dipencsv 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dipencsv/__init__.py +8 -0
- dipencsv/analytics.py +70 -0
- dipencsv/cleaner.py +154 -0
- dipencsv/core.py +226 -0
- dipencsv/engine/__init__.py +6 -0
- dipencsv/engine/base.py +42 -0
- dipencsv/engine/pandas_engine.py +53 -0
- dipencsv/engine/stream_engine.py +91 -0
- dipencsv/errors.py +23 -0
- dipencsv/exporter.py +45 -0
- dipencsv/intelligence.py +194 -0
- dipencsv/loader.py +24 -0
- dipencsv/summary.py +103 -0
- dipencsv-0.1.0.dist-info/METADATA +239 -0
- dipencsv-0.1.0.dist-info/RECORD +17 -0
- dipencsv-0.1.0.dist-info/WHEEL +5 -0
- dipencsv-0.1.0.dist-info/top_level.txt +1 -0
dipencsv/__init__.py
ADDED
dipencsv/analytics.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# dipencsv/analytics.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from dipencsv.errors import format_error, format_success
|
|
5
|
+
|
|
6
|
+
def describe(df: pd.DataFrame) -> pd.DataFrame:
|
|
7
|
+
numeric_df = df.select_dtypes(include="number")
|
|
8
|
+
if numeric_df.empty:
|
|
9
|
+
print("⚠️ No numeric columns found")
|
|
10
|
+
return pd.DataFrame()
|
|
11
|
+
return numeric_df.describe().round(2)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def value_counts(df: pd.DataFrame, col: str) -> pd.Series:
|
|
15
|
+
return df[col].value_counts()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
|
|
19
|
+
numeric_df = df.select_dtypes(include="number")
|
|
20
|
+
if numeric_df.empty:
|
|
21
|
+
print("⚠️ No numeric columns found")
|
|
22
|
+
return pd.DataFrame()
|
|
23
|
+
return numeric_df.corr().round(2)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def outliers(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
27
|
+
q1 = df[col].quantile(0.25)
|
|
28
|
+
q3 = df[col].quantile(0.75)
|
|
29
|
+
iqr = q3 - q1
|
|
30
|
+
lower = q1 - 1.5 * iqr
|
|
31
|
+
upper = q3 + 1.5 * iqr
|
|
32
|
+
result = df[(df[col] < lower) | (df[col] > upper)]
|
|
33
|
+
print(f"📊 Outliers in '{col}': {len(result)} rows")
|
|
34
|
+
return result
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def distribution(df: pd.DataFrame, col: str) -> dict:
|
|
38
|
+
return {
|
|
39
|
+
"mean" : round(float(df[col].mean()), 2),
|
|
40
|
+
"median": round(float(df[col].median()), 2),
|
|
41
|
+
"std" : round(float(df[col].std()), 2),
|
|
42
|
+
"min" : round(float(df[col].min()), 2),
|
|
43
|
+
"max" : round(float(df[col].max()), 2),
|
|
44
|
+
"skew" : round(float(df[col].skew()), 2),
|
|
45
|
+
}
|
|
46
|
+
def sort(df: pd.DataFrame, col: str, asc: bool = True) -> pd.DataFrame:
|
|
47
|
+
if col not in df.columns:
|
|
48
|
+
print(format_error(f"Column '{col}' not found"))
|
|
49
|
+
return df
|
|
50
|
+
df = df.sort_values(by=col, ascending=asc).reset_index(drop=True)
|
|
51
|
+
direction = "ascending" if asc else "descending"
|
|
52
|
+
print(format_success(f"Sorted by '{col}' ({direction})"))
|
|
53
|
+
return df
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def first(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
|
|
57
|
+
return df.head(n)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def last(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
|
|
61
|
+
return df.tail(n)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def find(df: pd.DataFrame, col: str, value) -> pd.DataFrame:
|
|
65
|
+
if col not in df.columns:
|
|
66
|
+
print(format_error(f"Column '{col}' not found"))
|
|
67
|
+
return pd.DataFrame()
|
|
68
|
+
result = df[df[col] == value]
|
|
69
|
+
print(format_success(f"Found {len(result)} rows where '{col}' = '{value}'"))
|
|
70
|
+
return result
|
dipencsv/cleaner.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# dipencsv/cleaner.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from dipencsv.errors import format_error, format_warning, format_success
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def clean(df: pd.DataFrame, strategy: str = "safe") -> pd.DataFrame:
|
|
9
|
+
df = df.copy()
|
|
10
|
+
|
|
11
|
+
# fix column names
|
|
12
|
+
df = _fix_columns(df)
|
|
13
|
+
|
|
14
|
+
# remove duplicates
|
|
15
|
+
before = len(df)
|
|
16
|
+
df = df.drop_duplicates()
|
|
17
|
+
removed = before - len(df)
|
|
18
|
+
if removed > 0:
|
|
19
|
+
print(format_warning(f"Removed {removed} duplicate rows"))
|
|
20
|
+
|
|
21
|
+
# handle missing values
|
|
22
|
+
df = _handle_missing(df, strategy)
|
|
23
|
+
|
|
24
|
+
# convert text numbers to numeric
|
|
25
|
+
df = _fix_numeric(df)
|
|
26
|
+
|
|
27
|
+
print(format_success("Cleaning done"))
|
|
28
|
+
return df
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _fix_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
32
|
+
df.columns = (
|
|
33
|
+
df.columns
|
|
34
|
+
.str.strip()
|
|
35
|
+
.str.lower()
|
|
36
|
+
.str.replace(" ", "_")
|
|
37
|
+
.str.replace(r"[^\w]", "", regex=True)
|
|
38
|
+
)
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _handle_missing(df: pd.DataFrame, strategy: str) -> pd.DataFrame:
|
|
43
|
+
if strategy == "safe":
|
|
44
|
+
for col in df.columns:
|
|
45
|
+
if df[col].isnull().sum() == 0:
|
|
46
|
+
continue
|
|
47
|
+
if df[col].dtype in ["float64", "int64"]:
|
|
48
|
+
df[col] = df[col].fillna(df[col].median())
|
|
49
|
+
else:
|
|
50
|
+
mode = df[col].mode()
|
|
51
|
+
if not mode.empty:
|
|
52
|
+
df[col] = df[col].fillna(mode[0])
|
|
53
|
+
elif strategy == "aggressive":
|
|
54
|
+
before = len(df)
|
|
55
|
+
df = df.dropna()
|
|
56
|
+
print(format_warning(f"Dropped {before - len(df)} rows with missing values"))
|
|
57
|
+
return df
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _fix_numeric(df: pd.DataFrame) -> pd.DataFrame:
|
|
61
|
+
for col in df.select_dtypes(include=["object", "str"]).columns:
|
|
62
|
+
converted = pd.to_numeric(df[col], errors="coerce")
|
|
63
|
+
if converted.notna().sum() > len(df) * 0.7:
|
|
64
|
+
df[col] = converted
|
|
65
|
+
return df
|
|
66
|
+
|
|
67
|
+
def drop(df: pd.DataFrame, cols) -> pd.DataFrame:
|
|
68
|
+
if isinstance(cols, str):
|
|
69
|
+
cols = [cols]
|
|
70
|
+
missing = [c for c in cols if c not in df.columns]
|
|
71
|
+
if missing:
|
|
72
|
+
print(format_error(f"Columns not found: {missing}"))
|
|
73
|
+
return df
|
|
74
|
+
df = df.drop(columns=cols)
|
|
75
|
+
print(format_success(f"Dropped columns: {cols}"))
|
|
76
|
+
return df
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def rename(df: pd.DataFrame, old: str, new: str) -> pd.DataFrame:
|
|
80
|
+
if old not in df.columns:
|
|
81
|
+
print(format_error(f"Column '{old}' not found"))
|
|
82
|
+
return df
|
|
83
|
+
df = df.rename(columns={old: new})
|
|
84
|
+
print(format_success(f"Renamed '{old}' → '{new}'"))
|
|
85
|
+
return df
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def fill(df: pd.DataFrame, col: str, value) -> pd.DataFrame:
|
|
89
|
+
if col not in df.columns:
|
|
90
|
+
print(format_error(f"Column '{col}' not found"))
|
|
91
|
+
return df
|
|
92
|
+
df[col] = df[col].fillna(value)
|
|
93
|
+
print(format_success(f"Filled missing values in '{col}' with {value}"))
|
|
94
|
+
return df
|
|
95
|
+
|
|
96
|
+
def encode(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
97
|
+
if col not in df.columns:
|
|
98
|
+
print(format_error(f"Column '{col}' not found"))
|
|
99
|
+
return df
|
|
100
|
+
unique_vals = df[col].unique()
|
|
101
|
+
mapping = {val: idx for idx, val in enumerate(unique_vals)}
|
|
102
|
+
df[col] = df[col].map(mapping)
|
|
103
|
+
print(format_success(f"Encoded '{col}': {mapping}"))
|
|
104
|
+
return df
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def normalize(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
108
|
+
if col not in df.columns:
|
|
109
|
+
print(format_error(f"Column '{col}' not found"))
|
|
110
|
+
return df
|
|
111
|
+
min_val = df[col].min()
|
|
112
|
+
max_val = df[col].max()
|
|
113
|
+
if max_val == min_val:
|
|
114
|
+
print(format_error(f"Cannot normalize '{col}' — all values are the same"))
|
|
115
|
+
return df
|
|
116
|
+
df[col] = (df[col] - min_val) / (max_val - min_val)
|
|
117
|
+
print(format_success(f"Normalized '{col}' to range [0, 1]"))
|
|
118
|
+
return df
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def standardize(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
122
|
+
if col not in df.columns:
|
|
123
|
+
print(format_error(f"Column '{col}' not found"))
|
|
124
|
+
return df
|
|
125
|
+
mean = df[col].mean()
|
|
126
|
+
std = df[col].std()
|
|
127
|
+
if std == 0:
|
|
128
|
+
print(format_error(f"Cannot standardize '{col}' — std is 0"))
|
|
129
|
+
return df
|
|
130
|
+
df[col] = (df[col] - mean) / std
|
|
131
|
+
print(format_success(f"Standardized '{col}' — mean=0, std=1"))
|
|
132
|
+
return df
|
|
133
|
+
|
|
134
|
+
def remove_outliers(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
135
|
+
if col not in df.columns:
|
|
136
|
+
print(format_error(f"Column '{col}' not found"))
|
|
137
|
+
return df
|
|
138
|
+
q1 = df[col].quantile(0.25)
|
|
139
|
+
q3 = df[col].quantile(0.75)
|
|
140
|
+
iqr = q3 - q1
|
|
141
|
+
lower = q1 - 1.5 * iqr
|
|
142
|
+
upper = q3 + 1.5 * iqr
|
|
143
|
+
before = len(df)
|
|
144
|
+
df = df[(df[col] >= lower) & (df[col] <= upper)]
|
|
145
|
+
removed = before - len(df)
|
|
146
|
+
print(format_success(f"Removed {removed} outliers from '{col}'"))
|
|
147
|
+
return df
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def split(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42):
|
|
151
|
+
from sklearn.model_selection import train_test_split
|
|
152
|
+
train, test = train_test_split(df, test_size=test_size, random_state=random_state)
|
|
153
|
+
print(format_success(f"Split done — train: {len(train)} rows, test: {len(test)} rows"))
|
|
154
|
+
return train, test
|
dipencsv/core.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# dipencsv/core.py
|
|
2
|
+
|
|
3
|
+
from dipencsv.loader import load_file, get_file_size_mb
|
|
4
|
+
from dipencsv.engine.pandas_engine import PandasEngine
|
|
5
|
+
from dipencsv.engine.stream_engine import StreamEngine
|
|
6
|
+
from dipencsv.summary import summary, report
|
|
7
|
+
from dipencsv.cleaner import clean
|
|
8
|
+
from dipencsv.analytics import describe, value_counts, correlation_matrix, outliers, distribution
|
|
9
|
+
from dipencsv.intelligence import ask, explain, magic
|
|
10
|
+
from dipencsv.exporter import export
|
|
11
|
+
from dipencsv.cleaner import clean, drop, rename, fill
|
|
12
|
+
from dipencsv.cleaner import clean, drop, rename, fill, encode, normalize, standardize
|
|
13
|
+
from dipencsv.cleaner import clean, drop, rename, fill, encode, normalize, standardize, remove_outliers, split
|
|
14
|
+
from dipencsv.analytics import describe, value_counts, correlation_matrix, outliers, distribution, sort, find, first, last
|
|
15
|
+
|
|
16
|
+
SIZE_THRESHOLD_MB = 500
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Data:
|
|
20
|
+
|
|
21
|
+
def __init__(self, filepath: str, stream: bool = False, auto_mode: bool = True):
|
|
22
|
+
self._filepath = filepath
|
|
23
|
+
self._engine = self._init_engine(filepath, stream, auto_mode)
|
|
24
|
+
|
|
25
|
+
def _init_engine(self, filepath, stream, auto_mode):
|
|
26
|
+
if stream:
|
|
27
|
+
print("🌊 Stream mode activated")
|
|
28
|
+
return StreamEngine(filepath)
|
|
29
|
+
|
|
30
|
+
if auto_mode:
|
|
31
|
+
size = get_file_size_mb(filepath)
|
|
32
|
+
if size > SIZE_THRESHOLD_MB:
|
|
33
|
+
print(f"🌊 File is {size:.1f}MB — auto switching to stream mode")
|
|
34
|
+
return StreamEngine(filepath)
|
|
35
|
+
else:
|
|
36
|
+
print(f"🐼 File is {size:.1f}MB — using pandas mode")
|
|
37
|
+
df = load_file(filepath)
|
|
38
|
+
return PandasEngine(df)
|
|
39
|
+
|
|
40
|
+
df = load_file(filepath)
|
|
41
|
+
return PandasEngine(df)
|
|
42
|
+
|
|
43
|
+
# --- analytics ---
|
|
44
|
+
def mean(self, col: str) -> float:
|
|
45
|
+
return self._engine.mean(col)
|
|
46
|
+
|
|
47
|
+
def median(self, col: str) -> float:
|
|
48
|
+
return self._engine.median(col)
|
|
49
|
+
|
|
50
|
+
def max(self, col: str) -> float:
|
|
51
|
+
return self._engine.max(col)
|
|
52
|
+
|
|
53
|
+
def min(self, col: str) -> float:
|
|
54
|
+
return self._engine.min(col)
|
|
55
|
+
|
|
56
|
+
def count(self, col: str) -> int:
|
|
57
|
+
return self._engine.count(col)
|
|
58
|
+
|
|
59
|
+
def group_mean(self, group_col: str, value_col: str) -> dict:
|
|
60
|
+
return self._engine.group_mean(group_col, value_col)
|
|
61
|
+
|
|
62
|
+
def correlation(self, col1: str, col2: str) -> float:
|
|
63
|
+
return self._engine.correlation(col1, col2)
|
|
64
|
+
|
|
65
|
+
def filter(self, condition: str):
|
|
66
|
+
return self._engine.filter(condition)
|
|
67
|
+
|
|
68
|
+
def top_n(self, col: str, n: int = 10):
|
|
69
|
+
return self._engine.top_n(col, n)
|
|
70
|
+
|
|
71
|
+
def summary(self):
|
|
72
|
+
self._engine._df if hasattr(self._engine, '_df') else None
|
|
73
|
+
if hasattr(self._engine, '_df'):
|
|
74
|
+
summary(self._engine._df)
|
|
75
|
+
else:
|
|
76
|
+
print("⚠️ summary() not supported in stream mode")
|
|
77
|
+
|
|
78
|
+
def report(self):
|
|
79
|
+
if hasattr(self._engine, '_df'):
|
|
80
|
+
report(self._engine._df)
|
|
81
|
+
else:
|
|
82
|
+
print("⚠️ report() not supported in stream mode")
|
|
83
|
+
|
|
84
|
+
def clean(self, strategy: str = "safe"):
|
|
85
|
+
if hasattr(self._engine, '_df'):
|
|
86
|
+
self._engine._df = clean(self._engine._df, strategy)
|
|
87
|
+
else:
|
|
88
|
+
print("⚠️ clean() not supported in stream mode")
|
|
89
|
+
|
|
90
|
+
def describe(self):
|
|
91
|
+
if hasattr(self._engine, '_df'):
|
|
92
|
+
return describe(self._engine._df)
|
|
93
|
+
else:
|
|
94
|
+
print("⚠️ describe() not supported in stream mode")
|
|
95
|
+
|
|
96
|
+
def value_counts(self, col: str):
|
|
97
|
+
if hasattr(self._engine, '_df'):
|
|
98
|
+
return value_counts(self._engine._df, col)
|
|
99
|
+
else:
|
|
100
|
+
print("⚠️ value_counts() not supported in stream mode")
|
|
101
|
+
|
|
102
|
+
def correlation_matrix(self):
|
|
103
|
+
if hasattr(self._engine, '_df'):
|
|
104
|
+
return correlation_matrix(self._engine._df)
|
|
105
|
+
else:
|
|
106
|
+
print("⚠️ correlation_matrix() not supported in stream mode")
|
|
107
|
+
|
|
108
|
+
def outliers(self, col: str):
|
|
109
|
+
if hasattr(self._engine, '_df'):
|
|
110
|
+
return outliers(self._engine._df, col)
|
|
111
|
+
else:
|
|
112
|
+
print("⚠️ outliers() not supported in stream mode")
|
|
113
|
+
|
|
114
|
+
def distribution(self, col: str):
|
|
115
|
+
if hasattr(self._engine, '_df'):
|
|
116
|
+
return distribution(self._engine._df, col)
|
|
117
|
+
else:
|
|
118
|
+
print("⚠️ distribution() not supported in stream mode")
|
|
119
|
+
|
|
120
|
+
def ask(self, question: str):
|
|
121
|
+
if hasattr(self._engine, '_df'):
|
|
122
|
+
return ask(self._engine._df, question)
|
|
123
|
+
else:
|
|
124
|
+
print("⚠️ ask() not supported in stream mode")
|
|
125
|
+
|
|
126
|
+
def explain(self):
|
|
127
|
+
if hasattr(self._engine, '_df'):
|
|
128
|
+
explain(self._engine._df)
|
|
129
|
+
else:
|
|
130
|
+
print("⚠️ explain() not supported in stream mode")
|
|
131
|
+
|
|
132
|
+
def magic(self):
|
|
133
|
+
if hasattr(self._engine, '_df'):
|
|
134
|
+
magic(self._engine._df)
|
|
135
|
+
else:
|
|
136
|
+
print("⚠️ magic() not supported in stream mode")
|
|
137
|
+
|
|
138
|
+
def export(self, filepath: str):
|
|
139
|
+
if hasattr(self._engine, '_df'):
|
|
140
|
+
export(self._engine._df, filepath)
|
|
141
|
+
else:
|
|
142
|
+
print("⚠️ export() not supported in stream mode")
|
|
143
|
+
|
|
144
|
+
def drop(self, cols):
|
|
145
|
+
if hasattr(self._engine, '_df'):
|
|
146
|
+
self._engine._df = drop(self._engine._df, cols)
|
|
147
|
+
else:
|
|
148
|
+
print("⚠️ drop() not supported in stream mode")
|
|
149
|
+
|
|
150
|
+
def rename(self, old: str, new: str):
|
|
151
|
+
if hasattr(self._engine, '_df'):
|
|
152
|
+
self._engine._df = rename(self._engine._df, old, new)
|
|
153
|
+
else:
|
|
154
|
+
print("⚠️ rename() not supported in stream mode")
|
|
155
|
+
|
|
156
|
+
def fill(self, col: str, value):
|
|
157
|
+
if hasattr(self._engine, '_df'):
|
|
158
|
+
self._engine._df = fill(self._engine._df, col, value)
|
|
159
|
+
else:
|
|
160
|
+
print("⚠️ fill() not supported in stream mode")
|
|
161
|
+
|
|
162
|
+
def encode(self, col: str):
|
|
163
|
+
if hasattr(self._engine, '_df'):
|
|
164
|
+
self._engine._df = encode(self._engine._df, col)
|
|
165
|
+
else:
|
|
166
|
+
print("⚠️ encode() not supported in stream mode")
|
|
167
|
+
|
|
168
|
+
def normalize(self, col: str):
|
|
169
|
+
if hasattr(self._engine, '_df'):
|
|
170
|
+
self._engine._df = normalize(self._engine._df, col)
|
|
171
|
+
else:
|
|
172
|
+
print("⚠️ normalize() not supported in stream mode")
|
|
173
|
+
|
|
174
|
+
def standardize(self, col: str):
|
|
175
|
+
if hasattr(self._engine, '_df'):
|
|
176
|
+
self._engine._df = standardize(self._engine._df, col)
|
|
177
|
+
else:
|
|
178
|
+
print("⚠️ standardize() not supported in stream mode")
|
|
179
|
+
|
|
180
|
+
def remove_outliers(self, col: str):
|
|
181
|
+
if hasattr(self._engine, '_df'):
|
|
182
|
+
self._engine._df = remove_outliers(self._engine._df, col)
|
|
183
|
+
else:
|
|
184
|
+
print("⚠️ remove_outliers() not supported in stream mode")
|
|
185
|
+
|
|
186
|
+
def split(self, test_size: float = 0.2, random_state: int = 42):
|
|
187
|
+
if hasattr(self._engine, '_df'):
|
|
188
|
+
return split(self._engine._df, test_size, random_state)
|
|
189
|
+
else:
|
|
190
|
+
print("⚠️ split() not supported in stream mode")
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def columns(self) -> list:
|
|
194
|
+
if hasattr(self._engine, '_df'):
|
|
195
|
+
return list(self._engine._df.columns)
|
|
196
|
+
return self._engine._columns
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def shape(self) -> tuple:
|
|
200
|
+
if hasattr(self._engine, '_df'):
|
|
201
|
+
return self._engine._df.shape
|
|
202
|
+
return None
|
|
203
|
+
|
|
204
|
+
def first(self, n: int = 5):
|
|
205
|
+
if hasattr(self._engine, '_df'):
|
|
206
|
+
return first(self._engine._df, n)
|
|
207
|
+
else:
|
|
208
|
+
print("⚠️ first() not supported in stream mode")
|
|
209
|
+
|
|
210
|
+
def last(self, n: int = 5):
|
|
211
|
+
if hasattr(self._engine, '_df'):
|
|
212
|
+
return last(self._engine._df, n)
|
|
213
|
+
else:
|
|
214
|
+
print("⚠️ last() not supported in stream mode")
|
|
215
|
+
|
|
216
|
+
def sort(self, col: str, asc: bool = True):
|
|
217
|
+
if hasattr(self._engine, '_df'):
|
|
218
|
+
self._engine._df = sort(self._engine._df, col, asc)
|
|
219
|
+
else:
|
|
220
|
+
print("⚠️ sort() not supported in stream mode")
|
|
221
|
+
|
|
222
|
+
def find(self, col: str, value):
|
|
223
|
+
if hasattr(self._engine, '_df'):
|
|
224
|
+
return find(self._engine._df, col, value)
|
|
225
|
+
else:
|
|
226
|
+
print("⚠️ find() not supported in stream mode")
|
dipencsv/engine/base.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# dipencsv/engine/base.py
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseEngine(ABC):
|
|
7
|
+
|
|
8
|
+
@abstractmethod
|
|
9
|
+
def mean(self, col: str) -> float:
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def median(self, col: str) -> float:
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def max(self, col: str) -> float:
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def min(self, col: str) -> float:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def count(self, col: str) -> int:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def group_mean(self, group_col: str, value_col: str) -> dict:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def correlation(self, col1: str, col2: str) -> float:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def filter(self, condition: str):
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def top_n(self, col: str, n: int):
|
|
42
|
+
pass
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# dipencsv/engine/pandas_engine.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from dipencsv.engine.base import BaseEngine
|
|
5
|
+
from dipencsv.errors import suggest_column
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PandasEngine(BaseEngine):
|
|
9
|
+
|
|
10
|
+
def __init__(self, df: pd.DataFrame):
|
|
11
|
+
self._df = df
|
|
12
|
+
|
|
13
|
+
def mean(self, col: str) -> float:
|
|
14
|
+
self._check_col(col)
|
|
15
|
+
return self._df[col].mean()
|
|
16
|
+
|
|
17
|
+
def median(self, col: str) -> float:
|
|
18
|
+
self._check_col(col)
|
|
19
|
+
return self._df[col].median()
|
|
20
|
+
|
|
21
|
+
def max(self, col: str) -> float:
|
|
22
|
+
self._check_col(col)
|
|
23
|
+
return self._df[col].max()
|
|
24
|
+
|
|
25
|
+
def min(self, col: str) -> float:
|
|
26
|
+
self._check_col(col)
|
|
27
|
+
return self._df[col].min()
|
|
28
|
+
|
|
29
|
+
def count(self, col: str) -> int:
|
|
30
|
+
self._check_col(col)
|
|
31
|
+
return self._df[col].count()
|
|
32
|
+
|
|
33
|
+
def group_mean(self, group_col: str, value_col: str) -> dict:
|
|
34
|
+
self._check_col(group_col)
|
|
35
|
+
self._check_col(value_col)
|
|
36
|
+
return self._df.groupby(group_col)[value_col].mean().to_dict()
|
|
37
|
+
|
|
38
|
+
def correlation(self, col1: str, col2: str) -> float:
|
|
39
|
+
self._check_col(col1)
|
|
40
|
+
self._check_col(col2)
|
|
41
|
+
return self._df[col1].corr(self._df[col2])
|
|
42
|
+
|
|
43
|
+
def filter(self, condition: str) -> pd.DataFrame:
|
|
44
|
+
return self._df.query(condition)
|
|
45
|
+
|
|
46
|
+
def top_n(self, col: str, n: int = 10) -> pd.DataFrame:
|
|
47
|
+
self._check_col(col)
|
|
48
|
+
return self._df.nlargest(n, col)
|
|
49
|
+
|
|
50
|
+
def _check_col(self, col: str):
|
|
51
|
+
if col not in self._df.columns:
|
|
52
|
+
suggestion = suggest_column(col, list(self._df.columns))
|
|
53
|
+
raise ValueError(f"❌ Column '{col}' not found.\n{suggestion}")
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# dipencsv/engine/stream_engine.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from dipencsv.engine.base import BaseEngine
|
|
5
|
+
from dipencsv.errors import suggest_column
|
|
6
|
+
|
|
7
|
+
CHUNK_SIZE = 100_000
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StreamEngine(BaseEngine):
|
|
11
|
+
|
|
12
|
+
def __init__(self, filepath: str, chunk_size: int = CHUNK_SIZE):
|
|
13
|
+
self._filepath = filepath
|
|
14
|
+
self._chunk_size = chunk_size
|
|
15
|
+
self._columns = self._get_columns()
|
|
16
|
+
|
|
17
|
+
def _get_columns(self) -> list:
|
|
18
|
+
df = pd.read_csv(self._filepath, nrows=1)
|
|
19
|
+
return list(df.columns)
|
|
20
|
+
|
|
21
|
+
def _chunks(self):
|
|
22
|
+
return pd.read_csv(self._filepath, chunksize=self._chunk_size)
|
|
23
|
+
|
|
24
|
+
def mean(self, col: str) -> float:
|
|
25
|
+
self._check_col(col)
|
|
26
|
+
total, count = 0, 0
|
|
27
|
+
for chunk in self._chunks():
|
|
28
|
+
total += chunk[col].sum()
|
|
29
|
+
count += chunk[col].count()
|
|
30
|
+
return total / count if count else 0
|
|
31
|
+
|
|
32
|
+
def median(self, col: str) -> float:
|
|
33
|
+
raise NotImplementedError("⚠️ median() not supported in stream mode — load a sample instead")
|
|
34
|
+
|
|
35
|
+
def max(self, col: str) -> float:
|
|
36
|
+
self._check_col(col)
|
|
37
|
+
result = None
|
|
38
|
+
for chunk in self._chunks():
|
|
39
|
+
chunk_max = chunk[col].max()
|
|
40
|
+
result = chunk_max if result is None else max(result, chunk_max)
|
|
41
|
+
return result
|
|
42
|
+
|
|
43
|
+
def min(self, col: str) -> float:
|
|
44
|
+
self._check_col(col)
|
|
45
|
+
result = None
|
|
46
|
+
for chunk in self._chunks():
|
|
47
|
+
chunk_min = chunk[col].min()
|
|
48
|
+
result = chunk_min if result is None else min(result, chunk_min)
|
|
49
|
+
return result
|
|
50
|
+
|
|
51
|
+
def count(self, col: str) -> int:
|
|
52
|
+
self._check_col(col)
|
|
53
|
+
total = 0
|
|
54
|
+
for chunk in self._chunks():
|
|
55
|
+
total += chunk[col].count()
|
|
56
|
+
return total
|
|
57
|
+
|
|
58
|
+
def group_mean(self, group_col: str, value_col: str) -> dict:
|
|
59
|
+
self._check_col(group_col)
|
|
60
|
+
self._check_col(value_col)
|
|
61
|
+
agg = {}
|
|
62
|
+
for chunk in self._chunks():
|
|
63
|
+
for key, grp in chunk.groupby(group_col):
|
|
64
|
+
if key not in agg:
|
|
65
|
+
agg[key] = {"sum": 0, "count": 0}
|
|
66
|
+
agg[key]["sum"] += grp[value_col].sum()
|
|
67
|
+
agg[key]["count"] += grp[value_col].count()
|
|
68
|
+
return {k: v["sum"] / v["count"] for k, v in agg.items()}
|
|
69
|
+
|
|
70
|
+
def correlation(self, col1: str, col2: str) -> float:
|
|
71
|
+
raise NotImplementedError("⚠️ correlation() not supported in stream mode")
|
|
72
|
+
|
|
73
|
+
def filter(self, condition: str) -> pd.DataFrame:
|
|
74
|
+
results = []
|
|
75
|
+
for chunk in self._chunks():
|
|
76
|
+
filtered = chunk.query(condition)
|
|
77
|
+
results.append(filtered)
|
|
78
|
+
return pd.concat(results, ignore_index=True)
|
|
79
|
+
|
|
80
|
+
def top_n(self, col: str, n: int = 10) -> pd.DataFrame:
|
|
81
|
+
self._check_col(col)
|
|
82
|
+
result = pd.DataFrame()
|
|
83
|
+
for chunk in self._chunks():
|
|
84
|
+
result = pd.concat([result, chunk.nlargest(n, col)])
|
|
85
|
+
result = result.nlargest(n, col)
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
def _check_col(self, col: str):
|
|
89
|
+
if col not in self._columns:
|
|
90
|
+
suggestion = suggest_column(col, self._columns)
|
|
91
|
+
raise ValueError(f"❌ Column '{col}' not found.\n{suggestion}")
|
dipencsv/errors.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# dipencsv/errors.py
|
|
2
|
+
|
|
3
|
+
from difflib import get_close_matches
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def suggest_column(col: str, available_cols: list) -> str:
|
|
7
|
+
matches = get_close_matches(col, available_cols, n=3, cutoff=0.5)
|
|
8
|
+
if matches:
|
|
9
|
+
suggestions = "\n".join(f" - {m}" for m in matches)
|
|
10
|
+
return f"💡 Did you mean:\n{suggestions}"
|
|
11
|
+
return f"💡 Available columns:\n" + "\n".join(f" - {c}" for c in available_cols)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def format_error(msg: str) -> str:
|
|
15
|
+
return f"❌ {msg}"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def format_warning(msg: str) -> str:
|
|
19
|
+
return f"⚠️ {msg}"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def format_success(msg: str) -> str:
|
|
23
|
+
return f"✅ {msg}"
|
dipencsv/exporter.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# dipencsv/exporter.py
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from dipencsv.errors import format_success, format_error
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def export(df: pd.DataFrame, filepath: str):
|
|
9
|
+
if filepath.endswith(".csv"):
|
|
10
|
+
_export_csv(df, filepath)
|
|
11
|
+
elif filepath.endswith(".json"):
|
|
12
|
+
_export_json(df, filepath)
|
|
13
|
+
elif filepath.endswith(".xlsx"):
|
|
14
|
+
_export_excel(df, filepath)
|
|
15
|
+
else:
|
|
16
|
+
print(format_error(f"Unsupported format: {filepath}"))
|
|
17
|
+
print("💡 Supported: .csv, .json, .xlsx")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _export_csv(df: pd.DataFrame, filepath: str):
|
|
21
|
+
try:
|
|
22
|
+
df.to_csv(filepath, index=False)
|
|
23
|
+
print(format_success(f"Exported to {filepath}"))
|
|
24
|
+
except Exception as e:
|
|
25
|
+
print(format_error(f"Failed to export CSV: {e}"))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _export_json(df: pd.DataFrame, filepath: str):
|
|
29
|
+
try:
|
|
30
|
+
data = df.to_dict(orient="records")
|
|
31
|
+
with open(filepath, "w") as f:
|
|
32
|
+
json.dump(data, f, indent=2)
|
|
33
|
+
print(format_success(f"Exported to {filepath}"))
|
|
34
|
+
except Exception as e:
|
|
35
|
+
print(format_error(f"Failed to export JSON: {e}"))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _export_excel(df: pd.DataFrame, filepath: str):
|
|
39
|
+
try:
|
|
40
|
+
df.to_excel(filepath, index=False)
|
|
41
|
+
print(format_success(f"Exported to {filepath}"))
|
|
42
|
+
except ImportError:
|
|
43
|
+
print(format_error("openpyxl not installed — run: pip install openpyxl"))
|
|
44
|
+
except Exception as e:
|
|
45
|
+
print(format_error(f"Failed to export Excel: {e}"))
|
dipencsv/intelligence.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# dipencsv/intelligence.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from dipencsv.errors import suggest_column
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def ask(df: pd.DataFrame, question: str):
|
|
8
|
+
q = question.lower().strip()
|
|
9
|
+
|
|
10
|
+
# group by — check FIRST before average/mean
|
|
11
|
+
if "by" in q:
|
|
12
|
+
result = _parse_group(q, df)
|
|
13
|
+
if result is not None:
|
|
14
|
+
print(result.to_string())
|
|
15
|
+
return result
|
|
16
|
+
else:
|
|
17
|
+
print("🤔 Could not parse group query.")
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
# top N
|
|
21
|
+
if "top" in q:
|
|
22
|
+
n, col = _parse_top(q, df)
|
|
23
|
+
if col:
|
|
24
|
+
result = df.nlargest(n, col)
|
|
25
|
+
print(f"🏆 Top {n} by '{col}':")
|
|
26
|
+
print(result[[col]].to_string())
|
|
27
|
+
return result
|
|
28
|
+
|
|
29
|
+
# average / mean
|
|
30
|
+
if "average" in q or "mean" in q:
|
|
31
|
+
col = _find_col(q, df)
|
|
32
|
+
if col:
|
|
33
|
+
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
34
|
+
print(f"❌ '{col}' is a text column — cannot calculate average")
|
|
35
|
+
return None
|
|
36
|
+
result = df[col].mean()
|
|
37
|
+
print(f"📊 Average {col}: {result:.2f}")
|
|
38
|
+
return result
|
|
39
|
+
|
|
40
|
+
# highest / maximum
|
|
41
|
+
if "highest" in q or "maximum" in q or "max" in q:
|
|
42
|
+
col = _find_col(q, df)
|
|
43
|
+
if col:
|
|
44
|
+
result = df[col].max()
|
|
45
|
+
print(f"📈 Highest {col}: {result}")
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
# lowest / minimum
|
|
49
|
+
if "lowest" in q or "minimum" in q or "min" in q:
|
|
50
|
+
col = _find_col(q, df)
|
|
51
|
+
if col:
|
|
52
|
+
result = df[col].min()
|
|
53
|
+
print(f"📉 Lowest {col}: {result}")
|
|
54
|
+
return result
|
|
55
|
+
|
|
56
|
+
# count
|
|
57
|
+
if "count" in q or "how many" in q:
|
|
58
|
+
col = _find_col(q, df)
|
|
59
|
+
if col:
|
|
60
|
+
result = df[col].count()
|
|
61
|
+
print(f"🔢 Count of {col}: {result}")
|
|
62
|
+
return result
|
|
63
|
+
|
|
64
|
+
numeric_cols = list(df.select_dtypes(include="number").columns)
|
|
65
|
+
text_cols = list(df.select_dtypes(include=["object", "str"]).columns)
|
|
66
|
+
|
|
67
|
+
print("🤔 Could not understand the question.")
|
|
68
|
+
print(f"\n💡 Numeric columns (use for average/top/highest): {numeric_cols}")
|
|
69
|
+
print(f"💡 Text columns (use for grouping): {text_cols}")
|
|
70
|
+
print("\n💡 Try:")
|
|
71
|
+
if numeric_cols:
|
|
72
|
+
print(f" - 'average {numeric_cols[0]}'")
|
|
73
|
+
print(f" - 'top 10 {numeric_cols[0]}'")
|
|
74
|
+
if numeric_cols and text_cols:
|
|
75
|
+
print(f" - 'highest {numeric_cols[0]} by {text_cols[0]}'")
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def explain(df: pd.DataFrame):
|
|
80
|
+
print("=" * 40)
|
|
81
|
+
print("🧠 DATASET EXPLANATION")
|
|
82
|
+
print("=" * 40)
|
|
83
|
+
|
|
84
|
+
rows, cols = df.shape
|
|
85
|
+
print(f"\n📦 Dataset has {rows} rows and {cols} columns.")
|
|
86
|
+
|
|
87
|
+
numeric = df.select_dtypes(include="number")
|
|
88
|
+
text = df.select_dtypes(include=["object", "str"])
|
|
89
|
+
|
|
90
|
+
print(f"🔢 Numeric columns : {list(numeric.columns)}")
|
|
91
|
+
print(f"🔤 Text columns : {list(text.columns)}")
|
|
92
|
+
|
|
93
|
+
# trends
|
|
94
|
+
print("\n📈 TRENDS")
|
|
95
|
+
print("-" * 40)
|
|
96
|
+
for col in numeric.columns:
|
|
97
|
+
mean = numeric[col].mean()
|
|
98
|
+
std = numeric[col].std()
|
|
99
|
+
skew = numeric[col].skew()
|
|
100
|
+
skew_label = "right-skewed" if skew > 1 else "left-skewed" if skew < -1 else "normal"
|
|
101
|
+
print(f" {col}: mean={mean:.2f}, std={std:.2f}, distribution={skew_label}")
|
|
102
|
+
|
|
103
|
+
# issues
|
|
104
|
+
print("\n⚠️ ISSUES")
|
|
105
|
+
print("-" * 40)
|
|
106
|
+
missing = df.isnull().sum()
|
|
107
|
+
missing = missing[missing > 0]
|
|
108
|
+
if missing.empty:
|
|
109
|
+
print(" ✅ No missing values")
|
|
110
|
+
else:
|
|
111
|
+
for col, count in missing.items():
|
|
112
|
+
pct = (count / rows) * 100
|
|
113
|
+
print(f" ⚠️ '{col}' has {count} missing values ({pct:.1f}%)")
|
|
114
|
+
|
|
115
|
+
dupes = df.duplicated().sum()
|
|
116
|
+
if dupes > 0:
|
|
117
|
+
print(f" ⚠️ {dupes} duplicate rows found")
|
|
118
|
+
else:
|
|
119
|
+
print(" ✅ No duplicates")
|
|
120
|
+
|
|
121
|
+
# suggestions
|
|
122
|
+
print("\n💡 SUGGESTIONS")
|
|
123
|
+
print("-" * 40)
|
|
124
|
+
if not missing.empty:
|
|
125
|
+
print(" - Run data.clean() to fix missing values")
|
|
126
|
+
if dupes > 0:
|
|
127
|
+
print(" - Run data.clean() to remove duplicates")
|
|
128
|
+
high_skew = [c for c in numeric.columns if abs(numeric[c].skew()) > 1]
|
|
129
|
+
if high_skew:
|
|
130
|
+
print(f" - Columns with high skew: {high_skew} — consider log transform")
|
|
131
|
+
if not missing.empty or dupes == 0:
|
|
132
|
+
print(" - Data looks ready for analysis ✅")
|
|
133
|
+
print("=" * 40)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def magic(df: pd.DataFrame):
|
|
137
|
+
print("=" * 40)
|
|
138
|
+
print("✨ MAGIC MODE")
|
|
139
|
+
print("=" * 40)
|
|
140
|
+
|
|
141
|
+
rows, cols = df.shape
|
|
142
|
+
missing = df.isnull().sum().sum()
|
|
143
|
+
dupes = df.duplicated().sum()
|
|
144
|
+
numeric = df.select_dtypes(include="number")
|
|
145
|
+
|
|
146
|
+
print(f"\n📦 {rows} rows × {cols} cols")
|
|
147
|
+
print(f"⚠️ Missing values : {missing}")
|
|
148
|
+
print(f"⚠️ Duplicates : {dupes}")
|
|
149
|
+
|
|
150
|
+
print("\n📈 KEY INSIGHTS")
|
|
151
|
+
print("-" * 40)
|
|
152
|
+
for col in numeric.columns:
|
|
153
|
+
print(f" {col}: min={numeric[col].min():.2f}, mean={numeric[col].mean():.2f}, max={numeric[col].max():.2f}")
|
|
154
|
+
|
|
155
|
+
print("\n🏆 RECOMMENDATION")
|
|
156
|
+
print("-" * 40)
|
|
157
|
+
if missing > 0 or dupes > 0:
|
|
158
|
+
print(" ⚠️ Run data.clean() before analysis")
|
|
159
|
+
else:
|
|
160
|
+
print(" ✅ Data is clean — ready to analyze")
|
|
161
|
+
print("=" * 40)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# --- internal helpers ---
|
|
165
|
+
|
|
166
|
+
def _find_col(question: str, df: pd.DataFrame):
|
|
167
|
+
words = question.lower().split()
|
|
168
|
+
for col in df.columns:
|
|
169
|
+
if col.lower() in words: # match whole word not substring
|
|
170
|
+
return col
|
|
171
|
+
print("❌ No matching column found in your question.")
|
|
172
|
+
print(f"💡 Your columns: {list(df.columns)}")
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
def _parse_top(question: str, df: pd.DataFrame):
|
|
176
|
+
import re
|
|
177
|
+
n = 10
|
|
178
|
+
match = re.search(r"top\s+(\d+)", question)
|
|
179
|
+
if match:
|
|
180
|
+
n = int(match.group(1))
|
|
181
|
+
col = _find_col(question, df)
|
|
182
|
+
return n, col
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _parse_group(question: str, df: pd.DataFrame):
|
|
186
|
+
words = question.lower().split()
|
|
187
|
+
cols = [c for c in df.columns if c.lower() in words] # whole word match
|
|
188
|
+
numeric = df.select_dtypes(include="number")
|
|
189
|
+
if len(cols) >= 2:
|
|
190
|
+
group_col = cols[0]
|
|
191
|
+
value_col = cols[1]
|
|
192
|
+
if value_col in numeric.columns:
|
|
193
|
+
return df.groupby(group_col)[value_col].mean().round(2)
|
|
194
|
+
return None
|
dipencsv/loader.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# dipencsv/loader.py
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def load_file(filepath: str) -> pd.DataFrame:
|
|
8
|
+
_validate(filepath)
|
|
9
|
+
df = pd.read_csv(filepath)
|
|
10
|
+
print(f"✅ Loaded: {df.shape[0]} rows × {df.shape[1]} cols")
|
|
11
|
+
return df
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _validate(filepath: str):
|
|
15
|
+
if not os.path.exists(filepath):
|
|
16
|
+
raise FileNotFoundError(f"❌ File not found: {filepath}")
|
|
17
|
+
if not filepath.endswith(".csv"):
|
|
18
|
+
raise ValueError(f"❌ Only CSV files supported: {filepath}")
|
|
19
|
+
if os.path.getsize(filepath) == 0:
|
|
20
|
+
raise ValueError(f"❌ File is empty: {filepath}")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_file_size_mb(filepath: str) -> float:
|
|
24
|
+
return os.path.getsize(filepath) / (1024 * 1024)
|
dipencsv/summary.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# dipencsv/summary.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from dipencsv.errors import format_warning
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def summary(df: pd.DataFrame):
|
|
8
|
+
print("=" * 40)
|
|
9
|
+
print("📊 DATASET SUMMARY")
|
|
10
|
+
print("=" * 40)
|
|
11
|
+
print(f" Rows : {df.shape[0]}")
|
|
12
|
+
print(f" Columns : {df.shape[1]}")
|
|
13
|
+
print(f" Total cells : {df.shape[0] * df.shape[1]}")
|
|
14
|
+
print()
|
|
15
|
+
|
|
16
|
+
print("📋 COLUMN TYPES")
|
|
17
|
+
print("-" * 40)
|
|
18
|
+
for col, dtype in df.dtypes.items():
|
|
19
|
+
print(f" {col:<25} {str(dtype)}")
|
|
20
|
+
print()
|
|
21
|
+
|
|
22
|
+
missing = df.isnull().sum()
|
|
23
|
+
missing = missing[missing > 0]
|
|
24
|
+
if not missing.empty:
|
|
25
|
+
print("⚠️ MISSING VALUES")
|
|
26
|
+
print("-" * 40)
|
|
27
|
+
for col, count in missing.items():
|
|
28
|
+
pct = (count / len(df)) * 100
|
|
29
|
+
print(f" {col:<25} {count} missing ({pct:.1f}%)")
|
|
30
|
+
print()
|
|
31
|
+
else:
|
|
32
|
+
print("✅ No missing values found")
|
|
33
|
+
print()
|
|
34
|
+
|
|
35
|
+
print("🔢 NUMERIC COLUMNS")
|
|
36
|
+
print("-" * 40)
|
|
37
|
+
numeric = df.select_dtypes(include="number").columns.tolist()
|
|
38
|
+
if numeric:
|
|
39
|
+
for col in numeric:
|
|
40
|
+
print(f" - {col}")
|
|
41
|
+
else:
|
|
42
|
+
print(" none")
|
|
43
|
+
print()
|
|
44
|
+
|
|
45
|
+
print("🔤 TEXT COLUMNS")
|
|
46
|
+
print("-" * 40)
|
|
47
|
+
text = df.select_dtypes(include=["object", "str"]).columns.tolist()
|
|
48
|
+
if text:
|
|
49
|
+
for col in text:
|
|
50
|
+
print(f" - {col}")
|
|
51
|
+
else:
|
|
52
|
+
print(" none")
|
|
53
|
+
print("=" * 40)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def report(df: pd.DataFrame):
|
|
57
|
+
print("=" * 40)
|
|
58
|
+
print("📝 FULL REPORT")
|
|
59
|
+
print("=" * 40)
|
|
60
|
+
|
|
61
|
+
# duplicates
|
|
62
|
+
dupes = df.duplicated().sum()
|
|
63
|
+
if dupes > 0:
|
|
64
|
+
print(format_warning(f"{dupes} duplicate rows found"))
|
|
65
|
+
else:
|
|
66
|
+
print("✅ No duplicates found")
|
|
67
|
+
print()
|
|
68
|
+
|
|
69
|
+
# missing %
|
|
70
|
+
print("📉 MISSING VALUES %")
|
|
71
|
+
print("-" * 40)
|
|
72
|
+
missing_pct = (df.isnull().sum() / len(df) * 100).round(2)
|
|
73
|
+
for col, pct in missing_pct.items():
|
|
74
|
+
status = "⚠️ " if pct > 0 else "✅"
|
|
75
|
+
print(f" {status} {col:<23} {pct}%")
|
|
76
|
+
print()
|
|
77
|
+
|
|
78
|
+
# basic stats
|
|
79
|
+
print("📈 BASIC STATS")
|
|
80
|
+
print("-" * 40)
|
|
81
|
+
numeric_df = df.select_dtypes(include="number")
|
|
82
|
+
if not numeric_df.empty:
|
|
83
|
+
stats = numeric_df.describe().round(2)
|
|
84
|
+
print(stats.to_string())
|
|
85
|
+
else:
|
|
86
|
+
print(" No numeric columns")
|
|
87
|
+
print()
|
|
88
|
+
|
|
89
|
+
# warnings
|
|
90
|
+
print("⚠️ WARNINGS")
|
|
91
|
+
print("-" * 40)
|
|
92
|
+
warnings = []
|
|
93
|
+
if dupes > 0:
|
|
94
|
+
warnings.append(f"Duplicate rows: {dupes}")
|
|
95
|
+
high_missing = missing_pct[missing_pct > 30]
|
|
96
|
+
for col, pct in high_missing.items():
|
|
97
|
+
warnings.append(f"'{col}' has {pct}% missing values")
|
|
98
|
+
if warnings:
|
|
99
|
+
for w in warnings:
|
|
100
|
+
print(f" ⚠️ {w}")
|
|
101
|
+
else:
|
|
102
|
+
print(" ✅ No major warnings")
|
|
103
|
+
print("=" * 40)
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dipencsv
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A beginner-friendly CSV analysis and ML data preparation toolkit
|
|
5
|
+
Author: Dipendra
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pandas>=1.5.0
|
|
10
|
+
Requires-Dist: openpyxl>=3.0.0
|
|
11
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest; extra == "dev"
|
|
14
|
+
|
|
15
|
+
# DipenCSV 🐼
|
|
16
|
+
|
|
17
|
+
> DipenCSV is to pandas what seaborn is to matplotlib — a friendlier, higher-level API.
|
|
18
|
+
|
|
19
|
+
A beginner-friendly CSV analysis and ML data preparation toolkit built on top of pandas. Helps students and small teams clean, analyze, and prepare CSV data for machine learning — without needing to know pandas.
|
|
20
|
+
|
|
21
|
+
## Who is it for?
|
|
22
|
+
- 🎓 Students cleaning data for ML assignments
|
|
23
|
+
- 🚀 Small startups needing quick data insights
|
|
24
|
+
- 👨💻 Developers who work with CSVs occasionally
|
|
25
|
+
- 📊 Anyone who finds pandas too complex
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install dipencsv
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from dipencsv import Data
|
|
37
|
+
|
|
38
|
+
data = Data("your_file.csv")
|
|
39
|
+
data.magic() # one click everything
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Full ML Workflow in 15 lines
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from dipencsv import Data
|
|
46
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
47
|
+
|
|
48
|
+
data = Data("titanic.csv")
|
|
49
|
+
data.clean()
|
|
50
|
+
data.drop(["passengerid", "name", "ticket", "cabin"])
|
|
51
|
+
data.encode("sex")
|
|
52
|
+
data.encode("embarked")
|
|
53
|
+
data.remove_outliers("fare")
|
|
54
|
+
data.normalize("fare")
|
|
55
|
+
data.normalize("age")
|
|
56
|
+
|
|
57
|
+
train, test = data.split(test_size=0.2)
|
|
58
|
+
|
|
59
|
+
X_train = train.drop("survived", axis=1)
|
|
60
|
+
y_train = train["survived"]
|
|
61
|
+
X_test = test.drop("survived", axis=1)
|
|
62
|
+
y_test = test["survived"]
|
|
63
|
+
|
|
64
|
+
model = RandomForestClassifier()
|
|
65
|
+
model.fit(X_train, y_train)
|
|
66
|
+
print(f"Accuracy: {model.score(X_test, y_test):.2f}")
|
|
67
|
+
# Accuracy: 0.76
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## All Commands
|
|
73
|
+
|
|
74
|
+
### Loading
|
|
75
|
+
```python
|
|
76
|
+
data = Data("file.csv") # auto mode
|
|
77
|
+
data = Data("bigfile.csv", stream=True) # force stream mode
|
|
78
|
+
data = Data("file.csv", auto_mode=False) # force pandas mode
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Properties
|
|
82
|
+
```python
|
|
83
|
+
print(data.columns) # list of column names
|
|
84
|
+
print(data.shape) # (rows, columns)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Understanding Data
|
|
88
|
+
```python
|
|
89
|
+
data.summary() # rows, cols, types, missing values
|
|
90
|
+
data.report() # duplicates, missing %, basic stats
|
|
91
|
+
data.explain() # trends, issues, suggestions
|
|
92
|
+
data.magic() # ⭐ one click full analysis
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Cleaning
|
|
96
|
+
```python
|
|
97
|
+
data.clean() # auto clean (safe mode)
|
|
98
|
+
data.clean(strategy="aggressive") # drop rows with missing values
|
|
99
|
+
data.drop("column") # drop one column
|
|
100
|
+
data.drop(["col1", "col2"]) # drop multiple columns
|
|
101
|
+
data.rename("old_name", "new_name") # rename column
|
|
102
|
+
data.fill("column", value) # fill missing values manually
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
> ⚠️ After clean(), column names become lowercase with underscores.
|
|
106
|
+
> Always check: `print(data.columns)`
|
|
107
|
+
|
|
108
|
+
### ML Preparation
|
|
109
|
+
```python
|
|
110
|
+
data.encode("gender") # text → numbers (label encoding)
|
|
111
|
+
data.normalize("age") # scale to [0, 1]
|
|
112
|
+
data.standardize("salary") # scale to mean=0, std=1
|
|
113
|
+
data.remove_outliers("price") # remove extreme values (IQR method)
|
|
114
|
+
train, test = data.split(test_size=0.2) # train/test split
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Analytics
|
|
118
|
+
```python
|
|
119
|
+
data.mean("salary")
|
|
120
|
+
data.median("age")
|
|
121
|
+
data.max("salary")
|
|
122
|
+
data.min("salary")
|
|
123
|
+
data.count("city")
|
|
124
|
+
data.correlation("age", "salary")
|
|
125
|
+
data.correlation_matrix()
|
|
126
|
+
data.distribution("salary")
|
|
127
|
+
data.outliers("salary")
|
|
128
|
+
data.describe()
|
|
129
|
+
data.value_counts("city")
|
|
130
|
+
data.group_mean("city", "salary")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Querying
|
|
134
|
+
```python
|
|
135
|
+
data.first() # first 5 rows
|
|
136
|
+
data.first(10) # first 10 rows
|
|
137
|
+
data.last() # last 5 rows
|
|
138
|
+
data.sort("age") # sort ascending
|
|
139
|
+
data.sort("age", asc=False) # sort descending
|
|
140
|
+
data.find("city", "Bangalore") # find rows by value
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Intelligence
|
|
144
|
+
```python
|
|
145
|
+
data.ask("average salary")
|
|
146
|
+
data.ask("top 10 salary")
|
|
147
|
+
data.ask("highest salary by city")
|
|
148
|
+
data.ask("lowest age")
|
|
149
|
+
data.ask("count city")
|
|
150
|
+
data.explain()
|
|
151
|
+
data.magic()
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Export
|
|
155
|
+
```python
|
|
156
|
+
data.export("output.csv") # CSV
|
|
157
|
+
data.export("output.json") # JSON
|
|
158
|
+
data.export("output.xlsx") # Excel (requires openpyxl)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Stream Mode (Big Files)
|
|
164
|
+
|
|
165
|
+
DipenCSV auto detects file size and switches to stream mode for files > 500MB:
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
data = Data("hugefile.csv") # auto detects
|
|
169
|
+
data = Data("hugefile.csv", stream=True) # force stream
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Supported in stream mode: `mean()`, `max()`, `min()`, `count()`, `group_mean()`, `filter()`, `top_n()`
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Smart Error Handling
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
data.mean("salry")
|
|
180
|
+
# ❌ Column 'salry' not found.
|
|
181
|
+
# 💡 Did you mean:
|
|
182
|
+
# - salary
|
|
183
|
+
# - salary_usd
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## Common Issues
|
|
189
|
+
|
|
190
|
+
**KeyError after clean()**
|
|
191
|
+
```python
|
|
192
|
+
data.clean()
|
|
193
|
+
print(data.columns) # check actual column names
|
|
194
|
+
data.mean("salary") # use lowercase
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
**Excel export failing**
|
|
198
|
+
```bash
|
|
199
|
+
pip install openpyxl
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**Big file crashes**
|
|
203
|
+
```python
|
|
204
|
+
data = Data("bigfile.csv", stream=True)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## Dependencies
|
|
210
|
+
- `pandas >= 1.5.0`
|
|
211
|
+
- `scikit-learn >= 1.0.0`
|
|
212
|
+
- `openpyxl >= 3.0.0` (optional, Excel export)
|
|
213
|
+
|
|
214
|
+
## Documentation
|
|
215
|
+
Full docs available in the `docs/` folder.
|
|
216
|
+
## Documentation
|
|
217
|
+
|
|
218
|
+
| Doc | What it covers |
|
|
219
|
+
|-----|----------------|
|
|
220
|
+
| [Getting Started](docs/getting_started.md) | installation, quick start |
|
|
221
|
+
| [Loading Data](docs/loading_data.md) | Data(), auto mode, stream mode |
|
|
222
|
+
| [Properties](docs/properties.md) | columns, shape |
|
|
223
|
+
| [Understanding Data](docs/understanding_data.md) | summary, report, explain, magic |
|
|
224
|
+
| [Cleaning Data](docs/cleaning_data.md) | clean, drop, rename, fill |
|
|
225
|
+
| [ML Preparation](docs/ml_prep.md) | encode, normalize, split etc |
|
|
226
|
+
| [Analytics](docs/analytics.md) | mean, correlation, outliers etc |
|
|
227
|
+
| [Query](docs/query.md) | sort, find, first, last |
|
|
228
|
+
| [Intelligence](docs/intelligence.md) | ask, explain, magic |
|
|
229
|
+
| [Stream Mode](docs/stream_mode.md) | big file handling |
|
|
230
|
+
| [Export](docs/export.md) | csv, json, xlsx |
|
|
231
|
+
| [Examples](docs/examples.md) | real world examples |
|
|
232
|
+
| [FAQ](docs/faq.md) | common student questions |
|
|
233
|
+
| [ML Workflow](docs/ml_workflow.md) | full end-to-end ML example |
|
|
234
|
+
|
|
235
|
+
## License
|
|
236
|
+
MIT License
|
|
237
|
+
|
|
238
|
+
## Author
|
|
239
|
+
Built by Dipendra — a CS student who wanted pandas to be less painful.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
dipencsv/__init__.py,sha256=tt4CaUaGC7ReFCulu9Aiot29paRWhQWzDali3B3O7JE,121
|
|
2
|
+
dipencsv/analytics.py,sha256=nqni-CL9ekMaEOvSLk1VAcYiShMfsi4XMUFvhWA6EZc,2257
|
|
3
|
+
dipencsv/cleaner.py,sha256=jVyyegwGWyK0UUPYmrKDjebmci6hYvq7t5DmuT5htrI,4860
|
|
4
|
+
dipencsv/core.py,sha256=Z2GGR7s7qoni5zLXB8o5TYRqnI0wLX9IYuv5Ot3lOs0,8050
|
|
5
|
+
dipencsv/errors.py,sha256=NMLDt01VSr4KZMLdH5qeuJkrUwvYLOopjWGDG5Ebu8U,598
|
|
6
|
+
dipencsv/exporter.py,sha256=pUhjidm4F1r6VYOYNDxufeRfI-bNdKUuWqhN-X_PDPY,1428
|
|
7
|
+
dipencsv/intelligence.py,sha256=NYQFwT68sLLj4oECcknZLC4fYgBcBEwXG6Mq4p8t5PY,6089
|
|
8
|
+
dipencsv/loader.py,sha256=7UnG2Hzde08SN0FN-22pRrruiZEUHqx9Tyy-AuwnUWI,678
|
|
9
|
+
dipencsv/summary.py,sha256=_qwYZsytcO7SzGXorYtK5T1FO-Kbes9TeAGdjSi0Y8E,2711
|
|
10
|
+
dipencsv/engine/__init__.py,sha256=6f-E6-atDCyRVVgDAg4--OJal5aBBs2xKAD4y46xUu4,184
|
|
11
|
+
dipencsv/engine/base.py,sha256=D8CFeRUd_d7HKrIF48sgCfN3_rGny3ebrvSsxsJIXdU,788
|
|
12
|
+
dipencsv/engine/pandas_engine.py,sha256=hzilZ-12C_5nTTiz2lMNn3EkObnzBl4gyCnpKzrX1gk,1594
|
|
13
|
+
dipencsv/engine/stream_engine.py,sha256=IhCx42n5fZ59fe6RGaG-8-GsXi20-odX698PahSsAhM,3138
|
|
14
|
+
dipencsv-0.1.0.dist-info/METADATA,sha256=sM7xwAtwHkh38RQzQTIw5NY2Gp5tASgzDbcy7VRL_Zo,6181
|
|
15
|
+
dipencsv-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
16
|
+
dipencsv-0.1.0.dist-info/top_level.txt,sha256=vT6kHGNkvpiVjSve9w6c8fovS-RX9qun6DprSl3bSeI,9
|
|
17
|
+
dipencsv-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dipencsv
|