datasetops-toolkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: datasetops-toolkit
3
+ Version: 0.1.0
4
+ Summary: A Python toolkit for simplifying dataset preparation and preprocessing
5
+ Author-email: Your Name <your@email.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/rajazain2001/Dataset-Ops
8
+ Requires-Python: >=3.7
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: pandas
11
+ Requires-Dist: numpy
12
+ Requires-Dist: matplotlib
13
+ Requires-Dist: seaborn
14
+ Requires-Dist: scikit-learn
15
+ Requires-Dist: openpyxl
16
+
17
+ # datasetops
18
+
19
+ A Python toolkit for simplifying dataset preparation and preprocessing.
20
+
21
+ ## Installation
22
+ ```bash
23
+ pip install datasetops
24
+ ```
25
+
26
+ ## Usage
27
+ ```python
28
+ from datasetops import DatasetOps
29
+
30
+ ds = DatasetOps("data.csv")
31
+
32
+ ds.summary()
33
+ ds.missing_report()
34
+ ds.remove_duplicates()
35
+ ds.normalize()
36
+ ds.encode_categorical()
37
+ ds.save_csv("clean_data.csv")
38
+ ```
39
+
40
+ ## Features
41
+
42
+ - Load CSV, Excel, JSON datasets
43
+ - Dataset summary and inspection
44
+ - Missing value detection and filling
45
+ - Duplicate detection and removal
46
+ - Normalization and standardization
47
+ - Categorical encoding
48
+ - Correlation analysis
49
+ - Outlier detection
50
+ - Visualization (histogram, boxplot, scatter, heatmap)
51
+ - Export to CSV and Excel
@@ -0,0 +1,35 @@
1
+ # datasetops
2
+
3
+ A Python toolkit for simplifying dataset preparation and preprocessing.
4
+
5
+ ## Installation
6
+ ```bash
7
+ pip install datasetops
8
+ ```
9
+
10
+ ## Usage
11
+ ```python
12
+ from datasetops import DatasetOps
13
+
14
+ ds = DatasetOps("data.csv")
15
+
16
+ ds.summary()
17
+ ds.missing_report()
18
+ ds.remove_duplicates()
19
+ ds.normalize()
20
+ ds.encode_categorical()
21
+ ds.save_csv("clean_data.csv")
22
+ ```
23
+
24
+ ## Features
25
+
26
+ - Load CSV, Excel, JSON datasets
27
+ - Dataset summary and inspection
28
+ - Missing value detection and filling
29
+ - Duplicate detection and removal
30
+ - Normalization and standardization
31
+ - Categorical encoding
32
+ - Correlation analysis
33
+ - Outlier detection
34
+ - Visualization (histogram, boxplot, scatter, heatmap)
35
+ - Export to CSV and Excel
@@ -0,0 +1,4 @@
1
+ from .loader import DatasetOps
2
+
3
+ __version__ = "0.1.0"
4
+ __author__ = "Raja Zain"
@@ -0,0 +1,39 @@
1
+ class AnalysisMixin:
2
+
3
+ def correlation_matrix(self):
4
+ return self.df.corr(numeric_only=True)
5
+
6
+ def detect_outliers(self, column):
7
+ Q1 = self.df[column].quantile(0.25)
8
+ Q3 = self.df[column].quantile(0.75)
9
+ IQR = Q3 - Q1
10
+ outliers = self.df[(self.df[column] < Q1 - 1.5 * IQR) | (self.df[column] > Q3 + 1.5 * IQR)]
11
+ print(f"Outliers detected in '{column}': {len(outliers)}")
12
+ return outliers
13
+
14
+ def column_unique_values(self, column):
15
+ return self.df[column].unique()
16
+
17
+ def value_counts(self, column):
18
+ return self.df[column].value_counts()
19
+
20
+ def group_by(self, column):
21
+ return self.df.groupby(column)
22
+
23
+ def aggregate_stats(self, group_col, agg_col, func='mean'):
24
+ return self.df.groupby(group_col)[agg_col].agg(func)
25
+
26
+ def top_values(self, column, n=5):
27
+ return self.df[column].value_counts().head(n)
28
+
29
+ def data_quality_report(self):
30
+ print("=" * 40)
31
+ print("DATA QUALITY REPORT")
32
+ print("=" * 40)
33
+ print(f"Total Rows : {self.df.shape[0]}")
34
+ print(f"Total Columns : {self.df.shape[1]}")
35
+ print(f"Missing Values : {self.df.isnull().sum().sum()}")
36
+ print(f"Duplicate Rows : {self.df.duplicated().sum()}")
37
+ print(f"Numeric Columns : {len(self.df.select_dtypes(include='number').columns)}")
38
+ print(f"Categoric Columns: {len(self.df.select_dtypes(include='object').columns)}")
39
+ print("=" * 40)
@@ -0,0 +1,88 @@
1
+ class CleaningMixin:
2
+
3
+ def missing_report(self):
4
+ missing = self.df.isnull().sum()
5
+ missing = missing[missing > 0]
6
+ if missing.empty:
7
+ print("No missing values found.")
8
+ else:
9
+ print("Missing Values:")
10
+ print(missing)
11
+
12
+ def missing_percentage(self):
13
+ pct = (self.df.isnull().sum() / len(self.df)) * 100
14
+ pct = pct[pct > 0]
15
+ if pct.empty:
16
+ print("No missing values found.")
17
+ else:
18
+ print("Missing Value Percentage:")
19
+ print(pct)
20
+
21
+ def fill_missing_mean(self):
22
+ numeric_cols = self.df.select_dtypes(include='number').columns
23
+ self.df[numeric_cols] = self.df[numeric_cols].fillna(self.df[numeric_cols].mean())
24
+ print("Missing numeric values filled with mean.")
25
+
26
+ def fill_missing_median(self):
27
+ numeric_cols = self.df.select_dtypes(include='number').columns
28
+ self.df[numeric_cols] = self.df[numeric_cols].fillna(self.df[numeric_cols].median())
29
+ print("Missing numeric values filled with median.")
30
+
31
+ def fill_missing_mode(self):
32
+ for col in self.df.columns:
33
+ self.df[col] = self.df[col].fillna(self.df[col].mode()[0])
34
+ print("Missing values filled with mode.")
35
+
36
+ def fill_missing_value(self, value):
37
+ self.df = self.df.fillna(value)
38
+ print(f"Missing values filled with: {value}")
39
+
40
+ def drop_missing_rows(self):
41
+ before = len(self.df)
42
+ self.df = self.df.dropna()
43
+ print(f"Removed {before - len(self.df)} rows with missing values.")
44
+
45
+ def drop_missing_columns(self):
46
+ before = len(self.df.columns)
47
+ self.df = self.df.dropna(axis=1)
48
+ print(f"Removed {before - len(self.df.columns)} columns with missing values.")
49
+
50
+ def duplicate_report(self):
51
+ count = self.df.duplicated().sum()
52
+ print(f"Duplicate rows found: {count}")
53
+
54
+ def count_duplicates(self):
55
+ return self.df.duplicated().sum()
56
+
57
+ def remove_duplicates(self):
58
+ before = len(self.df)
59
+ self.df = self.df.drop_duplicates()
60
+ print(f"Removed {before - len(self.df)} duplicate rows.")
61
+
62
+ def rename_column(self, old_name, new_name):
63
+ self.df = self.df.rename(columns={old_name: new_name})
64
+ print(f"Column '{old_name}' renamed to '{new_name}'.")
65
+
66
+ def drop_column(self, col_name):
67
+ self.df = self.df.drop(columns=[col_name])
68
+ print(f"Column '{col_name}' dropped.")
69
+
70
+ def add_column(self, col_name, default_value=None):
71
+ self.df[col_name] = default_value
72
+ print(f"Column '{col_name}' added.")
73
+
74
+ def select_columns(self, columns):
75
+ self.df = self.df[columns]
76
+ print(f"Selected columns: {columns}")
77
+
78
+ def filter_rows(self, condition):
79
+ before = len(self.df)
80
+ self.df = self.df.query(condition)
81
+ print(f"Filtered rows. Remaining: {len(self.df)} (removed {before - len(self.df)})")
82
+
83
+ def sort_rows(self, column, ascending=True):
84
+ self.df = self.df.sort_values(by=column, ascending=ascending)
85
+ print(f"Dataset sorted by '{column}'.")
86
+
87
+ def sample_rows(self, n=5):
88
+ return self.df.sample(n)
@@ -0,0 +1,20 @@
1
+ class ExportMixin:
2
+
3
+ def save_csv(self, filepath="clean_data.csv"):
4
+ self.df.to_csv(filepath, index=False)
5
+ print(f"Dataset saved as CSV: {filepath}")
6
+
7
+ def save_excel(self, filepath="clean_data.xlsx"):
8
+ self.df.to_excel(filepath, index=False)
9
+ print(f"Dataset saved as Excel: {filepath}")
10
+
11
+ def export_report(self, filepath="report.txt"):
12
+ with open(filepath, "w") as f:
13
+ f.write("DATA QUALITY REPORT\n")
14
+ f.write("=" * 40 + "\n")
15
+ f.write(f"Rows : {self.df.shape[0]}\n")
16
+ f.write(f"Columns : {self.df.shape[1]}\n")
17
+ f.write(f"Missing Values : {self.df.isnull().sum().sum()}\n")
18
+ f.write(f"Duplicate Rows : {self.df.duplicated().sum()}\n")
19
+ f.write("=" * 40 + "\n")
20
+ print(f"Report exported: {filepath}")
@@ -0,0 +1,67 @@
1
+ import pandas as pd
2
+ from .cleaning import CleaningMixin
3
+ from .transformation import TransformationMixin
4
+ from .analysis import AnalysisMixin
5
+ from .visualization import VisualizationMixin
6
+ from .export import ExportMixin
7
+
8
+
9
+ class DatasetOps(CleaningMixin, TransformationMixin, AnalysisMixin, VisualizationMixin, ExportMixin):
10
+
11
+ def __init__(self, filepath=None):
12
+ self.df = None
13
+ self._original = None
14
+ self.filepath = filepath
15
+ if filepath:
16
+ self.load_csv(filepath)
17
+
18
+ def load_csv(self, filepath):
19
+ self.df = pd.read_csv(filepath)
20
+ self._original = self.df.copy()
21
+ print(f"Dataset loaded successfully")
22
+ print(f"Rows: {self.df.shape[0]}, Columns: {self.df.shape[1]}")
23
+
24
+ def load_excel(self, filepath):
25
+ self.df = pd.read_excel(filepath)
26
+ self._original = self.df.copy()
27
+ print(f"Dataset loaded successfully")
28
+ print(f"Rows: {self.df.shape[0]}, Columns: {self.df.shape[1]}")
29
+
30
+ def load_json(self, filepath):
31
+ self.df = pd.read_json(filepath)
32
+ self._original = self.df.copy()
33
+ print(f"Dataset loaded successfully")
34
+ print(f"Rows: {self.df.shape[0]}, Columns: {self.df.shape[1]}")
35
+
36
+ def reload_dataset(self):
37
+ self.df = self._original.copy()
38
+ print("Dataset reloaded to original state.")
39
+
40
+ def summary(self):
41
+ print("=" * 40)
42
+ print("DATASET SUMMARY")
43
+ print("=" * 40)
44
+ print(f"Rows : {self.df.shape[0]}")
45
+ print(f"Columns : {self.df.shape[1]}")
46
+ print(f"Memory Usage: {self.df.memory_usage(deep=True).sum() / 1024:.2f} KB")
47
+ print(f"Duplicates : {self.df.duplicated().sum()}")
48
+ print(f"Missing : {self.df.isnull().sum().sum()}")
49
+ print("=" * 40)
50
+
51
+ def head(self, n=5):
52
+ return self.df.head(n)
53
+
54
+ def tail(self, n=5):
55
+ return self.df.tail(n)
56
+
57
+ def shape(self):
58
+ print(f"Shape: {self.df.shape}")
59
+
60
+ def column_names(self):
61
+ print("Columns:", list(self.df.columns))
62
+
63
+ def column_types(self):
64
+ print(self.df.dtypes)
65
+
66
+ def describe_stats(self):
67
+ return self.df.describe(include='all')
@@ -0,0 +1,33 @@
1
+ from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
2
+ import pandas as pd
3
+
4
+
5
+ class TransformationMixin:
6
+
7
+ def normalize(self):
8
+ numeric_cols = self.df.select_dtypes(include='number').columns
9
+ scaler = MinMaxScaler()
10
+ self.df[numeric_cols] = scaler.fit_transform(self.df[numeric_cols])
11
+ print("Numeric columns normalized to range 0-1.")
12
+
13
+ def standardize(self):
14
+ numeric_cols = self.df.select_dtypes(include='number').columns
15
+ scaler = StandardScaler()
16
+ self.df[numeric_cols] = scaler.fit_transform(self.df[numeric_cols])
17
+ print("Numeric columns standardized.")
18
+
19
+ def log_transform(self, column):
20
+ import numpy as np
21
+ self.df[column] = np.log1p(self.df[column])
22
+ print(f"Log transform applied to column '{column}'.")
23
+
24
+ def encode_categorical(self):
25
+ le = LabelEncoder()
26
+ cat_cols = self.df.select_dtypes(include='object').columns
27
+ for col in cat_cols:
28
+ self.df[col] = le.fit_transform(self.df[col].astype(str))
29
+ print(f"Categorical columns encoded: {list(cat_cols)}")
30
+
31
+ def one_hot_encode(self, column):
32
+ self.df = pd.get_dummies(self.df, columns=[column])
33
+ print(f"One-hot encoding applied to column '{column}'.")
@@ -0,0 +1,32 @@
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+
4
+
5
+ class VisualizationMixin:
6
+
7
+ def plot_histogram(self, column, bins=20):
8
+ self.df[column].hist(bins=bins)
9
+ plt.title(f"Histogram - {column}")
10
+ plt.xlabel(column)
11
+ plt.ylabel("Frequency")
12
+ plt.tight_layout()
13
+ plt.show()
14
+
15
+ def plot_boxplot(self, column):
16
+ sns.boxplot(y=self.df[column])
17
+ plt.title(f"Boxplot - {column}")
18
+ plt.tight_layout()
19
+ plt.show()
20
+
21
+ def plot_scatter(self, x_col, y_col):
22
+ self.df.plot.scatter(x=x_col, y=y_col)
23
+ plt.title(f"Scatter - {x_col} vs {y_col}")
24
+ plt.tight_layout()
25
+ plt.show()
26
+
27
+ def plot_correlation_heatmap(self):
28
+ corr = self.df.corr(numeric_only=True)
29
+ sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
30
+ plt.title("Correlation Heatmap")
31
+ plt.tight_layout()
32
+ plt.show()
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: datasetops-toolkit
3
+ Version: 0.1.0
4
+ Summary: A Python toolkit for simplifying dataset preparation and preprocessing
5
+ Author-email: Your Name <your@email.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/rajazain2001/Dataset-Ops
8
+ Requires-Python: >=3.7
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: pandas
11
+ Requires-Dist: numpy
12
+ Requires-Dist: matplotlib
13
+ Requires-Dist: seaborn
14
+ Requires-Dist: scikit-learn
15
+ Requires-Dist: openpyxl
16
+
17
+ # datasetops
18
+
19
+ A Python toolkit for simplifying dataset preparation and preprocessing.
20
+
21
+ ## Installation
22
+ ```bash
23
+ pip install datasetops
24
+ ```
25
+
26
+ ## Usage
27
+ ```python
28
+ from datasetops import DatasetOps
29
+
30
+ ds = DatasetOps("data.csv")
31
+
32
+ ds.summary()
33
+ ds.missing_report()
34
+ ds.remove_duplicates()
35
+ ds.normalize()
36
+ ds.encode_categorical()
37
+ ds.save_csv("clean_data.csv")
38
+ ```
39
+
40
+ ## Features
41
+
42
+ - Load CSV, Excel, JSON datasets
43
+ - Dataset summary and inspection
44
+ - Missing value detection and filling
45
+ - Duplicate detection and removal
46
+ - Normalization and standardization
47
+ - Categorical encoding
48
+ - Correlation analysis
49
+ - Outlier detection
50
+ - Visualization (histogram, boxplot, scatter, heatmap)
51
+ - Export to CSV and Excel
@@ -0,0 +1,14 @@
1
+ README.md
2
+ pyproject.toml
3
+ datasetops/__init__.py
4
+ datasetops/analysis.py
5
+ datasetops/cleaning.py
6
+ datasetops/export.py
7
+ datasetops/loader.py
8
+ datasetops/transformation.py
9
+ datasetops/visualization.py
10
+ datasetops_toolkit.egg-info/PKG-INFO
11
+ datasetops_toolkit.egg-info/SOURCES.txt
12
+ datasetops_toolkit.egg-info/dependency_links.txt
13
+ datasetops_toolkit.egg-info/requires.txt
14
+ datasetops_toolkit.egg-info/top_level.txt
@@ -0,0 +1,6 @@
1
+ pandas
2
+ numpy
3
+ matplotlib
4
+ seaborn
5
+ scikit-learn
6
+ openpyxl
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "datasetops-toolkit"
7
+ version = "0.1.0"
8
+ description = "A Python toolkit for simplifying dataset preparation and preprocessing"
9
+ readme = "README.md"
10
+ requires-python = ">=3.7"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Your Name", email = "your@email.com"}
14
+ ]
15
+ dependencies = [
16
+ "pandas",
17
+ "numpy",
18
+ "matplotlib",
19
+ "seaborn",
20
+ "scikit-learn",
21
+ "openpyxl"
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/rajazain2001/Dataset-Ops"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+