PyPI - datasetops-toolkit - Versions diffs - 0.1.0__tar.gz - Mend

datasetops-toolkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

datasetops_toolkit-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,51 @@
+Metadata-Version: 2.4
+Name: datasetops-toolkit
+Version: 0.1.0
+Summary: A Python toolkit for simplifying dataset preparation and preprocessing
+Author-email: Your Name <your@email.com>
+License: MIT
+Project-URL: Homepage, https://github.com/rajazain2001/Dataset-Ops
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+Requires-Dist: pandas
+Requires-Dist: numpy
+Requires-Dist: matplotlib
+Requires-Dist: seaborn
+Requires-Dist: scikit-learn
+Requires-Dist: openpyxl
+# datasetops
+A Python toolkit for simplifying dataset preparation and preprocessing.
+## Installation
+```bash
+pip install datasetops
+```
+## Usage
+```python
+from datasetops import DatasetOps
+ds = DatasetOps("data.csv")
+ds.summary()
+ds.missing_report()
+ds.remove_duplicates()
+ds.normalize()
+ds.encode_categorical()
+ds.save_csv("clean_data.csv")
+```
+## Features
+- Load CSV, Excel, JSON datasets
+- Dataset summary and inspection
+- Missing value detection and filling
+- Duplicate detection and removal
+- Normalization and standardization
+- Categorical encoding
+- Correlation analysis
+- Outlier detection
+- Visualization (histogram, boxplot, scatter, heatmap)
+- Export to CSV and Excel

datasetops_toolkit-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# datasetops
+A Python toolkit for simplifying dataset preparation and preprocessing.
+## Installation
+```bash
+pip install datasetops
+```
+## Usage
+```python
+from datasetops import DatasetOps
+ds = DatasetOps("data.csv")
+ds.summary()
+ds.missing_report()
+ds.remove_duplicates()
+ds.normalize()
+ds.encode_categorical()
+ds.save_csv("clean_data.csv")
+```
+## Features
+- Load CSV, Excel, JSON datasets
+- Dataset summary and inspection
+- Missing value detection and filling
+- Duplicate detection and removal
+- Normalization and standardization
+- Categorical encoding
+- Correlation analysis
+- Outlier detection
+- Visualization (histogram, boxplot, scatter, heatmap)
+- Export to CSV and Excel

datasetops_toolkit-0.1.0/datasetops/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .loader import DatasetOps
+__version__ = "0.1.0"
+__author__ = "Raja Zain"

datasetops_toolkit-0.1.0/datasetops/analysis.py ADDED Viewed

@@ -0,0 +1,39 @@
+class AnalysisMixin:
+    def correlation_matrix(self):
+        return self.df.corr(numeric_only=True)
+    def detect_outliers(self, column):
+        Q1 = self.df[column].quantile(0.25)
+        Q3 = self.df[column].quantile(0.75)
+        IQR = Q3 - Q1
+        outliers = self.df[(self.df[column] < Q1 - 1.5 * IQR) | (self.df[column] > Q3 + 1.5 * IQR)]
+        print(f"Outliers detected in '{column}': {len(outliers)}")
+        return outliers
+    def column_unique_values(self, column):
+        return self.df[column].unique()
+    def value_counts(self, column):
+        return self.df[column].value_counts()
+    def group_by(self, column):
+        return self.df.groupby(column)
+    def aggregate_stats(self, group_col, agg_col, func='mean'):
+        return self.df.groupby(group_col)[agg_col].agg(func)
+    def top_values(self, column, n=5):
+        return self.df[column].value_counts().head(n)
+    def data_quality_report(self):
+        print("=" * 40)
+        print("DATA QUALITY REPORT")
+        print("=" * 40)
+        print(f"Total Rows       : {self.df.shape[0]}")
+        print(f"Total Columns    : {self.df.shape[1]}")
+        print(f"Missing Values   : {self.df.isnull().sum().sum()}")
+        print(f"Duplicate Rows   : {self.df.duplicated().sum()}")
+        print(f"Numeric Columns  : {len(self.df.select_dtypes(include='number').columns)}")
+        print(f"Categoric Columns: {len(self.df.select_dtypes(include='object').columns)}")
+        print("=" * 40)

datasetops_toolkit-0.1.0/datasetops/cleaning.py ADDED Viewed

@@ -0,0 +1,88 @@
+class CleaningMixin:
+    def missing_report(self):
+        missing = self.df.isnull().sum()
+        missing = missing[missing > 0]
+        if missing.empty:
+            print("No missing values found.")
+        else:
+            print("Missing Values:")
+            print(missing)
+    def missing_percentage(self):
+        pct = (self.df.isnull().sum() / len(self.df)) * 100
+        pct = pct[pct > 0]
+        if pct.empty:
+            print("No missing values found.")
+        else:
+            print("Missing Value Percentage:")
+            print(pct)
+    def fill_missing_mean(self):
+        numeric_cols = self.df.select_dtypes(include='number').columns
+        self.df[numeric_cols] = self.df[numeric_cols].fillna(self.df[numeric_cols].mean())
+        print("Missing numeric values filled with mean.")
+    def fill_missing_median(self):
+        numeric_cols = self.df.select_dtypes(include='number').columns
+        self.df[numeric_cols] = self.df[numeric_cols].fillna(self.df[numeric_cols].median())
+        print("Missing numeric values filled with median.")
+    def fill_missing_mode(self):
+        for col in self.df.columns:
+            self.df[col] = self.df[col].fillna(self.df[col].mode()[0])
+        print("Missing values filled with mode.")
+    def fill_missing_value(self, value):
+        self.df = self.df.fillna(value)
+        print(f"Missing values filled with: {value}")
+    def drop_missing_rows(self):
+        before = len(self.df)
+        self.df = self.df.dropna()
+        print(f"Removed {before - len(self.df)} rows with missing values.")
+    def drop_missing_columns(self):
+        before = len(self.df.columns)
+        self.df = self.df.dropna(axis=1)
+        print(f"Removed {before - len(self.df.columns)} columns with missing values.")
+    def duplicate_report(self):
+        count = self.df.duplicated().sum()
+        print(f"Duplicate rows found: {count}")
+    def count_duplicates(self):
+        return self.df.duplicated().sum()
+    def remove_duplicates(self):
+        before = len(self.df)
+        self.df = self.df.drop_duplicates()
+        print(f"Removed {before - len(self.df)} duplicate rows.")
+    def rename_column(self, old_name, new_name):
+        self.df = self.df.rename(columns={old_name: new_name})
+        print(f"Column '{old_name}' renamed to '{new_name}'.")
+    def drop_column(self, col_name):
+        self.df = self.df.drop(columns=[col_name])
+        print(f"Column '{col_name}' dropped.")
+    def add_column(self, col_name, default_value=None):
+        self.df[col_name] = default_value
+        print(f"Column '{col_name}' added.")
+    def select_columns(self, columns):
+        self.df = self.df[columns]
+        print(f"Selected columns: {columns}")
+    def filter_rows(self, condition):
+        before = len(self.df)
+        self.df = self.df.query(condition)
+        print(f"Filtered rows. Remaining: {len(self.df)} (removed {before - len(self.df)})")
+    def sort_rows(self, column, ascending=True):
+        self.df = self.df.sort_values(by=column, ascending=ascending)
+        print(f"Dataset sorted by '{column}'.")
+    def sample_rows(self, n=5):
+        return self.df.sample(n)

datasetops_toolkit-0.1.0/datasetops/export.py ADDED Viewed

@@ -0,0 +1,20 @@
+class ExportMixin:
+    def save_csv(self, filepath="clean_data.csv"):
+        self.df.to_csv(filepath, index=False)
+        print(f"Dataset saved as CSV: {filepath}")
+    def save_excel(self, filepath="clean_data.xlsx"):
+        self.df.to_excel(filepath, index=False)
+        print(f"Dataset saved as Excel: {filepath}")
+    def export_report(self, filepath="report.txt"):
+        with open(filepath, "w") as f:
+            f.write("DATA QUALITY REPORT\n")
+            f.write("=" * 40 + "\n")
+            f.write(f"Rows             : {self.df.shape[0]}\n")
+            f.write(f"Columns          : {self.df.shape[1]}\n")
+            f.write(f"Missing Values   : {self.df.isnull().sum().sum()}\n")
+            f.write(f"Duplicate Rows   : {self.df.duplicated().sum()}\n")
+            f.write("=" * 40 + "\n")
+        print(f"Report exported: {filepath}")

datasetops_toolkit-0.1.0/datasetops/loader.py ADDED Viewed

@@ -0,0 +1,67 @@
+import pandas as pd
+from .cleaning import CleaningMixin
+from .transformation import TransformationMixin
+from .analysis import AnalysisMixin
+from .visualization import VisualizationMixin
+from .export import ExportMixin
+class DatasetOps(CleaningMixin, TransformationMixin, AnalysisMixin, VisualizationMixin, ExportMixin):
+    def __init__(self, filepath=None):
+        self.df = None
+        self._original = None
+        self.filepath = filepath
+        if filepath:
+            self.load_csv(filepath)
+    def load_csv(self, filepath):
+        self.df = pd.read_csv(filepath)
+        self._original = self.df.copy()
+        print(f"Dataset loaded successfully")
+        print(f"Rows: {self.df.shape[0]}, Columns: {self.df.shape[1]}")
+    def load_excel(self, filepath):
+        self.df = pd.read_excel(filepath)
+        self._original = self.df.copy()
+        print(f"Dataset loaded successfully")
+        print(f"Rows: {self.df.shape[0]}, Columns: {self.df.shape[1]}")
+    def load_json(self, filepath):
+        self.df = pd.read_json(filepath)
+        self._original = self.df.copy()
+        print(f"Dataset loaded successfully")
+        print(f"Rows: {self.df.shape[0]}, Columns: {self.df.shape[1]}")
+    def reload_dataset(self):
+        self.df = self._original.copy()
+        print("Dataset reloaded to original state.")
+    def summary(self):
+        print("=" * 40)
+        print("DATASET SUMMARY")
+        print("=" * 40)
+        print(f"Rows        : {self.df.shape[0]}")
+        print(f"Columns     : {self.df.shape[1]}")
+        print(f"Memory Usage: {self.df.memory_usage(deep=True).sum() / 1024:.2f} KB")
+        print(f"Duplicates  : {self.df.duplicated().sum()}")
+        print(f"Missing     : {self.df.isnull().sum().sum()}")
+        print("=" * 40)
+    def head(self, n=5):
+        return self.df.head(n)
+    def tail(self, n=5):
+        return self.df.tail(n)
+    def shape(self):
+        print(f"Shape: {self.df.shape}")
+    def column_names(self):
+        print("Columns:", list(self.df.columns))
+    def column_types(self):
+        print(self.df.dtypes)
+    def describe_stats(self):
+        return self.df.describe(include='all')

datasetops_toolkit-0.1.0/datasetops/transformation.py ADDED Viewed

@@ -0,0 +1,33 @@
+from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
+import pandas as pd
+class TransformationMixin:
+    def normalize(self):
+        numeric_cols = self.df.select_dtypes(include='number').columns
+        scaler = MinMaxScaler()
+        self.df[numeric_cols] = scaler.fit_transform(self.df[numeric_cols])
+        print("Numeric columns normalized to range 0-1.")
+    def standardize(self):
+        numeric_cols = self.df.select_dtypes(include='number').columns
+        scaler = StandardScaler()
+        self.df[numeric_cols] = scaler.fit_transform(self.df[numeric_cols])
+        print("Numeric columns standardized.")
+    def log_transform(self, column):
+        import numpy as np
+        self.df[column] = np.log1p(self.df[column])
+        print(f"Log transform applied to column '{column}'.")
+    def encode_categorical(self):
+        le = LabelEncoder()
+        cat_cols = self.df.select_dtypes(include='object').columns
+        for col in cat_cols:
+            self.df[col] = le.fit_transform(self.df[col].astype(str))
+        print(f"Categorical columns encoded: {list(cat_cols)}")
+    def one_hot_encode(self, column):
+        self.df = pd.get_dummies(self.df, columns=[column])
+        print(f"One-hot encoding applied to column '{column}'.")

datasetops_toolkit-0.1.0/datasetops/visualization.py ADDED Viewed

@@ -0,0 +1,32 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+class VisualizationMixin:
+    def plot_histogram(self, column, bins=20):
+        self.df[column].hist(bins=bins)
+        plt.title(f"Histogram - {column}")
+        plt.xlabel(column)
+        plt.ylabel("Frequency")
+        plt.tight_layout()
+        plt.show()
+    def plot_boxplot(self, column):
+        sns.boxplot(y=self.df[column])
+        plt.title(f"Boxplot - {column}")
+        plt.tight_layout()
+        plt.show()
+    def plot_scatter(self, x_col, y_col):
+        self.df.plot.scatter(x=x_col, y=y_col)
+        plt.title(f"Scatter - {x_col} vs {y_col}")
+        plt.tight_layout()
+        plt.show()
+    def plot_correlation_heatmap(self):
+        corr = self.df.corr(numeric_only=True)
+        sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
+        plt.title("Correlation Heatmap")
+        plt.tight_layout()
+        plt.show()

datasetops_toolkit-0.1.0/datasetops_toolkit.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,51 @@
+Metadata-Version: 2.4
+Name: datasetops-toolkit
+Version: 0.1.0
+Summary: A Python toolkit for simplifying dataset preparation and preprocessing
+Author-email: Your Name <your@email.com>
+License: MIT
+Project-URL: Homepage, https://github.com/rajazain2001/Dataset-Ops
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+Requires-Dist: pandas
+Requires-Dist: numpy
+Requires-Dist: matplotlib
+Requires-Dist: seaborn
+Requires-Dist: scikit-learn
+Requires-Dist: openpyxl
+# datasetops
+A Python toolkit for simplifying dataset preparation and preprocessing.
+## Installation
+```bash
+pip install datasetops
+```
+## Usage
+```python
+from datasetops import DatasetOps
+ds = DatasetOps("data.csv")
+ds.summary()
+ds.missing_report()
+ds.remove_duplicates()
+ds.normalize()
+ds.encode_categorical()
+ds.save_csv("clean_data.csv")
+```
+## Features
+- Load CSV, Excel, JSON datasets
+- Dataset summary and inspection
+- Missing value detection and filling
+- Duplicate detection and removal
+- Normalization and standardization
+- Categorical encoding
+- Correlation analysis
+- Outlier detection
+- Visualization (histogram, boxplot, scatter, heatmap)
+- Export to CSV and Excel

datasetops_toolkit-0.1.0/datasetops_toolkit.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,14 @@
+README.md
+pyproject.toml
+datasetops/__init__.py
+datasetops/analysis.py
+datasetops/cleaning.py
+datasetops/export.py
+datasetops/loader.py
+datasetops/transformation.py
+datasetops/visualization.py
+datasetops_toolkit.egg-info/PKG-INFO
+datasetops_toolkit.egg-info/SOURCES.txt
+datasetops_toolkit.egg-info/dependency_links.txt
+datasetops_toolkit.egg-info/requires.txt
+datasetops_toolkit.egg-info/top_level.txt

datasetops_toolkit-0.1.0/datasetops_toolkit.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

datasetops_toolkit-0.1.0/datasetops_toolkit.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,6 @@
+pandas
+numpy
+matplotlib
+seaborn
+scikit-learn
+openpyxl

datasetops_toolkit-0.1.0/datasetops_toolkit.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ datasetops

datasetops_toolkit-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "datasetops-toolkit"
+version = "0.1.0"
+description = "A Python toolkit for simplifying dataset preparation and preprocessing"
+readme = "README.md"
+requires-python = ">=3.7"
+license = {text = "MIT"}
+authors = [
+  {name = "Your Name", email = "your@email.com"}
+]
+dependencies = [
+    "pandas",
+    "numpy",
+    "matplotlib",
+    "seaborn",
+    "scikit-learn",
+    "openpyxl"
+]
+[project.urls]
+Homepage = "https://github.com/rajazain2001/Dataset-Ops"

datasetops_toolkit-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0