datra 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datra/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .audit import Audit
2
+ from .cleaner import clean
3
+
4
+ __all__ = ["Audit", "clean"]
datra/audit.py ADDED
@@ -0,0 +1,60 @@
1
+ from functools import cached_property
2
+
3
+ from datra.io import load
4
+ from datra.checks.validate import validate as validate_df
5
+ from datra.checks.completeness import completeness as check_completeness
6
+ from datra.checks.uniqueness import uniqueness as check_uniqueness
7
+ from datra.checks.outliers import outliers as check_outliers
8
+ from datra.checks.profile import profile as check_profile
9
+ from datra.scoring.score import calculate_score
10
+ from datra.reports import build_report, save_report
11
+
12
+
13
+ class Audit:
14
+ def __init__(self, input_data):
15
+ self.df = load(input_data)
16
+
17
+ @cached_property
18
+ def completeness(self):
19
+ return check_completeness(self.df)
20
+
21
+ @cached_property
22
+ def uniqueness(self):
23
+ return check_uniqueness(self.df)
24
+
25
+ @cached_property
26
+ def outliers(self):
27
+ return check_outliers(self.df)
28
+
29
+ @cached_property
30
+ def profile(self):
31
+ return check_profile(self.df)
32
+
33
+ @cached_property
34
+ def score(self):
35
+ metrics_payload = {
36
+ "completeness": self.completeness,
37
+ "uniqueness": self.uniqueness,
38
+ "outliers": self.outliers,
39
+ }
40
+ return calculate_score(metrics_payload)
41
+
42
+ @property
43
+ def results(self):
44
+ return {
45
+ "profile": self.profile,
46
+ "completeness": self.completeness,
47
+ "uniqueness": self.uniqueness,
48
+ "outliers": self.outliers,
49
+ "score": self.score,
50
+ }
51
+
52
+ def validate(self, rules: dict):
53
+ return validate_df(self.df, rules)
54
+
55
+ def build_report(self):
56
+ return build_report(self.results)
57
+
58
+ def save_report(self, path="outputs", format="json"):
59
+ report = self.build_report()
60
+ return save_report(report, output=path, format=format)
File without changes
@@ -0,0 +1,19 @@
1
+ import pandas as pd
2
+
3
+
4
+ def completeness(df: pd.DataFrame):
5
+ total_rows = len(df)
6
+
7
+ result = {}
8
+
9
+ for col in df.columns:
10
+ missing = df[col].isna().sum()
11
+ filled = total_rows - missing
12
+
13
+ result[col] = {
14
+ "missing_values": int(missing),
15
+ "filled_values": int(filled),
16
+ "completeness_ratio": float(round(filled / total_rows, 4))
17
+ }
18
+
19
+ return result
@@ -0,0 +1,35 @@
1
+ import pandas as pd
2
+
3
+
4
+ def outliers(df: pd.DataFrame):
5
+ results = {}
6
+
7
+ numeric_cols = df.select_dtypes(include="number").columns
8
+
9
+ for col in numeric_cols:
10
+ series = df[col].dropna()
11
+
12
+ if series.empty:
13
+ continue
14
+
15
+ q1 = series.quantile(0.25)
16
+ q3 = series.quantile(0.75)
17
+
18
+ iqr = q3 - q1
19
+
20
+ lower_bound = q1 - 1.5 * iqr
21
+ upper_bound = q3 + 1.5 * iqr
22
+
23
+ outliers_mask = (series < lower_bound) | (series > upper_bound)
24
+
25
+ outlier_values = series[outliers_mask]
26
+
27
+ results[col] = {
28
+ "lower_bound": float(lower_bound),
29
+ "upper_bound": float(upper_bound),
30
+ "outlier_count": int(outliers_mask.sum()),
31
+ "outlier_ratio": float(round(outliers_mask.mean(), 4)),
32
+ # "sample_outliers": outlier_values.head(5).tolist()
33
+ }
34
+
35
+ return results
@@ -0,0 +1,49 @@
1
+ import pandas as pd
2
+
3
+
4
+ def profile(df: pd.DataFrame):
5
+ rows, columns = df.shape
6
+
7
+ numeric_columns = len(
8
+ df.select_dtypes(include="number").columns
9
+ )
10
+
11
+ categorical_columns = len(
12
+ df.select_dtypes(include=["object", "category"]).columns
13
+ )
14
+
15
+ datetime_columns = len(
16
+ df.select_dtypes(include=["datetime64"]).columns
17
+ )
18
+
19
+ memory_usage_mb = float(round(
20
+ df.memory_usage(deep=True).sum() / (1024 * 1024),
21
+ 2
22
+ ))
23
+
24
+ duplicate_rows = int(df.duplicated().sum())
25
+
26
+ missing_cells = int(df.isna().sum().sum())
27
+
28
+ # columns = {}
29
+
30
+ # for col in df.columns:
31
+ # col_data = df[col]
32
+
33
+ # columns[col] = {
34
+ # "dtype": str(col_data.dtype),
35
+ # "missing_pct": round(col_data.isna().mean() * 100, 2),
36
+ # "unique_count": int(col_data.nunique(dropna=True)),
37
+ # "sample_values": col_data.dropna().astype(str).head(3).tolist()
38
+ # }
39
+
40
+ return {
41
+ "rows": rows,
42
+ "columns": columns,
43
+ "numeric_columns": numeric_columns,
44
+ "categorical_columns": categorical_columns,
45
+ "datetime_columns": datetime_columns,
46
+ "memory_usage_mb": memory_usage_mb,
47
+ "duplicate_rows": duplicate_rows,
48
+ "missing_cells": missing_cells,
49
+ }
@@ -0,0 +1,18 @@
1
+ import pandas as pd
2
+
3
+
4
+ def uniqueness(df: pd.DataFrame):
5
+ total_rows = len(df)
6
+
7
+ dup_mask = df.duplicated()
8
+ duplicate_rows = dup_mask.sum()
9
+
10
+ result = {
11
+ "total_rows": total_rows,
12
+ "unique_rows": int(len(df) - duplicate_rows),
13
+ "duplicate_rows": int(duplicate_rows),
14
+ "duplicate_ratio": float(round(duplicate_rows / total_rows, 4) if total_rows else 0),
15
+ # "duplicate_sample": df[dup_mask].head(5).to_dict(orient="records")
16
+ }
17
+
18
+ return result
@@ -0,0 +1,151 @@
1
+ import pandas as pd
2
+
3
+
4
+ def _record_check(summary, column_result, check_name, passed, **details):
5
+ column_result["checks"][check_name] = {
6
+ "passed": passed,
7
+ **details,
8
+ }
9
+
10
+ summary["rules_checked"] += 1
11
+
12
+ if passed:
13
+ summary["rules_passed"] += 1
14
+ else:
15
+ summary["rules_failed"] += 1
16
+ column_result["passed"] = False
17
+
18
+
19
+ def _check_minimum(df, column, minimum, summary, column_result):
20
+ violations = int((df[column] < minimum).sum())
21
+
22
+ passed = violations == 0
23
+
24
+ _record_check(
25
+ summary,
26
+ column_result,
27
+ "minimum",
28
+ passed,
29
+ expected=minimum,
30
+ violations=violations,
31
+ )
32
+
33
+
34
+ def _check_maximum(df, column, maximum, summary, column_result):
35
+ violations = int((df[column] > maximum).sum())
36
+
37
+ passed = violations == 0
38
+
39
+ _record_check(
40
+ summary,
41
+ column_result,
42
+ "maximum",
43
+ passed,
44
+ expected=maximum,
45
+ violations=violations,
46
+ )
47
+
48
+
49
+ def _check_allowed(df, column, allowed, summary, column_result):
50
+ violations = int(
51
+ (~df[column].isin(allowed)).sum()
52
+ )
53
+
54
+ passed = violations == 0
55
+
56
+ _record_check(
57
+ summary,
58
+ column_result,
59
+ "allowed",
60
+ passed,
61
+ expected=allowed,
62
+ violations=violations,
63
+ )
64
+
65
+
66
+ def _check_unique(df, column, summary, column_result):
67
+ violations = int(df[column].duplicated().sum())
68
+
69
+ passed = violations == 0
70
+
71
+ _record_check(
72
+ summary,
73
+ column_result,
74
+ "unique",
75
+ passed,
76
+ violations=violations,
77
+ )
78
+
79
+
80
+ def validate(df: pd.DataFrame, rules: dict):
81
+ summary = {
82
+ "columns_checked": 0,
83
+ "rules_checked": 0,
84
+ "rules_passed": 0,
85
+ "rules_failed": 0,
86
+ "validation_score": 0.0,
87
+ }
88
+
89
+ results = {}
90
+
91
+ for column, column_rules in rules.items():
92
+ if column not in df.columns:
93
+ continue
94
+
95
+ summary["columns_checked"] += 1
96
+
97
+ column_result = {
98
+ "passed": True,
99
+ "checks": {}
100
+ }
101
+
102
+ if "min" in column_rules:
103
+ _check_minimum(
104
+ df,
105
+ column,
106
+ column_rules["min"],
107
+ summary,
108
+ column_result,
109
+ )
110
+
111
+ if "max" in column_rules:
112
+ _check_maximum(
113
+ df,
114
+ column,
115
+ column_rules["max"],
116
+ summary,
117
+ column_result,
118
+ )
119
+
120
+ if "allowed" in column_rules:
121
+ _check_allowed(
122
+ df,
123
+ column,
124
+ column_rules["allowed"],
125
+ summary,
126
+ column_result,
127
+ )
128
+
129
+ if column_rules.get("unique"):
130
+ _check_unique(
131
+ df,
132
+ column,
133
+ summary,
134
+ column_result,
135
+ )
136
+
137
+ results[column] = column_result
138
+
139
+ if summary["rules_checked"]:
140
+ summary["validation_score"] = round(
141
+ (
142
+ summary["rules_passed"]
143
+ / summary["rules_checked"]
144
+ ) * 100,
145
+ 2,
146
+ )
147
+
148
+ return {
149
+ "summary": summary,
150
+ "columns": results,
151
+ }
datra/cleaner.py ADDED
@@ -0,0 +1,74 @@
1
+ from copy import deepcopy
2
+
3
+ from datra.io import load, save
4
+ from datra.defaults import DEFAULT_CLEANING_RULES
5
+ from datra.cleaning.duplicates import remove_duplicates
6
+ from datra.cleaning.missing import fill_missing
7
+ from datra.cleaning.columns import standardize_column_names
8
+
9
+
10
+ def clean(
11
+ data,
12
+ *,
13
+ rules=None,
14
+ drop_duplicates=None,
15
+ fill_numeric=None,
16
+ fill_categorical=None,
17
+ standardize_columns=None,
18
+ output=None,
19
+ format=None,
20
+ ):
21
+ df = load(data)
22
+
23
+ rules = _build_rules(
24
+ rules,
25
+ drop_duplicates=drop_duplicates,
26
+ fill_numeric=fill_numeric,
27
+ fill_categorical=fill_categorical,
28
+ standardize_columns=standardize_columns,
29
+ )
30
+
31
+ cleaned = df.copy()
32
+
33
+ if rules["duplicates"]["drop"]:
34
+ cleaned = remove_duplicates(cleaned)
35
+
36
+ missing = rules["missing"]
37
+ if any(value is not None for value in missing.values()):
38
+ cleaned = fill_missing(cleaned, missing)
39
+
40
+ if rules["columns"]["standardize"]:
41
+ cleaned = standardize_column_names(cleaned)
42
+
43
+ if output is not None:
44
+ save(cleaned, output, format)
45
+
46
+ return cleaned
47
+
48
+
49
+ def _build_rules(
50
+ rules,
51
+ *,
52
+ drop_duplicates,
53
+ fill_numeric,
54
+ fill_categorical,
55
+ standardize_columns,
56
+ ):
57
+ merged = deepcopy(DEFAULT_CLEANING_RULES)
58
+
59
+ if rules is not None:
60
+ merged.update(rules)
61
+
62
+ if drop_duplicates is not None:
63
+ merged["duplicates"]["drop"] = drop_duplicates
64
+
65
+ if fill_numeric is not None:
66
+ merged["missing"]["numeric"] = fill_numeric
67
+
68
+ if fill_categorical is not None:
69
+ merged["missing"]["categorical"] = fill_categorical
70
+
71
+ if standardize_columns is not None:
72
+ merged["columns"]["standardize"] = standardize_columns
73
+
74
+ return merged
File without changes
@@ -0,0 +1,35 @@
1
+ import re
2
+ import pandas as pd
3
+
4
+
5
+ def standardize_column_names(df: pd.DataFrame):
6
+ cleaned = df.copy()
7
+
8
+ cleaned.columns = [
9
+ _standardize(column)
10
+ for column in cleaned.columns
11
+ ]
12
+
13
+ return cleaned
14
+
15
+
16
+ def _standardize(name: str) -> str:
17
+ name = name.strip()
18
+
19
+ #Ssplit acronym followed by a normal word.
20
+ # HTTPStatus -> HTTP_Status
21
+ name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", name)
22
+
23
+ # Split lowercase/ditgit followed by uppercase
24
+ name = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", name)
25
+
26
+ # Replace any non-alphanumeric characters with underscores
27
+ name = re.sub(r"[^A-Za-z0-9]+", "_", name)
28
+
29
+ # Collapse repeated underscores
30
+ name = re.sub(r"_+", "_", name)
31
+
32
+ # Remove leading/trailing underscores.
33
+ name = name.strip("_")
34
+
35
+ return name.lower()
@@ -0,0 +1,6 @@
1
+ import pandas as pd
2
+
3
+
4
+ def remove_duplicates(df: pd.DataFrame):
5
+
6
+ return df.drop_duplicates().copy()
@@ -0,0 +1,56 @@
1
+ import pandas as pd
2
+
3
+
4
+ def _fill_numeric(df: pd.DataFrame, strategy):
5
+ numeric_columns = df.select_dtypes(include="number").columns
6
+
7
+ if numeric_columns.empty:
8
+ return df
9
+
10
+ if strategy == "zero":
11
+ value = 0
12
+
13
+ elif strategy in ("mean", "median"):
14
+ value = df[numeric_columns].agg(strategy)
15
+
16
+ elif strategy == "mode":
17
+ mode_df = df[numeric_columns].mode()
18
+ value = mode_df.iloc[0] if not mode_df.empty else pd.Series(dtype="float64")
19
+
20
+
21
+ else:
22
+ raise ValueError(f"Unknown numeric strategy '{strategy}'. Use 'mean', 'median', 'mode', or 'zero'.")
23
+
24
+ df[numeric_columns] = df[numeric_columns].fillna(value)
25
+
26
+
27
+
28
+ def _fill_categorical(df: pd.DataFrame, strategy):
29
+ categorical_columns = df.select_dtypes(exclude="number").columns
30
+
31
+ if categorical_columns.empty:
32
+ return df
33
+
34
+ if strategy == "mode":
35
+ mode_df = df[categorical_columns].mode()
36
+ value = mode_df.iloc[0] if not mode_df.empty else pd.Series(dtype="object")
37
+
38
+ else:
39
+ raise ValueError(f"Unknown categorical strategy '{strategy}'. Use 'mode'.")
40
+
41
+ df[categorical_columns] = df[categorical_columns].fillna(value)
42
+
43
+
44
+ def fill_missing(df: pd.DataFrame, rules: dict) -> pd.DataFrame:
45
+ cleaned = df.copy()
46
+
47
+ numeric_strategy = rules.get("numeric")
48
+ categorical_strategy = rules.get("categorical")
49
+
50
+ if numeric_strategy:
51
+ _fill_numeric(cleaned, numeric_strategy)
52
+
53
+ if categorical_strategy:
54
+ _fill_categorical(cleaned, categorical_strategy)
55
+
56
+ return cleaned
datra/defaults.py ADDED
@@ -0,0 +1,12 @@
1
+ DEFAULT_CLEANING_RULES = {
2
+ "duplicates": {
3
+ "drop": True,
4
+ },
5
+ "missing": {
6
+ "numeric": None,
7
+ "categorical": None,
8
+ },
9
+ "columns": {
10
+ "standardize": True,
11
+ },
12
+ }
datra/io/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .load import load
2
+ from .save import save
3
+
4
+ __all__ = ["load", "save"]
datra/io/load.py ADDED
@@ -0,0 +1,34 @@
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+
5
+
6
+ _READERS = {
7
+ ".csv": pd.read_csv,
8
+ ".xlsx": pd.read_excel,
9
+ ".xls": pd.read_excel,
10
+ }
11
+
12
+
13
+ def load(data):
14
+ if isinstance(data, pd.DataFrame):
15
+ return data.copy()
16
+
17
+ if not isinstance(data, (str, Path)):
18
+ raise TypeError(
19
+ f"Unsupported input type: {type(data).__name__}."
20
+ )
21
+
22
+ path = Path(data)
23
+
24
+ if not path.exists():
25
+ raise FileNotFoundError(f"File not found: {path}")
26
+
27
+ try:
28
+ reader = _READERS[path.suffix.lower()]
29
+ except KeyError as exc:
30
+ raise ValueError(
31
+ f"Unsupported file type: '{path.suffix}'."
32
+ ) from exc
33
+
34
+ return reader(path)
datra/io/save.py ADDED
@@ -0,0 +1,46 @@
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+
5
+
6
+ def _save_csv(df: pd.DataFrame, path: Path):
7
+ df.to_csv(path, index=False)
8
+
9
+
10
+ def _save_excel(df: pd.DataFrame, path: Path):
11
+ df.to_excel(path, index=False)
12
+
13
+
14
+ _WRITERS = {
15
+ ".csv": _save_csv,
16
+ ".xlsx": _save_excel,
17
+ ".xls": _save_excel,
18
+ }
19
+
20
+ def _resolve_suffix(output: Path, format):
21
+ if format is not None:
22
+ suffix = "." + format.lower().lstrip(".")
23
+ output = output.with_suffix(suffix)
24
+ else:
25
+ suffix = output.suffix.lower()
26
+
27
+ return output, suffix
28
+
29
+ def save(df: pd.DataFrame, output, format=None):
30
+ output = Path(output)
31
+
32
+ output, suffix = _resolve_suffix(output, format)
33
+
34
+ if suffix not in _WRITERS:
35
+ supported = ", ".join(_WRITERS.keys())
36
+ raise ValueError(
37
+ f"Unsupported file format '{suffix}'. Supported formats are: {supported}"
38
+ )
39
+
40
+ writer = _WRITERS[suffix]
41
+
42
+ output.parent.mkdir(parents=True, exist_ok=True)
43
+
44
+ writer(df, output)
45
+
46
+ return output
@@ -0,0 +1,7 @@
1
+ from .builder import build_report
2
+ from .exporter import save_report
3
+
4
+ __all__ = [
5
+ "build_report",
6
+ "save_report",
7
+ ]
@@ -0,0 +1,35 @@
1
+ from datetime import datetime
2
+
3
+
4
+ def build_report(results: dict):
5
+ return {
6
+ "metadata": {
7
+ "generated_at": datetime.now().isoformat(),
8
+ "tool": "datra",
9
+ },
10
+
11
+ "summary": _build_summary(results),
12
+
13
+ "profile": results.get("profile", {}),
14
+
15
+ "checks": {
16
+ "completeness": results.get("completeness", {}),
17
+ "uniqueness": results.get("uniqueness", {}),
18
+ "outliers": results.get("outliers", {}),
19
+ },
20
+
21
+ "score": results.get("score", {}),
22
+ }
23
+
24
+
25
+ def _build_summary(results: dict):
26
+ score = results.get("score", {})
27
+
28
+ return {
29
+ "dataset_quality_score": score.get("overall_score"),
30
+ "rows": results.get("profile", {}).get("rows"),
31
+ "columns": results.get("profile", {}).get("columns"),
32
+ "has_duplicates": bool(results.get("uniqueness", {}).get("duplicate_rows", 0)),
33
+ "has_missing_values": bool(results.get("completeness")),
34
+ "has_outliers": bool(results.get("outliers")),
35
+ }
@@ -0,0 +1,35 @@
1
+ import json
2
+ from pathlib import Path
3
+ from datetime import datetime
4
+
5
+ from .templates import report_to_html
6
+
7
+
8
+ def save_report(
9
+ report,
10
+ output="outputs",
11
+ format="json",
12
+ ):
13
+ output = Path(output)
14
+ output.mkdir(parents=True, exist_ok=True)
15
+
16
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
17
+
18
+ filename = f"datra_report_{timestamp}.{format}"
19
+
20
+ path = output / filename
21
+
22
+ if format == "json":
23
+ with open(path, "w", encoding="utf-8") as f:
24
+ json.dump(report, f, indent=4)
25
+
26
+ elif format == "html":
27
+ with open(path, "w", encoding="utf-8") as f:
28
+ f.write(report_to_html(report))
29
+
30
+ else:
31
+ raise ValueError(
32
+ f"Unsupported report format '{format}'."
33
+ )
34
+
35
+ return path
@@ -0,0 +1,26 @@
1
+ import json
2
+
3
+
4
+ def report_to_html(report: dict):
5
+ return f"""
6
+ <!DOCTYPE html>
7
+ <html>
8
+ <head>
9
+ <meta charset="utf-8">
10
+ <title>Datra Report</title>
11
+ </head>
12
+
13
+ <body>
14
+ <h1>Datra Audit Report</h1>
15
+
16
+ <h2>Summary</h2>
17
+ <pre>{json.dumps(report["summary"], indent=4)}</pre>
18
+
19
+ <h2>Score</h2>
20
+ <pre>{json.dumps(report["score"], indent=4)}</pre>
21
+
22
+ <h2>Checks</h2>
23
+ <pre>{json.dumps(report["checks"], indent=4)}</pre>
24
+ </body>
25
+ </html>
26
+ """
File without changes
datra/scoring/score.py ADDED
@@ -0,0 +1,51 @@
1
+ def calculate_score(results: dict):
2
+ scores = {}
3
+
4
+ comp = results.get("completeness", {})
5
+ if comp:
6
+ ratios = [
7
+ v["completeness_ratio"]
8
+ for v in comp.values()
9
+ ]
10
+ scores["completeness"] = (
11
+ sum(ratios) / len(ratios)
12
+ if ratios
13
+ else 0
14
+ )
15
+ else:
16
+ scores["completeness"] = 0
17
+
18
+
19
+ # lower duplicates = better
20
+ uniq = results.get("uniqueness", {})
21
+ if uniq:
22
+ dup_ratio = uniq.get("duplicate_ratio", 1)
23
+ scores["uniqueness"] = 1 - dup_ratio
24
+ else:
25
+ scores["uniqueness"] = 0
26
+
27
+
28
+ out = results.get("outliers", {})
29
+ if out:
30
+ ratios = [
31
+ v["outlier_ratio"]
32
+ for v in out.values()
33
+ ]
34
+ avg_outlier = sum(ratios) / len(ratios)
35
+ scores["outliers"] = 1 - avg_outlier
36
+ else:
37
+ scores["outliers"] = 1
38
+
39
+
40
+ overall_score = round(
41
+ sum(scores.values()) / len(scores),
42
+ 2
43
+ )
44
+
45
+ return {
46
+ "scores": {
47
+ k: round(v, 2)
48
+ for k, v in scores.items()
49
+ },
50
+ "overall_score": overall_score,
51
+ }
@@ -0,0 +1,338 @@
1
+ Metadata-Version: 2.4
2
+ Name: datra
3
+ Version: 0.1.0
4
+ Summary: A lightweight Python library for cleaning, auditing, and validating tabular data.
5
+ Author-email: Raphael <raphaeljames897@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Raphael James
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/Raphaelj1/datra
29
+ Project-URL: Repository, https://github.com/Raphaelj1/datra
30
+ Project-URL: Issues, https://github.com/Raphaelj1/datra/issues
31
+ Keywords: data-cleaning,data-quality,validation,pandas,data-analysis
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
36
+ Classifier: Topic :: Software Development :: Libraries
37
+ Classifier: License :: OSI Approved :: MIT License
38
+ Classifier: Operating System :: OS Independent
39
+ Classifier: Programming Language :: Python :: 3
40
+ Classifier: Programming Language :: Python :: 3.13
41
+ Requires-Python: >=3.10
42
+ Description-Content-Type: text/markdown
43
+ License-File: LICENSE
44
+ Requires-Dist: pandas<3.0,>=2.0
45
+ Requires-Dist: openpyxl>=3.1
46
+ Provides-Extra: dev
47
+ Requires-Dist: pytest>=8; extra == "dev"
48
+ Requires-Dist: build; extra == "dev"
49
+ Requires-Dist: twine; extra == "dev"
50
+ Dynamic: license-file
51
+
52
+ # Datra
53
+
54
+ A lightweight Python library for cleaning, auditing, and validating tabular data. It helps data scientists, analysts, and engineers quickly identify data quality issues, clean datasets using simple rules, and generate reports.
55
+
56
+ ## Why Datra?
57
+
58
+ Data quality problems often consume more time than analysis itself. Missing values, duplicate records, inconsistent column names, and invalid entries can silently affect downstream models and business decisions.
59
+
60
+ Datra provides a simple workflow for understanding and improving dataset quality before analysis or machine learning.
61
+
62
+ With Datra, you can:
63
+
64
+ - Audit datasets to identify quality issues.
65
+ - Clean data using configurable rules.
66
+ - Validate datasets against business rules.
67
+ - Generate JSON and HTML quality reports.
68
+ - Work directly with Pandas DataFrames or CSV and Excel files.
69
+
70
+ ## Features
71
+
72
+ - Dataset profiling
73
+ - Missing value analysis
74
+ - Duplicate detection
75
+ - Outlier detection (IQR-based)
76
+ - Rule-based data validation
77
+ - Automated data quality scoring
78
+ - Configurable data cleaning
79
+ - Column name standardization
80
+ - Support for Pandas DataFrames
81
+ - CSV and Excel file support
82
+ - JSON and HTML report generation
83
+ - Save cleaned datasets directly to disk
84
+
85
+ ## Installation
86
+
87
+ Install Datra from PyPI:
88
+
89
+ ```bash
90
+ pip install datra
91
+ ```
92
+
93
+ Or install the latest development version:
94
+
95
+ ```bash
96
+ git clone https://github.com/raphaelj1/datra.git
97
+
98
+ cd datra
99
+
100
+ pip install -e .
101
+ ```
102
+
103
+ ## Quick Start
104
+
105
+ ### Clean a dataset
106
+
107
+ ```python
108
+ from datra import clean
109
+
110
+ cleaned = clean(
111
+ "patients.csv",
112
+ drop_duplicates=True,
113
+ fill_numeric="median",
114
+ fill_categorical="mode",
115
+ standardize_columns=True,
116
+ )
117
+ ```
118
+
119
+ ### Audit a dataset
120
+
121
+ ```python
122
+ from datra import Audit
123
+
124
+ audit = Audit("patients.csv")
125
+
126
+ print(audit.profile)
127
+ print(audit.score)
128
+ ```
129
+
130
+ ### Validate a dataset
131
+
132
+ ```python
133
+ rules = {
134
+ "Age": {
135
+ "min": 0,
136
+ "max": 120,
137
+ },
138
+ "Gender": {
139
+ "allowed": [
140
+ "Male",
141
+ "Female",
142
+ ],
143
+ },
144
+ }
145
+
146
+ report = audit.validate(rules)
147
+ ```
148
+
149
+ ## Cleaning Data
150
+
151
+ The `clean()` function applies one or more cleaning operations to a dataset and returns a new DataFrame. It accepts either a Pandas DataFrame or the path to a CSV or Excel file.
152
+
153
+ ### Using keyword arguments
154
+
155
+ ```python
156
+ from datra import clean
157
+
158
+ cleaned = clean(
159
+ "patients.csv",
160
+ drop_duplicates=True,
161
+ fill_numeric="median",
162
+ fill_categorical="mode",
163
+ standardize_columns=True,
164
+ )
165
+ ```
166
+
167
+ ### Using cleaning rules
168
+
169
+ ```python
170
+ rules = {
171
+ "duplicates": {
172
+ "drop": True,
173
+ },
174
+ "missing": {
175
+ "numeric": "median",
176
+ "categorical": "mode",
177
+ },
178
+ "columns": {
179
+ "standardize": True,
180
+ },
181
+ }
182
+
183
+ cleaned = clean("patients.csv", rules=rules)
184
+ ```
185
+
186
+ ### Save the cleaned dataset
187
+
188
+ ```python
189
+ clean(
190
+ "patients.csv",
191
+ drop_duplicates=True,
192
+ output="cleaned_patients.xlsx",
193
+ )
194
+ ```
195
+
196
+ ## Auditing Data
197
+
198
+ Create an audit object to inspect dataset quality.
199
+
200
+ ```python
201
+ from datra import Audit
202
+
203
+ audit = Audit("patients.csv")
204
+ ```
205
+
206
+ Retrieve individual quality checks.
207
+
208
+ ```python
209
+ audit.profile
210
+
211
+ audit.completeness
212
+
213
+ audit.uniqueness
214
+
215
+ audit.outliers
216
+
217
+ audit.score
218
+ ```
219
+
220
+ Or access all audit results at once.
221
+
222
+ ```python
223
+ audit.results
224
+ ```
225
+
226
+ ## Validation
227
+
228
+ Validate datasets against custom business rules.
229
+
230
+ ```python
231
+ rules = {
232
+ "Age": {
233
+ "min": 0,
234
+ "max": 120,
235
+ },
236
+ "Patient ID": {
237
+ "unique": True,
238
+ },
239
+ "Gender": {
240
+ "allowed": [
241
+ "Male",
242
+ "Female",
243
+ ],
244
+ },
245
+ }
246
+
247
+ report = audit.validate(rules)
248
+ ```
249
+
250
+ Validation returns a structured report describing which checks passed, which failed, and the number of violations for each rule.
251
+
252
+ ## Reports
253
+
254
+ Build a data quality report as a Python dictionary.
255
+
256
+ ```python
257
+ from datra import Audit
258
+
259
+ audit = Audit("patients.csv")
260
+
261
+ report = audit.build_report()
262
+ ```
263
+
264
+ Save the report as JSON.
265
+
266
+ ```python
267
+ audit.save_report(
268
+ format="json",
269
+ )
270
+ ```
271
+
272
+ Or save it as an HTML report.
273
+
274
+ ```python
275
+ audit.save_report(
276
+ format="html",
277
+ )
278
+ ```
279
+
280
+ ## Supported File Formats
281
+
282
+ Datra supports both Pandas DataFrames and common tabular file formats.
283
+
284
+ | Input | Supported |
285
+ | ---------------- | --------- |
286
+ | Pandas DataFrame | ✅ |
287
+ | CSV | ✅ |
288
+ | Excel (.xlsx) | ✅ |
289
+ | Excel (.xls) | ✅ |
290
+
291
+ ### Report Formats
292
+
293
+ | Format | Supported |
294
+ | ------ | ---------- |
295
+ | JSON | ✅ |
296
+ | HTML | ✅ |
297
+ | PDF | 🚧 Planned |
298
+
299
+ ## Project Structure
300
+
301
+ ```
302
+ datra/
303
+ ├── datra/ # Library source code
304
+ ├── examples/ # Example usage
305
+ ├── tests/
306
+ ├── pyproject.toml
307
+ ├── README.md
308
+ └── LICENSE
309
+ ```
310
+
311
+ ## Roadmap
312
+
313
+ Planned improvements include:
314
+
315
+ - PDF report generation
316
+ - Command-line interface (CLI)
317
+ - Additional cleaning operations
318
+ - Additional validation rules
319
+ - More data quality checks
320
+ - Interactive HTML reports
321
+ - Support for additional file formats
322
+
323
+ ## Contributing
324
+
325
+ Contributions, feature requests, and bug reports are welcome.
326
+
327
+ If you would like to contribute:
328
+
329
+ 1. Fork the repository.
330
+ 2. Create a new feature branch.
331
+ 3. Commit your changes.
332
+ 4. Open a pull request.
333
+
334
+ Please ensure all tests pass before submitting a pull request.
335
+
336
+ ## License
337
+
338
+ This project is licensed under the MIT License.
@@ -0,0 +1,28 @@
1
+ datra/__init__.py,sha256=1hClFgDmV-XDtPolfJHiIYsnkiSXCnoWLxKmk-FQZiw,84
2
+ datra/audit.py,sha256=6R13Pn4gDvDcNEr0cs2s4UdwcDU3m7_Ph1lxQZDE7_A,1813
3
+ datra/cleaner.py,sha256=eBLk_aKj6Ot-VKwTRUV7m-l9rZbNhwW0WWMKcU8sKYs,1850
4
+ datra/defaults.py,sha256=nMpeF2HUm72zBfnFxWfBAKf_uZEr5zkL-xqim1kfePI,219
5
+ datra/checks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ datra/checks/completeness.py,sha256=Mmj2WpVvr6BtoGd7s5ZIPeEFKRPhGoXhV_Y5IJf4QzA,433
7
+ datra/checks/outliers.py,sha256=sSui1_-nFxRNEC2jnuKn3jrkDn6lhFSNcI9ZDm2e3mI,906
8
+ datra/checks/profile.py,sha256=FjTwEFHjLh0DdhJkQoIaGOfpsmZfkqFJ-fzFJACX8Z4,1335
9
+ datra/checks/uniqueness.py,sha256=O8nLFvQzj7ryEn_9kxcBqEXn3yUATHmSpKco6GTaUvQ,512
10
+ datra/checks/validate.py,sha256=aaCZvnLPt7txUMtyB_SxodKVa2mS047ChG9lVFjVkiI,3441
11
+ datra/cleaning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ datra/cleaning/columns.py,sha256=4YSTWOhxa0sVBZNyNP6Yq0ONz4rY7cEPMda8RSj9PxM,844
13
+ datra/cleaning/duplicates.py,sha256=pEOR_Zo4TnITPSck64ohmrFc0AWz0TTxjjEYBJ6YS5g,107
14
+ datra/cleaning/missing.py,sha256=yrtC265kicr7iogGXJsFv7jZb_uOq_-T1mHNaYx0INg,1598
15
+ datra/io/__init__.py,sha256=2gVWbsNF1AnyonLfvIANNxC0kKipQZf0gUv4KO2jsWk,76
16
+ datra/io/load.py,sha256=pWVexMzIfeVN3xvYClFLbO415_ctuodLtvz-B1FM4MA,718
17
+ datra/io/save.py,sha256=zB54SYGyjRjteyXj5k_02c1RlJIQZTL9fiesWJlS-s4,1053
18
+ datra/reports/__init__.py,sha256=n-8S8-Ru9tKkuCyKaXzsEerFCIb-wBh9taHAdTBu444,127
19
+ datra/reports/builder.py,sha256=Od1S7ozc4w1eeGyc5ZA8K2jTpMyIXX8xlN1fD-cguok,1058
20
+ datra/reports/exporter.py,sha256=uv0m1ELxH2y7qKQP-SZnlv0ihIxK396qDOCs8YIKT1A,791
21
+ datra/reports/templates.py,sha256=pzvTMmN1SYupLZ3J8A6bzRzyCUCXxBRCNxeRZAhLGf4,664
22
+ datra/scoring/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ datra/scoring/score.py,sha256=oOnZuKIWEkiaV5Ae_KMtg1Vl29Rkj9ttLMlXuyI31iU,1189
24
+ datra-0.1.0.dist-info/licenses/LICENSE,sha256=06G7x2RjMHSDF3YlizZYiWN6NKucfVoKIfIM9YLmZQQ,1091
25
+ datra-0.1.0.dist-info/METADATA,sha256=BGNss3O0t5tAj42HNxjvnc85YF4xvW-WzclbfR4IBjc,8036
26
+ datra-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
27
+ datra-0.1.0.dist-info/top_level.txt,sha256=tikJ6dyoHKVnA0xDo5hELjedEDb-NxP4IURNI1FQpaE,6
28
+ datra-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Raphael James
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ datra