datra 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datra/__init__.py +4 -0
- datra/audit.py +60 -0
- datra/checks/__init__.py +0 -0
- datra/checks/completeness.py +19 -0
- datra/checks/outliers.py +35 -0
- datra/checks/profile.py +49 -0
- datra/checks/uniqueness.py +18 -0
- datra/checks/validate.py +151 -0
- datra/cleaner.py +74 -0
- datra/cleaning/__init__.py +0 -0
- datra/cleaning/columns.py +35 -0
- datra/cleaning/duplicates.py +6 -0
- datra/cleaning/missing.py +56 -0
- datra/defaults.py +12 -0
- datra/io/__init__.py +4 -0
- datra/io/load.py +34 -0
- datra/io/save.py +46 -0
- datra/reports/__init__.py +7 -0
- datra/reports/builder.py +35 -0
- datra/reports/exporter.py +35 -0
- datra/reports/templates.py +26 -0
- datra/scoring/__init__.py +0 -0
- datra/scoring/score.py +51 -0
- datra-0.1.0.dist-info/METADATA +338 -0
- datra-0.1.0.dist-info/RECORD +28 -0
- datra-0.1.0.dist-info/WHEEL +5 -0
- datra-0.1.0.dist-info/licenses/LICENSE +21 -0
- datra-0.1.0.dist-info/top_level.txt +1 -0
datra/__init__.py
ADDED
datra/audit.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from functools import cached_property
|
|
2
|
+
|
|
3
|
+
from datra.io import load
|
|
4
|
+
from datra.checks.validate import validate as validate_df
|
|
5
|
+
from datra.checks.completeness import completeness as check_completeness
|
|
6
|
+
from datra.checks.uniqueness import uniqueness as check_uniqueness
|
|
7
|
+
from datra.checks.outliers import outliers as check_outliers
|
|
8
|
+
from datra.checks.profile import profile as check_profile
|
|
9
|
+
from datra.scoring.score import calculate_score
|
|
10
|
+
from datra.reports import build_report, save_report
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Audit:
|
|
14
|
+
def __init__(self, input_data):
|
|
15
|
+
self.df = load(input_data)
|
|
16
|
+
|
|
17
|
+
@cached_property
|
|
18
|
+
def completeness(self):
|
|
19
|
+
return check_completeness(self.df)
|
|
20
|
+
|
|
21
|
+
@cached_property
|
|
22
|
+
def uniqueness(self):
|
|
23
|
+
return check_uniqueness(self.df)
|
|
24
|
+
|
|
25
|
+
@cached_property
|
|
26
|
+
def outliers(self):
|
|
27
|
+
return check_outliers(self.df)
|
|
28
|
+
|
|
29
|
+
@cached_property
|
|
30
|
+
def profile(self):
|
|
31
|
+
return check_profile(self.df)
|
|
32
|
+
|
|
33
|
+
@cached_property
|
|
34
|
+
def score(self):
|
|
35
|
+
metrics_payload = {
|
|
36
|
+
"completeness": self.completeness,
|
|
37
|
+
"uniqueness": self.uniqueness,
|
|
38
|
+
"outliers": self.outliers,
|
|
39
|
+
}
|
|
40
|
+
return calculate_score(metrics_payload)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def results(self):
|
|
44
|
+
return {
|
|
45
|
+
"profile": self.profile,
|
|
46
|
+
"completeness": self.completeness,
|
|
47
|
+
"uniqueness": self.uniqueness,
|
|
48
|
+
"outliers": self.outliers,
|
|
49
|
+
"score": self.score,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def validate(self, rules: dict):
|
|
53
|
+
return validate_df(self.df, rules)
|
|
54
|
+
|
|
55
|
+
def build_report(self):
|
|
56
|
+
return build_report(self.results)
|
|
57
|
+
|
|
58
|
+
def save_report(self, path="outputs", format="json"):
|
|
59
|
+
report = self.build_report()
|
|
60
|
+
return save_report(report, output=path, format=format)
|
datra/checks/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def completeness(df: pd.DataFrame):
|
|
5
|
+
total_rows = len(df)
|
|
6
|
+
|
|
7
|
+
result = {}
|
|
8
|
+
|
|
9
|
+
for col in df.columns:
|
|
10
|
+
missing = df[col].isna().sum()
|
|
11
|
+
filled = total_rows - missing
|
|
12
|
+
|
|
13
|
+
result[col] = {
|
|
14
|
+
"missing_values": int(missing),
|
|
15
|
+
"filled_values": int(filled),
|
|
16
|
+
"completeness_ratio": float(round(filled / total_rows, 4))
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
return result
|
datra/checks/outliers.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def outliers(df: pd.DataFrame):
|
|
5
|
+
results = {}
|
|
6
|
+
|
|
7
|
+
numeric_cols = df.select_dtypes(include="number").columns
|
|
8
|
+
|
|
9
|
+
for col in numeric_cols:
|
|
10
|
+
series = df[col].dropna()
|
|
11
|
+
|
|
12
|
+
if series.empty:
|
|
13
|
+
continue
|
|
14
|
+
|
|
15
|
+
q1 = series.quantile(0.25)
|
|
16
|
+
q3 = series.quantile(0.75)
|
|
17
|
+
|
|
18
|
+
iqr = q3 - q1
|
|
19
|
+
|
|
20
|
+
lower_bound = q1 - 1.5 * iqr
|
|
21
|
+
upper_bound = q3 + 1.5 * iqr
|
|
22
|
+
|
|
23
|
+
outliers_mask = (series < lower_bound) | (series > upper_bound)
|
|
24
|
+
|
|
25
|
+
outlier_values = series[outliers_mask]
|
|
26
|
+
|
|
27
|
+
results[col] = {
|
|
28
|
+
"lower_bound": float(lower_bound),
|
|
29
|
+
"upper_bound": float(upper_bound),
|
|
30
|
+
"outlier_count": int(outliers_mask.sum()),
|
|
31
|
+
"outlier_ratio": float(round(outliers_mask.mean(), 4)),
|
|
32
|
+
# "sample_outliers": outlier_values.head(5).tolist()
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return results
|
datra/checks/profile.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def profile(df: pd.DataFrame):
|
|
5
|
+
rows, columns = df.shape
|
|
6
|
+
|
|
7
|
+
numeric_columns = len(
|
|
8
|
+
df.select_dtypes(include="number").columns
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
categorical_columns = len(
|
|
12
|
+
df.select_dtypes(include=["object", "category"]).columns
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
datetime_columns = len(
|
|
16
|
+
df.select_dtypes(include=["datetime64"]).columns
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
memory_usage_mb = float(round(
|
|
20
|
+
df.memory_usage(deep=True).sum() / (1024 * 1024),
|
|
21
|
+
2
|
|
22
|
+
))
|
|
23
|
+
|
|
24
|
+
duplicate_rows = int(df.duplicated().sum())
|
|
25
|
+
|
|
26
|
+
missing_cells = int(df.isna().sum().sum())
|
|
27
|
+
|
|
28
|
+
# columns = {}
|
|
29
|
+
|
|
30
|
+
# for col in df.columns:
|
|
31
|
+
# col_data = df[col]
|
|
32
|
+
|
|
33
|
+
# columns[col] = {
|
|
34
|
+
# "dtype": str(col_data.dtype),
|
|
35
|
+
# "missing_pct": round(col_data.isna().mean() * 100, 2),
|
|
36
|
+
# "unique_count": int(col_data.nunique(dropna=True)),
|
|
37
|
+
# "sample_values": col_data.dropna().astype(str).head(3).tolist()
|
|
38
|
+
# }
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
"rows": rows,
|
|
42
|
+
"columns": columns,
|
|
43
|
+
"numeric_columns": numeric_columns,
|
|
44
|
+
"categorical_columns": categorical_columns,
|
|
45
|
+
"datetime_columns": datetime_columns,
|
|
46
|
+
"memory_usage_mb": memory_usage_mb,
|
|
47
|
+
"duplicate_rows": duplicate_rows,
|
|
48
|
+
"missing_cells": missing_cells,
|
|
49
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def uniqueness(df: pd.DataFrame):
|
|
5
|
+
total_rows = len(df)
|
|
6
|
+
|
|
7
|
+
dup_mask = df.duplicated()
|
|
8
|
+
duplicate_rows = dup_mask.sum()
|
|
9
|
+
|
|
10
|
+
result = {
|
|
11
|
+
"total_rows": total_rows,
|
|
12
|
+
"unique_rows": int(len(df) - duplicate_rows),
|
|
13
|
+
"duplicate_rows": int(duplicate_rows),
|
|
14
|
+
"duplicate_ratio": float(round(duplicate_rows / total_rows, 4) if total_rows else 0),
|
|
15
|
+
# "duplicate_sample": df[dup_mask].head(5).to_dict(orient="records")
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
return result
|
datra/checks/validate.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _record_check(summary, column_result, check_name, passed, **details):
|
|
5
|
+
column_result["checks"][check_name] = {
|
|
6
|
+
"passed": passed,
|
|
7
|
+
**details,
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
summary["rules_checked"] += 1
|
|
11
|
+
|
|
12
|
+
if passed:
|
|
13
|
+
summary["rules_passed"] += 1
|
|
14
|
+
else:
|
|
15
|
+
summary["rules_failed"] += 1
|
|
16
|
+
column_result["passed"] = False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _check_minimum(df, column, minimum, summary, column_result):
|
|
20
|
+
violations = int((df[column] < minimum).sum())
|
|
21
|
+
|
|
22
|
+
passed = violations == 0
|
|
23
|
+
|
|
24
|
+
_record_check(
|
|
25
|
+
summary,
|
|
26
|
+
column_result,
|
|
27
|
+
"minimum",
|
|
28
|
+
passed,
|
|
29
|
+
expected=minimum,
|
|
30
|
+
violations=violations,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _check_maximum(df, column, maximum, summary, column_result):
|
|
35
|
+
violations = int((df[column] > maximum).sum())
|
|
36
|
+
|
|
37
|
+
passed = violations == 0
|
|
38
|
+
|
|
39
|
+
_record_check(
|
|
40
|
+
summary,
|
|
41
|
+
column_result,
|
|
42
|
+
"maximum",
|
|
43
|
+
passed,
|
|
44
|
+
expected=maximum,
|
|
45
|
+
violations=violations,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _check_allowed(df, column, allowed, summary, column_result):
|
|
50
|
+
violations = int(
|
|
51
|
+
(~df[column].isin(allowed)).sum()
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
passed = violations == 0
|
|
55
|
+
|
|
56
|
+
_record_check(
|
|
57
|
+
summary,
|
|
58
|
+
column_result,
|
|
59
|
+
"allowed",
|
|
60
|
+
passed,
|
|
61
|
+
expected=allowed,
|
|
62
|
+
violations=violations,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _check_unique(df, column, summary, column_result):
|
|
67
|
+
violations = int(df[column].duplicated().sum())
|
|
68
|
+
|
|
69
|
+
passed = violations == 0
|
|
70
|
+
|
|
71
|
+
_record_check(
|
|
72
|
+
summary,
|
|
73
|
+
column_result,
|
|
74
|
+
"unique",
|
|
75
|
+
passed,
|
|
76
|
+
violations=violations,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def validate(df: pd.DataFrame, rules: dict):
|
|
81
|
+
summary = {
|
|
82
|
+
"columns_checked": 0,
|
|
83
|
+
"rules_checked": 0,
|
|
84
|
+
"rules_passed": 0,
|
|
85
|
+
"rules_failed": 0,
|
|
86
|
+
"validation_score": 0.0,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
results = {}
|
|
90
|
+
|
|
91
|
+
for column, column_rules in rules.items():
|
|
92
|
+
if column not in df.columns:
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
summary["columns_checked"] += 1
|
|
96
|
+
|
|
97
|
+
column_result = {
|
|
98
|
+
"passed": True,
|
|
99
|
+
"checks": {}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if "min" in column_rules:
|
|
103
|
+
_check_minimum(
|
|
104
|
+
df,
|
|
105
|
+
column,
|
|
106
|
+
column_rules["min"],
|
|
107
|
+
summary,
|
|
108
|
+
column_result,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if "max" in column_rules:
|
|
112
|
+
_check_maximum(
|
|
113
|
+
df,
|
|
114
|
+
column,
|
|
115
|
+
column_rules["max"],
|
|
116
|
+
summary,
|
|
117
|
+
column_result,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if "allowed" in column_rules:
|
|
121
|
+
_check_allowed(
|
|
122
|
+
df,
|
|
123
|
+
column,
|
|
124
|
+
column_rules["allowed"],
|
|
125
|
+
summary,
|
|
126
|
+
column_result,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if column_rules.get("unique"):
|
|
130
|
+
_check_unique(
|
|
131
|
+
df,
|
|
132
|
+
column,
|
|
133
|
+
summary,
|
|
134
|
+
column_result,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
results[column] = column_result
|
|
138
|
+
|
|
139
|
+
if summary["rules_checked"]:
|
|
140
|
+
summary["validation_score"] = round(
|
|
141
|
+
(
|
|
142
|
+
summary["rules_passed"]
|
|
143
|
+
/ summary["rules_checked"]
|
|
144
|
+
) * 100,
|
|
145
|
+
2,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return {
|
|
149
|
+
"summary": summary,
|
|
150
|
+
"columns": results,
|
|
151
|
+
}
|
datra/cleaner.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
2
|
+
|
|
3
|
+
from datra.io import load, save
|
|
4
|
+
from datra.defaults import DEFAULT_CLEANING_RULES
|
|
5
|
+
from datra.cleaning.duplicates import remove_duplicates
|
|
6
|
+
from datra.cleaning.missing import fill_missing
|
|
7
|
+
from datra.cleaning.columns import standardize_column_names
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def clean(
|
|
11
|
+
data,
|
|
12
|
+
*,
|
|
13
|
+
rules=None,
|
|
14
|
+
drop_duplicates=None,
|
|
15
|
+
fill_numeric=None,
|
|
16
|
+
fill_categorical=None,
|
|
17
|
+
standardize_columns=None,
|
|
18
|
+
output=None,
|
|
19
|
+
format=None,
|
|
20
|
+
):
|
|
21
|
+
df = load(data)
|
|
22
|
+
|
|
23
|
+
rules = _build_rules(
|
|
24
|
+
rules,
|
|
25
|
+
drop_duplicates=drop_duplicates,
|
|
26
|
+
fill_numeric=fill_numeric,
|
|
27
|
+
fill_categorical=fill_categorical,
|
|
28
|
+
standardize_columns=standardize_columns,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
cleaned = df.copy()
|
|
32
|
+
|
|
33
|
+
if rules["duplicates"]["drop"]:
|
|
34
|
+
cleaned = remove_duplicates(cleaned)
|
|
35
|
+
|
|
36
|
+
missing = rules["missing"]
|
|
37
|
+
if any(value is not None for value in missing.values()):
|
|
38
|
+
cleaned = fill_missing(cleaned, missing)
|
|
39
|
+
|
|
40
|
+
if rules["columns"]["standardize"]:
|
|
41
|
+
cleaned = standardize_column_names(cleaned)
|
|
42
|
+
|
|
43
|
+
if output is not None:
|
|
44
|
+
save(cleaned, output, format)
|
|
45
|
+
|
|
46
|
+
return cleaned
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _build_rules(
|
|
50
|
+
rules,
|
|
51
|
+
*,
|
|
52
|
+
drop_duplicates,
|
|
53
|
+
fill_numeric,
|
|
54
|
+
fill_categorical,
|
|
55
|
+
standardize_columns,
|
|
56
|
+
):
|
|
57
|
+
merged = deepcopy(DEFAULT_CLEANING_RULES)
|
|
58
|
+
|
|
59
|
+
if rules is not None:
|
|
60
|
+
merged.update(rules)
|
|
61
|
+
|
|
62
|
+
if drop_duplicates is not None:
|
|
63
|
+
merged["duplicates"]["drop"] = drop_duplicates
|
|
64
|
+
|
|
65
|
+
if fill_numeric is not None:
|
|
66
|
+
merged["missing"]["numeric"] = fill_numeric
|
|
67
|
+
|
|
68
|
+
if fill_categorical is not None:
|
|
69
|
+
merged["missing"]["categorical"] = fill_categorical
|
|
70
|
+
|
|
71
|
+
if standardize_columns is not None:
|
|
72
|
+
merged["columns"]["standardize"] = standardize_columns
|
|
73
|
+
|
|
74
|
+
return merged
|
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def standardize_column_names(df: pd.DataFrame):
|
|
6
|
+
cleaned = df.copy()
|
|
7
|
+
|
|
8
|
+
cleaned.columns = [
|
|
9
|
+
_standardize(column)
|
|
10
|
+
for column in cleaned.columns
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
return cleaned
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _standardize(name: str) -> str:
|
|
17
|
+
name = name.strip()
|
|
18
|
+
|
|
19
|
+
#Ssplit acronym followed by a normal word.
|
|
20
|
+
# HTTPStatus -> HTTP_Status
|
|
21
|
+
name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", name)
|
|
22
|
+
|
|
23
|
+
# Split lowercase/ditgit followed by uppercase
|
|
24
|
+
name = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", name)
|
|
25
|
+
|
|
26
|
+
# Replace any non-alphanumeric characters with underscores
|
|
27
|
+
name = re.sub(r"[^A-Za-z0-9]+", "_", name)
|
|
28
|
+
|
|
29
|
+
# Collapse repeated underscores
|
|
30
|
+
name = re.sub(r"_+", "_", name)
|
|
31
|
+
|
|
32
|
+
# Remove leading/trailing underscores.
|
|
33
|
+
name = name.strip("_")
|
|
34
|
+
|
|
35
|
+
return name.lower()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _fill_numeric(df: pd.DataFrame, strategy):
|
|
5
|
+
numeric_columns = df.select_dtypes(include="number").columns
|
|
6
|
+
|
|
7
|
+
if numeric_columns.empty:
|
|
8
|
+
return df
|
|
9
|
+
|
|
10
|
+
if strategy == "zero":
|
|
11
|
+
value = 0
|
|
12
|
+
|
|
13
|
+
elif strategy in ("mean", "median"):
|
|
14
|
+
value = df[numeric_columns].agg(strategy)
|
|
15
|
+
|
|
16
|
+
elif strategy == "mode":
|
|
17
|
+
mode_df = df[numeric_columns].mode()
|
|
18
|
+
value = mode_df.iloc[0] if not mode_df.empty else pd.Series(dtype="float64")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
else:
|
|
22
|
+
raise ValueError(f"Unknown numeric strategy '{strategy}'. Use 'mean', 'median', 'mode', or 'zero'.")
|
|
23
|
+
|
|
24
|
+
df[numeric_columns] = df[numeric_columns].fillna(value)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _fill_categorical(df: pd.DataFrame, strategy):
|
|
29
|
+
categorical_columns = df.select_dtypes(exclude="number").columns
|
|
30
|
+
|
|
31
|
+
if categorical_columns.empty:
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
if strategy == "mode":
|
|
35
|
+
mode_df = df[categorical_columns].mode()
|
|
36
|
+
value = mode_df.iloc[0] if not mode_df.empty else pd.Series(dtype="object")
|
|
37
|
+
|
|
38
|
+
else:
|
|
39
|
+
raise ValueError(f"Unknown categorical strategy '{strategy}'. Use 'mode'.")
|
|
40
|
+
|
|
41
|
+
df[categorical_columns] = df[categorical_columns].fillna(value)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def fill_missing(df: pd.DataFrame, rules: dict) -> pd.DataFrame:
|
|
45
|
+
cleaned = df.copy()
|
|
46
|
+
|
|
47
|
+
numeric_strategy = rules.get("numeric")
|
|
48
|
+
categorical_strategy = rules.get("categorical")
|
|
49
|
+
|
|
50
|
+
if numeric_strategy:
|
|
51
|
+
_fill_numeric(cleaned, numeric_strategy)
|
|
52
|
+
|
|
53
|
+
if categorical_strategy:
|
|
54
|
+
_fill_categorical(cleaned, categorical_strategy)
|
|
55
|
+
|
|
56
|
+
return cleaned
|
datra/defaults.py
ADDED
datra/io/__init__.py
ADDED
datra/io/load.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
_READERS = {
|
|
7
|
+
".csv": pd.read_csv,
|
|
8
|
+
".xlsx": pd.read_excel,
|
|
9
|
+
".xls": pd.read_excel,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def load(data):
|
|
14
|
+
if isinstance(data, pd.DataFrame):
|
|
15
|
+
return data.copy()
|
|
16
|
+
|
|
17
|
+
if not isinstance(data, (str, Path)):
|
|
18
|
+
raise TypeError(
|
|
19
|
+
f"Unsupported input type: {type(data).__name__}."
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
path = Path(data)
|
|
23
|
+
|
|
24
|
+
if not path.exists():
|
|
25
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
reader = _READERS[path.suffix.lower()]
|
|
29
|
+
except KeyError as exc:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"Unsupported file type: '{path.suffix}'."
|
|
32
|
+
) from exc
|
|
33
|
+
|
|
34
|
+
return reader(path)
|
datra/io/save.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _save_csv(df: pd.DataFrame, path: Path):
|
|
7
|
+
df.to_csv(path, index=False)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _save_excel(df: pd.DataFrame, path: Path):
|
|
11
|
+
df.to_excel(path, index=False)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_WRITERS = {
|
|
15
|
+
".csv": _save_csv,
|
|
16
|
+
".xlsx": _save_excel,
|
|
17
|
+
".xls": _save_excel,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
def _resolve_suffix(output: Path, format):
|
|
21
|
+
if format is not None:
|
|
22
|
+
suffix = "." + format.lower().lstrip(".")
|
|
23
|
+
output = output.with_suffix(suffix)
|
|
24
|
+
else:
|
|
25
|
+
suffix = output.suffix.lower()
|
|
26
|
+
|
|
27
|
+
return output, suffix
|
|
28
|
+
|
|
29
|
+
def save(df: pd.DataFrame, output, format=None):
|
|
30
|
+
output = Path(output)
|
|
31
|
+
|
|
32
|
+
output, suffix = _resolve_suffix(output, format)
|
|
33
|
+
|
|
34
|
+
if suffix not in _WRITERS:
|
|
35
|
+
supported = ", ".join(_WRITERS.keys())
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"Unsupported file format '{suffix}'. Supported formats are: {supported}"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
writer = _WRITERS[suffix]
|
|
41
|
+
|
|
42
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
writer(df, output)
|
|
45
|
+
|
|
46
|
+
return output
|
datra/reports/builder.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def build_report(results: dict):
|
|
5
|
+
return {
|
|
6
|
+
"metadata": {
|
|
7
|
+
"generated_at": datetime.now().isoformat(),
|
|
8
|
+
"tool": "datra",
|
|
9
|
+
},
|
|
10
|
+
|
|
11
|
+
"summary": _build_summary(results),
|
|
12
|
+
|
|
13
|
+
"profile": results.get("profile", {}),
|
|
14
|
+
|
|
15
|
+
"checks": {
|
|
16
|
+
"completeness": results.get("completeness", {}),
|
|
17
|
+
"uniqueness": results.get("uniqueness", {}),
|
|
18
|
+
"outliers": results.get("outliers", {}),
|
|
19
|
+
},
|
|
20
|
+
|
|
21
|
+
"score": results.get("score", {}),
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _build_summary(results: dict):
|
|
26
|
+
score = results.get("score", {})
|
|
27
|
+
|
|
28
|
+
return {
|
|
29
|
+
"dataset_quality_score": score.get("overall_score"),
|
|
30
|
+
"rows": results.get("profile", {}).get("rows"),
|
|
31
|
+
"columns": results.get("profile", {}).get("columns"),
|
|
32
|
+
"has_duplicates": bool(results.get("uniqueness", {}).get("duplicate_rows", 0)),
|
|
33
|
+
"has_missing_values": bool(results.get("completeness")),
|
|
34
|
+
"has_outliers": bool(results.get("outliers")),
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
from .templates import report_to_html
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def save_report(
|
|
9
|
+
report,
|
|
10
|
+
output="outputs",
|
|
11
|
+
format="json",
|
|
12
|
+
):
|
|
13
|
+
output = Path(output)
|
|
14
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
|
|
16
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
17
|
+
|
|
18
|
+
filename = f"datra_report_{timestamp}.{format}"
|
|
19
|
+
|
|
20
|
+
path = output / filename
|
|
21
|
+
|
|
22
|
+
if format == "json":
|
|
23
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
24
|
+
json.dump(report, f, indent=4)
|
|
25
|
+
|
|
26
|
+
elif format == "html":
|
|
27
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
28
|
+
f.write(report_to_html(report))
|
|
29
|
+
|
|
30
|
+
else:
|
|
31
|
+
raise ValueError(
|
|
32
|
+
f"Unsupported report format '{format}'."
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
return path
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def report_to_html(report: dict):
|
|
5
|
+
return f"""
|
|
6
|
+
<!DOCTYPE html>
|
|
7
|
+
<html>
|
|
8
|
+
<head>
|
|
9
|
+
<meta charset="utf-8">
|
|
10
|
+
<title>Datra Report</title>
|
|
11
|
+
</head>
|
|
12
|
+
|
|
13
|
+
<body>
|
|
14
|
+
<h1>Datra Audit Report</h1>
|
|
15
|
+
|
|
16
|
+
<h2>Summary</h2>
|
|
17
|
+
<pre>{json.dumps(report["summary"], indent=4)}</pre>
|
|
18
|
+
|
|
19
|
+
<h2>Score</h2>
|
|
20
|
+
<pre>{json.dumps(report["score"], indent=4)}</pre>
|
|
21
|
+
|
|
22
|
+
<h2>Checks</h2>
|
|
23
|
+
<pre>{json.dumps(report["checks"], indent=4)}</pre>
|
|
24
|
+
</body>
|
|
25
|
+
</html>
|
|
26
|
+
"""
|
|
File without changes
|
datra/scoring/score.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
def calculate_score(results: dict):
|
|
2
|
+
scores = {}
|
|
3
|
+
|
|
4
|
+
comp = results.get("completeness", {})
|
|
5
|
+
if comp:
|
|
6
|
+
ratios = [
|
|
7
|
+
v["completeness_ratio"]
|
|
8
|
+
for v in comp.values()
|
|
9
|
+
]
|
|
10
|
+
scores["completeness"] = (
|
|
11
|
+
sum(ratios) / len(ratios)
|
|
12
|
+
if ratios
|
|
13
|
+
else 0
|
|
14
|
+
)
|
|
15
|
+
else:
|
|
16
|
+
scores["completeness"] = 0
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# lower duplicates = better
|
|
20
|
+
uniq = results.get("uniqueness", {})
|
|
21
|
+
if uniq:
|
|
22
|
+
dup_ratio = uniq.get("duplicate_ratio", 1)
|
|
23
|
+
scores["uniqueness"] = 1 - dup_ratio
|
|
24
|
+
else:
|
|
25
|
+
scores["uniqueness"] = 0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
out = results.get("outliers", {})
|
|
29
|
+
if out:
|
|
30
|
+
ratios = [
|
|
31
|
+
v["outlier_ratio"]
|
|
32
|
+
for v in out.values()
|
|
33
|
+
]
|
|
34
|
+
avg_outlier = sum(ratios) / len(ratios)
|
|
35
|
+
scores["outliers"] = 1 - avg_outlier
|
|
36
|
+
else:
|
|
37
|
+
scores["outliers"] = 1
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
overall_score = round(
|
|
41
|
+
sum(scores.values()) / len(scores),
|
|
42
|
+
2
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
"scores": {
|
|
47
|
+
k: round(v, 2)
|
|
48
|
+
for k, v in scores.items()
|
|
49
|
+
},
|
|
50
|
+
"overall_score": overall_score,
|
|
51
|
+
}
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datra
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight Python library for cleaning, auditing, and validating tabular data.
|
|
5
|
+
Author-email: Raphael <raphaeljames897@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Raphael James
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/Raphaelj1/datra
|
|
29
|
+
Project-URL: Repository, https://github.com/Raphaelj1/datra
|
|
30
|
+
Project-URL: Issues, https://github.com/Raphaelj1/datra/issues
|
|
31
|
+
Keywords: data-cleaning,data-quality,validation,pandas,data-analysis
|
|
32
|
+
Classifier: Development Status :: 3 - Alpha
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
36
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
37
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
38
|
+
Classifier: Operating System :: OS Independent
|
|
39
|
+
Classifier: Programming Language :: Python :: 3
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
41
|
+
Requires-Python: >=3.10
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
License-File: LICENSE
|
|
44
|
+
Requires-Dist: pandas<3.0,>=2.0
|
|
45
|
+
Requires-Dist: openpyxl>=3.1
|
|
46
|
+
Provides-Extra: dev
|
|
47
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
48
|
+
Requires-Dist: build; extra == "dev"
|
|
49
|
+
Requires-Dist: twine; extra == "dev"
|
|
50
|
+
Dynamic: license-file
|
|
51
|
+
|
|
52
|
+
# Datra
|
|
53
|
+
|
|
54
|
+
A lightweight Python library for cleaning, auditing, and validating tabular data. It helps data scientists, analysts, and engineers quickly identify data quality issues, clean datasets using simple rules, and generate reports.
|
|
55
|
+
|
|
56
|
+
## Why Datra?
|
|
57
|
+
|
|
58
|
+
Data quality problems often consume more time than analysis itself. Missing values, duplicate records, inconsistent column names, and invalid entries can silently affect downstream models and business decisions.
|
|
59
|
+
|
|
60
|
+
Datra provides a simple workflow for understanding and improving dataset quality before analysis or machine learning.
|
|
61
|
+
|
|
62
|
+
With Datra, you can:
|
|
63
|
+
|
|
64
|
+
- Audit datasets to identify quality issues.
|
|
65
|
+
- Clean data using configurable rules.
|
|
66
|
+
- Validate datasets against business rules.
|
|
67
|
+
- Generate JSON and HTML quality reports.
|
|
68
|
+
- Work directly with Pandas DataFrames or CSV and Excel files.
|
|
69
|
+
|
|
70
|
+
## Features
|
|
71
|
+
|
|
72
|
+
- Dataset profiling
|
|
73
|
+
- Missing value analysis
|
|
74
|
+
- Duplicate detection
|
|
75
|
+
- Outlier detection (IQR-based)
|
|
76
|
+
- Rule-based data validation
|
|
77
|
+
- Automated data quality scoring
|
|
78
|
+
- Configurable data cleaning
|
|
79
|
+
- Column name standardization
|
|
80
|
+
- Support for Pandas DataFrames
|
|
81
|
+
- CSV and Excel file support
|
|
82
|
+
- JSON and HTML report generation
|
|
83
|
+
- Save cleaned datasets directly to disk
|
|
84
|
+
|
|
85
|
+
## Installation
|
|
86
|
+
|
|
87
|
+
Install Datra from PyPI:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install datra
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Or install the latest development version:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git clone https://github.com/raphaelj1/datra.git
|
|
97
|
+
|
|
98
|
+
cd datra
|
|
99
|
+
|
|
100
|
+
pip install -e .
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Quick Start
|
|
104
|
+
|
|
105
|
+
### Clean a dataset
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from datra import clean
|
|
109
|
+
|
|
110
|
+
cleaned = clean(
|
|
111
|
+
"patients.csv",
|
|
112
|
+
drop_duplicates=True,
|
|
113
|
+
fill_numeric="median",
|
|
114
|
+
fill_categorical="mode",
|
|
115
|
+
standardize_columns=True,
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Audit a dataset
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from datra import Audit
|
|
123
|
+
|
|
124
|
+
audit = Audit("patients.csv")
|
|
125
|
+
|
|
126
|
+
print(audit.profile)
|
|
127
|
+
print(audit.score)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Validate a dataset
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
rules = {
|
|
134
|
+
"Age": {
|
|
135
|
+
"min": 0,
|
|
136
|
+
"max": 120,
|
|
137
|
+
},
|
|
138
|
+
"Gender": {
|
|
139
|
+
"allowed": [
|
|
140
|
+
"Male",
|
|
141
|
+
"Female",
|
|
142
|
+
],
|
|
143
|
+
},
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
report = audit.validate(rules)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Cleaning Data
|
|
150
|
+
|
|
151
|
+
The `clean()` function applies one or more cleaning operations to a dataset and returns a new DataFrame. It accepts either a Pandas DataFrame or the path to a CSV or Excel file.
|
|
152
|
+
|
|
153
|
+
### Using keyword arguments
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from datra import clean
|
|
157
|
+
|
|
158
|
+
cleaned = clean(
|
|
159
|
+
"patients.csv",
|
|
160
|
+
drop_duplicates=True,
|
|
161
|
+
fill_numeric="median",
|
|
162
|
+
fill_categorical="mode",
|
|
163
|
+
standardize_columns=True,
|
|
164
|
+
)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Using cleaning rules
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
rules = {
|
|
171
|
+
"duplicates": {
|
|
172
|
+
"drop": True,
|
|
173
|
+
},
|
|
174
|
+
"missing": {
|
|
175
|
+
"numeric": "median",
|
|
176
|
+
"categorical": "mode",
|
|
177
|
+
},
|
|
178
|
+
"columns": {
|
|
179
|
+
"standardize": True,
|
|
180
|
+
},
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
cleaned = clean("patients.csv", rules=rules)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Save the cleaned dataset
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
clean(
|
|
190
|
+
"patients.csv",
|
|
191
|
+
drop_duplicates=True,
|
|
192
|
+
output="cleaned_patients.xlsx",
|
|
193
|
+
)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## Auditing Data
|
|
197
|
+
|
|
198
|
+
Create an audit object to inspect dataset quality.
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
from datra import Audit
|
|
202
|
+
|
|
203
|
+
audit = Audit("patients.csv")
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
Retrieve individual quality checks.
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
audit.profile
|
|
210
|
+
|
|
211
|
+
audit.completeness
|
|
212
|
+
|
|
213
|
+
audit.uniqueness
|
|
214
|
+
|
|
215
|
+
audit.outliers
|
|
216
|
+
|
|
217
|
+
audit.score
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Or access all audit results at once.
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
audit.results
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Validation
|
|
227
|
+
|
|
228
|
+
Validate datasets against custom business rules.
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
rules = {
|
|
232
|
+
"Age": {
|
|
233
|
+
"min": 0,
|
|
234
|
+
"max": 120,
|
|
235
|
+
},
|
|
236
|
+
"Patient ID": {
|
|
237
|
+
"unique": True,
|
|
238
|
+
},
|
|
239
|
+
"Gender": {
|
|
240
|
+
"allowed": [
|
|
241
|
+
"Male",
|
|
242
|
+
"Female",
|
|
243
|
+
],
|
|
244
|
+
},
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
report = audit.validate(rules)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
Validation returns a structured report describing which checks passed, which failed, and the number of violations for each rule.
|
|
251
|
+
|
|
252
|
+
## Reports
|
|
253
|
+
|
|
254
|
+
Build a data quality report as a Python dictionary.
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
from datra import Audit
|
|
258
|
+
|
|
259
|
+
audit = Audit("patients.csv")
|
|
260
|
+
|
|
261
|
+
report = audit.build_report()
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Save the report as JSON.
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
audit.save_report(
|
|
268
|
+
format="json",
|
|
269
|
+
)
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
Or save it as an HTML report.
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
audit.save_report(
|
|
276
|
+
format="html",
|
|
277
|
+
)
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## Supported File Formats
|
|
281
|
+
|
|
282
|
+
Datra supports both Pandas DataFrames and common tabular file formats.
|
|
283
|
+
|
|
284
|
+
| Input | Supported |
|
|
285
|
+
| ---------------- | --------- |
|
|
286
|
+
| Pandas DataFrame | ✅ |
|
|
287
|
+
| CSV | ✅ |
|
|
288
|
+
| Excel (.xlsx) | ✅ |
|
|
289
|
+
| Excel (.xls) | ✅ |
|
|
290
|
+
|
|
291
|
+
### Report Formats
|
|
292
|
+
|
|
293
|
+
| Format | Supported |
|
|
294
|
+
| ------ | ---------- |
|
|
295
|
+
| JSON | ✅ |
|
|
296
|
+
| HTML | ✅ |
|
|
297
|
+
| PDF | 🚧 Planned |
|
|
298
|
+
|
|
299
|
+
## Project Structure
|
|
300
|
+
|
|
301
|
+
```
|
|
302
|
+
datra/
|
|
303
|
+
├── datra/ # Library source code
|
|
304
|
+
├── examples/ # Example usage
|
|
305
|
+
├── tests/
|
|
306
|
+
├── pyproject.toml
|
|
307
|
+
├── README.md
|
|
308
|
+
└── LICENSE
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
## Roadmap
|
|
312
|
+
|
|
313
|
+
Planned improvements include:
|
|
314
|
+
|
|
315
|
+
- PDF report generation
|
|
316
|
+
- Command-line interface (CLI)
|
|
317
|
+
- Additional cleaning operations
|
|
318
|
+
- Additional validation rules
|
|
319
|
+
- More data quality checks
|
|
320
|
+
- Interactive HTML reports
|
|
321
|
+
- Support for additional file formats
|
|
322
|
+
|
|
323
|
+
## Contributing
|
|
324
|
+
|
|
325
|
+
Contributions, feature requests, and bug reports are welcome.
|
|
326
|
+
|
|
327
|
+
If you would like to contribute:
|
|
328
|
+
|
|
329
|
+
1. Fork the repository.
|
|
330
|
+
2. Create a new feature branch.
|
|
331
|
+
3. Commit your changes.
|
|
332
|
+
4. Open a pull request.
|
|
333
|
+
|
|
334
|
+
Please ensure all tests pass before submitting a pull request.
|
|
335
|
+
|
|
336
|
+
## License
|
|
337
|
+
|
|
338
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
datra/__init__.py,sha256=1hClFgDmV-XDtPolfJHiIYsnkiSXCnoWLxKmk-FQZiw,84
|
|
2
|
+
datra/audit.py,sha256=6R13Pn4gDvDcNEr0cs2s4UdwcDU3m7_Ph1lxQZDE7_A,1813
|
|
3
|
+
datra/cleaner.py,sha256=eBLk_aKj6Ot-VKwTRUV7m-l9rZbNhwW0WWMKcU8sKYs,1850
|
|
4
|
+
datra/defaults.py,sha256=nMpeF2HUm72zBfnFxWfBAKf_uZEr5zkL-xqim1kfePI,219
|
|
5
|
+
datra/checks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
datra/checks/completeness.py,sha256=Mmj2WpVvr6BtoGd7s5ZIPeEFKRPhGoXhV_Y5IJf4QzA,433
|
|
7
|
+
datra/checks/outliers.py,sha256=sSui1_-nFxRNEC2jnuKn3jrkDn6lhFSNcI9ZDm2e3mI,906
|
|
8
|
+
datra/checks/profile.py,sha256=FjTwEFHjLh0DdhJkQoIaGOfpsmZfkqFJ-fzFJACX8Z4,1335
|
|
9
|
+
datra/checks/uniqueness.py,sha256=O8nLFvQzj7ryEn_9kxcBqEXn3yUATHmSpKco6GTaUvQ,512
|
|
10
|
+
datra/checks/validate.py,sha256=aaCZvnLPt7txUMtyB_SxodKVa2mS047ChG9lVFjVkiI,3441
|
|
11
|
+
datra/cleaning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
datra/cleaning/columns.py,sha256=4YSTWOhxa0sVBZNyNP6Yq0ONz4rY7cEPMda8RSj9PxM,844
|
|
13
|
+
datra/cleaning/duplicates.py,sha256=pEOR_Zo4TnITPSck64ohmrFc0AWz0TTxjjEYBJ6YS5g,107
|
|
14
|
+
datra/cleaning/missing.py,sha256=yrtC265kicr7iogGXJsFv7jZb_uOq_-T1mHNaYx0INg,1598
|
|
15
|
+
datra/io/__init__.py,sha256=2gVWbsNF1AnyonLfvIANNxC0kKipQZf0gUv4KO2jsWk,76
|
|
16
|
+
datra/io/load.py,sha256=pWVexMzIfeVN3xvYClFLbO415_ctuodLtvz-B1FM4MA,718
|
|
17
|
+
datra/io/save.py,sha256=zB54SYGyjRjteyXj5k_02c1RlJIQZTL9fiesWJlS-s4,1053
|
|
18
|
+
datra/reports/__init__.py,sha256=n-8S8-Ru9tKkuCyKaXzsEerFCIb-wBh9taHAdTBu444,127
|
|
19
|
+
datra/reports/builder.py,sha256=Od1S7ozc4w1eeGyc5ZA8K2jTpMyIXX8xlN1fD-cguok,1058
|
|
20
|
+
datra/reports/exporter.py,sha256=uv0m1ELxH2y7qKQP-SZnlv0ihIxK396qDOCs8YIKT1A,791
|
|
21
|
+
datra/reports/templates.py,sha256=pzvTMmN1SYupLZ3J8A6bzRzyCUCXxBRCNxeRZAhLGf4,664
|
|
22
|
+
datra/scoring/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
+
datra/scoring/score.py,sha256=oOnZuKIWEkiaV5Ae_KMtg1Vl29Rkj9ttLMlXuyI31iU,1189
|
|
24
|
+
datra-0.1.0.dist-info/licenses/LICENSE,sha256=06G7x2RjMHSDF3YlizZYiWN6NKucfVoKIfIM9YLmZQQ,1091
|
|
25
|
+
datra-0.1.0.dist-info/METADATA,sha256=BGNss3O0t5tAj42HNxjvnc85YF4xvW-WzclbfR4IBjc,8036
|
|
26
|
+
datra-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
27
|
+
datra-0.1.0.dist-info/top_level.txt,sha256=tikJ6dyoHKVnA0xDo5hELjedEDb-NxP4IURNI1FQpaE,6
|
|
28
|
+
datra-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Raphael James
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
datra
|