tidytable-core 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tidytable/__init__.py +19 -0
- tidytable/dedup/__init__.py +6 -0
- tidytable/dedup/engine.py +17 -0
- tidytable/merge/__init__.py +6 -0
- tidytable/merge/engine.py +34 -0
- tidytable/missing/__init__.py +7 -0
- tidytable/missing/engine.py +27 -0
- tidytable/parse/__init__.py +9 -0
- tidytable/parse/engine.py +73 -0
- tidytable/profile/__init__.py +9 -0
- tidytable/profile/engine.py +62 -0
- tidytable/reconcile/__init__.py +6 -0
- tidytable/reconcile/engine.py +38 -0
- tidytable/structural/__init__.py +7 -0
- tidytable/structural/engine.py +44 -0
- tidytable/xl/__init__.py +8 -0
- tidytable/xl/engine.py +56 -0
- tidytable_core-1.0.0.dist-info/METADATA +330 -0
- tidytable_core-1.0.0.dist-info/RECORD +22 -0
- tidytable_core-1.0.0.dist-info/WHEEL +5 -0
- tidytable_core-1.0.0.dist-info/licenses/LICENSE +0 -0
- tidytable_core-1.0.0.dist-info/top_level.txt +1 -0
tidytable/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from . import xl
|
|
2
|
+
from . import structural
|
|
3
|
+
from . import parse
|
|
4
|
+
from . import missing
|
|
5
|
+
from . import dedup
|
|
6
|
+
from . import merge
|
|
7
|
+
from . import reconcile
|
|
8
|
+
from . import profile
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"xl",
|
|
12
|
+
"structural",
|
|
13
|
+
"parse",
|
|
14
|
+
"missing",
|
|
15
|
+
"dedup",
|
|
16
|
+
"merge",
|
|
17
|
+
"reconcile",
|
|
18
|
+
"profile"
|
|
19
|
+
]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def absolute(df: pd.DataFrame) -> pd.DataFrame:
|
|
4
|
+
"""Removes records where rows present duplicate values across every single key column."""
|
|
5
|
+
return df.drop_duplicates().reset_index(drop=True)
|
|
6
|
+
|
|
7
|
+
def partial(df: pd.DataFrame, subset: list[str], keep: str = "latest", timestamp_col: str = None) -> pd.DataFrame:
|
|
8
|
+
"""Resolves ledger data conflicts across target unique matching indexes."""
|
|
9
|
+
if keep in ["latest", "earliest"] and not timestamp_col:
|
|
10
|
+
raise ValueError("timestamp_col must be specified when keeping latest/earliest items.")
|
|
11
|
+
|
|
12
|
+
working_df = df.copy()
|
|
13
|
+
if timestamp_col:
|
|
14
|
+
working_df = working_df.sort_values(by=timestamp_col, ascending=(keep == "earliest"))
|
|
15
|
+
|
|
16
|
+
keep_policy = "first" if keep in ["latest", "earliest"] else keep
|
|
17
|
+
return working_df.drop_duplicates(subset=subset, keep=keep_policy).reset_index(drop=True)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from rapidfuzz import process
|
|
3
|
+
|
|
4
|
+
def fuzzy_vlookup(left_df: pd.DataFrame, right_df: pd.DataFrame, left_on: str, right_on: str, threshold: float = 0.85) -> pd.DataFrame:
|
|
5
|
+
"""Executes an error-tolerant database left-join using string similarity score profiles."""
|
|
6
|
+
ldf = left_df.copy()
|
|
7
|
+
rdf = right_df.copy()
|
|
8
|
+
|
|
9
|
+
right_keys = rdf[right_on].dropna().astype(str).tolist()
|
|
10
|
+
match_map = {}
|
|
11
|
+
|
|
12
|
+
for l_key in ldf[left_on].dropna().astype(str).unique():
|
|
13
|
+
best_match = process.extractOne(l_key, right_keys, score_cutoff=threshold * 100)
|
|
14
|
+
if best_match:
|
|
15
|
+
match_map[l_key] = best_match[0]
|
|
16
|
+
|
|
17
|
+
ldf[f"_fuzzy_key_join"] = ldf[left_on].astype(str).map(match_map)
|
|
18
|
+
rdf[f"_fuzzy_key_join"] = rdf[right_on].astype(str)
|
|
19
|
+
|
|
20
|
+
merged = pd.merge(ldf, rdf, on="_fuzzy_key_join", how="left").drop(columns=["_fuzzy_key_join"])
|
|
21
|
+
return merged
|
|
22
|
+
|
|
23
|
+
def join_diagnose(left_df: pd.DataFrame, right_df: pd.DataFrame, left_on: str, right_on: str) -> dict:
|
|
24
|
+
"""Runs a pre-flight assessment matrix map detailing alignment success chances."""
|
|
25
|
+
l_keys = set(left_df[left_on].dropna().unique())
|
|
26
|
+
r_keys = set(right_df[right_on].dropna().unique())
|
|
27
|
+
|
|
28
|
+
unmatched = l_keys - r_keys
|
|
29
|
+
return {
|
|
30
|
+
"left_total_unique_keys": len(l_keys),
|
|
31
|
+
"right_total_unique_keys": len(r_keys),
|
|
32
|
+
"unmatched_left_keys_count": len(unmatched),
|
|
33
|
+
"type_alignment_match": left_df[left_on].dtype == right_df[right_on].dtype
|
|
34
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
def drop_empty_cols(df: pd.DataFrame, threshold: float = 0.50) -> pd.DataFrame:
|
|
5
|
+
"""Prunes out layout dimension paths that present excess null counts."""
|
|
6
|
+
return df.dropna(axis=1, thresh=int((1 - threshold) * len(df)))
|
|
7
|
+
|
|
8
|
+
def flag_absence(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
|
|
9
|
+
"""Appends binary indicator tracking indexes to document missing status items."""
|
|
10
|
+
new_df = df.copy()
|
|
11
|
+
for col in columns:
|
|
12
|
+
if col in new_df.columns:
|
|
13
|
+
new_df[f"{col}_is_missing"] = new_df[col].isna()
|
|
14
|
+
return new_df
|
|
15
|
+
|
|
16
|
+
def impute(series: pd.Series, strategy: str = "median") -> pd.Series:
|
|
17
|
+
"""Fills data holes safely based on chosen statistical profiles."""
|
|
18
|
+
s = series.copy()
|
|
19
|
+
if strategy == "mean":
|
|
20
|
+
fill_val = s.mean()
|
|
21
|
+
elif strategy == "median":
|
|
22
|
+
fill_val = s.median()
|
|
23
|
+
elif strategy == "mode":
|
|
24
|
+
fill_val = s.mode()[0] if not s.mode().empty else np.nan
|
|
25
|
+
else:
|
|
26
|
+
fill_val = strategy # Interpret as explicit fallback text/value
|
|
27
|
+
return s.fillna(fill_val)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from dateutil import parser as date_parser
|
|
4
|
+
|
|
5
|
+
def financials(series: pd.Series) -> pd.Series:
|
|
6
|
+
"""Extracts scalar numbers from extreme formatting configurations and bounds."""
|
|
7
|
+
def _clean(val):
|
|
8
|
+
if pd.isna(val): return np.nan
|
|
9
|
+
s = str(val).strip().replace(',', '')
|
|
10
|
+
if not s or s.lower() in ['nan', 'none', 'null', '']: return np.nan
|
|
11
|
+
|
|
12
|
+
# Guard against raw exponential notations or infinite boundaries
|
|
13
|
+
if s.lower() in ['inf', '-inf', 'infinity']:
|
|
14
|
+
return np.inf if '-' not in s else -np.inf
|
|
15
|
+
|
|
16
|
+
is_negative = ('(' in s and ')' in s) or ('-' in s)
|
|
17
|
+
s = s.replace('$', '').replace('€', '').replace('£', '').replace('(', '').replace(')', '').replace('-', '').strip()
|
|
18
|
+
|
|
19
|
+
if not s: return np.nan
|
|
20
|
+
|
|
21
|
+
# Handle scaling metrics multipliers safely
|
|
22
|
+
multiplier = 1
|
|
23
|
+
if s.lower().endswith('k'):
|
|
24
|
+
multiplier = 1_000
|
|
25
|
+
s = s[:-1]
|
|
26
|
+
elif s.lower().endswith('m'):
|
|
27
|
+
multiplier = 1_000_000
|
|
28
|
+
s = s[:-1]
|
|
29
|
+
elif s.lower().endswith('b'):
|
|
30
|
+
multiplier = 1_000_000_000
|
|
31
|
+
s = s[:-1]
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
num = float(s) * multiplier
|
|
35
|
+
return -num if is_negative else num
|
|
36
|
+
except ValueError:
|
|
37
|
+
return np.nan
|
|
38
|
+
|
|
39
|
+
return series.apply(_clean)
|
|
40
|
+
|
|
41
|
+
def dates(series: pd.Series, dayfirst: bool = False) -> pd.Series:
|
|
42
|
+
"""Transforms fluid data variants down to ISO timestamps while guarding against out-of-bounds dates."""
|
|
43
|
+
def _parse(val):
|
|
44
|
+
if pd.isna(val) or str(val).strip() in ["", "NaT", "NaN", "None", "null"]: return pd.NaT
|
|
45
|
+
try:
|
|
46
|
+
return date_parser.parse(str(val), dayfirst=dayfirst)
|
|
47
|
+
except (ValueError, OverflowError, TypeError):
|
|
48
|
+
return pd.NaT
|
|
49
|
+
return series.apply(_parse)
|
|
50
|
+
|
|
51
|
+
def handle_formula_ghosts(series: pd.Series, error_strategy: str = "coerce") -> pd.Series:
|
|
52
|
+
"""Traps calculations failures from Excel and forces them to target null variables."""
|
|
53
|
+
ghost_strings = ["#VALUE!", "#REF!", "#DIV/0!", "#NAME?", "#N/A", "#NUM!", "#NULL!"]
|
|
54
|
+
return series.apply(lambda x: np.nan if str(x).strip().upper() in ghost_strings else x)
|
|
55
|
+
|
|
56
|
+
def repair_identifiers(series: pd.Series, pad_length: int = None) -> pd.Series:
|
|
57
|
+
"""Restores dropped leading zeroes on data identifiers securely."""
|
|
58
|
+
def _pad(val):
|
|
59
|
+
if pd.isna(val) or str(val).strip() in ["", "nan", "NaN"]: return ""
|
|
60
|
+
s = str(val).split('.')[0].strip()
|
|
61
|
+
return s.zfill(pad_length) if pad_length else s
|
|
62
|
+
return series.apply(_pad)
|
|
63
|
+
|
|
64
|
+
def boolean(series: pd.Series) -> pd.Series:
|
|
65
|
+
"""Maps custom categorical indicator strings cleanly to boolean values."""
|
|
66
|
+
truth_map = {
|
|
67
|
+
"y": True, "yes": True, "1": True, "t": True, "true": True,
|
|
68
|
+
"n": False, "no": False, "0": False, "f": False, "false": False
|
|
69
|
+
}
|
|
70
|
+
# Clean the string, lower it, and map it directly to True/False/NaN
|
|
71
|
+
return series.astype(str).str.strip().str.lower().map(truth_map)
|
|
72
|
+
|
|
73
|
+
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
def blueprint(df: pd.DataFrame) -> dict:
|
|
6
|
+
"""Generates a high-level overview of shapes, data types, and memory metrics."""
|
|
7
|
+
return {
|
|
8
|
+
"shape": {"rows": df.shape[0], "columns": df.shape[1]},
|
|
9
|
+
"dtypes": {str(k): str(v) for k, v in df.dtypes.to_dict().items()},
|
|
10
|
+
"null_counts": {str(k): int(v) for k, v in df.isna().sum().to_dict().items()}
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
def check_anomalies(df: pd.DataFrame) -> list[str]:
|
|
14
|
+
"""Scans boundaries to check for junk placeholder values strings."""
|
|
15
|
+
alerts = []
|
|
16
|
+
placeholders = ["?", "n/a", "none", "-", "nan", "null"]
|
|
17
|
+
|
|
18
|
+
for col in df.columns:
|
|
19
|
+
stringified = df[col].astype(str).str.strip().str.lower()
|
|
20
|
+
found = stringified[stringified.isin(placeholders)].unique()
|
|
21
|
+
if len(found) > 0:
|
|
22
|
+
alerts.append(f"Column '{col}' contains unhandled string anomalies: {list(found)}")
|
|
23
|
+
|
|
24
|
+
return alerts
|
|
25
|
+
|
|
26
|
+
def audit_report(df: pd.DataFrame, output: str = "cli") -> str:
|
|
27
|
+
"""Compiles a text summary documenting pipeline modifications."""
|
|
28
|
+
bp = blueprint(df)
|
|
29
|
+
report = f"=== TIDYTABLE DATA CLEANING AUDIT REPORT ===\n"
|
|
30
|
+
report += f"Final Dimensions: {bp['shape']['rows']} rows x {bp['shape']['columns']} columns\n"
|
|
31
|
+
report += f"Column Composition and Null Status Logs:\n"
|
|
32
|
+
for col, dtype in bp['dtypes'].items():
|
|
33
|
+
report += f" - {col} ({dtype}): Missing values: {bp['null_counts'][col]}\n"
|
|
34
|
+
return report
|
|
35
|
+
|
|
36
|
+
def pin_schema(df: pd.DataFrame, path: str) -> None:
|
|
37
|
+
"""Saves the data structure blueprint to a local template file."""
|
|
38
|
+
bp = blueprint(df)
|
|
39
|
+
with open(path, 'w') as f:
|
|
40
|
+
json.dump(bp, f, indent=4)
|
|
41
|
+
|
|
42
|
+
def validate(df: pd.DataFrame, schema_path: str) -> bool:
|
|
43
|
+
"""Validates the processed dataset against a saved reference blueprint."""
|
|
44
|
+
with open(schema_path, 'r') as f:
|
|
45
|
+
saved_schema = json.load(f)
|
|
46
|
+
|
|
47
|
+
current_cols = set(df.columns)
|
|
48
|
+
saved_cols = set(saved_schema["dtypes"].keys())
|
|
49
|
+
|
|
50
|
+
if current_cols != saved_cols:
|
|
51
|
+
missing = saved_cols - current_cols
|
|
52
|
+
extra = current_cols - saved_cols
|
|
53
|
+
raise ValueError(f"Schema mismatch. Missing expected columns: {missing}. Unexpected extra columns: {extra}")
|
|
54
|
+
|
|
55
|
+
for col in df.columns:
|
|
56
|
+
curr_type = str(df[col].dtype)
|
|
57
|
+
saved_type = saved_schema["dtypes"][col]
|
|
58
|
+
# Allow flexible evaluation comparisons for strings/objects object mappings
|
|
59
|
+
if curr_type != saved_type and not (curr_type == "object" and saved_type == "string"):
|
|
60
|
+
raise TypeError(f"Type validation alert on '{col}': Expected {saved_type}, found {curr_type}")
|
|
61
|
+
|
|
62
|
+
return True
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def sheet_diff(df_old: pd.DataFrame, df_new: pd.DataFrame, key_column: str) -> dict[str, pd.DataFrame]:
|
|
4
|
+
"""Compares two balance frames to extract record additions, deletions, and alterations."""
|
|
5
|
+
old_idx = df_old.set_index(key_column)
|
|
6
|
+
new_idx = df_new.set_index(key_column)
|
|
7
|
+
|
|
8
|
+
added_keys = new_idx.index.difference(old_idx.index)
|
|
9
|
+
dropped_keys = old_idx.index.difference(new_idx.index)
|
|
10
|
+
shared_keys = old_idx.index.intersection(new_idx.index)
|
|
11
|
+
|
|
12
|
+
added_df = new_idx.loc[added_keys].reset_index()
|
|
13
|
+
dropped_df = old_idx.loc[dropped_keys].reset_index()
|
|
14
|
+
|
|
15
|
+
# Identify modified value items
|
|
16
|
+
modified_rows = []
|
|
17
|
+
for key in shared_keys:
|
|
18
|
+
old_row = old_idx.loc[key]
|
|
19
|
+
new_row = new_idx.loc[key]
|
|
20
|
+
if not old_row.equals(new_row):
|
|
21
|
+
diff_mask = old_row != new_row
|
|
22
|
+
row_data = {key_column: key}
|
|
23
|
+
for col in old_idx.columns:
|
|
24
|
+
if diff_mask.get(col, False):
|
|
25
|
+
row_data[f"{col}_old"] = old_row[col]
|
|
26
|
+
row_data[f"{col}_new"] = new_row[col]
|
|
27
|
+
modified_rows.append(row_data)
|
|
28
|
+
|
|
29
|
+
return {
|
|
30
|
+
"Added": added_df,
|
|
31
|
+
"Dropped": dropped_df,
|
|
32
|
+
"Modified": pd.DataFrame(modified_rows)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
def align_schemas(df_old: pd.DataFrame, df_new: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
36
|
+
"""Re-orders and aligns column structures between two file generations."""
|
|
37
|
+
shared_cols = [col for col in df_old.columns if col in df_new.columns]
|
|
38
|
+
return df_old[shared_cols].copy(), df_new[shared_cols].copy()
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import re
|
|
3
|
+
from rapidfuzz import process, utils
|
|
4
|
+
|
|
5
|
+
def rename_columns(df: pd.DataFrame, style: str = "snake_case") -> pd.DataFrame:
|
|
6
|
+
"""Converts column labels cleanly into standardized variable formats."""
|
|
7
|
+
cleaned_df = df.copy()
|
|
8
|
+
new_names = []
|
|
9
|
+
|
|
10
|
+
for col in cleaned_df.columns:
|
|
11
|
+
s = str(col).strip()
|
|
12
|
+
if style == "snake_case":
|
|
13
|
+
s = re.sub(r'[^\w\s-]', '', s) # Drop punctuation marks
|
|
14
|
+
s = s.replace('-', '_') # Convert hard dashes to clear underscores
|
|
15
|
+
s = re.sub(r'[\s_]+', '_', s) # Collapse space/underscore variations
|
|
16
|
+
s = s.strip('_').lower() # Trim edges and lowercase
|
|
17
|
+
new_names.append(s)
|
|
18
|
+
|
|
19
|
+
cleaned_df.columns = new_names
|
|
20
|
+
return cleaned_df
|
|
21
|
+
|
|
22
|
+
def strip_whitespace(series: pd.Series) -> pd.Series:
|
|
23
|
+
"""Strips regular spaces, tab lines, and hidden web code spaces (\xa0)."""
|
|
24
|
+
return series.astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip() if pd.notna(x) else x)
|
|
25
|
+
|
|
26
|
+
def standardize_categories(series: pd.Series, mapping: dict = None, auto_cluster: bool = False) -> pd.Series:
|
|
27
|
+
"""Groups scattered spelling variations or mapping targets into a clean key frame."""
|
|
28
|
+
s = series.copy()
|
|
29
|
+
if mapping:
|
|
30
|
+
reverse_map = {variant: key for key, list_var in mapping.items() for variant in list_var}
|
|
31
|
+
return s.apply(lambda x: reverse_map.get(str(x).strip(), x))
|
|
32
|
+
|
|
33
|
+
if auto_cluster:
|
|
34
|
+
unique_vals = [str(v).strip() for v in s.dropna().unique()]
|
|
35
|
+
processed_map = {}
|
|
36
|
+
# Simple clustering matrix setup
|
|
37
|
+
for val in unique_vals:
|
|
38
|
+
if val in processed_map: continue
|
|
39
|
+
matches = process.extract(val, unique_vals, score_cutoff=85.0)
|
|
40
|
+
for match in matches:
|
|
41
|
+
processed_map[match[0]] = val
|
|
42
|
+
return s.apply(lambda x: processed_map.get(str(x).strip(), x) if pd.notna(x) else x)
|
|
43
|
+
|
|
44
|
+
return s
|
tidytable/xl/__init__.py
ADDED
tidytable/xl/engine.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import openpyxl
|
|
4
|
+
|
|
5
|
+
def load_workbook(file_path: str) -> dict[str, pd.DataFrame]:
|
|
6
|
+
"""Loads an Excel file and returns a dictionary of dataframes mapped by sheet names."""
|
|
7
|
+
return pd.read_excel(file_path, sheet_name=None)
|
|
8
|
+
|
|
9
|
+
def unmerge_and_fill(sheet_data: pd.DataFrame, strategy: str = "ffill") -> pd.DataFrame:
|
|
10
|
+
"""Detects merged cells layout defects and flattens values forward or laterally."""
|
|
11
|
+
df = sheet_data.copy()
|
|
12
|
+
if strategy == "ffill":
|
|
13
|
+
return df.ffill()
|
|
14
|
+
elif strategy == "lfill":
|
|
15
|
+
return df.ffill(axis=1)
|
|
16
|
+
else:
|
|
17
|
+
raise ValueError("Strategy must be either 'ffill' or 'lfill'")
|
|
18
|
+
|
|
19
|
+
def sniff_headers(sheet_data: pd.DataFrame, scan_rows: int = 20) -> tuple[int, list[str]]:
|
|
20
|
+
"""Locates where metadata panels clear out and structured headers begin."""
|
|
21
|
+
df_scan = sheet_data.head(scan_rows)
|
|
22
|
+
best_row_idx = 0
|
|
23
|
+
max_non_nulls = -1
|
|
24
|
+
|
|
25
|
+
for idx, row in df_scan.iterrows():
|
|
26
|
+
non_null_count = row.notna().sum()
|
|
27
|
+
if non_null_count > max_non_nulls:
|
|
28
|
+
max_non_nulls = non_null_count
|
|
29
|
+
best_row_idx = idx
|
|
30
|
+
|
|
31
|
+
detected_headers = [str(val).strip() for val in sheet_data.iloc[best_row_idx].tolist()]
|
|
32
|
+
return int(best_row_idx), detected_headers
|
|
33
|
+
|
|
34
|
+
def split_multi_tables(sheet_data: pd.DataFrame) -> list[pd.DataFrame]:
|
|
35
|
+
"""Slices a single layout matrix into distinct data tables if isolated by full empty spaces."""
|
|
36
|
+
empty_cols = sheet_data.isna().all(axis=0)
|
|
37
|
+
split_indices = np.where(empty_cols)[0]
|
|
38
|
+
|
|
39
|
+
if len(split_indices) == 0:
|
|
40
|
+
return [sheet_data]
|
|
41
|
+
|
|
42
|
+
tables = []
|
|
43
|
+
start_col = 0
|
|
44
|
+
for idx in split_indices:
|
|
45
|
+
if idx > start_col:
|
|
46
|
+
slice_df = sheet_data.iloc[:, start_col:idx].dropna(how='all')
|
|
47
|
+
if not slice_df.empty:
|
|
48
|
+
tables.append(slice_df.reset_index(drop=True))
|
|
49
|
+
start_col = idx + 1
|
|
50
|
+
|
|
51
|
+
if start_col < sheet_data.shape[1]:
|
|
52
|
+
slice_df = sheet_data.iloc[:, start_col:].dropna(how='all')
|
|
53
|
+
if not slice_df.empty:
|
|
54
|
+
tables.append(slice_df.reset_index(drop=True))
|
|
55
|
+
|
|
56
|
+
return tables
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tidytable-core
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An ecosystem-style explicit data cleaning framework for Excel and CSV pipelines.
|
|
5
|
+
Author-email: Aayush Vijay <aayushvj8699@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/aayushvijay/tidytable
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: pandas>=2.0.0
|
|
15
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
16
|
+
Requires-Dist: python-dateutil>=2.8.2
|
|
17
|
+
Requires-Dist: rapidfuzz>=3.0.0
|
|
18
|
+
Dynamic: license-file
|
|
19
|
+
|
|
20
|
+
Here is the raw Markdown block. You can copy everything inside this block and paste it directly into your `README.md` file:
|
|
21
|
+
|
|
22
|
+
```markdown
|
|
23
|
+
# tidytable-core 🧹
|
|
24
|
+
|
|
25
|
+
An ecosystem-style, explicit data cleaning framework built for data analysts who bridge the gap between messy, human-formatted Excel/CSV spreadsheets and production-ready Python data structures.
|
|
26
|
+
|
|
27
|
+
Unlike "black-box" cleaning scripts that automatically change your underlying values, `tidytable` forces an **explicit pipeline paradigm**. Each transformation engine is completely decoupled, granting you absolute control and step-by-step data lineage tracking.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## 🚀 Installation & System Configuration
|
|
32
|
+
|
|
33
|
+
Install `tidytable-core` globally from the Python Package Index (PyPI):
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install tidytable-core
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Core External System Dependencies
|
|
41
|
+
|
|
42
|
+
* **`pandas`**: Core tabular dataframe manipulation engine.
|
|
43
|
+
* **`openpyxl`**: Memory-mapped engine for modern `.xlsx` workbook streams.
|
|
44
|
+
* **`python-dateutil`**: Dynamic flexible timestamp text resolution matrix parser.
|
|
45
|
+
* **`rapidfuzz`**: High-performance Levenshtein Distance string similarity evaluation index engine.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## 🏗️ Architectural Execution Pipeline
|
|
50
|
+
|
|
51
|
+
Data flows sequentially through your explicitly invoked processing domains:
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
[ Messy Spreadsheet File ]
|
|
55
|
+
│
|
|
56
|
+
▼
|
|
57
|
+
┌───────────────────────┐
|
|
58
|
+
│ tidytable.xl │ ──► Resolves unaligned human-formatted layout grids.
|
|
59
|
+
└───────────┬───────────┘
|
|
60
|
+
│ (Tabular Data Stream)
|
|
61
|
+
▼
|
|
62
|
+
┌───────────────────────┐
|
|
63
|
+
│ tidytable.structural │ ──► Standardizes variable labels and strips whitespace.
|
|
64
|
+
└───────────┬───────────┘
|
|
65
|
+
│
|
|
66
|
+
▼
|
|
67
|
+
┌───────────────────────┐
|
|
68
|
+
│ tidytable.parse │ ──► Casts values safely into clean data types.
|
|
69
|
+
└───────────┬───────────┘
|
|
70
|
+
│
|
|
71
|
+
▼
|
|
72
|
+
┌───────────────────────┐
|
|
73
|
+
│ tidytable.missing │ ──► Drops empty panels and applies imputation profiles.
|
|
74
|
+
└───────────┬───────────┘
|
|
75
|
+
│
|
|
76
|
+
▼
|
|
77
|
+
┌───────────────────────┐
|
|
78
|
+
│ tidytable.dedup │ ──► Filters duplicate records safely.
|
|
79
|
+
└───────────┬───────────┘
|
|
80
|
+
│
|
|
81
|
+
▼
|
|
82
|
+
┌───────────────────────┐
|
|
83
|
+
│ tidytable.profile │ ──► Validates constraints and generates audit ledger logs.
|
|
84
|
+
└───────────┬───────────┘
|
|
85
|
+
│
|
|
86
|
+
▼
|
|
87
|
+
[ Pristine DataFrame ]
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## 📚 Complete Sub-Library Blueprint Reference
|
|
94
|
+
|
|
95
|
+
### 1. `tidytable.xl` (The Excel Surgeon)
|
|
96
|
+
|
|
97
|
+
Extracts pristine data grids from visually stylized worksheets.
|
|
98
|
+
|
|
99
|
+
#### `xl.load_workbook(file_path: str) -> dict[str, pd.DataFrame]`
|
|
100
|
+
|
|
101
|
+
* **Use**: Loads an entire workbook into memory.
|
|
102
|
+
* **Arguments**: `file_path` (*str*): System destination path pointing to an Excel document.
|
|
103
|
+
|
|
104
|
+
#### `xl.unmerge_and_fill(sheet_data: pd.DataFrame, strategy: str = "ffill") -> pd.DataFrame`
|
|
105
|
+
|
|
106
|
+
* **Use**: Flattens merged cells and fills empty fields down or across so rows stay linked.
|
|
107
|
+
* **Arguments**:
|
|
108
|
+
* `sheet_data` (*DataFrame*): The input sheet table matrix.
|
|
109
|
+
* `strategy` (*str*): Direction constraint rule. `"ffill"` (forward fill down) or `"lfill"` (lateral fill across).
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
#### `xl.sniff_headers(sheet_data: pd.DataFrame, scan_rows: int = 20) -> tuple[int, list[str]]`
|
|
114
|
+
|
|
115
|
+
* **Use**: Skips title banners and KPI cards to find where the actual table headers start.
|
|
116
|
+
* **Arguments**: `scan_rows` (*int*): Search depth row index limit.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
import tidytable as tt
|
|
120
|
+
|
|
121
|
+
sheets = tt.xl.load_workbook("sales_report.xlsx")
|
|
122
|
+
raw_data = sheets["Q2_Leads"]
|
|
123
|
+
|
|
124
|
+
# Sniff header index location and clean names
|
|
125
|
+
header_idx, headers = tt.xl.sniff_headers(raw_data, scan_rows=15)
|
|
126
|
+
|
|
127
|
+
# Unmerge and prop values down to form a database structure
|
|
128
|
+
df = tt.xl.unmerge_and_fill(raw_data, strategy="ffill")
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
### 2. `tidytable.structural` (The Text Blacksmith)
|
|
135
|
+
|
|
136
|
+
Cleans and standardizes column structures and text anomalies.
|
|
137
|
+
|
|
138
|
+
#### `structural.rename_columns(df: pd.DataFrame, style: str = "snake_case") -> pd.DataFrame`
|
|
139
|
+
|
|
140
|
+
* **Use**: Converts columns (like `"Gross Profit (%)"`) into clean variables (`"gross_profit"`).
|
|
141
|
+
* **Arguments**: `style` (*str*): Re-casing format rules. Default is `"snake_case"`.
|
|
142
|
+
|
|
143
|
+
#### `structural.strip_whitespace(series: pd.Series) -> pd.Series`
|
|
144
|
+
|
|
145
|
+
* **Use**: Deep-strips leading/trailing spaces, tab breaks, and hidden non-breaking spaces (`\xa0`).
|
|
146
|
+
|
|
147
|
+
#### `structural.standardize_categories(series: pd.Series, mapping: dict = None, auto_cluster: bool = False) -> pd.Series`
|
|
148
|
+
|
|
149
|
+
* **Use**: Groups manual typos and naming variations into a single target category name.
|
|
150
|
+
* **Arguments**:
|
|
151
|
+
* `mapping` (*dict*): Manual dictionary rules map (e.g., `{"USA": ["usa", "U.S.A.", "us"]}`).
|
|
152
|
+
* `auto_cluster` (*bool*): Uses Levenshtein Distance to merge variations automatically.
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
df = tt.structural.rename_columns(df, style="snake_case")
|
|
158
|
+
df["product_name"] = tt.structural.strip_whitespace(df["product_name"])
|
|
159
|
+
|
|
160
|
+
# Merge regional text typos automatically using string distance clustering
|
|
161
|
+
df["region"] = tt.structural.standardize_categories(df["region"], auto_cluster=True)
|
|
162
|
+
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
### 3. `tidytable.parse` (The Type Whisperer)
|
|
168
|
+
|
|
169
|
+
Converts raw string text blocks into strict mathematical datatypes without crashing.
|
|
170
|
+
|
|
171
|
+
#### `parse.dates(series: pd.Series, dayfirst: bool = False) -> pd.Series`
|
|
172
|
+
|
|
173
|
+
* **Use**: Parses mixed date format variations in a single column into uniform ISO datetimes.
|
|
174
|
+
|
|
175
|
+
#### `parse.financials(series: pd.Series) -> pd.Series`
|
|
176
|
+
|
|
177
|
+
* **Use**: Extracts numeric values from accounting styles like `"$ (1,250.00)"` or `"12K"`.
|
|
178
|
+
|
|
179
|
+
#### `parse.repair_identifiers(series: pd.Series, pad_length: int = None) -> pd.Series`
|
|
180
|
+
|
|
181
|
+
* **Use**: Restores dropped leading zeroes on data codes (e.g., converts float `401.0` back to `"00401"`).
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
df["invoice_date"] = tt.parse.dates(df["invoice_date"], dayfirst=False)
|
|
185
|
+
df["net_revenue"] = tt.parse.financials(df["net_revenue"])
|
|
186
|
+
df["zip_code"] = tt.parse.repair_identifiers(df["zip_code"], pad_length=5)
|
|
187
|
+
df["roi_metric"] = tt.parse.handle_formula_ghosts(df["roi_metric"], error_strategy="coerce")
|
|
188
|
+
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
### 4. `tidytable.missing` (The Ghost Hunter)
|
|
194
|
+
|
|
195
|
+
Identifies and resolves gaps in data matrices.
|
|
196
|
+
|
|
197
|
+
#### `missing.drop_empty_cols(df: pd.DataFrame, threshold: float = 0.50) -> pd.DataFrame`
|
|
198
|
+
|
|
199
|
+
* **Use**: Drops column attributes where the missing values ratio exceeds the threshold boundary limit.
|
|
200
|
+
|
|
201
|
+
#### `missing.flag_absence(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame`
|
|
202
|
+
|
|
203
|
+
* **Use**: Appends a binary companion indicator column (`{column}_is_missing`) to keep the data signal before running imputation.
|
|
204
|
+
|
|
205
|
+
#### `missing.impute(series: pd.Series, strategy: str = "median") -> pd.Series`
|
|
206
|
+
|
|
207
|
+
* **Use**: Fills null voids based on chosen statistical parameters (`"mean"`, `"median"`, `"mode"`).
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
df = tt.missing.drop_empty_cols(df, threshold=0.40)
|
|
211
|
+
df = tt.missing.flag_absence(df, columns=["customer_age"])
|
|
212
|
+
df["customer_age"] = tt.missing.impute(df["customer_age"], strategy="median")
|
|
213
|
+
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
### 5. `tidytable.dedup` (The Twin Eliminator)
|
|
219
|
+
|
|
220
|
+
Detects and drops duplicate entries across your rows.
|
|
221
|
+
|
|
222
|
+
#### `dedup.absolute(df: pd.DataFrame) -> pd.DataFrame`
|
|
223
|
+
|
|
224
|
+
* **Use**: Drops rows only if they match exactly across every single field.
|
|
225
|
+
|
|
226
|
+
#### `dedup.partial(df: pd.DataFrame, subset: list[str], keep: str = "latest", timestamp_col: str = None) -> pd.DataFrame`
|
|
227
|
+
|
|
228
|
+
* **Use**: Resolves record updates by keeping the earliest or latest transaction entry for a unique key.
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
# Clear absolute identical rows
|
|
232
|
+
df = tt.dedup.absolute(df)
|
|
233
|
+
|
|
234
|
+
# For matching customer IDs, keep only the record with the most recent update timestamp
|
|
235
|
+
df = tt.dedup.partial(df, subset=["customer_id"], keep="latest", timestamp_col="updated_at")
|
|
236
|
+
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
### 6. `tidytable.merge` (The VLOOKUP Bridge)
|
|
242
|
+
|
|
243
|
+
Joins separate files together even when keys are messy, incomplete, or slightly misspelled.
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
# Identify mismatched elements before executing joins
|
|
247
|
+
pre_flight = tt.merge.join_diagnose(left_df=leads_df, right_df=master_df, left_on="vendor", right_on="v_name")
|
|
248
|
+
|
|
249
|
+
# Join matching rows even if there are typos (e.g., matches "Apple Inc." to "Apple, Inc.")
|
|
250
|
+
joined_df = tt.merge.fuzzy_vlookup(leads_df, master_df, left_on="vendor", right_on="v_name", threshold=0.88)
|
|
251
|
+
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
### 7. `tidytable.reconcile` (The Ledger Auditor)
|
|
257
|
+
|
|
258
|
+
Automates version control checks between separate instances of the same file structure.
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
# Generate structural audits comparing January data against February data
|
|
262
|
+
ledger_updates = tt.reconcile.sheet_diff(df_old=jan_df, df_new=feb_df, key_column="transaction_id")
|
|
263
|
+
|
|
264
|
+
print("New rows added this month:", len(ledger_updates["Added"]))
|
|
265
|
+
print("Row modifications captured:", len(ledger_updates["Modified"]))
|
|
266
|
+
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
### 8. `tidytable.profile` (The Auditor & Schema Guard)
|
|
272
|
+
|
|
273
|
+
Handles file schema pinning, anomaly detection, and automated audit trails.
|
|
274
|
+
|
|
275
|
+
```python
|
|
276
|
+
# Scan for raw strings masking null values (e.g., "?", "n/a", "-")
|
|
277
|
+
anomalies = tt.profile.check_anomalies(df)
|
|
278
|
+
|
|
279
|
+
# Validate current file structure against last month's blueprint to make sure scripts don't crash
|
|
280
|
+
if tt.profile.validate(df, schema_path="schemas/prod_blueprint.json"):
|
|
281
|
+
# Output file pipeline performance audit change log metrics
|
|
282
|
+
print(tt.profile.audit_report(df, output="cli"))
|
|
283
|
+
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
## 🎯 Complete End-to-End Explicit Analyst Workflow
|
|
289
|
+
|
|
290
|
+
Here is a complete real-world script showing how an analyst runs a detailed cleaning pipeline manually:
|
|
291
|
+
|
|
292
|
+
```python
|
|
293
|
+
import tidytable as tt
|
|
294
|
+
import pandas as pd
|
|
295
|
+
|
|
296
|
+
# Step 1: Layout Normalization
|
|
297
|
+
workbook = tt.xl.load_workbook("raw_factory_data.xlsx")
|
|
298
|
+
sheet_grid = workbook["Master_Log"]
|
|
299
|
+
df = tt.xl.unmerge_and_fill(sheet_grid, strategy="ffill")
|
|
300
|
+
|
|
301
|
+
# Step 2: Structural Column Cleaning
|
|
302
|
+
df = tt.structural.rename_columns(df, style="snake_case")
|
|
303
|
+
df["part_name"] = tt.structural.strip_whitespace(df["part_name"])
|
|
304
|
+
|
|
305
|
+
# Step 3: Type Safe Parsing
|
|
306
|
+
df["serial_id"] = tt.parse.repair_identifiers(df["serial_id"], pad_length=6)
|
|
307
|
+
df["cost"] = tt.parse.financials(df["cost"])
|
|
308
|
+
df["log_date"] = tt.parse.dates(df["log_date"])
|
|
309
|
+
|
|
310
|
+
# Step 4: Integrity and Row Refinement
|
|
311
|
+
df = tt.missing.flag_absence(df, columns=["efficiency_score"])
|
|
312
|
+
df["efficiency_score"] = tt.missing.impute(df["efficiency_score"], strategy="mean")
|
|
313
|
+
df = tt.dedup.absolute(df)
|
|
314
|
+
|
|
315
|
+
# Step 5: Verification & Schema Pinning
|
|
316
|
+
if tt.profile.validate(df, schema_path="schemas/factory_spec.json"):
|
|
317
|
+
df.to_csv("clean_factory_data.csv", index=False)
|
|
318
|
+
print(tt.profile.audit_report(df, output="cli"))
|
|
319
|
+
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## ⚖️ License
|
|
325
|
+
|
|
326
|
+
Distributed under the MIT License. See `LICENSE` for details.
|
|
327
|
+
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
tidytable/__init__.py,sha256=OhNae229kC5RbGOwFtVpSzXKpkIuXMTUwBoeFjDQoQk,298
|
|
2
|
+
tidytable/dedup/__init__.py,sha256=d3l94dahpt9698AAEKjD615WsL9FacKYPDGRGIPr2y0,83
|
|
3
|
+
tidytable/dedup/engine.py,sha256=8wiMmQToV7AprEr-siQlBFqxaGl5_hX3xb_9kQCIHx8,896
|
|
4
|
+
tidytable/merge/__init__.py,sha256=BU-Z1T2Zw6U6y-RV5CdznU5VHNgJid4rWJi7H2fHEME,105
|
|
5
|
+
tidytable/merge/engine.py,sha256=0rRRFiEHwIdr8IRmto9II7X9p-ZjQhJ7TMFJhf__MqA,1490
|
|
6
|
+
tidytable/missing/__init__.py,sha256=B33QsEgD_37bwTtTZtKwTf0j0DXg8q5hpxEXgAFQcjg,129
|
|
7
|
+
tidytable/missing/engine.py,sha256=w3pQ0IPpGp0xDEnCz5yiK5bp4OlUw5LMuM24UJNfQlI,1071
|
|
8
|
+
tidytable/parse/__init__.py,sha256=9iGY8AbhVxUnGOpJy0KzNEWypvxamLvtnRhguSrqZ-0,205
|
|
9
|
+
tidytable/parse/engine.py,sha256=73kdQsBGy848FiYiUv1MIFlCu4Z0shgKEPg3wINuxWw,3071
|
|
10
|
+
tidytable/profile/__init__.py,sha256=vD7IOmIGlT8A_WW9AYNuIxuFxIVuPBcdZ0iryx-W9ds,191
|
|
11
|
+
tidytable/profile/engine.py,sha256=r5NRlRmvstii9ZItlqjtzDxritDVxrHnUdFU3jty5gY,2676
|
|
12
|
+
tidytable/reconcile/__init__.py,sha256=k2wQKsrb9WIjBjzIRxDGWn8dlADYC52qqK-dPV8kafw,99
|
|
13
|
+
tidytable/reconcile/engine.py,sha256=ELuE-ByULeIboJ621m9dtsZouQgykdktIpSvIkWC-Vs,1592
|
|
14
|
+
tidytable/structural/__init__.py,sha256=rO4Ed2QkuRFrmbi9S76Cc-_qNAVdSIJbjsvjrlkMDwo,167
|
|
15
|
+
tidytable/structural/engine.py,sha256=n_eTiPa3PZeC1FhR8h5vbpFIl_N5oM33JuiwHavUGvU,1933
|
|
16
|
+
tidytable/xl/__init__.py,sha256=iiuAckb6es9o6ut4f07qBJQzk0VNYeYHlSQtrqmCUqQ,193
|
|
17
|
+
tidytable/xl/engine.py,sha256=vNzTYvA8AQk3lirkCCuQnOIF6QH1c4jdBfPIORsFVp0,2148
|
|
18
|
+
tidytable_core-1.0.0.dist-info/licenses/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
tidytable_core-1.0.0.dist-info/METADATA,sha256=ZMT4TnhoQwx_1IgW9GXViSDimzDL6fZm6JFtE91-mu0,11828
|
|
20
|
+
tidytable_core-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
21
|
+
tidytable_core-1.0.0.dist-info/top_level.txt,sha256=DYeHvMlVFQwgpGUmlMe26iZrkoha6of6-nkB3B5vMg0,10
|
|
22
|
+
tidytable_core-1.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tidytable
|