tidytable-core 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tidytable/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ from . import xl
2
+ from . import structural
3
+ from . import parse
4
+ from . import missing
5
+ from . import dedup
6
+ from . import merge
7
+ from . import reconcile
8
+ from . import profile
9
+
10
+ __all__ = [
11
+ "xl",
12
+ "structural",
13
+ "parse",
14
+ "missing",
15
+ "dedup",
16
+ "merge",
17
+ "reconcile",
18
+ "profile"
19
+ ]
@@ -0,0 +1,6 @@
1
+ from .engine import absolute, partial
2
+
3
+ __all__ = [
4
+ "absolute",
5
+ "partial"
6
+ ]
@@ -0,0 +1,17 @@
1
+ import pandas as pd
2
+
3
+ def absolute(df: pd.DataFrame) -> pd.DataFrame:
4
+ """Removes records where rows present duplicate values across every single key column."""
5
+ return df.drop_duplicates().reset_index(drop=True)
6
+
7
+ def partial(df: pd.DataFrame, subset: list[str], keep: str = "latest", timestamp_col: str = None) -> pd.DataFrame:
8
+ """Resolves ledger data conflicts across target unique matching indexes."""
9
+ if keep in ["latest", "earliest"] and not timestamp_col:
10
+ raise ValueError("timestamp_col must be specified when keeping latest/earliest items.")
11
+
12
+ working_df = df.copy()
13
+ if timestamp_col:
14
+ working_df = working_df.sort_values(by=timestamp_col, ascending=(keep == "earliest"))
15
+
16
+ keep_policy = "first" if keep in ["latest", "earliest"] else keep
17
+ return working_df.drop_duplicates(subset=subset, keep=keep_policy).reset_index(drop=True)
@@ -0,0 +1,6 @@
1
+ from .engine import fuzzy_vlookup, join_diagnose
2
+
3
+ __all__ = [
4
+ "fuzzy_vlookup",
5
+ "join_diagnose"
6
+ ]
@@ -0,0 +1,34 @@
1
+ import pandas as pd
2
+ from rapidfuzz import process
3
+
4
+ def fuzzy_vlookup(left_df: pd.DataFrame, right_df: pd.DataFrame, left_on: str, right_on: str, threshold: float = 0.85) -> pd.DataFrame:
5
+ """Executes an error-tolerant database left-join using string similarity score profiles."""
6
+ ldf = left_df.copy()
7
+ rdf = right_df.copy()
8
+
9
+ right_keys = rdf[right_on].dropna().astype(str).tolist()
10
+ match_map = {}
11
+
12
+ for l_key in ldf[left_on].dropna().astype(str).unique():
13
+ best_match = process.extractOne(l_key, right_keys, score_cutoff=threshold * 100)
14
+ if best_match:
15
+ match_map[l_key] = best_match[0]
16
+
17
+ ldf[f"_fuzzy_key_join"] = ldf[left_on].astype(str).map(match_map)
18
+ rdf[f"_fuzzy_key_join"] = rdf[right_on].astype(str)
19
+
20
+ merged = pd.merge(ldf, rdf, on="_fuzzy_key_join", how="left").drop(columns=["_fuzzy_key_join"])
21
+ return merged
22
+
23
+ def join_diagnose(left_df: pd.DataFrame, right_df: pd.DataFrame, left_on: str, right_on: str) -> dict:
24
+ """Runs a pre-flight assessment matrix map detailing alignment success chances."""
25
+ l_keys = set(left_df[left_on].dropna().unique())
26
+ r_keys = set(right_df[right_on].dropna().unique())
27
+
28
+ unmatched = l_keys - r_keys
29
+ return {
30
+ "left_total_unique_keys": len(l_keys),
31
+ "right_total_unique_keys": len(r_keys),
32
+ "unmatched_left_keys_count": len(unmatched),
33
+ "type_alignment_match": left_df[left_on].dtype == right_df[right_on].dtype
34
+ }
@@ -0,0 +1,7 @@
1
+ from .engine import drop_empty_cols, flag_absence, impute
2
+
3
+ __all__ = [
4
+ "drop_empty_cols",
5
+ "flag_absence",
6
+ "impute"
7
+ ]
@@ -0,0 +1,27 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def drop_empty_cols(df: pd.DataFrame, threshold: float = 0.50) -> pd.DataFrame:
5
+ """Prunes out layout dimension paths that present excess null counts."""
6
+ return df.dropna(axis=1, thresh=int((1 - threshold) * len(df)))
7
+
8
+ def flag_absence(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
9
+ """Appends binary indicator tracking indexes to document missing status items."""
10
+ new_df = df.copy()
11
+ for col in columns:
12
+ if col in new_df.columns:
13
+ new_df[f"{col}_is_missing"] = new_df[col].isna()
14
+ return new_df
15
+
16
+ def impute(series: pd.Series, strategy: str = "median") -> pd.Series:
17
+ """Fills data holes safely based on chosen statistical profiles."""
18
+ s = series.copy()
19
+ if strategy == "mean":
20
+ fill_val = s.mean()
21
+ elif strategy == "median":
22
+ fill_val = s.median()
23
+ elif strategy == "mode":
24
+ fill_val = s.mode()[0] if not s.mode().empty else np.nan
25
+ else:
26
+ fill_val = strategy # Interpret as explicit fallback text/value
27
+ return s.fillna(fill_val)
@@ -0,0 +1,9 @@
1
+ from .engine import dates, financials, boolean, repair_identifiers, handle_formula_ghosts
2
+
3
+ __all__ = [
4
+ "dates",
5
+ "financials",
6
+ "boolean",
7
+ "repair_identifiers",
8
+ "handle_formula_ghosts"
9
+ ]
@@ -0,0 +1,73 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from dateutil import parser as date_parser
4
+
5
+ def financials(series: pd.Series) -> pd.Series:
6
+ """Extracts scalar numbers from extreme formatting configurations and bounds."""
7
+ def _clean(val):
8
+ if pd.isna(val): return np.nan
9
+ s = str(val).strip().replace(',', '')
10
+ if not s or s.lower() in ['nan', 'none', 'null', '']: return np.nan
11
+
12
+ # Guard against raw exponential notations or infinite boundaries
13
+ if s.lower() in ['inf', '-inf', 'infinity']:
14
+ return np.inf if '-' not in s else -np.inf
15
+
16
+ is_negative = ('(' in s and ')' in s) or ('-' in s)
17
+ s = s.replace('$', '').replace('€', '').replace('£', '').replace('(', '').replace(')', '').replace('-', '').strip()
18
+
19
+ if not s: return np.nan
20
+
21
+ # Handle scaling metrics multipliers safely
22
+ multiplier = 1
23
+ if s.lower().endswith('k'):
24
+ multiplier = 1_000
25
+ s = s[:-1]
26
+ elif s.lower().endswith('m'):
27
+ multiplier = 1_000_000
28
+ s = s[:-1]
29
+ elif s.lower().endswith('b'):
30
+ multiplier = 1_000_000_000
31
+ s = s[:-1]
32
+
33
+ try:
34
+ num = float(s) * multiplier
35
+ return -num if is_negative else num
36
+ except ValueError:
37
+ return np.nan
38
+
39
+ return series.apply(_clean)
40
+
41
+ def dates(series: pd.Series, dayfirst: bool = False) -> pd.Series:
42
+ """Transforms fluid data variants down to ISO timestamps while guarding against out-of-bounds dates."""
43
+ def _parse(val):
44
+ if pd.isna(val) or str(val).strip() in ["", "NaT", "NaN", "None", "null"]: return pd.NaT
45
+ try:
46
+ return date_parser.parse(str(val), dayfirst=dayfirst)
47
+ except (ValueError, OverflowError, TypeError):
48
+ return pd.NaT
49
+ return series.apply(_parse)
50
+
51
+ def handle_formula_ghosts(series: pd.Series, error_strategy: str = "coerce") -> pd.Series:
52
+ """Traps calculations failures from Excel and forces them to target null variables."""
53
+ ghost_strings = ["#VALUE!", "#REF!", "#DIV/0!", "#NAME?", "#N/A", "#NUM!", "#NULL!"]
54
+ return series.apply(lambda x: np.nan if str(x).strip().upper() in ghost_strings else x)
55
+
56
+ def repair_identifiers(series: pd.Series, pad_length: int = None) -> pd.Series:
57
+ """Restores dropped leading zeroes on data identifiers securely."""
58
+ def _pad(val):
59
+ if pd.isna(val) or str(val).strip() in ["", "nan", "NaN"]: return ""
60
+ s = str(val).split('.')[0].strip()
61
+ return s.zfill(pad_length) if pad_length else s
62
+ return series.apply(_pad)
63
+
64
+ def boolean(series: pd.Series) -> pd.Series:
65
+ """Maps custom categorical indicator strings cleanly to boolean values."""
66
+ truth_map = {
67
+ "y": True, "yes": True, "1": True, "t": True, "true": True,
68
+ "n": False, "no": False, "0": False, "f": False, "false": False
69
+ }
70
+ # Clean the string, lower it, and map it directly to True/False/NaN
71
+ return series.astype(str).str.strip().str.lower().map(truth_map)
72
+
73
+
@@ -0,0 +1,9 @@
1
+ from .engine import blueprint, check_anomalies, audit_report, pin_schema, validate
2
+
3
+ __all__ = [
4
+ "blueprint",
5
+ "check_anomalies",
6
+ "audit_report",
7
+ "pin_schema",
8
+ "validate"
9
+ ]
@@ -0,0 +1,62 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import json
4
+
5
+ def blueprint(df: pd.DataFrame) -> dict:
6
+ """Generates a high-level overview of shapes, data types, and memory metrics."""
7
+ return {
8
+ "shape": {"rows": df.shape[0], "columns": df.shape[1]},
9
+ "dtypes": {str(k): str(v) for k, v in df.dtypes.to_dict().items()},
10
+ "null_counts": {str(k): int(v) for k, v in df.isna().sum().to_dict().items()}
11
+ }
12
+
13
+ def check_anomalies(df: pd.DataFrame) -> list[str]:
14
+ """Scans boundaries to check for junk placeholder values strings."""
15
+ alerts = []
16
+ placeholders = ["?", "n/a", "none", "-", "nan", "null"]
17
+
18
+ for col in df.columns:
19
+ stringified = df[col].astype(str).str.strip().str.lower()
20
+ found = stringified[stringified.isin(placeholders)].unique()
21
+ if len(found) > 0:
22
+ alerts.append(f"Column '{col}' contains unhandled string anomalies: {list(found)}")
23
+
24
+ return alerts
25
+
26
+ def audit_report(df: pd.DataFrame, output: str = "cli") -> str:
27
+ """Compiles a text summary documenting pipeline modifications."""
28
+ bp = blueprint(df)
29
+ report = f"=== TIDYTABLE DATA CLEANING AUDIT REPORT ===\n"
30
+ report += f"Final Dimensions: {bp['shape']['rows']} rows x {bp['shape']['columns']} columns\n"
31
+ report += f"Column Composition and Null Status Logs:\n"
32
+ for col, dtype in bp['dtypes'].items():
33
+ report += f" - {col} ({dtype}): Missing values: {bp['null_counts'][col]}\n"
34
+ return report
35
+
36
+ def pin_schema(df: pd.DataFrame, path: str) -> None:
37
+ """Saves the data structure blueprint to a local template file."""
38
+ bp = blueprint(df)
39
+ with open(path, 'w') as f:
40
+ json.dump(bp, f, indent=4)
41
+
42
+ def validate(df: pd.DataFrame, schema_path: str) -> bool:
43
+ """Validates the processed dataset against a saved reference blueprint."""
44
+ with open(schema_path, 'r') as f:
45
+ saved_schema = json.load(f)
46
+
47
+ current_cols = set(df.columns)
48
+ saved_cols = set(saved_schema["dtypes"].keys())
49
+
50
+ if current_cols != saved_cols:
51
+ missing = saved_cols - current_cols
52
+ extra = current_cols - saved_cols
53
+ raise ValueError(f"Schema mismatch. Missing expected columns: {missing}. Unexpected extra columns: {extra}")
54
+
55
+ for col in df.columns:
56
+ curr_type = str(df[col].dtype)
57
+ saved_type = saved_schema["dtypes"][col]
58
+ # Allow flexible evaluation comparisons for strings/objects object mappings
59
+ if curr_type != saved_type and not (curr_type == "object" and saved_type == "string"):
60
+ raise TypeError(f"Type validation alert on '{col}': Expected {saved_type}, found {curr_type}")
61
+
62
+ return True
@@ -0,0 +1,6 @@
1
+ from .engine import sheet_diff, align_schemas
2
+
3
+ __all__ = [
4
+ "sheet_diff",
5
+ "align_schemas"
6
+ ]
@@ -0,0 +1,38 @@
1
+ import pandas as pd
2
+
3
+ def sheet_diff(df_old: pd.DataFrame, df_new: pd.DataFrame, key_column: str) -> dict[str, pd.DataFrame]:
4
+ """Compares two balance frames to extract record additions, deletions, and alterations."""
5
+ old_idx = df_old.set_index(key_column)
6
+ new_idx = df_new.set_index(key_column)
7
+
8
+ added_keys = new_idx.index.difference(old_idx.index)
9
+ dropped_keys = old_idx.index.difference(new_idx.index)
10
+ shared_keys = old_idx.index.intersection(new_idx.index)
11
+
12
+ added_df = new_idx.loc[added_keys].reset_index()
13
+ dropped_df = old_idx.loc[dropped_keys].reset_index()
14
+
15
+ # Identify modified value items
16
+ modified_rows = []
17
+ for key in shared_keys:
18
+ old_row = old_idx.loc[key]
19
+ new_row = new_idx.loc[key]
20
+ if not old_row.equals(new_row):
21
+ diff_mask = old_row != new_row
22
+ row_data = {key_column: key}
23
+ for col in old_idx.columns:
24
+ if diff_mask.get(col, False):
25
+ row_data[f"{col}_old"] = old_row[col]
26
+ row_data[f"{col}_new"] = new_row[col]
27
+ modified_rows.append(row_data)
28
+
29
+ return {
30
+ "Added": added_df,
31
+ "Dropped": dropped_df,
32
+ "Modified": pd.DataFrame(modified_rows)
33
+ }
34
+
35
+ def align_schemas(df_old: pd.DataFrame, df_new: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
36
+ """Re-orders and aligns column structures between two file generations."""
37
+ shared_cols = [col for col in df_old.columns if col in df_new.columns]
38
+ return df_old[shared_cols].copy(), df_new[shared_cols].copy()
@@ -0,0 +1,7 @@
1
+ from .engine import rename_columns, strip_whitespace, standardize_categories
2
+
3
+ __all__ = [
4
+ "rename_columns",
5
+ "strip_whitespace",
6
+ "standardize_categories"
7
+ ]
@@ -0,0 +1,44 @@
1
+ import pandas as pd
2
+ import re
3
+ from rapidfuzz import process, utils
4
+
5
+ def rename_columns(df: pd.DataFrame, style: str = "snake_case") -> pd.DataFrame:
6
+ """Converts column labels cleanly into standardized variable formats."""
7
+ cleaned_df = df.copy()
8
+ new_names = []
9
+
10
+ for col in cleaned_df.columns:
11
+ s = str(col).strip()
12
+ if style == "snake_case":
13
+ s = re.sub(r'[^\w\s-]', '', s) # Drop punctuation marks
14
+ s = s.replace('-', '_') # Convert hard dashes to clear underscores
15
+ s = re.sub(r'[\s_]+', '_', s) # Collapse space/underscore variations
16
+ s = s.strip('_').lower() # Trim edges and lowercase
17
+ new_names.append(s)
18
+
19
+ cleaned_df.columns = new_names
20
+ return cleaned_df
21
+
22
+ def strip_whitespace(series: pd.Series) -> pd.Series:
23
+ """Strips regular spaces, tab lines, and hidden web code spaces (\xa0)."""
24
+ return series.astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip() if pd.notna(x) else x)
25
+
26
+ def standardize_categories(series: pd.Series, mapping: dict = None, auto_cluster: bool = False) -> pd.Series:
27
+ """Groups scattered spelling variations or mapping targets into a clean key frame."""
28
+ s = series.copy()
29
+ if mapping:
30
+ reverse_map = {variant: key for key, list_var in mapping.items() for variant in list_var}
31
+ return s.apply(lambda x: reverse_map.get(str(x).strip(), x))
32
+
33
+ if auto_cluster:
34
+ unique_vals = [str(v).strip() for v in s.dropna().unique()]
35
+ processed_map = {}
36
+ # Simple clustering matrix setup
37
+ for val in unique_vals:
38
+ if val in processed_map: continue
39
+ matches = process.extract(val, unique_vals, score_cutoff=85.0)
40
+ for match in matches:
41
+ processed_map[match[0]] = val
42
+ return s.apply(lambda x: processed_map.get(str(x).strip(), x) if pd.notna(x) else x)
43
+
44
+ return s
@@ -0,0 +1,8 @@
1
+ from .engine import load_workbook, unmerge_and_fill, sniff_headers, split_multi_tables
2
+
3
+ __all__ = [
4
+ "load_workbook",
5
+ "unmerge_and_fill",
6
+ "sniff_headers",
7
+ "split_multi_tables"
8
+ ]
tidytable/xl/engine.py ADDED
@@ -0,0 +1,56 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import openpyxl
4
+
5
+ def load_workbook(file_path: str) -> dict[str, pd.DataFrame]:
6
+ """Loads an Excel file and returns a dictionary of dataframes mapped by sheet names."""
7
+ return pd.read_excel(file_path, sheet_name=None)
8
+
9
+ def unmerge_and_fill(sheet_data: pd.DataFrame, strategy: str = "ffill") -> pd.DataFrame:
10
+ """Detects merged cells layout defects and flattens values forward or laterally."""
11
+ df = sheet_data.copy()
12
+ if strategy == "ffill":
13
+ return df.ffill()
14
+ elif strategy == "lfill":
15
+ return df.ffill(axis=1)
16
+ else:
17
+ raise ValueError("Strategy must be either 'ffill' or 'lfill'")
18
+
19
+ def sniff_headers(sheet_data: pd.DataFrame, scan_rows: int = 20) -> tuple[int, list[str]]:
20
+ """Locates where metadata panels clear out and structured headers begin."""
21
+ df_scan = sheet_data.head(scan_rows)
22
+ best_row_idx = 0
23
+ max_non_nulls = -1
24
+
25
+ for idx, row in df_scan.iterrows():
26
+ non_null_count = row.notna().sum()
27
+ if non_null_count > max_non_nulls:
28
+ max_non_nulls = non_null_count
29
+ best_row_idx = idx
30
+
31
+ detected_headers = [str(val).strip() for val in sheet_data.iloc[best_row_idx].tolist()]
32
+ return int(best_row_idx), detected_headers
33
+
34
+ def split_multi_tables(sheet_data: pd.DataFrame) -> list[pd.DataFrame]:
35
+ """Slices a single layout matrix into distinct data tables if isolated by full empty spaces."""
36
+ empty_cols = sheet_data.isna().all(axis=0)
37
+ split_indices = np.where(empty_cols)[0]
38
+
39
+ if len(split_indices) == 0:
40
+ return [sheet_data]
41
+
42
+ tables = []
43
+ start_col = 0
44
+ for idx in split_indices:
45
+ if idx > start_col:
46
+ slice_df = sheet_data.iloc[:, start_col:idx].dropna(how='all')
47
+ if not slice_df.empty:
48
+ tables.append(slice_df.reset_index(drop=True))
49
+ start_col = idx + 1
50
+
51
+ if start_col < sheet_data.shape[1]:
52
+ slice_df = sheet_data.iloc[:, start_col:].dropna(how='all')
53
+ if not slice_df.empty:
54
+ tables.append(slice_df.reset_index(drop=True))
55
+
56
+ return tables
@@ -0,0 +1,330 @@
1
+ Metadata-Version: 2.4
2
+ Name: tidytable-core
3
+ Version: 1.0.0
4
+ Summary: An ecosystem-style explicit data cleaning framework for Excel and CSV pipelines.
5
+ Author-email: Aayush Vijay <aayushvj8699@gmail.com>
6
+ Project-URL: Homepage, https://github.com/aayushvijay/tidytable
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: MacOS :: MacOS X
10
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pandas>=2.0.0
15
+ Requires-Dist: openpyxl>=3.1.0
16
+ Requires-Dist: python-dateutil>=2.8.2
17
+ Requires-Dist: rapidfuzz>=3.0.0
18
+ Dynamic: license-file
19
+
20
+ Here is the raw Markdown block. You can copy everything inside this block and paste it directly into your `README.md` file:
21
+
22
+ ```markdown
23
+ # tidytable-core 🧹
24
+
25
+ An ecosystem-style, explicit data cleaning framework built for data analysts who bridge the gap between messy, human-formatted Excel/CSV spreadsheets and production-ready Python data structures.
26
+
27
+ Unlike "black-box" cleaning scripts that automatically change your underlying values, `tidytable` forces an **explicit pipeline paradigm**. Each transformation engine is completely decoupled, granting you absolute control and step-by-step data lineage tracking.
28
+
29
+ ---
30
+
31
+ ## 🚀 Installation & System Configuration
32
+
33
+ Install `tidytable-core` globally from the Python Package Index (PyPI):
34
+
35
+ ```bash
36
+ pip install tidytable-core
37
+
38
+ ```
39
+
40
+ ### Core External System Dependencies
41
+
42
+ * **`pandas`**: Core tabular dataframe manipulation engine.
43
+ * **`openpyxl`**: Memory-mapped engine for modern `.xlsx` workbook streams.
44
+ * **`python-dateutil`**: Dynamic flexible timestamp text resolution matrix parser.
45
+ * **`rapidfuzz`**: High-performance Levenshtein Distance string similarity evaluation index engine.
46
+
47
+ ---
48
+
49
+ ## 🏗️ Architectural Execution Pipeline
50
+
51
+ Data flows sequentially through your explicitly invoked processing domains:
52
+
53
+ ```
54
+ [ Messy Spreadsheet File ]
55
+
56
+
57
+ ┌───────────────────────┐
58
+ │ tidytable.xl │ ──► Resolves unaligned human-formatted layout grids.
59
+ └───────────┬───────────┘
60
+ │ (Tabular Data Stream)
61
+
62
+ ┌───────────────────────┐
63
+ │ tidytable.structural │ ──► Standardizes variable labels and strips whitespace.
64
+ └───────────┬───────────┘
65
+
66
+
67
+ ┌───────────────────────┐
68
+ │ tidytable.parse │ ──► Casts values safely into clean data types.
69
+ └───────────┬───────────┘
70
+
71
+
72
+ ┌───────────────────────┐
73
+ │ tidytable.missing │ ──► Drops empty panels and applies imputation profiles.
74
+ └───────────┬───────────┘
75
+
76
+
77
+ ┌───────────────────────┐
78
+ │ tidytable.dedup │ ──► Filters duplicate records safely.
79
+ └───────────┬───────────┘
80
+
81
+
82
+ ┌───────────────────────┐
83
+ │ tidytable.profile │ ──► Validates constraints and generates audit ledger logs.
84
+ └───────────┬───────────┘
85
+
86
+
87
+ [ Pristine DataFrame ]
88
+
89
+ ```
90
+
91
+ ---
92
+
93
+ ## 📚 Complete Sub-Library Blueprint Reference
94
+
95
+ ### 1. `tidytable.xl` (The Excel Surgeon)
96
+
97
+ Extracts pristine data grids from visually stylized worksheets.
98
+
99
+ #### `xl.load_workbook(file_path: str) -> dict[str, pd.DataFrame]`
100
+
101
+ * **Use**: Loads an entire workbook into memory.
102
+ * **Arguments**: `file_path` (*str*): System destination path pointing to an Excel document.
103
+
104
+ #### `xl.unmerge_and_fill(sheet_data: pd.DataFrame, strategy: str = "ffill") -> pd.DataFrame`
105
+
106
+ * **Use**: Flattens merged cells and fills empty fields down or across so rows stay linked.
107
+ * **Arguments**:
108
+ * `sheet_data` (*DataFrame*): The input sheet table matrix.
109
+ * `strategy` (*str*): Direction constraint rule. `"ffill"` (forward fill down) or `"lfill"` (lateral fill across).
110
+
111
+
112
+
113
+ #### `xl.sniff_headers(sheet_data: pd.DataFrame, scan_rows: int = 20) -> tuple[int, list[str]]`
114
+
115
+ * **Use**: Skips title banners and KPI cards to find where the actual table headers start.
116
+ * **Arguments**: `scan_rows` (*int*): Search depth row index limit.
117
+
118
+ ```python
119
+ import tidytable as tt
120
+
121
+ sheets = tt.xl.load_workbook("sales_report.xlsx")
122
+ raw_data = sheets["Q2_Leads"]
123
+
124
+ # Sniff header index location and clean names
125
+ header_idx, headers = tt.xl.sniff_headers(raw_data, scan_rows=15)
126
+
127
+ # Unmerge and prop values down to form a database structure
128
+ df = tt.xl.unmerge_and_fill(raw_data, strategy="ffill")
129
+
130
+ ```
131
+
132
+ ---
133
+
134
+ ### 2. `tidytable.structural` (The Text Blacksmith)
135
+
136
+ Cleans and standardizes column structures and text anomalies.
137
+
138
+ #### `structural.rename_columns(df: pd.DataFrame, style: str = "snake_case") -> pd.DataFrame`
139
+
140
+ * **Use**: Converts columns (like `"Gross Profit (%)"`) into clean variables (`"gross_profit"`).
141
+ * **Arguments**: `style` (*str*): Re-casing format rules. Default is `"snake_case"`.
142
+
143
+ #### `structural.strip_whitespace(series: pd.Series) -> pd.Series`
144
+
145
+ * **Use**: Deep-strips leading/trailing spaces, tab breaks, and hidden non-breaking spaces (`\xa0`).
146
+
147
+ #### `structural.standardize_categories(series: pd.Series, mapping: dict = None, auto_cluster: bool = False) -> pd.Series`
148
+
149
+ * **Use**: Groups manual typos and naming variations into a single target category name.
150
+ * **Arguments**:
151
+ * `mapping` (*dict*): Manual dictionary rules map (e.g., `{"USA": ["usa", "U.S.A.", "us"]}`).
152
+ * `auto_cluster` (*bool*): Uses Levenshtein Distance to merge variations automatically.
153
+
154
+
155
+
156
+ ```python
157
+ df = tt.structural.rename_columns(df, style="snake_case")
158
+ df["product_name"] = tt.structural.strip_whitespace(df["product_name"])
159
+
160
+ # Merge regional text typos automatically using string distance clustering
161
+ df["region"] = tt.structural.standardize_categories(df["region"], auto_cluster=True)
162
+
163
+ ```
164
+
165
+ ---
166
+
167
+ ### 3. `tidytable.parse` (The Type Whisperer)
168
+
169
+ Converts raw string text blocks into strict mathematical datatypes without crashing.
170
+
171
+ #### `parse.dates(series: pd.Series, dayfirst: bool = False) -> pd.Series`
172
+
173
+ * **Use**: Parses mixed date format variations in a single column into uniform ISO datetimes.
174
+
175
+ #### `parse.financials(series: pd.Series) -> pd.Series`
176
+
177
+ * **Use**: Extracts numeric values from accounting styles like `"$ (1,250.00)"` or `"12K"`.
178
+
179
+ #### `parse.repair_identifiers(series: pd.Series, pad_length: int = None) -> pd.Series`
180
+
181
+ * **Use**: Restores dropped leading zeroes on data codes (e.g., converts float `401.0` back to `"00401"`).
182
+
183
+ ```python
184
+ df["invoice_date"] = tt.parse.dates(df["invoice_date"], dayfirst=False)
185
+ df["net_revenue"] = tt.parse.financials(df["net_revenue"])
186
+ df["zip_code"] = tt.parse.repair_identifiers(df["zip_code"], pad_length=5)
187
+ df["roi_metric"] = tt.parse.handle_formula_ghosts(df["roi_metric"], error_strategy="coerce")
188
+
189
+ ```
190
+
191
+ ---
192
+
193
+ ### 4. `tidytable.missing` (The Ghost Hunter)
194
+
195
+ Identifies and resolves gaps in data matrices.
196
+
197
+ #### `missing.drop_empty_cols(df: pd.DataFrame, threshold: float = 0.50) -> pd.DataFrame`
198
+
199
+ * **Use**: Drops column attributes where the missing values ratio exceeds the threshold boundary limit.
200
+
201
+ #### `missing.flag_absence(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame`
202
+
203
+ * **Use**: Appends a binary companion indicator column (`{column}_is_missing`) to keep the data signal before running imputation.
204
+
205
+ #### `missing.impute(series: pd.Series, strategy: str = "median") -> pd.Series`
206
+
207
+ * **Use**: Fills null voids based on chosen statistical parameters (`"mean"`, `"median"`, `"mode"`).
208
+
209
+ ```python
210
+ df = tt.missing.drop_empty_cols(df, threshold=0.40)
211
+ df = tt.missing.flag_absence(df, columns=["customer_age"])
212
+ df["customer_age"] = tt.missing.impute(df["customer_age"], strategy="median")
213
+
214
+ ```
215
+
216
+ ---
217
+
218
+ ### 5. `tidytable.dedup` (The Twin Eliminator)
219
+
220
+ Detects and drops duplicate entries across your rows.
221
+
222
+ #### `dedup.absolute(df: pd.DataFrame) -> pd.DataFrame`
223
+
224
+ * **Use**: Drops rows only if they match exactly across every single field.
225
+
226
+ #### `dedup.partial(df: pd.DataFrame, subset: list[str], keep: str = "latest", timestamp_col: str = None) -> pd.DataFrame`
227
+
228
+ * **Use**: Resolves record updates by keeping the earliest or latest transaction entry for a unique key.
229
+
230
+ ```python
231
+ # Clear absolute identical rows
232
+ df = tt.dedup.absolute(df)
233
+
234
+ # For matching customer IDs, keep only the record with the most recent update timestamp
235
+ df = tt.dedup.partial(df, subset=["customer_id"], keep="latest", timestamp_col="updated_at")
236
+
237
+ ```
238
+
239
+ ---
240
+
241
+ ### 6. `tidytable.merge` (The VLOOKUP Bridge)
242
+
243
+ Joins separate files together even when keys are messy, incomplete, or slightly misspelled.
244
+
245
+ ```python
246
+ # Identify mismatched elements before executing joins
247
+ pre_flight = tt.merge.join_diagnose(left_df=leads_df, right_df=master_df, left_on="vendor", right_on="v_name")
248
+
249
+ # Join matching rows even if there are typos (e.g., matches "Apple Inc." to "Apple, Inc.")
250
+ joined_df = tt.merge.fuzzy_vlookup(leads_df, master_df, left_on="vendor", right_on="v_name", threshold=0.88)
251
+
252
+ ```
253
+
254
+ ---
255
+
256
+ ### 7. `tidytable.reconcile` (The Ledger Auditor)
257
+
258
+ Automates version control checks between separate instances of the same file structure.
259
+
260
+ ```python
261
+ # Generate structural audits comparing January data against February data
262
+ ledger_updates = tt.reconcile.sheet_diff(df_old=jan_df, df_new=feb_df, key_column="transaction_id")
263
+
264
+ print("New rows added this month:", len(ledger_updates["Added"]))
265
+ print("Row modifications captured:", len(ledger_updates["Modified"]))
266
+
267
+ ```
268
+
269
+ ---
270
+
271
+ ### 8. `tidytable.profile` (The Auditor & Schema Guard)
272
+
273
+ Handles file schema pinning, anomaly detection, and automated audit trails.
274
+
275
+ ```python
276
+ # Scan for raw strings masking null values (e.g., "?", "n/a", "-")
277
+ anomalies = tt.profile.check_anomalies(df)
278
+
279
+ # Validate current file structure against last month's blueprint to make sure scripts don't crash
280
+ if tt.profile.validate(df, schema_path="schemas/prod_blueprint.json"):
281
+ # Output file pipeline performance audit change log metrics
282
+ print(tt.profile.audit_report(df, output="cli"))
283
+
284
+ ```
285
+
286
+ ---
287
+
288
+ ## 🎯 Complete End-to-End Explicit Analyst Workflow
289
+
290
+ Here is a complete real-world script showing how an analyst runs a detailed cleaning pipeline manually:
291
+
292
+ ```python
293
+ import tidytable as tt
294
+ import pandas as pd
295
+
296
+ # Step 1: Layout Normalization
297
+ workbook = tt.xl.load_workbook("raw_factory_data.xlsx")
298
+ sheet_grid = workbook["Master_Log"]
299
+ df = tt.xl.unmerge_and_fill(sheet_grid, strategy="ffill")
300
+
301
+ # Step 2: Structural Column Cleaning
302
+ df = tt.structural.rename_columns(df, style="snake_case")
303
+ df["part_name"] = tt.structural.strip_whitespace(df["part_name"])
304
+
305
+ # Step 3: Type Safe Parsing
306
+ df["serial_id"] = tt.parse.repair_identifiers(df["serial_id"], pad_length=6)
307
+ df["cost"] = tt.parse.financials(df["cost"])
308
+ df["log_date"] = tt.parse.dates(df["log_date"])
309
+
310
+ # Step 4: Integrity and Row Refinement
311
+ df = tt.missing.flag_absence(df, columns=["efficiency_score"])
312
+ df["efficiency_score"] = tt.missing.impute(df["efficiency_score"], strategy="mean")
313
+ df = tt.dedup.absolute(df)
314
+
315
+ # Step 5: Verification & Schema Pinning
316
+ if tt.profile.validate(df, schema_path="schemas/factory_spec.json"):
317
+ df.to_csv("clean_factory_data.csv", index=False)
318
+ print(tt.profile.audit_report(df, output="cli"))
319
+
320
+ ```
321
+
322
+ ---
323
+
324
+ ## ⚖️ License
325
+
326
+ Distributed under the MIT License. See `LICENSE` for details.
327
+
328
+ ```
329
+
330
+ ```
@@ -0,0 +1,22 @@
1
+ tidytable/__init__.py,sha256=OhNae229kC5RbGOwFtVpSzXKpkIuXMTUwBoeFjDQoQk,298
2
+ tidytable/dedup/__init__.py,sha256=d3l94dahpt9698AAEKjD615WsL9FacKYPDGRGIPr2y0,83
3
+ tidytable/dedup/engine.py,sha256=8wiMmQToV7AprEr-siQlBFqxaGl5_hX3xb_9kQCIHx8,896
4
+ tidytable/merge/__init__.py,sha256=BU-Z1T2Zw6U6y-RV5CdznU5VHNgJid4rWJi7H2fHEME,105
5
+ tidytable/merge/engine.py,sha256=0rRRFiEHwIdr8IRmto9II7X9p-ZjQhJ7TMFJhf__MqA,1490
6
+ tidytable/missing/__init__.py,sha256=B33QsEgD_37bwTtTZtKwTf0j0DXg8q5hpxEXgAFQcjg,129
7
+ tidytable/missing/engine.py,sha256=w3pQ0IPpGp0xDEnCz5yiK5bp4OlUw5LMuM24UJNfQlI,1071
8
+ tidytable/parse/__init__.py,sha256=9iGY8AbhVxUnGOpJy0KzNEWypvxamLvtnRhguSrqZ-0,205
9
+ tidytable/parse/engine.py,sha256=73kdQsBGy848FiYiUv1MIFlCu4Z0shgKEPg3wINuxWw,3071
10
+ tidytable/profile/__init__.py,sha256=vD7IOmIGlT8A_WW9AYNuIxuFxIVuPBcdZ0iryx-W9ds,191
11
+ tidytable/profile/engine.py,sha256=r5NRlRmvstii9ZItlqjtzDxritDVxrHnUdFU3jty5gY,2676
12
+ tidytable/reconcile/__init__.py,sha256=k2wQKsrb9WIjBjzIRxDGWn8dlADYC52qqK-dPV8kafw,99
13
+ tidytable/reconcile/engine.py,sha256=ELuE-ByULeIboJ621m9dtsZouQgykdktIpSvIkWC-Vs,1592
14
+ tidytable/structural/__init__.py,sha256=rO4Ed2QkuRFrmbi9S76Cc-_qNAVdSIJbjsvjrlkMDwo,167
15
+ tidytable/structural/engine.py,sha256=n_eTiPa3PZeC1FhR8h5vbpFIl_N5oM33JuiwHavUGvU,1933
16
+ tidytable/xl/__init__.py,sha256=iiuAckb6es9o6ut4f07qBJQzk0VNYeYHlSQtrqmCUqQ,193
17
+ tidytable/xl/engine.py,sha256=vNzTYvA8AQk3lirkCCuQnOIF6QH1c4jdBfPIORsFVp0,2148
18
+ tidytable_core-1.0.0.dist-info/licenses/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ tidytable_core-1.0.0.dist-info/METADATA,sha256=ZMT4TnhoQwx_1IgW9GXViSDimzDL6fZm6JFtE91-mu0,11828
20
+ tidytable_core-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
21
+ tidytable_core-1.0.0.dist-info/top_level.txt,sha256=DYeHvMlVFQwgpGUmlMe26iZrkoha6of6-nkB3B5vMg0,10
22
+ tidytable_core-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
File without changes
@@ -0,0 +1 @@
1
+ tidytable