tidytable-core 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tidytable_core-1.0.0/LICENSE +0 -0
- tidytable_core-1.0.0/PKG-INFO +330 -0
- tidytable_core-1.0.0/README.md +311 -0
- tidytable_core-1.0.0/pyproject.toml +31 -0
- tidytable_core-1.0.0/setup.cfg +4 -0
- tidytable_core-1.0.0/src/tidytable/__init__.py +19 -0
- tidytable_core-1.0.0/src/tidytable/dedup/__init__.py +6 -0
- tidytable_core-1.0.0/src/tidytable/dedup/engine.py +17 -0
- tidytable_core-1.0.0/src/tidytable/merge/__init__.py +6 -0
- tidytable_core-1.0.0/src/tidytable/merge/engine.py +34 -0
- tidytable_core-1.0.0/src/tidytable/missing/__init__.py +7 -0
- tidytable_core-1.0.0/src/tidytable/missing/engine.py +27 -0
- tidytable_core-1.0.0/src/tidytable/parse/__init__.py +9 -0
- tidytable_core-1.0.0/src/tidytable/parse/engine.py +73 -0
- tidytable_core-1.0.0/src/tidytable/profile/__init__.py +9 -0
- tidytable_core-1.0.0/src/tidytable/profile/engine.py +62 -0
- tidytable_core-1.0.0/src/tidytable/reconcile/__init__.py +6 -0
- tidytable_core-1.0.0/src/tidytable/reconcile/engine.py +38 -0
- tidytable_core-1.0.0/src/tidytable/structural/__init__.py +7 -0
- tidytable_core-1.0.0/src/tidytable/structural/engine.py +44 -0
- tidytable_core-1.0.0/src/tidytable/xl/__init__.py +8 -0
- tidytable_core-1.0.0/src/tidytable/xl/engine.py +56 -0
- tidytable_core-1.0.0/src/tidytable_core.egg-info/PKG-INFO +330 -0
- tidytable_core-1.0.0/src/tidytable_core.egg-info/SOURCES.txt +26 -0
- tidytable_core-1.0.0/src/tidytable_core.egg-info/dependency_links.txt +1 -0
- tidytable_core-1.0.0/src/tidytable_core.egg-info/requires.txt +4 -0
- tidytable_core-1.0.0/src/tidytable_core.egg-info/top_level.txt +1 -0
- tidytable_core-1.0.0/tests/test_core.py +100 -0
|
File without changes
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tidytable-core
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An ecosystem-style explicit data cleaning framework for Excel and CSV pipelines.
|
|
5
|
+
Author-email: Aayush Vijay <aayushvj8699@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/aayushvijay/tidytable
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: pandas>=2.0.0
|
|
15
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
16
|
+
Requires-Dist: python-dateutil>=2.8.2
|
|
17
|
+
Requires-Dist: rapidfuzz>=3.0.0
|
|
18
|
+
Dynamic: license-file
|
|
19
|
+
|
|
20
|
+
Here is the raw Markdown block. You can copy everything inside this block and paste it directly into your `README.md` file:
|
|
21
|
+
|
|
22
|
+
```markdown
|
|
23
|
+
# tidytable-core ๐งน
|
|
24
|
+
|
|
25
|
+
An ecosystem-style, explicit data cleaning framework built for data analysts who bridge the gap between messy, human-formatted Excel/CSV spreadsheets and production-ready Python data structures.
|
|
26
|
+
|
|
27
|
+
Unlike "black-box" cleaning scripts that automatically change your underlying values, `tidytable` forces an **explicit pipeline paradigm**. Each transformation engine is completely decoupled, granting you absolute control and step-by-step data lineage tracking.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## ๐ Installation & System Configuration
|
|
32
|
+
|
|
33
|
+
Install `tidytable-core` globally from the Python Package Index (PyPI):
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install tidytable-core
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Core External System Dependencies
|
|
41
|
+
|
|
42
|
+
* **`pandas`**: Core tabular dataframe manipulation engine.
|
|
43
|
+
* **`openpyxl`**: Memory-mapped engine for modern `.xlsx` workbook streams.
|
|
44
|
+
* **`python-dateutil`**: Dynamic flexible timestamp text resolution matrix parser.
|
|
45
|
+
* **`rapidfuzz`**: High-performance Levenshtein Distance string similarity evaluation index engine.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## ๐๏ธ Architectural Execution Pipeline
|
|
50
|
+
|
|
51
|
+
Data flows sequentially through your explicitly invoked processing domains:
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
[ Messy Spreadsheet File ]
|
|
55
|
+
โ
|
|
56
|
+
โผ
|
|
57
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
58
|
+
โ tidytable.xl โ โโโบ Resolves unaligned human-formatted layout grids.
|
|
59
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
60
|
+
โ (Tabular Data Stream)
|
|
61
|
+
โผ
|
|
62
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
63
|
+
โ tidytable.structural โ โโโบ Standardizes variable labels and strips whitespace.
|
|
64
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
65
|
+
โ
|
|
66
|
+
โผ
|
|
67
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
68
|
+
โ tidytable.parse โ โโโบ Casts values safely into clean data types.
|
|
69
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
70
|
+
โ
|
|
71
|
+
โผ
|
|
72
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
73
|
+
โ tidytable.missing โ โโโบ Drops empty panels and applies imputation profiles.
|
|
74
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
75
|
+
โ
|
|
76
|
+
โผ
|
|
77
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
78
|
+
โ tidytable.dedup โ โโโบ Filters duplicate records safely.
|
|
79
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
80
|
+
โ
|
|
81
|
+
โผ
|
|
82
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
83
|
+
โ tidytable.profile โ โโโบ Validates constraints and generates audit ledger logs.
|
|
84
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
85
|
+
โ
|
|
86
|
+
โผ
|
|
87
|
+
[ Pristine DataFrame ]
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## ๐ Complete Sub-Library Blueprint Reference
|
|
94
|
+
|
|
95
|
+
### 1. `tidytable.xl` (The Excel Surgeon)
|
|
96
|
+
|
|
97
|
+
Extracts pristine data grids from visually stylized worksheets.
|
|
98
|
+
|
|
99
|
+
#### `xl.load_workbook(file_path: str) -> dict[str, pd.DataFrame]`
|
|
100
|
+
|
|
101
|
+
* **Use**: Loads an entire workbook into memory.
|
|
102
|
+
* **Arguments**: `file_path` (*str*): System destination path pointing to an Excel document.
|
|
103
|
+
|
|
104
|
+
#### `xl.unmerge_and_fill(sheet_data: pd.DataFrame, strategy: str = "ffill") -> pd.DataFrame`
|
|
105
|
+
|
|
106
|
+
* **Use**: Flattens merged cells and fills empty fields down or across so rows stay linked.
|
|
107
|
+
* **Arguments**:
|
|
108
|
+
* `sheet_data` (*DataFrame*): The input sheet table matrix.
|
|
109
|
+
* `strategy` (*str*): Direction constraint rule. `"ffill"` (forward fill down) or `"lfill"` (lateral fill across).
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
#### `xl.sniff_headers(sheet_data: pd.DataFrame, scan_rows: int = 20) -> tuple[int, list[str]]`
|
|
114
|
+
|
|
115
|
+
* **Use**: Skips title banners and KPI cards to find where the actual table headers start.
|
|
116
|
+
* **Arguments**: `scan_rows` (*int*): Search depth row index limit.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
import tidytable as tt
|
|
120
|
+
|
|
121
|
+
sheets = tt.xl.load_workbook("sales_report.xlsx")
|
|
122
|
+
raw_data = sheets["Q2_Leads"]
|
|
123
|
+
|
|
124
|
+
# Sniff header index location and clean names
|
|
125
|
+
header_idx, headers = tt.xl.sniff_headers(raw_data, scan_rows=15)
|
|
126
|
+
|
|
127
|
+
# Unmerge and prop values down to form a database structure
|
|
128
|
+
df = tt.xl.unmerge_and_fill(raw_data, strategy="ffill")
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
### 2. `tidytable.structural` (The Text Blacksmith)
|
|
135
|
+
|
|
136
|
+
Cleans and standardizes column structures and text anomalies.
|
|
137
|
+
|
|
138
|
+
#### `structural.rename_columns(df: pd.DataFrame, style: str = "snake_case") -> pd.DataFrame`
|
|
139
|
+
|
|
140
|
+
* **Use**: Converts columns (like `"Gross Profit (%)"`) into clean variables (`"gross_profit"`).
|
|
141
|
+
* **Arguments**: `style` (*str*): Re-casing format rules. Default is `"snake_case"`.
|
|
142
|
+
|
|
143
|
+
#### `structural.strip_whitespace(series: pd.Series) -> pd.Series`
|
|
144
|
+
|
|
145
|
+
* **Use**: Deep-strips leading/trailing spaces, tab breaks, and hidden non-breaking spaces (`\xa0`).
|
|
146
|
+
|
|
147
|
+
#### `structural.standardize_categories(series: pd.Series, mapping: dict = None, auto_cluster: bool = False) -> pd.Series`
|
|
148
|
+
|
|
149
|
+
* **Use**: Groups manual typos and naming variations into a single target category name.
|
|
150
|
+
* **Arguments**:
|
|
151
|
+
* `mapping` (*dict*): Manual dictionary rules map (e.g., `{"USA": ["usa", "U.S.A.", "us"]}`).
|
|
152
|
+
* `auto_cluster` (*bool*): Uses Levenshtein Distance to merge variations automatically.
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
df = tt.structural.rename_columns(df, style="snake_case")
|
|
158
|
+
df["product_name"] = tt.structural.strip_whitespace(df["product_name"])
|
|
159
|
+
|
|
160
|
+
# Merge regional text typos automatically using string distance clustering
|
|
161
|
+
df["region"] = tt.structural.standardize_categories(df["region"], auto_cluster=True)
|
|
162
|
+
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
### 3. `tidytable.parse` (The Type Whisperer)
|
|
168
|
+
|
|
169
|
+
Converts raw string text blocks into strict mathematical datatypes without crashing.
|
|
170
|
+
|
|
171
|
+
#### `parse.dates(series: pd.Series, dayfirst: bool = False) -> pd.Series`
|
|
172
|
+
|
|
173
|
+
* **Use**: Parses mixed date format variations in a single column into uniform ISO datetimes.
|
|
174
|
+
|
|
175
|
+
#### `parse.financials(series: pd.Series) -> pd.Series`
|
|
176
|
+
|
|
177
|
+
* **Use**: Extracts numeric values from accounting styles like `"$ (1,250.00)"` or `"12K"`.
|
|
178
|
+
|
|
179
|
+
#### `parse.repair_identifiers(series: pd.Series, pad_length: int = None) -> pd.Series`
|
|
180
|
+
|
|
181
|
+
* **Use**: Restores dropped leading zeroes on data codes (e.g., converts float `401.0` back to `"00401"`).
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
df["invoice_date"] = tt.parse.dates(df["invoice_date"], dayfirst=False)
|
|
185
|
+
df["net_revenue"] = tt.parse.financials(df["net_revenue"])
|
|
186
|
+
df["zip_code"] = tt.parse.repair_identifiers(df["zip_code"], pad_length=5)
|
|
187
|
+
df["roi_metric"] = tt.parse.handle_formula_ghosts(df["roi_metric"], error_strategy="coerce")
|
|
188
|
+
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
### 4. `tidytable.missing` (The Ghost Hunter)
|
|
194
|
+
|
|
195
|
+
Identifies and resolves gaps in data matrices.
|
|
196
|
+
|
|
197
|
+
#### `missing.drop_empty_cols(df: pd.DataFrame, threshold: float = 0.50) -> pd.DataFrame`
|
|
198
|
+
|
|
199
|
+
* **Use**: Drops column attributes where the missing values ratio exceeds the threshold boundary limit.
|
|
200
|
+
|
|
201
|
+
#### `missing.flag_absence(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame`
|
|
202
|
+
|
|
203
|
+
* **Use**: Appends a binary companion indicator column (`{column}_is_missing`) to keep the data signal before running imputation.
|
|
204
|
+
|
|
205
|
+
#### `missing.impute(series: pd.Series, strategy: str = "median") -> pd.Series`
|
|
206
|
+
|
|
207
|
+
* **Use**: Fills null voids based on chosen statistical parameters (`"mean"`, `"median"`, `"mode"`).
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
df = tt.missing.drop_empty_cols(df, threshold=0.40)
|
|
211
|
+
df = tt.missing.flag_absence(df, columns=["customer_age"])
|
|
212
|
+
df["customer_age"] = tt.missing.impute(df["customer_age"], strategy="median")
|
|
213
|
+
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
### 5. `tidytable.dedup` (The Twin Eliminator)
|
|
219
|
+
|
|
220
|
+
Detects and drops duplicate entries across your rows.
|
|
221
|
+
|
|
222
|
+
#### `dedup.absolute(df: pd.DataFrame) -> pd.DataFrame`
|
|
223
|
+
|
|
224
|
+
* **Use**: Drops rows only if they match exactly across every single field.
|
|
225
|
+
|
|
226
|
+
#### `dedup.partial(df: pd.DataFrame, subset: list[str], keep: str = "latest", timestamp_col: str = None) -> pd.DataFrame`
|
|
227
|
+
|
|
228
|
+
* **Use**: Resolves record updates by keeping the earliest or latest transaction entry for a unique key.
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
# Clear absolute identical rows
|
|
232
|
+
df = tt.dedup.absolute(df)
|
|
233
|
+
|
|
234
|
+
# For matching customer IDs, keep only the record with the most recent update timestamp
|
|
235
|
+
df = tt.dedup.partial(df, subset=["customer_id"], keep="latest", timestamp_col="updated_at")
|
|
236
|
+
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
### 6. `tidytable.merge` (The VLOOKUP Bridge)
|
|
242
|
+
|
|
243
|
+
Joins separate files together even when keys are messy, incomplete, or slightly misspelled.
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
# Identify mismatched elements before executing joins
|
|
247
|
+
pre_flight = tt.merge.join_diagnose(left_df=leads_df, right_df=master_df, left_on="vendor", right_on="v_name")
|
|
248
|
+
|
|
249
|
+
# Join matching rows even if there are typos (e.g., matches "Apple Inc." to "Apple, Inc.")
|
|
250
|
+
joined_df = tt.merge.fuzzy_vlookup(leads_df, master_df, left_on="vendor", right_on="v_name", threshold=0.88)
|
|
251
|
+
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
### 7. `tidytable.reconcile` (The Ledger Auditor)
|
|
257
|
+
|
|
258
|
+
Automates version control checks between separate instances of the same file structure.
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
# Generate structural audits comparing January data against February data
|
|
262
|
+
ledger_updates = tt.reconcile.sheet_diff(df_old=jan_df, df_new=feb_df, key_column="transaction_id")
|
|
263
|
+
|
|
264
|
+
print("New rows added this month:", len(ledger_updates["Added"]))
|
|
265
|
+
print("Row modifications captured:", len(ledger_updates["Modified"]))
|
|
266
|
+
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
### 8. `tidytable.profile` (The Auditor & Schema Guard)
|
|
272
|
+
|
|
273
|
+
Handles file schema pinning, anomaly detection, and automated audit trails.
|
|
274
|
+
|
|
275
|
+
```python
|
|
276
|
+
# Scan for raw strings masking null values (e.g., "?", "n/a", "-")
|
|
277
|
+
anomalies = tt.profile.check_anomalies(df)
|
|
278
|
+
|
|
279
|
+
# Validate current file structure against last month's blueprint to make sure scripts don't crash
|
|
280
|
+
if tt.profile.validate(df, schema_path="schemas/prod_blueprint.json"):
|
|
281
|
+
# Output file pipeline performance audit change log metrics
|
|
282
|
+
print(tt.profile.audit_report(df, output="cli"))
|
|
283
|
+
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
## ๐ฏ Complete End-to-End Explicit Analyst Workflow
|
|
289
|
+
|
|
290
|
+
Here is a complete real-world script showing how an analyst runs a detailed cleaning pipeline manually:
|
|
291
|
+
|
|
292
|
+
```python
|
|
293
|
+
import tidytable as tt
|
|
294
|
+
import pandas as pd
|
|
295
|
+
|
|
296
|
+
# Step 1: Layout Normalization
|
|
297
|
+
workbook = tt.xl.load_workbook("raw_factory_data.xlsx")
|
|
298
|
+
sheet_grid = workbook["Master_Log"]
|
|
299
|
+
df = tt.xl.unmerge_and_fill(sheet_grid, strategy="ffill")
|
|
300
|
+
|
|
301
|
+
# Step 2: Structural Column Cleaning
|
|
302
|
+
df = tt.structural.rename_columns(df, style="snake_case")
|
|
303
|
+
df["part_name"] = tt.structural.strip_whitespace(df["part_name"])
|
|
304
|
+
|
|
305
|
+
# Step 3: Type Safe Parsing
|
|
306
|
+
df["serial_id"] = tt.parse.repair_identifiers(df["serial_id"], pad_length=6)
|
|
307
|
+
df["cost"] = tt.parse.financials(df["cost"])
|
|
308
|
+
df["log_date"] = tt.parse.dates(df["log_date"])
|
|
309
|
+
|
|
310
|
+
# Step 4: Integrity and Row Refinement
|
|
311
|
+
df = tt.missing.flag_absence(df, columns=["efficiency_score"])
|
|
312
|
+
df["efficiency_score"] = tt.missing.impute(df["efficiency_score"], strategy="mean")
|
|
313
|
+
df = tt.dedup.absolute(df)
|
|
314
|
+
|
|
315
|
+
# Step 5: Verification & Schema Pinning
|
|
316
|
+
if tt.profile.validate(df, schema_path="schemas/factory_spec.json"):
|
|
317
|
+
df.to_csv("clean_factory_data.csv", index=False)
|
|
318
|
+
print(tt.profile.audit_report(df, output="cli"))
|
|
319
|
+
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## โ๏ธ License
|
|
325
|
+
|
|
326
|
+
Distributed under the MIT License. See `LICENSE` for details.
|
|
327
|
+
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
```
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
Here is the raw Markdown block. You can copy everything inside this block and paste it directly into your `README.md` file:
|
|
2
|
+
|
|
3
|
+
```markdown
|
|
4
|
+
# tidytable-core ๐งน
|
|
5
|
+
|
|
6
|
+
An ecosystem-style, explicit data cleaning framework built for data analysts who bridge the gap between messy, human-formatted Excel/CSV spreadsheets and production-ready Python data structures.
|
|
7
|
+
|
|
8
|
+
Unlike "black-box" cleaning scripts that automatically change your underlying values, `tidytable` forces an **explicit pipeline paradigm**. Each transformation engine is completely decoupled, granting you absolute control and step-by-step data lineage tracking.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## ๐ Installation & System Configuration
|
|
13
|
+
|
|
14
|
+
Install `tidytable-core` globally from the Python Package Index (PyPI):
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install tidytable-core
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### Core External System Dependencies
|
|
22
|
+
|
|
23
|
+
* **`pandas`**: Core tabular dataframe manipulation engine.
|
|
24
|
+
* **`openpyxl`**: Memory-mapped engine for modern `.xlsx` workbook streams.
|
|
25
|
+
* **`python-dateutil`**: Dynamic flexible timestamp text resolution matrix parser.
|
|
26
|
+
* **`rapidfuzz`**: High-performance Levenshtein Distance string similarity evaluation index engine.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## ๐๏ธ Architectural Execution Pipeline
|
|
31
|
+
|
|
32
|
+
Data flows sequentially through your explicitly invoked processing domains:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
[ Messy Spreadsheet File ]
|
|
36
|
+
โ
|
|
37
|
+
โผ
|
|
38
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
39
|
+
โ tidytable.xl โ โโโบ Resolves unaligned human-formatted layout grids.
|
|
40
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
41
|
+
โ (Tabular Data Stream)
|
|
42
|
+
โผ
|
|
43
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
44
|
+
โ tidytable.structural โ โโโบ Standardizes variable labels and strips whitespace.
|
|
45
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
46
|
+
โ
|
|
47
|
+
โผ
|
|
48
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
49
|
+
โ tidytable.parse โ โโโบ Casts values safely into clean data types.
|
|
50
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
51
|
+
โ
|
|
52
|
+
โผ
|
|
53
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
54
|
+
โ tidytable.missing โ โโโบ Drops empty panels and applies imputation profiles.
|
|
55
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
56
|
+
โ
|
|
57
|
+
โผ
|
|
58
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
59
|
+
โ tidytable.dedup โ โโโบ Filters duplicate records safely.
|
|
60
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
61
|
+
โ
|
|
62
|
+
โผ
|
|
63
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
64
|
+
โ tidytable.profile โ โโโบ Validates constraints and generates audit ledger logs.
|
|
65
|
+
โโโโโโโโโโโโโฌโโโโโโโโโโโโ
|
|
66
|
+
โ
|
|
67
|
+
โผ
|
|
68
|
+
[ Pristine DataFrame ]
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## ๐ Complete Sub-Library Blueprint Reference
|
|
75
|
+
|
|
76
|
+
### 1. `tidytable.xl` (The Excel Surgeon)
|
|
77
|
+
|
|
78
|
+
Extracts pristine data grids from visually stylized worksheets.
|
|
79
|
+
|
|
80
|
+
#### `xl.load_workbook(file_path: str) -> dict[str, pd.DataFrame]`
|
|
81
|
+
|
|
82
|
+
* **Use**: Loads an entire workbook into memory.
|
|
83
|
+
* **Arguments**: `file_path` (*str*): System destination path pointing to an Excel document.
|
|
84
|
+
|
|
85
|
+
#### `xl.unmerge_and_fill(sheet_data: pd.DataFrame, strategy: str = "ffill") -> pd.DataFrame`
|
|
86
|
+
|
|
87
|
+
* **Use**: Flattens merged cells and fills empty fields down or across so rows stay linked.
|
|
88
|
+
* **Arguments**:
|
|
89
|
+
* `sheet_data` (*DataFrame*): The input sheet table matrix.
|
|
90
|
+
* `strategy` (*str*): Direction constraint rule. `"ffill"` (forward fill down) or `"lfill"` (lateral fill across).
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
#### `xl.sniff_headers(sheet_data: pd.DataFrame, scan_rows: int = 20) -> tuple[int, list[str]]`
|
|
95
|
+
|
|
96
|
+
* **Use**: Skips title banners and KPI cards to find where the actual table headers start.
|
|
97
|
+
* **Arguments**: `scan_rows` (*int*): Search depth row index limit.
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
import tidytable as tt
|
|
101
|
+
|
|
102
|
+
sheets = tt.xl.load_workbook("sales_report.xlsx")
|
|
103
|
+
raw_data = sheets["Q2_Leads"]
|
|
104
|
+
|
|
105
|
+
# Sniff header index location and clean names
|
|
106
|
+
header_idx, headers = tt.xl.sniff_headers(raw_data, scan_rows=15)
|
|
107
|
+
|
|
108
|
+
# Unmerge and prop values down to form a database structure
|
|
109
|
+
df = tt.xl.unmerge_and_fill(raw_data, strategy="ffill")
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
### 2. `tidytable.structural` (The Text Blacksmith)
|
|
116
|
+
|
|
117
|
+
Cleans and standardizes column structures and text anomalies.
|
|
118
|
+
|
|
119
|
+
#### `structural.rename_columns(df: pd.DataFrame, style: str = "snake_case") -> pd.DataFrame`
|
|
120
|
+
|
|
121
|
+
* **Use**: Converts columns (like `"Gross Profit (%)"`) into clean variables (`"gross_profit"`).
|
|
122
|
+
* **Arguments**: `style` (*str*): Re-casing format rules. Default is `"snake_case"`.
|
|
123
|
+
|
|
124
|
+
#### `structural.strip_whitespace(series: pd.Series) -> pd.Series`
|
|
125
|
+
|
|
126
|
+
* **Use**: Deep-strips leading/trailing spaces, tab breaks, and hidden non-breaking spaces (`\xa0`).
|
|
127
|
+
|
|
128
|
+
#### `structural.standardize_categories(series: pd.Series, mapping: dict = None, auto_cluster: bool = False) -> pd.Series`
|
|
129
|
+
|
|
130
|
+
* **Use**: Groups manual typos and naming variations into a single target category name.
|
|
131
|
+
* **Arguments**:
|
|
132
|
+
* `mapping` (*dict*): Manual dictionary rules map (e.g., `{"USA": ["usa", "U.S.A.", "us"]}`).
|
|
133
|
+
* `auto_cluster` (*bool*): Uses Levenshtein Distance to merge variations automatically.
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
df = tt.structural.rename_columns(df, style="snake_case")
|
|
139
|
+
df["product_name"] = tt.structural.strip_whitespace(df["product_name"])
|
|
140
|
+
|
|
141
|
+
# Merge regional text typos automatically using string distance clustering
|
|
142
|
+
df["region"] = tt.structural.standardize_categories(df["region"], auto_cluster=True)
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
### 3. `tidytable.parse` (The Type Whisperer)
|
|
149
|
+
|
|
150
|
+
Converts raw string text blocks into strict mathematical datatypes without crashing.
|
|
151
|
+
|
|
152
|
+
#### `parse.dates(series: pd.Series, dayfirst: bool = False) -> pd.Series`
|
|
153
|
+
|
|
154
|
+
* **Use**: Parses mixed date format variations in a single column into uniform ISO datetimes.
|
|
155
|
+
|
|
156
|
+
#### `parse.financials(series: pd.Series) -> pd.Series`
|
|
157
|
+
|
|
158
|
+
* **Use**: Extracts numeric values from accounting styles like `"$ (1,250.00)"` or `"12K"`.
|
|
159
|
+
|
|
160
|
+
#### `parse.repair_identifiers(series: pd.Series, pad_length: int = None) -> pd.Series`
|
|
161
|
+
|
|
162
|
+
* **Use**: Restores dropped leading zeroes on data codes (e.g., converts float `401.0` back to `"00401"`).
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
df["invoice_date"] = tt.parse.dates(df["invoice_date"], dayfirst=False)
|
|
166
|
+
df["net_revenue"] = tt.parse.financials(df["net_revenue"])
|
|
167
|
+
df["zip_code"] = tt.parse.repair_identifiers(df["zip_code"], pad_length=5)
|
|
168
|
+
df["roi_metric"] = tt.parse.handle_formula_ghosts(df["roi_metric"], error_strategy="coerce")
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
### 4. `tidytable.missing` (The Ghost Hunter)
|
|
175
|
+
|
|
176
|
+
Identifies and resolves gaps in data matrices.
|
|
177
|
+
|
|
178
|
+
#### `missing.drop_empty_cols(df: pd.DataFrame, threshold: float = 0.50) -> pd.DataFrame`
|
|
179
|
+
|
|
180
|
+
* **Use**: Drops column attributes where the missing values ratio exceeds the threshold boundary limit.
|
|
181
|
+
|
|
182
|
+
#### `missing.flag_absence(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame`
|
|
183
|
+
|
|
184
|
+
* **Use**: Appends a binary companion indicator column (`{column}_is_missing`) to keep the data signal before running imputation.
|
|
185
|
+
|
|
186
|
+
#### `missing.impute(series: pd.Series, strategy: str = "median") -> pd.Series`
|
|
187
|
+
|
|
188
|
+
* **Use**: Fills null voids based on chosen statistical parameters (`"mean"`, `"median"`, `"mode"`).
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
df = tt.missing.drop_empty_cols(df, threshold=0.40)
|
|
192
|
+
df = tt.missing.flag_absence(df, columns=["customer_age"])
|
|
193
|
+
df["customer_age"] = tt.missing.impute(df["customer_age"], strategy="median")
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
### 5. `tidytable.dedup` (The Twin Eliminator)
|
|
200
|
+
|
|
201
|
+
Detects and drops duplicate entries across your rows.
|
|
202
|
+
|
|
203
|
+
#### `dedup.absolute(df: pd.DataFrame) -> pd.DataFrame`
|
|
204
|
+
|
|
205
|
+
* **Use**: Drops rows only if they match exactly across every single field.
|
|
206
|
+
|
|
207
|
+
#### `dedup.partial(df: pd.DataFrame, subset: list[str], keep: str = "latest", timestamp_col: str = None) -> pd.DataFrame`
|
|
208
|
+
|
|
209
|
+
* **Use**: Resolves record updates by keeping the earliest or latest transaction entry for a unique key.
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
# Clear absolute identical rows
|
|
213
|
+
df = tt.dedup.absolute(df)
|
|
214
|
+
|
|
215
|
+
# For matching customer IDs, keep only the record with the most recent update timestamp
|
|
216
|
+
df = tt.dedup.partial(df, subset=["customer_id"], keep="latest", timestamp_col="updated_at")
|
|
217
|
+
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
### 6. `tidytable.merge` (The VLOOKUP Bridge)
|
|
223
|
+
|
|
224
|
+
Joins separate files together even when keys are messy, incomplete, or slightly misspelled.
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
# Identify mismatched elements before executing joins
|
|
228
|
+
pre_flight = tt.merge.join_diagnose(left_df=leads_df, right_df=master_df, left_on="vendor", right_on="v_name")
|
|
229
|
+
|
|
230
|
+
# Join matching rows even if there are typos (e.g., matches "Apple Inc." to "Apple, Inc.")
|
|
231
|
+
joined_df = tt.merge.fuzzy_vlookup(leads_df, master_df, left_on="vendor", right_on="v_name", threshold=0.88)
|
|
232
|
+
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
### 7. `tidytable.reconcile` (The Ledger Auditor)
|
|
238
|
+
|
|
239
|
+
Automates version control checks between separate instances of the same file structure.
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
# Generate structural audits comparing January data against February data
|
|
243
|
+
ledger_updates = tt.reconcile.sheet_diff(df_old=jan_df, df_new=feb_df, key_column="transaction_id")
|
|
244
|
+
|
|
245
|
+
print("New rows added this month:", len(ledger_updates["Added"]))
|
|
246
|
+
print("Row modifications captured:", len(ledger_updates["Modified"]))
|
|
247
|
+
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
### 8. `tidytable.profile` (The Auditor & Schema Guard)
|
|
253
|
+
|
|
254
|
+
Handles file schema pinning, anomaly detection, and automated audit trails.
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
# Scan for raw strings masking null values (e.g., "?", "n/a", "-")
|
|
258
|
+
anomalies = tt.profile.check_anomalies(df)
|
|
259
|
+
|
|
260
|
+
# Validate current file structure against last month's blueprint to make sure scripts don't crash
|
|
261
|
+
if tt.profile.validate(df, schema_path="schemas/prod_blueprint.json"):
|
|
262
|
+
# Output file pipeline performance audit change log metrics
|
|
263
|
+
print(tt.profile.audit_report(df, output="cli"))
|
|
264
|
+
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## ๐ฏ Complete End-to-End Explicit Analyst Workflow
|
|
270
|
+
|
|
271
|
+
Here is a complete real-world script showing how an analyst runs a detailed cleaning pipeline manually:
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
import tidytable as tt
|
|
275
|
+
import pandas as pd
|
|
276
|
+
|
|
277
|
+
# Step 1: Layout Normalization
|
|
278
|
+
workbook = tt.xl.load_workbook("raw_factory_data.xlsx")
|
|
279
|
+
sheet_grid = workbook["Master_Log"]
|
|
280
|
+
df = tt.xl.unmerge_and_fill(sheet_grid, strategy="ffill")
|
|
281
|
+
|
|
282
|
+
# Step 2: Structural Column Cleaning
|
|
283
|
+
df = tt.structural.rename_columns(df, style="snake_case")
|
|
284
|
+
df["part_name"] = tt.structural.strip_whitespace(df["part_name"])
|
|
285
|
+
|
|
286
|
+
# Step 3: Type Safe Parsing
|
|
287
|
+
df["serial_id"] = tt.parse.repair_identifiers(df["serial_id"], pad_length=6)
|
|
288
|
+
df["cost"] = tt.parse.financials(df["cost"])
|
|
289
|
+
df["log_date"] = tt.parse.dates(df["log_date"])
|
|
290
|
+
|
|
291
|
+
# Step 4: Integrity and Row Refinement
|
|
292
|
+
df = tt.missing.flag_absence(df, columns=["efficiency_score"])
|
|
293
|
+
df["efficiency_score"] = tt.missing.impute(df["efficiency_score"], strategy="mean")
|
|
294
|
+
df = tt.dedup.absolute(df)
|
|
295
|
+
|
|
296
|
+
# Step 5: Verification & Schema Pinning
|
|
297
|
+
if tt.profile.validate(df, schema_path="schemas/factory_spec.json"):
|
|
298
|
+
df.to_csv("clean_factory_data.csv", index=False)
|
|
299
|
+
print(tt.profile.audit_report(df, output="cli"))
|
|
300
|
+
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## โ๏ธ License
|
|
306
|
+
|
|
307
|
+
Distributed under the MIT License. See `LICENSE` for details.
|
|
308
|
+
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
```
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tidytable-core" # Ensure this name is unique on PyPI
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Aayush Vijay", email="aayushvj8699@gmail.com" }
|
|
10
|
+
]
|
|
11
|
+
description = "An ecosystem-style explicit data cleaning framework for Excel and CSV pipelines."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.9"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: MacOS :: MacOS X",
|
|
18
|
+
"Topic :: Scientific/Engineering :: Information Analysis"
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"pandas>=2.0.0",
|
|
22
|
+
"openpyxl>=3.1.0",
|
|
23
|
+
"python-dateutil>=2.8.2",
|
|
24
|
+
"rapidfuzz>=3.0.0"
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
"Homepage" = "https://github.com/aayushvijay/tidytable"
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
where = ["src"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from . import xl
|
|
2
|
+
from . import structural
|
|
3
|
+
from . import parse
|
|
4
|
+
from . import missing
|
|
5
|
+
from . import dedup
|
|
6
|
+
from . import merge
|
|
7
|
+
from . import reconcile
|
|
8
|
+
from . import profile
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"xl",
|
|
12
|
+
"structural",
|
|
13
|
+
"parse",
|
|
14
|
+
"missing",
|
|
15
|
+
"dedup",
|
|
16
|
+
"merge",
|
|
17
|
+
"reconcile",
|
|
18
|
+
"profile"
|
|
19
|
+
]
|