tidytable-core 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. tidytable_core-1.0.0/LICENSE +0 -0
  2. tidytable_core-1.0.0/PKG-INFO +330 -0
  3. tidytable_core-1.0.0/README.md +311 -0
  4. tidytable_core-1.0.0/pyproject.toml +31 -0
  5. tidytable_core-1.0.0/setup.cfg +4 -0
  6. tidytable_core-1.0.0/src/tidytable/__init__.py +19 -0
  7. tidytable_core-1.0.0/src/tidytable/dedup/__init__.py +6 -0
  8. tidytable_core-1.0.0/src/tidytable/dedup/engine.py +17 -0
  9. tidytable_core-1.0.0/src/tidytable/merge/__init__.py +6 -0
  10. tidytable_core-1.0.0/src/tidytable/merge/engine.py +34 -0
  11. tidytable_core-1.0.0/src/tidytable/missing/__init__.py +7 -0
  12. tidytable_core-1.0.0/src/tidytable/missing/engine.py +27 -0
  13. tidytable_core-1.0.0/src/tidytable/parse/__init__.py +9 -0
  14. tidytable_core-1.0.0/src/tidytable/parse/engine.py +73 -0
  15. tidytable_core-1.0.0/src/tidytable/profile/__init__.py +9 -0
  16. tidytable_core-1.0.0/src/tidytable/profile/engine.py +62 -0
  17. tidytable_core-1.0.0/src/tidytable/reconcile/__init__.py +6 -0
  18. tidytable_core-1.0.0/src/tidytable/reconcile/engine.py +38 -0
  19. tidytable_core-1.0.0/src/tidytable/structural/__init__.py +7 -0
  20. tidytable_core-1.0.0/src/tidytable/structural/engine.py +44 -0
  21. tidytable_core-1.0.0/src/tidytable/xl/__init__.py +8 -0
  22. tidytable_core-1.0.0/src/tidytable/xl/engine.py +56 -0
  23. tidytable_core-1.0.0/src/tidytable_core.egg-info/PKG-INFO +330 -0
  24. tidytable_core-1.0.0/src/tidytable_core.egg-info/SOURCES.txt +26 -0
  25. tidytable_core-1.0.0/src/tidytable_core.egg-info/dependency_links.txt +1 -0
  26. tidytable_core-1.0.0/src/tidytable_core.egg-info/requires.txt +4 -0
  27. tidytable_core-1.0.0/src/tidytable_core.egg-info/top_level.txt +1 -0
  28. tidytable_core-1.0.0/tests/test_core.py +100 -0
File without changes
@@ -0,0 +1,330 @@
1
+ Metadata-Version: 2.4
2
+ Name: tidytable-core
3
+ Version: 1.0.0
4
+ Summary: An ecosystem-style explicit data cleaning framework for Excel and CSV pipelines.
5
+ Author-email: Aayush Vijay <aayushvj8699@gmail.com>
6
+ Project-URL: Homepage, https://github.com/aayushvijay/tidytable
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: MacOS :: MacOS X
10
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pandas>=2.0.0
15
+ Requires-Dist: openpyxl>=3.1.0
16
+ Requires-Dist: python-dateutil>=2.8.2
17
+ Requires-Dist: rapidfuzz>=3.0.0
18
+ Dynamic: license-file
19
+
20
+ Here is the raw Markdown block. You can copy everything inside this block and paste it directly into your `README.md` file:
21
+
22
+ ```markdown
23
+ # tidytable-core ๐Ÿงน
24
+
25
+ An ecosystem-style, explicit data cleaning framework built for data analysts who bridge the gap between messy, human-formatted Excel/CSV spreadsheets and production-ready Python data structures.
26
+
27
+ Unlike "black-box" cleaning scripts that automatically change your underlying values, `tidytable` forces an **explicit pipeline paradigm**. Each transformation engine is completely decoupled, granting you absolute control and step-by-step data lineage tracking.
28
+
29
+ ---
30
+
31
+ ## ๐Ÿš€ Installation & System Configuration
32
+
33
+ Install `tidytable-core` globally from the Python Package Index (PyPI):
34
+
35
+ ```bash
36
+ pip install tidytable-core
37
+
38
+ ```
39
+
40
+ ### Core External System Dependencies
41
+
42
+ * **`pandas`**: Core tabular dataframe manipulation engine.
43
+ * **`openpyxl`**: Memory-mapped engine for modern `.xlsx` workbook streams.
44
+ * **`python-dateutil`**: Dynamic flexible timestamp text resolution matrix parser.
45
+ * **`rapidfuzz`**: High-performance Levenshtein Distance string similarity evaluation index engine.
46
+
47
+ ---
48
+
49
+ ## ๐Ÿ—๏ธ Architectural Execution Pipeline
50
+
51
+ Data flows sequentially through your explicitly invoked processing domains:
52
+
53
+ ```
54
+ [ Messy Spreadsheet File ]
55
+ โ”‚
56
+ โ–ผ
57
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
58
+ โ”‚ tidytable.xl โ”‚ โ”€โ”€โ–บ Resolves unaligned human-formatted layout grids.
59
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
60
+ โ”‚ (Tabular Data Stream)
61
+ โ–ผ
62
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
63
+ โ”‚ tidytable.structural โ”‚ โ”€โ”€โ–บ Standardizes variable labels and strips whitespace.
64
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
65
+ โ”‚
66
+ โ–ผ
67
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
68
+ โ”‚ tidytable.parse โ”‚ โ”€โ”€โ–บ Casts values safely into clean data types.
69
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
70
+ โ”‚
71
+ โ–ผ
72
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
73
+ โ”‚ tidytable.missing โ”‚ โ”€โ”€โ–บ Drops empty panels and applies imputation profiles.
74
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
75
+ โ”‚
76
+ โ–ผ
77
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
78
+ โ”‚ tidytable.dedup โ”‚ โ”€โ”€โ–บ Filters duplicate records safely.
79
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
80
+ โ”‚
81
+ โ–ผ
82
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
83
+ โ”‚ tidytable.profile โ”‚ โ”€โ”€โ–บ Validates constraints and generates audit ledger logs.
84
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
85
+ โ”‚
86
+ โ–ผ
87
+ [ Pristine DataFrame ]
88
+
89
+ ```
90
+
91
+ ---
92
+
93
+ ## ๐Ÿ“š Complete Sub-Library Blueprint Reference
94
+
95
+ ### 1. `tidytable.xl` (The Excel Surgeon)
96
+
97
+ Extracts pristine data grids from visually stylized worksheets.
98
+
99
+ #### `xl.load_workbook(file_path: str) -> dict[str, pd.DataFrame]`
100
+
101
+ * **Use**: Loads an entire workbook into memory.
102
+ * **Arguments**: `file_path` (*str*): System destination path pointing to an Excel document.
103
+
104
+ #### `xl.unmerge_and_fill(sheet_data: pd.DataFrame, strategy: str = "ffill") -> pd.DataFrame`
105
+
106
+ * **Use**: Flattens merged cells and fills empty fields down or across so rows stay linked.
107
+ * **Arguments**:
108
+ * `sheet_data` (*DataFrame*): The input sheet table matrix.
109
+ * `strategy` (*str*): Direction constraint rule. `"ffill"` (forward fill down) or `"lfill"` (lateral fill across).
110
+
111
+
112
+
113
+ #### `xl.sniff_headers(sheet_data: pd.DataFrame, scan_rows: int = 20) -> tuple[int, list[str]]`
114
+
115
+ * **Use**: Skips title banners and KPI cards to find where the actual table headers start.
116
+ * **Arguments**: `scan_rows` (*int*): Search depth row index limit.
117
+
118
+ ```python
119
+ import tidytable as tt
120
+
121
+ sheets = tt.xl.load_workbook("sales_report.xlsx")
122
+ raw_data = sheets["Q2_Leads"]
123
+
124
+ # Sniff header index location and clean names
125
+ header_idx, headers = tt.xl.sniff_headers(raw_data, scan_rows=15)
126
+
127
+ # Unmerge and prop values down to form a database structure
128
+ df = tt.xl.unmerge_and_fill(raw_data, strategy="ffill")
129
+
130
+ ```
131
+
132
+ ---
133
+
134
+ ### 2. `tidytable.structural` (The Text Blacksmith)
135
+
136
+ Cleans and standardizes column structures and text anomalies.
137
+
138
+ #### `structural.rename_columns(df: pd.DataFrame, style: str = "snake_case") -> pd.DataFrame`
139
+
140
+ * **Use**: Converts columns (like `"Gross Profit (%)"`) into clean variables (`"gross_profit"`).
141
+ * **Arguments**: `style` (*str*): Re-casing format rules. Default is `"snake_case"`.
142
+
143
+ #### `structural.strip_whitespace(series: pd.Series) -> pd.Series`
144
+
145
+ * **Use**: Deep-strips leading/trailing spaces, tab breaks, and hidden non-breaking spaces (`\xa0`).
146
+
147
+ #### `structural.standardize_categories(series: pd.Series, mapping: dict = None, auto_cluster: bool = False) -> pd.Series`
148
+
149
+ * **Use**: Groups manual typos and naming variations into a single target category name.
150
+ * **Arguments**:
151
+ * `mapping` (*dict*): Manual dictionary rules map (e.g., `{"USA": ["usa", "U.S.A.", "us"]}`).
152
+ * `auto_cluster` (*bool*): Uses Levenshtein Distance to merge variations automatically.
153
+
154
+
155
+
156
+ ```python
157
+ df = tt.structural.rename_columns(df, style="snake_case")
158
+ df["product_name"] = tt.structural.strip_whitespace(df["product_name"])
159
+
160
+ # Merge regional text typos automatically using string distance clustering
161
+ df["region"] = tt.structural.standardize_categories(df["region"], auto_cluster=True)
162
+
163
+ ```
164
+
165
+ ---
166
+
167
+ ### 3. `tidytable.parse` (The Type Whisperer)
168
+
169
+ Converts raw string text blocks into strict mathematical datatypes without crashing.
170
+
171
+ #### `parse.dates(series: pd.Series, dayfirst: bool = False) -> pd.Series`
172
+
173
+ * **Use**: Parses mixed date format variations in a single column into uniform ISO datetimes.
174
+
175
+ #### `parse.financials(series: pd.Series) -> pd.Series`
176
+
177
+ * **Use**: Extracts numeric values from accounting styles like `"$ (1,250.00)"` or `"12K"`.
178
+
179
+ #### `parse.repair_identifiers(series: pd.Series, pad_length: int = None) -> pd.Series`
180
+
181
+ * **Use**: Restores dropped leading zeroes on data codes (e.g., converts float `401.0` back to `"00401"`).
182
+
183
+ ```python
184
+ df["invoice_date"] = tt.parse.dates(df["invoice_date"], dayfirst=False)
185
+ df["net_revenue"] = tt.parse.financials(df["net_revenue"])
186
+ df["zip_code"] = tt.parse.repair_identifiers(df["zip_code"], pad_length=5)
187
+ df["roi_metric"] = tt.parse.handle_formula_ghosts(df["roi_metric"], error_strategy="coerce")
188
+
189
+ ```
190
+
191
+ ---
192
+
193
+ ### 4. `tidytable.missing` (The Ghost Hunter)
194
+
195
+ Identifies and resolves gaps in data matrices.
196
+
197
+ #### `missing.drop_empty_cols(df: pd.DataFrame, threshold: float = 0.50) -> pd.DataFrame`
198
+
199
+ * **Use**: Drops column attributes where the missing values ratio exceeds the threshold boundary limit.
200
+
201
+ #### `missing.flag_absence(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame`
202
+
203
+ * **Use**: Appends a binary companion indicator column (`{column}_is_missing`) to keep the data signal before running imputation.
204
+
205
+ #### `missing.impute(series: pd.Series, strategy: str = "median") -> pd.Series`
206
+
207
+ * **Use**: Fills null voids based on chosen statistical parameters (`"mean"`, `"median"`, `"mode"`).
208
+
209
+ ```python
210
+ df = tt.missing.drop_empty_cols(df, threshold=0.40)
211
+ df = tt.missing.flag_absence(df, columns=["customer_age"])
212
+ df["customer_age"] = tt.missing.impute(df["customer_age"], strategy="median")
213
+
214
+ ```
215
+
216
+ ---
217
+
218
+ ### 5. `tidytable.dedup` (The Twin Eliminator)
219
+
220
+ Detects and drops duplicate entries across your rows.
221
+
222
+ #### `dedup.absolute(df: pd.DataFrame) -> pd.DataFrame`
223
+
224
+ * **Use**: Drops rows only if they match exactly across every single field.
225
+
226
+ #### `dedup.partial(df: pd.DataFrame, subset: list[str], keep: str = "latest", timestamp_col: str = None) -> pd.DataFrame`
227
+
228
+ * **Use**: Resolves record updates by keeping the earliest or latest transaction entry for a unique key.
229
+
230
+ ```python
231
+ # Clear absolute identical rows
232
+ df = tt.dedup.absolute(df)
233
+
234
+ # For matching customer IDs, keep only the record with the most recent update timestamp
235
+ df = tt.dedup.partial(df, subset=["customer_id"], keep="latest", timestamp_col="updated_at")
236
+
237
+ ```
238
+
239
+ ---
240
+
241
+ ### 6. `tidytable.merge` (The VLOOKUP Bridge)
242
+
243
+ Joins separate files together even when keys are messy, incomplete, or slightly misspelled.
244
+
245
+ ```python
246
+ # Identify mismatched elements before executing joins
247
+ pre_flight = tt.merge.join_diagnose(left_df=leads_df, right_df=master_df, left_on="vendor", right_on="v_name")
248
+
249
+ # Join matching rows even if there are typos (e.g., matches "Apple Inc." to "Apple, Inc.")
250
+ joined_df = tt.merge.fuzzy_vlookup(leads_df, master_df, left_on="vendor", right_on="v_name", threshold=0.88)
251
+
252
+ ```
253
+
254
+ ---
255
+
256
+ ### 7. `tidytable.reconcile` (The Ledger Auditor)
257
+
258
+ Automates version control checks between separate instances of the same file structure.
259
+
260
+ ```python
261
+ # Generate structural audits comparing January data against February data
262
+ ledger_updates = tt.reconcile.sheet_diff(df_old=jan_df, df_new=feb_df, key_column="transaction_id")
263
+
264
+ print("New rows added this month:", len(ledger_updates["Added"]))
265
+ print("Row modifications captured:", len(ledger_updates["Modified"]))
266
+
267
+ ```
268
+
269
+ ---
270
+
271
+ ### 8. `tidytable.profile` (The Auditor & Schema Guard)
272
+
273
+ Handles file schema pinning, anomaly detection, and automated audit trails.
274
+
275
+ ```python
276
+ # Scan for raw strings masking null values (e.g., "?", "n/a", "-")
277
+ anomalies = tt.profile.check_anomalies(df)
278
+
279
+ # Validate current file structure against last month's blueprint to make sure scripts don't crash
280
+ if tt.profile.validate(df, schema_path="schemas/prod_blueprint.json"):
281
+ # Output file pipeline performance audit change log metrics
282
+ print(tt.profile.audit_report(df, output="cli"))
283
+
284
+ ```
285
+
286
+ ---
287
+
288
+ ## ๐ŸŽฏ Complete End-to-End Explicit Analyst Workflow
289
+
290
+ Here is a complete real-world script showing how an analyst runs a detailed cleaning pipeline manually:
291
+
292
+ ```python
293
+ import tidytable as tt
294
+ import pandas as pd
295
+
296
+ # Step 1: Layout Normalization
297
+ workbook = tt.xl.load_workbook("raw_factory_data.xlsx")
298
+ sheet_grid = workbook["Master_Log"]
299
+ df = tt.xl.unmerge_and_fill(sheet_grid, strategy="ffill")
300
+
301
+ # Step 2: Structural Column Cleaning
302
+ df = tt.structural.rename_columns(df, style="snake_case")
303
+ df["part_name"] = tt.structural.strip_whitespace(df["part_name"])
304
+
305
+ # Step 3: Type Safe Parsing
306
+ df["serial_id"] = tt.parse.repair_identifiers(df["serial_id"], pad_length=6)
307
+ df["cost"] = tt.parse.financials(df["cost"])
308
+ df["log_date"] = tt.parse.dates(df["log_date"])
309
+
310
+ # Step 4: Integrity and Row Refinement
311
+ df = tt.missing.flag_absence(df, columns=["efficiency_score"])
312
+ df["efficiency_score"] = tt.missing.impute(df["efficiency_score"], strategy="mean")
313
+ df = tt.dedup.absolute(df)
314
+
315
+ # Step 5: Verification & Schema Pinning
316
+ if tt.profile.validate(df, schema_path="schemas/factory_spec.json"):
317
+ df.to_csv("clean_factory_data.csv", index=False)
318
+ print(tt.profile.audit_report(df, output="cli"))
319
+
320
+ ```
321
+
322
+ ---
323
+
324
+ ## โš–๏ธ License
325
+
326
+ Distributed under the MIT License. See `LICENSE` for details.
327
+
328
+ ```
329
+
330
+ ```
@@ -0,0 +1,311 @@
1
+ Here is the raw Markdown block. You can copy everything inside this block and paste it directly into your `README.md` file:
2
+
3
+ ```markdown
4
+ # tidytable-core ๐Ÿงน
5
+
6
+ An ecosystem-style, explicit data cleaning framework built for data analysts who bridge the gap between messy, human-formatted Excel/CSV spreadsheets and production-ready Python data structures.
7
+
8
+ Unlike "black-box" cleaning scripts that automatically change your underlying values, `tidytable` forces an **explicit pipeline paradigm**. Each transformation engine is completely decoupled, granting you absolute control and step-by-step data lineage tracking.
9
+
10
+ ---
11
+
12
+ ## ๐Ÿš€ Installation & System Configuration
13
+
14
+ Install `tidytable-core` globally from the Python Package Index (PyPI):
15
+
16
+ ```bash
17
+ pip install tidytable-core
18
+
19
+ ```
20
+
21
+ ### Core External System Dependencies
22
+
23
+ * **`pandas`**: Core tabular dataframe manipulation engine.
24
+ * **`openpyxl`**: Memory-mapped engine for modern `.xlsx` workbook streams.
25
+ * **`python-dateutil`**: Dynamic flexible timestamp text resolution matrix parser.
26
+ * **`rapidfuzz`**: High-performance Levenshtein Distance string similarity evaluation index engine.
27
+
28
+ ---
29
+
30
+ ## ๐Ÿ—๏ธ Architectural Execution Pipeline
31
+
32
+ Data flows sequentially through your explicitly invoked processing domains:
33
+
34
+ ```
35
+ [ Messy Spreadsheet File ]
36
+ โ”‚
37
+ โ–ผ
38
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
39
+ โ”‚ tidytable.xl โ”‚ โ”€โ”€โ–บ Resolves unaligned human-formatted layout grids.
40
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
41
+ โ”‚ (Tabular Data Stream)
42
+ โ–ผ
43
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
44
+ โ”‚ tidytable.structural โ”‚ โ”€โ”€โ–บ Standardizes variable labels and strips whitespace.
45
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
46
+ โ”‚
47
+ โ–ผ
48
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
49
+ โ”‚ tidytable.parse โ”‚ โ”€โ”€โ–บ Casts values safely into clean data types.
50
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
51
+ โ”‚
52
+ โ–ผ
53
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
54
+ โ”‚ tidytable.missing โ”‚ โ”€โ”€โ–บ Drops empty panels and applies imputation profiles.
55
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
56
+ โ”‚
57
+ โ–ผ
58
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
59
+ โ”‚ tidytable.dedup โ”‚ โ”€โ”€โ–บ Filters duplicate records safely.
60
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
61
+ โ”‚
62
+ โ–ผ
63
+ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
64
+ โ”‚ tidytable.profile โ”‚ โ”€โ”€โ–บ Validates constraints and generates audit ledger logs.
65
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
66
+ โ”‚
67
+ โ–ผ
68
+ [ Pristine DataFrame ]
69
+
70
+ ```
71
+
72
+ ---
73
+
74
+ ## ๐Ÿ“š Complete Sub-Library Blueprint Reference
75
+
76
+ ### 1. `tidytable.xl` (The Excel Surgeon)
77
+
78
+ Extracts pristine data grids from visually stylized worksheets.
79
+
80
+ #### `xl.load_workbook(file_path: str) -> dict[str, pd.DataFrame]`
81
+
82
+ * **Use**: Loads an entire workbook into memory.
83
+ * **Arguments**: `file_path` (*str*): System destination path pointing to an Excel document.
84
+
85
+ #### `xl.unmerge_and_fill(sheet_data: pd.DataFrame, strategy: str = "ffill") -> pd.DataFrame`
86
+
87
+ * **Use**: Flattens merged cells and fills empty fields down or across so rows stay linked.
88
+ * **Arguments**:
89
+ * `sheet_data` (*DataFrame*): The input sheet table matrix.
90
+ * `strategy` (*str*): Direction constraint rule. `"ffill"` (forward fill down) or `"lfill"` (lateral fill across).
91
+
92
+
93
+
94
+ #### `xl.sniff_headers(sheet_data: pd.DataFrame, scan_rows: int = 20) -> tuple[int, list[str]]`
95
+
96
+ * **Use**: Skips title banners and KPI cards to find where the actual table headers start.
97
+ * **Arguments**: `scan_rows` (*int*): Search depth row index limit.
98
+
99
+ ```python
100
+ import tidytable as tt
101
+
102
+ sheets = tt.xl.load_workbook("sales_report.xlsx")
103
+ raw_data = sheets["Q2_Leads"]
104
+
105
+ # Sniff header index location and clean names
106
+ header_idx, headers = tt.xl.sniff_headers(raw_data, scan_rows=15)
107
+
108
+ # Unmerge and prop values down to form a database structure
109
+ df = tt.xl.unmerge_and_fill(raw_data, strategy="ffill")
110
+
111
+ ```
112
+
113
+ ---
114
+
115
+ ### 2. `tidytable.structural` (The Text Blacksmith)
116
+
117
+ Cleans and standardizes column structures and text anomalies.
118
+
119
+ #### `structural.rename_columns(df: pd.DataFrame, style: str = "snake_case") -> pd.DataFrame`
120
+
121
+ * **Use**: Converts columns (like `"Gross Profit (%)"`) into clean variables (`"gross_profit"`).
122
+ * **Arguments**: `style` (*str*): Re-casing format rules. Default is `"snake_case"`.
123
+
124
+ #### `structural.strip_whitespace(series: pd.Series) -> pd.Series`
125
+
126
+ * **Use**: Deep-strips leading/trailing spaces, tab breaks, and hidden non-breaking spaces (`\xa0`).
127
+
128
+ #### `structural.standardize_categories(series: pd.Series, mapping: dict = None, auto_cluster: bool = False) -> pd.Series`
129
+
130
+ * **Use**: Groups manual typos and naming variations into a single target category name.
131
+ * **Arguments**:
132
+ * `mapping` (*dict*): Manual dictionary rules map (e.g., `{"USA": ["usa", "U.S.A.", "us"]}`).
133
+ * `auto_cluster` (*bool*): Uses Levenshtein Distance to merge variations automatically.
134
+
135
+
136
+
137
+ ```python
138
+ df = tt.structural.rename_columns(df, style="snake_case")
139
+ df["product_name"] = tt.structural.strip_whitespace(df["product_name"])
140
+
141
+ # Merge regional text typos automatically using string distance clustering
142
+ df["region"] = tt.structural.standardize_categories(df["region"], auto_cluster=True)
143
+
144
+ ```
145
+
146
+ ---
147
+
148
+ ### 3. `tidytable.parse` (The Type Whisperer)
149
+
150
+ Converts raw string text blocks into strict mathematical datatypes without crashing.
151
+
152
+ #### `parse.dates(series: pd.Series, dayfirst: bool = False) -> pd.Series`
153
+
154
+ * **Use**: Parses mixed date format variations in a single column into uniform ISO datetimes.
155
+
156
+ #### `parse.financials(series: pd.Series) -> pd.Series`
157
+
158
+ * **Use**: Extracts numeric values from accounting styles like `"$ (1,250.00)"` or `"12K"`.
159
+
160
+ #### `parse.repair_identifiers(series: pd.Series, pad_length: int = None) -> pd.Series`
161
+
162
+ * **Use**: Restores dropped leading zeroes on data codes (e.g., converts float `401.0` back to `"00401"`).
163
+
164
+ ```python
165
+ df["invoice_date"] = tt.parse.dates(df["invoice_date"], dayfirst=False)
166
+ df["net_revenue"] = tt.parse.financials(df["net_revenue"])
167
+ df["zip_code"] = tt.parse.repair_identifiers(df["zip_code"], pad_length=5)
168
+ df["roi_metric"] = tt.parse.handle_formula_ghosts(df["roi_metric"], error_strategy="coerce")
169
+
170
+ ```
171
+
172
+ ---
173
+
174
+ ### 4. `tidytable.missing` (The Ghost Hunter)
175
+
176
+ Identifies and resolves gaps in data matrices.
177
+
178
+ #### `missing.drop_empty_cols(df: pd.DataFrame, threshold: float = 0.50) -> pd.DataFrame`
179
+
180
+ * **Use**: Drops column attributes where the missing values ratio exceeds the threshold boundary limit.
181
+
182
+ #### `missing.flag_absence(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame`
183
+
184
+ * **Use**: Appends a binary companion indicator column (`{column}_is_missing`) to keep the data signal before running imputation.
185
+
186
+ #### `missing.impute(series: pd.Series, strategy: str = "median") -> pd.Series`
187
+
188
+ * **Use**: Fills null voids based on chosen statistical parameters (`"mean"`, `"median"`, `"mode"`).
189
+
190
+ ```python
191
+ df = tt.missing.drop_empty_cols(df, threshold=0.40)
192
+ df = tt.missing.flag_absence(df, columns=["customer_age"])
193
+ df["customer_age"] = tt.missing.impute(df["customer_age"], strategy="median")
194
+
195
+ ```
196
+
197
+ ---
198
+
199
+ ### 5. `tidytable.dedup` (The Twin Eliminator)
200
+
201
+ Detects and drops duplicate entries across your rows.
202
+
203
+ #### `dedup.absolute(df: pd.DataFrame) -> pd.DataFrame`
204
+
205
+ * **Use**: Drops rows only if they match exactly across every single field.
206
+
207
+ #### `dedup.partial(df: pd.DataFrame, subset: list[str], keep: str = "latest", timestamp_col: str = None) -> pd.DataFrame`
208
+
209
+ * **Use**: Resolves record updates by keeping the earliest or latest transaction entry for a unique key.
210
+
211
+ ```python
212
+ # Clear absolute identical rows
213
+ df = tt.dedup.absolute(df)
214
+
215
+ # For matching customer IDs, keep only the record with the most recent update timestamp
216
+ df = tt.dedup.partial(df, subset=["customer_id"], keep="latest", timestamp_col="updated_at")
217
+
218
+ ```
219
+
220
+ ---
221
+
222
+ ### 6. `tidytable.merge` (The VLOOKUP Bridge)
223
+
224
+ Joins separate files together even when keys are messy, incomplete, or slightly misspelled.
225
+
226
+ ```python
227
+ # Identify mismatched elements before executing joins
228
+ pre_flight = tt.merge.join_diagnose(left_df=leads_df, right_df=master_df, left_on="vendor", right_on="v_name")
229
+
230
+ # Join matching rows even if there are typos (e.g., matches "Apple Inc." to "Apple, Inc.")
231
+ joined_df = tt.merge.fuzzy_vlookup(leads_df, master_df, left_on="vendor", right_on="v_name", threshold=0.88)
232
+
233
+ ```
234
+
235
+ ---
236
+
237
+ ### 7. `tidytable.reconcile` (The Ledger Auditor)
238
+
239
+ Automates version control checks between separate instances of the same file structure.
240
+
241
+ ```python
242
+ # Generate structural audits comparing January data against February data
243
+ ledger_updates = tt.reconcile.sheet_diff(df_old=jan_df, df_new=feb_df, key_column="transaction_id")
244
+
245
+ print("New rows added this month:", len(ledger_updates["Added"]))
246
+ print("Row modifications captured:", len(ledger_updates["Modified"]))
247
+
248
+ ```
249
+
250
+ ---
251
+
252
+ ### 8. `tidytable.profile` (The Auditor & Schema Guard)
253
+
254
+ Handles file schema pinning, anomaly detection, and automated audit trails.
255
+
256
+ ```python
257
+ # Scan for raw strings masking null values (e.g., "?", "n/a", "-")
258
+ anomalies = tt.profile.check_anomalies(df)
259
+
260
+ # Validate current file structure against last month's blueprint to make sure scripts don't crash
261
+ if tt.profile.validate(df, schema_path="schemas/prod_blueprint.json"):
262
+ # Output file pipeline performance audit change log metrics
263
+ print(tt.profile.audit_report(df, output="cli"))
264
+
265
+ ```
266
+
267
+ ---
268
+
269
+ ## ๐ŸŽฏ Complete End-to-End Explicit Analyst Workflow
270
+
271
+ Here is a complete real-world script showing how an analyst runs a detailed cleaning pipeline manually:
272
+
273
+ ```python
274
+ import tidytable as tt
275
+ import pandas as pd
276
+
277
+ # Step 1: Layout Normalization
278
+ workbook = tt.xl.load_workbook("raw_factory_data.xlsx")
279
+ sheet_grid = workbook["Master_Log"]
280
+ df = tt.xl.unmerge_and_fill(sheet_grid, strategy="ffill")
281
+
282
+ # Step 2: Structural Column Cleaning
283
+ df = tt.structural.rename_columns(df, style="snake_case")
284
+ df["part_name"] = tt.structural.strip_whitespace(df["part_name"])
285
+
286
+ # Step 3: Type Safe Parsing
287
+ df["serial_id"] = tt.parse.repair_identifiers(df["serial_id"], pad_length=6)
288
+ df["cost"] = tt.parse.financials(df["cost"])
289
+ df["log_date"] = tt.parse.dates(df["log_date"])
290
+
291
+ # Step 4: Integrity and Row Refinement
292
+ df = tt.missing.flag_absence(df, columns=["efficiency_score"])
293
+ df["efficiency_score"] = tt.missing.impute(df["efficiency_score"], strategy="mean")
294
+ df = tt.dedup.absolute(df)
295
+
296
+ # Step 5: Verification & Schema Pinning
297
+ if tt.profile.validate(df, schema_path="schemas/factory_spec.json"):
298
+ df.to_csv("clean_factory_data.csv", index=False)
299
+ print(tt.profile.audit_report(df, output="cli"))
300
+
301
+ ```
302
+
303
+ ---
304
+
305
+ ## โš–๏ธ License
306
+
307
+ Distributed under the MIT License. See `LICENSE` for details.
308
+
309
+ ```
310
+
311
+ ```
@@ -0,0 +1,31 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tidytable-core" # Ensure this name is unique on PyPI
7
+ version = "1.0.0"
8
+ authors = [
9
+ { name="Aayush Vijay", email="aayushvj8699@gmail.com" }
10
+ ]
11
+ description = "An ecosystem-style explicit data cleaning framework for Excel and CSV pipelines."
12
+ readme = "README.md"
13
+ requires-python = ">=3.9"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: MacOS :: MacOS X",
18
+ "Topic :: Scientific/Engineering :: Information Analysis"
19
+ ]
20
+ dependencies = [
21
+ "pandas>=2.0.0",
22
+ "openpyxl>=3.1.0",
23
+ "python-dateutil>=2.8.2",
24
+ "rapidfuzz>=3.0.0"
25
+ ]
26
+
27
+ [project.urls]
28
+ "Homepage" = "https://github.com/aayushvijay/tidytable"
29
+
30
+ [tool.setuptools.packages.find]
31
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,19 @@
1
+ from . import xl
2
+ from . import structural
3
+ from . import parse
4
+ from . import missing
5
+ from . import dedup
6
+ from . import merge
7
+ from . import reconcile
8
+ from . import profile
9
+
10
+ __all__ = [
11
+ "xl",
12
+ "structural",
13
+ "parse",
14
+ "missing",
15
+ "dedup",
16
+ "merge",
17
+ "reconcile",
18
+ "profile"
19
+ ]