deepcsv 0.6.0__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepcsv-0.6.2/CHANGELOG.md +25 -0
- deepcsv-0.6.2/PKG-INFO +250 -0
- deepcsv-0.6.2/README.md +188 -0
- {deepcsv-0.6.0 → deepcsv-0.6.2}/deepcsv/deepcsv.py +11 -11
- deepcsv-0.6.2/deepcsv/utils.py +302 -0
- deepcsv-0.6.2/deepcsv.egg-info/PKG-INFO +250 -0
- {deepcsv-0.6.0 → deepcsv-0.6.2}/setup.py +2 -2
- deepcsv-0.6.0/CHANGELOG.md +0 -10
- deepcsv-0.6.0/PKG-INFO +0 -192
- deepcsv-0.6.0/README.md +0 -145
- deepcsv-0.6.0/deepcsv/utils.py +0 -159
- deepcsv-0.6.0/deepcsv.egg-info/PKG-INFO +0 -192
- {deepcsv-0.6.0 → deepcsv-0.6.2}/LICENSE +0 -0
- {deepcsv-0.6.0 → deepcsv-0.6.2}/MANIFEST.in +0 -0
- {deepcsv-0.6.0 → deepcsv-0.6.2}/deepcsv/__init__.py +0 -0
- {deepcsv-0.6.0 → deepcsv-0.6.2}/deepcsv.egg-info/SOURCES.txt +0 -0
- {deepcsv-0.6.0 → deepcsv-0.6.2}/deepcsv.egg-info/dependency_links.txt +0 -0
- {deepcsv-0.6.0 → deepcsv-0.6.2}/deepcsv.egg-info/requires.txt +0 -0
- {deepcsv-0.6.0 → deepcsv-0.6.2}/deepcsv.egg-info/top_level.txt +0 -0
- {deepcsv-0.6.0 → deepcsv-0.6.2}/setup.cfg +0 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- `process_all_files` — Added option for user to customize the output folder name in
|
|
8
|
+
- `read_any()` — Reads any supported file format and returns a pandas DataFrame automatically. Supports: `.csv`, `.txt`, `.tsv`, `.xls`, `.xlsx`, `.json`, `.parquet`, `.pkl`, `.feather`, `.db`, `.sqlite`
|
|
9
|
+
- `clean_values()` — Cleans a DataFrame by removing nulls, specific values, specific types, or rows by index. Supports optional condition filtering with 6 operators
|
|
10
|
+
- `_validate_cols()` — Internal helper: validates cols is a non-empty list and all columns exist in the DataFrame
|
|
11
|
+
- `_validate_index()` — Internal helper: validates index is a non-empty list and all indexes exist in the DataFrame. Supports optional `reset_index` before validation
|
|
12
|
+
- `_validate_condition()` — Internal helper: validates condition list and returns `(operator_func, value)`
|
|
13
|
+
- `_parse_operator()` — Internal helper: converts operator string like `'>='` into its Python operator function
|
|
14
|
+
- finding_value parameter in `clean_values(data_input,finding_value)` find and remove rows that have this specific value
|
|
15
|
+
- finding_type parameter in `clean_values(data_input,finding_type)` find and remove rows that have this specific type (ex: str, int)
|
|
16
|
+
- condition parameter in `clean_values(data_input,condition : [operator, value] → ex: ['>=', 500])` applied only with finding_value or finding_type
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
### Changed
|
|
21
|
+
|
|
22
|
+
- `process_file()` — Added `save_file_extension` parameter. Now supports saving the processed DataFrame in any format after conversion, not just returning it
|
|
23
|
+
- `process_all_files()` — Added `file_extension` parameter. Now supports saving converted files in any format instead of always saving as Parquet. Also expanded supported input formats beyond `.csv` and `.xlsx` to cover all formats supported by `read_any()`
|
|
24
|
+
|
|
25
|
+
---
|
deepcsv-0.6.2/PKG-INFO
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deepcsv
|
|
3
|
+
Version: 0.6.2
|
|
4
|
+
Summary: Automatically processes data files in directories, converts array-like strings to NumPy arrays, detects and fixes data type issues, and saves results as optimized Parquet files and MORE!
|
|
5
|
+
Home-page: https://github.com/abdubakr77/deepcsv
|
|
6
|
+
Author: Abdullah Bakr
|
|
7
|
+
Author-email: abdubakora1232@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Source, https://github.com/abdubakr77/deepcsv
|
|
10
|
+
Project-URL: Tracker, https://github.com/abdubakr77/deepcsv/issues
|
|
11
|
+
Keywords: data-processing pandas numpy etl data-cleaning file-conversion automation parquet bulk-conversion
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Requires-Python: >=3.7
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: pandas
|
|
21
|
+
Requires-Dist: pyarrow
|
|
22
|
+
Requires-Dist: requests
|
|
23
|
+
Dynamic: author
|
|
24
|
+
Dynamic: author-email
|
|
25
|
+
Dynamic: classifier
|
|
26
|
+
Dynamic: description
|
|
27
|
+
Dynamic: description-content-type
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: keywords
|
|
30
|
+
Dynamic: license
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
Dynamic: project-url
|
|
33
|
+
Dynamic: requires-dist
|
|
34
|
+
Dynamic: requires-python
|
|
35
|
+
Dynamic: summary
|
|
36
|
+
|
|
37
|
+
# deepcsv
|
|
38
|
+
> *"You think you saved a list. You open it tomorrow — and it's a string."*
|
|
39
|
+
|
|
40
|
+
`deepcsv` was built to solve exactly this problem.
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## The Problem
|
|
45
|
+
|
|
46
|
+
Your CSV files are lying to you.
|
|
47
|
+
|
|
48
|
+
You save a list — you open it tomorrow and it's a string.
|
|
49
|
+
Your column has numbers — it secretly has 3 different data types.
|
|
50
|
+
You have 200 CSV files across 40 folders — and you process them one by one.
|
|
51
|
+
You load a file and spend 20 minutes just picking the right reader.
|
|
52
|
+
You have nulls scattered everywhere with no clean way to handle them.
|
|
53
|
+
|
|
54
|
+
This is the silent killer of every data pipeline.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## The Solution
|
|
59
|
+
|
|
60
|
+
`deepcsv` handles all of this in one import.
|
|
61
|
+
|
|
62
|
+
- Walks through every folder and subfolder automatically
|
|
63
|
+
- Finds every CSV and XLSX file
|
|
64
|
+
- Detects columns storing lists as strings and converts them to real NumPy arrays
|
|
65
|
+
- Catches mixed-type columns and fixes them automatically
|
|
66
|
+
- Saves everything in any format you choose — not just Parquet
|
|
67
|
+
- Reads any file format with one function — no more picking the right reader
|
|
68
|
+
- Cleans nulls with full control over columns, rows, indexes, values, and types
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Installation
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install deepcsv
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## Functions
|
|
81
|
+
|
|
82
|
+
### `process_file(data_input, save_file_extension= str)`
|
|
83
|
+
|
|
84
|
+
Reads a file or DataFrame, converts array-like strings to NumPy arrays, fixes mixed-type columns, and optionally saves the result in any format you choose.
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
import deepcsv
|
|
88
|
+
|
|
89
|
+
# Process only
|
|
90
|
+
df = deepcsv.process_file('path/to/file.csv')
|
|
91
|
+
|
|
92
|
+
# Process and save as parquet
|
|
93
|
+
df = deepcsv.process_file('path/to/file.csv', save_file_extension='parquet')
|
|
94
|
+
|
|
95
|
+
# Process and save as Excel
|
|
96
|
+
df = deepcsv.process_file('path/to/file.csv', save_file_extension='xlsx')
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**Supported save formats:** `.csv` `.tsv` `.txt` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.html` `.xml`
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
### `process_all_files(directory_path, output_dir="All CSV Files is Converted Here", file_extension="parquet")`
|
|
104
|
+
|
|
105
|
+
Walks through all folders and subfolders, applies `process_file` on every supported file, and saves results in the format you choose.
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
import deepcsv
|
|
109
|
+
|
|
110
|
+
# Default — saves as parquet
|
|
111
|
+
deepcsv.process_all_files('path/to/folder')
|
|
112
|
+
|
|
113
|
+
# Custom output folder
|
|
114
|
+
deepcsv.process_all_files('path/to/folder', output_dir='Converted Files')
|
|
115
|
+
|
|
116
|
+
# Save as CSV instead
|
|
117
|
+
deepcsv.process_all_files('path/to/folder', file_extension='csv')
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Supported input formats:** `.csv` `.txt` `.tsv` `.xls` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.db` `.sqlite`
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
### `read_any(file_path)` ✨
|
|
125
|
+
|
|
126
|
+
Reads any supported file format and returns a pandas DataFrame — one function for everything.
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from deepcsv import read_any
|
|
130
|
+
|
|
131
|
+
df = read_any('data/users.csv')
|
|
132
|
+
df = read_any('reports/sales.xlsx')
|
|
133
|
+
df = read_any('warehouse/orders.parquet')
|
|
134
|
+
df = read_any('local.db')
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
**Supported formats:** `.csv` `.txt` `.tsv` `.xls` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.db` `.sqlite`
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
### `clean_values(data_input, ...)` ✨
|
|
142
|
+
|
|
143
|
+
Cleans a DataFrame by removing nulls, specific values, specific types, or rows by index — with full control over which columns to target and optional conditions.
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from deepcsv import clean_values
|
|
147
|
+
|
|
148
|
+
# Drop fully-null columns
|
|
149
|
+
df = clean_values('data.csv', cols=['age', 'salary'])
|
|
150
|
+
|
|
151
|
+
# Drop rows that have nulls in specific cols
|
|
152
|
+
df = clean_values('data.csv', cols=['age', 'salary'], ax_0=True)
|
|
153
|
+
|
|
154
|
+
# Drop rows by index
|
|
155
|
+
df = clean_values(df, index=[0, 5, 12])
|
|
156
|
+
|
|
157
|
+
# Remove rows where a specific value exists
|
|
158
|
+
df = clean_values(df, cols=['status'], finding_value='N/A')
|
|
159
|
+
|
|
160
|
+
# Remove rows where value meets a condition
|
|
161
|
+
df = clean_values(df, cols=['score'], finding_value='N/A', condition=['>=', 500])
|
|
162
|
+
|
|
163
|
+
# Remove rows by Python type
|
|
164
|
+
df = clean_values(df, cols=['age'], finding_type=str)
|
|
165
|
+
|
|
166
|
+
# Apply on all columns except some
|
|
167
|
+
df = clean_values('data.csv', all_cols_except=['id', 'name'])
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
| Parameter | Type | Default | Description |
|
|
171
|
+
|---|---|---|---|
|
|
172
|
+
| `data_input` | `str \| DataFrame` | required | File path or DataFrame |
|
|
173
|
+
| `cols` | `list` | `None` | Columns to apply on |
|
|
174
|
+
| `ax_0` | `bool` | `False` | `True`: drop rows with nulls — `False`: drop fully-null cols |
|
|
175
|
+
| `index` | `list` | `None` | Row indexes to drop |
|
|
176
|
+
| `condition` | `list` | `None` | `[operator, value]` — ex: `['>=', 500]` |
|
|
177
|
+
| `all_cols_except` | `list` | `None` | Apply on all columns except these |
|
|
178
|
+
| `finding_value` | `any` | `None` | Find and remove rows containing this value |
|
|
179
|
+
| `finding_type` | `type` | `None` | Find and remove rows matching this Python type |
|
|
180
|
+
|
|
181
|
+
**Supported condition operators:** `>=` `<=` `>` `<` `==` `!=`
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Function Signatures
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
process_file(data_input: Union[str, pd.DataFrame], save_file_extension: str = None) -> pd.DataFrame
|
|
189
|
+
process_all_files(directory_path: str, output_dir: str = "All CSV Files is Converted Here", file_extension: str = "parquet") -> None
|
|
190
|
+
read_any(file_path: str) -> pd.DataFrame
|
|
191
|
+
clean_values(data_input, cols=None, ax_0=False, index=None, condition=None, all_cols_except=None, finding_value=None, finding_type=None) -> pd.DataFrame
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Key Features
|
|
197
|
+
|
|
198
|
+
- String list → real NumPy array conversion (fast, no manual parsing)
|
|
199
|
+
- Mixed-type column detection and auto-fix
|
|
200
|
+
- Save in any format — CSV, Excel, JSON, Parquet, Feather, and more
|
|
201
|
+
- One universal file reader for 10+ formats
|
|
202
|
+
- Flexible null cleaning by column, row, index, value, or type
|
|
203
|
+
- Conditional filtering with 6 operators
|
|
204
|
+
- Recursive directory traversal
|
|
205
|
+
- Warning messages for full transparency
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## Notes
|
|
210
|
+
|
|
211
|
+
- Requires `pyarrow` for Parquet and Feather support
|
|
212
|
+
- Only saves files in `process_all_files` if the DataFrame contains converted array columns
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## Requirements
|
|
217
|
+
|
|
218
|
+
- Python >= 3.7
|
|
219
|
+
- pandas
|
|
220
|
+
- pyarrow
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
**By: Abdullah Bakr**
|
|
225
|
+
|
|
226
|
+
# Changelog
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
### Added
|
|
231
|
+
|
|
232
|
+
- `process_all_files` — Added option for user to customize the output folder name in
|
|
233
|
+
- `read_any()` — Reads any supported file format and returns a pandas DataFrame automatically. Supports: `.csv`, `.txt`, `.tsv`, `.xls`, `.xlsx`, `.json`, `.parquet`, `.pkl`, `.feather`, `.db`, `.sqlite`
|
|
234
|
+
- `clean_values()` — Cleans a DataFrame by removing nulls, specific values, specific types, or rows by index. Supports optional condition filtering with 6 operators
|
|
235
|
+
- `_validate_cols()` — Internal helper: validates cols is a non-empty list and all columns exist in the DataFrame
|
|
236
|
+
- `_validate_index()` — Internal helper: validates index is a non-empty list and all indexes exist in the DataFrame. Supports optional `reset_index` before validation
|
|
237
|
+
- `_validate_condition()` — Internal helper: validates condition list and returns `(operator_func, value)`
|
|
238
|
+
- `_parse_operator()` — Internal helper: converts operator string like `'>='` into its Python operator function
|
|
239
|
+
- finding_value parameter in `clean_values(data_input,finding_value)` find and remove rows that have this specific value
|
|
240
|
+
- finding_type parameter in `clean_values(data_input,finding_type)` find and remove rows that have this specific type (ex: str, int)
|
|
241
|
+
- condition parameter in `clean_values(data_input,condition : [operator, value] → ex: ['>=', 500])` applied only with finding_value or finding_type
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
### Changed
|
|
246
|
+
|
|
247
|
+
- `process_file()` — Added `save_file_extension` parameter. Now supports saving the processed DataFrame in any format after conversion, not just returning it
|
|
248
|
+
- `process_all_files()` — Added `file_extension` parameter. Now supports saving converted files in any format instead of always saving as Parquet. Also expanded supported input formats beyond `.csv` and `.xlsx` to cover all formats supported by `read_any()`
|
|
249
|
+
|
|
250
|
+
---
|
deepcsv-0.6.2/README.md
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# deepcsv
|
|
2
|
+
> *"You think you saved a list. You open it tomorrow — and it's a string."*
|
|
3
|
+
|
|
4
|
+
`deepcsv` was built to solve exactly this problem.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## The Problem
|
|
9
|
+
|
|
10
|
+
Your CSV files are lying to you.
|
|
11
|
+
|
|
12
|
+
You save a list — you open it tomorrow and it's a string.
|
|
13
|
+
Your column has numbers — it secretly has 3 different data types.
|
|
14
|
+
You have 200 CSV files across 40 folders — and you process them one by one.
|
|
15
|
+
You load a file and spend 20 minutes just picking the right reader.
|
|
16
|
+
You have nulls scattered everywhere with no clean way to handle them.
|
|
17
|
+
|
|
18
|
+
This is the silent killer of every data pipeline.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## The Solution
|
|
23
|
+
|
|
24
|
+
`deepcsv` handles all of this in one import.
|
|
25
|
+
|
|
26
|
+
- Walks through every folder and subfolder automatically
|
|
27
|
+
- Finds every CSV and XLSX file
|
|
28
|
+
- Detects columns storing lists as strings and converts them to real NumPy arrays
|
|
29
|
+
- Catches mixed-type columns and fixes them automatically
|
|
30
|
+
- Saves everything in any format you choose — not just Parquet
|
|
31
|
+
- Reads any file format with one function — no more picking the right reader
|
|
32
|
+
- Cleans nulls with full control over columns, rows, indexes, values, and types
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install deepcsv
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Functions
|
|
45
|
+
|
|
46
|
+
### `process_file(data_input, save_file_extension= str)`
|
|
47
|
+
|
|
48
|
+
Reads a file or DataFrame, converts array-like strings to NumPy arrays, fixes mixed-type columns, and optionally saves the result in any format you choose.
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
import deepcsv
|
|
52
|
+
|
|
53
|
+
# Process only
|
|
54
|
+
df = deepcsv.process_file('path/to/file.csv')
|
|
55
|
+
|
|
56
|
+
# Process and save as parquet
|
|
57
|
+
df = deepcsv.process_file('path/to/file.csv', save_file_extension='parquet')
|
|
58
|
+
|
|
59
|
+
# Process and save as Excel
|
|
60
|
+
df = deepcsv.process_file('path/to/file.csv', save_file_extension='xlsx')
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
**Supported save formats:** `.csv` `.tsv` `.txt` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.html` `.xml`
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
### `process_all_files(directory_path, output_dir="All CSV Files is Converted Here", file_extension="parquet")`
|
|
68
|
+
|
|
69
|
+
Walks through all folders and subfolders, applies `process_file` on every supported file, and saves results in the format you choose.
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import deepcsv
|
|
73
|
+
|
|
74
|
+
# Default — saves as parquet
|
|
75
|
+
deepcsv.process_all_files('path/to/folder')
|
|
76
|
+
|
|
77
|
+
# Custom output folder
|
|
78
|
+
deepcsv.process_all_files('path/to/folder', output_dir='Converted Files')
|
|
79
|
+
|
|
80
|
+
# Save as CSV instead
|
|
81
|
+
deepcsv.process_all_files('path/to/folder', file_extension='csv')
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Supported input formats:** `.csv` `.txt` `.tsv` `.xls` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.db` `.sqlite`
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
### `read_any(file_path)` ✨
|
|
89
|
+
|
|
90
|
+
Reads any supported file format and returns a pandas DataFrame — one function for everything.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from deepcsv import read_any
|
|
94
|
+
|
|
95
|
+
df = read_any('data/users.csv')
|
|
96
|
+
df = read_any('reports/sales.xlsx')
|
|
97
|
+
df = read_any('warehouse/orders.parquet')
|
|
98
|
+
df = read_any('local.db')
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Supported formats:** `.csv` `.txt` `.tsv` `.xls` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.db` `.sqlite`
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
### `clean_values(data_input, ...)` ✨
|
|
106
|
+
|
|
107
|
+
Cleans a DataFrame by removing nulls, specific values, specific types, or rows by index — with full control over which columns to target and optional conditions.
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from deepcsv import clean_values
|
|
111
|
+
|
|
112
|
+
# Drop fully-null columns
|
|
113
|
+
df = clean_values('data.csv', cols=['age', 'salary'])
|
|
114
|
+
|
|
115
|
+
# Drop rows that have nulls in specific cols
|
|
116
|
+
df = clean_values('data.csv', cols=['age', 'salary'], ax_0=True)
|
|
117
|
+
|
|
118
|
+
# Drop rows by index
|
|
119
|
+
df = clean_values(df, index=[0, 5, 12])
|
|
120
|
+
|
|
121
|
+
# Remove rows where a specific value exists
|
|
122
|
+
df = clean_values(df, cols=['status'], finding_value='N/A')
|
|
123
|
+
|
|
124
|
+
# Remove rows where value meets a condition
|
|
125
|
+
df = clean_values(df, cols=['score'], finding_value='N/A', condition=['>=', 500])
|
|
126
|
+
|
|
127
|
+
# Remove rows by Python type
|
|
128
|
+
df = clean_values(df, cols=['age'], finding_type=str)
|
|
129
|
+
|
|
130
|
+
# Apply on all columns except some
|
|
131
|
+
df = clean_values('data.csv', all_cols_except=['id', 'name'])
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
| Parameter | Type | Default | Description |
|
|
135
|
+
|---|---|---|---|
|
|
136
|
+
| `data_input` | `str \| DataFrame` | required | File path or DataFrame |
|
|
137
|
+
| `cols` | `list` | `None` | Columns to apply on |
|
|
138
|
+
| `ax_0` | `bool` | `False` | `True`: drop rows with nulls — `False`: drop fully-null cols |
|
|
139
|
+
| `index` | `list` | `None` | Row indexes to drop |
|
|
140
|
+
| `condition` | `list` | `None` | `[operator, value]` — ex: `['>=', 500]` |
|
|
141
|
+
| `all_cols_except` | `list` | `None` | Apply on all columns except these |
|
|
142
|
+
| `finding_value` | `any` | `None` | Find and remove rows containing this value |
|
|
143
|
+
| `finding_type` | `type` | `None` | Find and remove rows matching this Python type |
|
|
144
|
+
|
|
145
|
+
**Supported condition operators:** `>=` `<=` `>` `<` `==` `!=`
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Function Signatures
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
process_file(data_input: Union[str, pd.DataFrame], save_file_extension: str = None) -> pd.DataFrame
|
|
153
|
+
process_all_files(directory_path: str, output_dir: str = "All CSV Files is Converted Here", file_extension: str = "parquet") -> None
|
|
154
|
+
read_any(file_path: str) -> pd.DataFrame
|
|
155
|
+
clean_values(data_input, cols=None, ax_0=False, index=None, condition=None, all_cols_except=None, finding_value=None, finding_type=None) -> pd.DataFrame
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Key Features
|
|
161
|
+
|
|
162
|
+
- String list → real NumPy array conversion (fast, no manual parsing)
|
|
163
|
+
- Mixed-type column detection and auto-fix
|
|
164
|
+
- Save in any format — CSV, Excel, JSON, Parquet, Feather, and more
|
|
165
|
+
- One universal file reader for 10+ formats
|
|
166
|
+
- Flexible null cleaning by column, row, index, value, or type
|
|
167
|
+
- Conditional filtering with 6 operators
|
|
168
|
+
- Recursive directory traversal
|
|
169
|
+
- Warning messages for full transparency
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Notes
|
|
174
|
+
|
|
175
|
+
- Requires `pyarrow` for Parquet and Feather support
|
|
176
|
+
- Only saves files in `process_all_files` if the DataFrame contains converted array columns
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Requirements
|
|
181
|
+
|
|
182
|
+
- Python >= 3.7
|
|
183
|
+
- pandas
|
|
184
|
+
- pyarrow
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
**By: Abdullah Bakr**
|
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
import pyarrow
|
|
2
2
|
import pandas as pd
|
|
3
|
-
from .utils import read_any, clean_values, _validate_cols, _validate_index
|
|
4
|
-
from typing import Union
|
|
3
|
+
from .utils import read_any, clean_values, _validate_cols, _validate_index,_parse_operator,_validate_condition,_save_as
|
|
4
|
+
from typing import Union
|
|
5
5
|
from ast import literal_eval
|
|
6
6
|
from numpy import nan,array
|
|
7
7
|
from os import listdir,makedirs
|
|
8
8
|
from os.path import join,relpath,dirname,isfile,isdir
|
|
9
9
|
from warnings import filterwarnings
|
|
10
|
-
from typing import Union
|
|
11
10
|
filterwarnings("ignore")
|
|
12
11
|
|
|
13
|
-
def process_file(data_input: Union[str, pd.DataFrame]) -> pd.DataFrame:
|
|
12
|
+
def process_file(data_input: Union[str, pd.DataFrame] , save_file_extension=str) -> pd.DataFrame:
|
|
14
13
|
"""
|
|
15
14
|
Parses string representations of lists in DataFrame columns to actual NumPy arrays.
|
|
16
15
|
|
|
@@ -58,11 +57,13 @@ def process_file(data_input: Union[str, pd.DataFrame]) -> pd.DataFrame:
|
|
|
58
57
|
|
|
59
58
|
data[f"{ColName.capitalize()}List"] = data[ColName].apply(lambda x : array(literal_eval(x)) if pd.notna(x) else nan)
|
|
60
59
|
data.drop(ColName,inplace=True,axis=1)
|
|
61
|
-
|
|
60
|
+
|
|
61
|
+
if save_file_extension.strip().lower() in ['csv','txt','tsv','xls','xlsx','json','parquet','pkl','feather','db','sqlite']:
|
|
62
|
+
_save_as(data=data,ext=save_file_extension)
|
|
62
63
|
return data
|
|
63
64
|
|
|
64
65
|
|
|
65
|
-
def process_all_files(directory_path: str, output_dir="All CSV Files is Converted Here") -> None:
|
|
66
|
+
def process_all_files(directory_path: str, output_dir="All CSV Files is Converted Here",file_extension= "parquet") -> None:
|
|
66
67
|
"""
|
|
67
68
|
Recursively processes all CSV and XLSX files in a directory,
|
|
68
69
|
converts array strings to NumPy arrays, and saves as Parquet files.
|
|
@@ -99,7 +100,7 @@ def process_all_files(directory_path: str, output_dir="All CSV Files is Converte
|
|
|
99
100
|
|
|
100
101
|
Sub_Item_Path = join(Curr_Path,item_name)
|
|
101
102
|
|
|
102
|
-
if isfile(Sub_Item_Path) and (Sub_Item_Path.
|
|
103
|
+
if isfile(Sub_Item_Path) and (Sub_Item_Path.split(".")[-1].strip().lower() in ['csv','txt','tsv','xls','xlsx','json','parquet','pkl','feather','db','sqlite']):
|
|
103
104
|
|
|
104
105
|
print(f"{Sub_Item_Path} File Is Processing Now...")
|
|
105
106
|
|
|
@@ -111,10 +112,9 @@ def process_all_files(directory_path: str, output_dir="All CSV Files is Converte
|
|
|
111
112
|
if "List" in df_converted.columns[-1]:
|
|
112
113
|
print(Sub_Item_Path)
|
|
113
114
|
makedirs(dirname(output),exist_ok=True)
|
|
114
|
-
df_converted
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
print("-"*50)
|
|
115
|
+
_save_as(data=df_converted,
|
|
116
|
+
current_dir=output.replace(f".{Sub_Item_Path.split(".")[-1].strip().lower()}", f".{file_extension}"),
|
|
117
|
+
ext=file_extension)
|
|
118
118
|
|
|
119
119
|
elif isdir(Sub_Item_Path):
|
|
120
120
|
|