deepcsv 0.6.0__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ # Changelog
2
+
3
+ ---
4
+
5
+ ### Added
6
+
7
+ - `process_all_files` — Added option for user to customize the output folder name in
8
+ - `read_any()` — Reads any supported file format and returns a pandas DataFrame automatically. Supports: `.csv`, `.txt`, `.tsv`, `.xls`, `.xlsx`, `.json`, `.parquet`, `.pkl`, `.feather`, `.db`, `.sqlite`
9
+ - `clean_values()` — Cleans a DataFrame by removing nulls, specific values, specific types, or rows by index. Supports optional condition filtering with 6 operators
10
+ - `_validate_cols()` — Internal helper: validates cols is a non-empty list and all columns exist in the DataFrame
11
+ - `_validate_index()` — Internal helper: validates index is a non-empty list and all indexes exist in the DataFrame. Supports optional `reset_index` before validation
12
+ - `_validate_condition()` — Internal helper: validates condition list and returns `(operator_func, value)`
13
+ - `_parse_operator()` — Internal helper: converts operator string like `'>='` into its Python operator function
14
+ - finding_value parameter in `clean_values(data_input,finding_value)` find and remove rows that have this specific value
15
+ - finding_type parameter in `clean_values(data_input,finding_type)` find and remove rows that have this specific type (ex: str, int)
16
+ - condition parameter in `clean_values(data_input,condition : [operator, value] → ex: ['>=', 500])` applied only with finding_value or finding_type
17
+
18
+ ---
19
+
20
+ ### Changed
21
+
22
+ - `process_file()` — Added `save_file_extension` parameter. Now supports saving the processed DataFrame in any format after conversion, not just returning it
23
+ - `process_all_files()` — Added `file_extension` parameter. Now supports saving converted files in any format instead of always saving as Parquet. Also expanded supported input formats beyond `.csv` and `.xlsx` to cover all formats supported by `read_any()`
24
+
25
+ ---
deepcsv-0.6.2/PKG-INFO ADDED
@@ -0,0 +1,250 @@
1
+ Metadata-Version: 2.4
2
+ Name: deepcsv
3
+ Version: 0.6.2
4
+ Summary: Automatically processes data files in directories, converts array-like strings to NumPy arrays, detects and fixes data type issues, and saves results as optimized Parquet files and MORE!
5
+ Home-page: https://github.com/abdubakr77/deepcsv
6
+ Author: Abdullah Bakr
7
+ Author-email: abdubakora1232@gmail.com
8
+ License: MIT
9
+ Project-URL: Source, https://github.com/abdubakr77/deepcsv
10
+ Project-URL: Tracker, https://github.com/abdubakr77/deepcsv/issues
11
+ Keywords: data-processing pandas numpy etl data-cleaning file-conversion automation parquet bulk-conversion
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
14
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Requires-Python: >=3.7
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: pandas
21
+ Requires-Dist: pyarrow
22
+ Requires-Dist: requests
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: keywords
30
+ Dynamic: license
31
+ Dynamic: license-file
32
+ Dynamic: project-url
33
+ Dynamic: requires-dist
34
+ Dynamic: requires-python
35
+ Dynamic: summary
36
+
37
+ # deepcsv
38
+ > *"You think you saved a list. You open it tomorrow — and it's a string."*
39
+
40
+ `deepcsv` was built to solve exactly this problem.
41
+
42
+ ---
43
+
44
+ ## The Problem
45
+
46
+ Your CSV files are lying to you.
47
+
48
+ You save a list — you open it tomorrow and it's a string.
49
+ Your column has numbers — it secretly has 3 different data types.
50
+ You have 200 CSV files across 40 folders — and you process them one by one.
51
+ You load a file and spend 20 minutes just picking the right reader.
52
+ You have nulls scattered everywhere with no clean way to handle them.
53
+
54
+ This is the silent killer of every data pipeline.
55
+
56
+ ---
57
+
58
+ ## The Solution
59
+
60
+ `deepcsv` handles all of this in one import.
61
+
62
+ - Walks through every folder and subfolder automatically
63
+ - Finds every CSV and XLSX file
64
+ - Detects columns storing lists as strings and converts them to real NumPy arrays
65
+ - Catches mixed-type columns and fixes them automatically
66
+ - Saves everything in any format you choose — not just Parquet
67
+ - Reads any file format with one function — no more picking the right reader
68
+ - Cleans nulls with full control over columns, rows, indexes, values, and types
69
+
70
+ ---
71
+
72
+ ## Installation
73
+
74
+ ```bash
75
+ pip install deepcsv
76
+ ```
77
+
78
+ ---
79
+
80
+ ## Functions
81
+
82
+ ### `process_file(data_input, save_file_extension= str)`
83
+
84
+ Reads a file or DataFrame, converts array-like strings to NumPy arrays, fixes mixed-type columns, and optionally saves the result in any format you choose.
85
+
86
+ ```python
87
+ import deepcsv
88
+
89
+ # Process only
90
+ df = deepcsv.process_file('path/to/file.csv')
91
+
92
+ # Process and save as parquet
93
+ df = deepcsv.process_file('path/to/file.csv', save_file_extension='parquet')
94
+
95
+ # Process and save as Excel
96
+ df = deepcsv.process_file('path/to/file.csv', save_file_extension='xlsx')
97
+ ```
98
+
99
+ **Supported save formats:** `.csv` `.tsv` `.txt` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.html` `.xml`
100
+
101
+ ---
102
+
103
+ ### `process_all_files(directory_path, output_dir="All CSV Files is Converted Here", file_extension="parquet")`
104
+
105
+ Walks through all folders and subfolders, applies `process_file` on every supported file, and saves results in the format you choose.
106
+
107
+ ```python
108
+ import deepcsv
109
+
110
+ # Default — saves as parquet
111
+ deepcsv.process_all_files('path/to/folder')
112
+
113
+ # Custom output folder
114
+ deepcsv.process_all_files('path/to/folder', output_dir='Converted Files')
115
+
116
+ # Save as CSV instead
117
+ deepcsv.process_all_files('path/to/folder', file_extension='csv')
118
+ ```
119
+
120
+ **Supported input formats:** `.csv` `.txt` `.tsv` `.xls` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.db` `.sqlite`
121
+
122
+ ---
123
+
124
+ ### `read_any(file_path)` ✨
125
+
126
+ Reads any supported file format and returns a pandas DataFrame — one function for everything.
127
+
128
+ ```python
129
+ from deepcsv import read_any
130
+
131
+ df = read_any('data/users.csv')
132
+ df = read_any('reports/sales.xlsx')
133
+ df = read_any('warehouse/orders.parquet')
134
+ df = read_any('local.db')
135
+ ```
136
+
137
+ **Supported formats:** `.csv` `.txt` `.tsv` `.xls` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.db` `.sqlite`
138
+
139
+ ---
140
+
141
+ ### `clean_values(data_input, ...)` ✨
142
+
143
+ Cleans a DataFrame by removing nulls, specific values, specific types, or rows by index — with full control over which columns to target and optional conditions.
144
+
145
+ ```python
146
+ from deepcsv import clean_values
147
+
148
+ # Drop fully-null columns
149
+ df = clean_values('data.csv', cols=['age', 'salary'])
150
+
151
+ # Drop rows that have nulls in specific cols
152
+ df = clean_values('data.csv', cols=['age', 'salary'], ax_0=True)
153
+
154
+ # Drop rows by index
155
+ df = clean_values(df, index=[0, 5, 12])
156
+
157
+ # Remove rows where a specific value exists
158
+ df = clean_values(df, cols=['status'], finding_value='N/A')
159
+
160
+ # Remove rows where value meets a condition
161
+ df = clean_values(df, cols=['score'], finding_value='N/A', condition=['>=', 500])
162
+
163
+ # Remove rows by Python type
164
+ df = clean_values(df, cols=['age'], finding_type=str)
165
+
166
+ # Apply on all columns except some
167
+ df = clean_values('data.csv', all_cols_except=['id', 'name'])
168
+ ```
169
+
170
+ | Parameter | Type | Default | Description |
171
+ |---|---|---|---|
172
+ | `data_input` | `str \| DataFrame` | required | File path or DataFrame |
173
+ | `cols` | `list` | `None` | Columns to apply on |
174
+ | `ax_0` | `bool` | `False` | `True`: drop rows with nulls — `False`: drop fully-null cols |
175
+ | `index` | `list` | `None` | Row indexes to drop |
176
+ | `condition` | `list` | `None` | `[operator, value]` — ex: `['>=', 500]` |
177
+ | `all_cols_except` | `list` | `None` | Apply on all columns except these |
178
+ | `finding_value` | `any` | `None` | Find and remove rows containing this value |
179
+ | `finding_type` | `type` | `None` | Find and remove rows matching this Python type |
180
+
181
+ **Supported condition operators:** `>=` `<=` `>` `<` `==` `!=`
182
+
183
+ ---
184
+
185
+ ## Function Signatures
186
+
187
+ ```python
188
+ process_file(data_input: Union[str, pd.DataFrame], save_file_extension: str = None) -> pd.DataFrame
189
+ process_all_files(directory_path: str, output_dir: str = "All CSV Files is Converted Here", file_extension: str = "parquet") -> None
190
+ read_any(file_path: str) -> pd.DataFrame
191
+ clean_values(data_input, cols=None, ax_0=False, index=None, condition=None, all_cols_except=None, finding_value=None, finding_type=None) -> pd.DataFrame
192
+ ```
193
+
194
+ ---
195
+
196
+ ## Key Features
197
+
198
+ - String list → real NumPy array conversion (fast, no manual parsing)
199
+ - Mixed-type column detection and auto-fix
200
+ - Save in any format — CSV, Excel, JSON, Parquet, Feather, and more
201
+ - One universal file reader for 10+ formats
202
+ - Flexible null cleaning by column, row, index, value, or type
203
+ - Conditional filtering with 6 operators
204
+ - Recursive directory traversal
205
+ - Warning messages for full transparency
206
+
207
+ ---
208
+
209
+ ## Notes
210
+
211
+ - Requires `pyarrow` for Parquet and Feather support
212
+ - Only saves files in `process_all_files` if the DataFrame contains converted array columns
213
+
214
+ ---
215
+
216
+ ## Requirements
217
+
218
+ - Python >= 3.7
219
+ - pandas
220
+ - pyarrow
221
+
222
+ ---
223
+
224
+ **By: Abdullah Bakr**
225
+
226
+ # Changelog
227
+
228
+ ---
229
+
230
+ ### Added
231
+
232
+ - `process_all_files` — Added option for user to customize the output folder name in
233
+ - `read_any()` — Reads any supported file format and returns a pandas DataFrame automatically. Supports: `.csv`, `.txt`, `.tsv`, `.xls`, `.xlsx`, `.json`, `.parquet`, `.pkl`, `.feather`, `.db`, `.sqlite`
234
+ - `clean_values()` — Cleans a DataFrame by removing nulls, specific values, specific types, or rows by index. Supports optional condition filtering with 6 operators
235
+ - `_validate_cols()` — Internal helper: validates cols is a non-empty list and all columns exist in the DataFrame
236
+ - `_validate_index()` — Internal helper: validates index is a non-empty list and all indexes exist in the DataFrame. Supports optional `reset_index` before validation
237
+ - `_validate_condition()` — Internal helper: validates condition list and returns `(operator_func, value)`
238
+ - `_parse_operator()` — Internal helper: converts operator string like `'>='` into its Python operator function
239
+ - finding_value parameter in `clean_values(data_input,finding_value)` find and remove rows that have this specific value
240
+ - finding_type parameter in `clean_values(data_input,finding_type)` find and remove rows that have this specific type (ex: str, int)
241
+ - condition parameter in `clean_values(data_input,condition : [operator, value] → ex: ['>=', 500])` applied only with finding_value or finding_type
242
+
243
+ ---
244
+
245
+ ### Changed
246
+
247
+ - `process_file()` — Added `save_file_extension` parameter. Now supports saving the processed DataFrame in any format after conversion, not just returning it
248
+ - `process_all_files()` — Added `file_extension` parameter. Now supports saving converted files in any format instead of always saving as Parquet. Also expanded supported input formats beyond `.csv` and `.xlsx` to cover all formats supported by `read_any()`
249
+
250
+ ---
@@ -0,0 +1,188 @@
1
+ # deepcsv
2
+ > *"You think you saved a list. You open it tomorrow — and it's a string."*
3
+
4
+ `deepcsv` was built to solve exactly this problem.
5
+
6
+ ---
7
+
8
+ ## The Problem
9
+
10
+ Your CSV files are lying to you.
11
+
12
+ You save a list — you open it tomorrow and it's a string.
13
+ Your column has numbers — it secretly has 3 different data types.
14
+ You have 200 CSV files across 40 folders — and you process them one by one.
15
+ You load a file and spend 20 minutes just picking the right reader.
16
+ You have nulls scattered everywhere with no clean way to handle them.
17
+
18
+ This is the silent killer of every data pipeline.
19
+
20
+ ---
21
+
22
+ ## The Solution
23
+
24
+ `deepcsv` handles all of this in one import.
25
+
26
+ - Walks through every folder and subfolder automatically
27
+ - Finds every CSV and XLSX file
28
+ - Detects columns storing lists as strings and converts them to real NumPy arrays
29
+ - Catches mixed-type columns and fixes them automatically
30
+ - Saves everything in any format you choose — not just Parquet
31
+ - Reads any file format with one function — no more picking the right reader
32
+ - Cleans nulls with full control over columns, rows, indexes, values, and types
33
+
34
+ ---
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install deepcsv
40
+ ```
41
+
42
+ ---
43
+
44
+ ## Functions
45
+
46
+ ### `process_file(data_input, save_file_extension= str)`
47
+
48
+ Reads a file or DataFrame, converts array-like strings to NumPy arrays, fixes mixed-type columns, and optionally saves the result in any format you choose.
49
+
50
+ ```python
51
+ import deepcsv
52
+
53
+ # Process only
54
+ df = deepcsv.process_file('path/to/file.csv')
55
+
56
+ # Process and save as parquet
57
+ df = deepcsv.process_file('path/to/file.csv', save_file_extension='parquet')
58
+
59
+ # Process and save as Excel
60
+ df = deepcsv.process_file('path/to/file.csv', save_file_extension='xlsx')
61
+ ```
62
+
63
+ **Supported save formats:** `.csv` `.tsv` `.txt` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.html` `.xml`
64
+
65
+ ---
66
+
67
+ ### `process_all_files(directory_path, output_dir="All CSV Files is Converted Here", file_extension="parquet")`
68
+
69
+ Walks through all folders and subfolders, applies `process_file` on every supported file, and saves results in the format you choose.
70
+
71
+ ```python
72
+ import deepcsv
73
+
74
+ # Default — saves as parquet
75
+ deepcsv.process_all_files('path/to/folder')
76
+
77
+ # Custom output folder
78
+ deepcsv.process_all_files('path/to/folder', output_dir='Converted Files')
79
+
80
+ # Save as CSV instead
81
+ deepcsv.process_all_files('path/to/folder', file_extension='csv')
82
+ ```
83
+
84
+ **Supported input formats:** `.csv` `.txt` `.tsv` `.xls` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.db` `.sqlite`
85
+
86
+ ---
87
+
88
+ ### `read_any(file_path)` ✨
89
+
90
+ Reads any supported file format and returns a pandas DataFrame — one function for everything.
91
+
92
+ ```python
93
+ from deepcsv import read_any
94
+
95
+ df = read_any('data/users.csv')
96
+ df = read_any('reports/sales.xlsx')
97
+ df = read_any('warehouse/orders.parquet')
98
+ df = read_any('local.db')
99
+ ```
100
+
101
+ **Supported formats:** `.csv` `.txt` `.tsv` `.xls` `.xlsx` `.json` `.parquet` `.pkl` `.feather` `.db` `.sqlite`
102
+
103
+ ---
104
+
105
+ ### `clean_values(data_input, ...)` ✨
106
+
107
+ Cleans a DataFrame by removing nulls, specific values, specific types, or rows by index — with full control over which columns to target and optional conditions.
108
+
109
+ ```python
110
+ from deepcsv import clean_values
111
+
112
+ # Drop fully-null columns
113
+ df = clean_values('data.csv', cols=['age', 'salary'])
114
+
115
+ # Drop rows that have nulls in specific cols
116
+ df = clean_values('data.csv', cols=['age', 'salary'], ax_0=True)
117
+
118
+ # Drop rows by index
119
+ df = clean_values(df, index=[0, 5, 12])
120
+
121
+ # Remove rows where a specific value exists
122
+ df = clean_values(df, cols=['status'], finding_value='N/A')
123
+
124
+ # Remove rows where value meets a condition
125
+ df = clean_values(df, cols=['score'], finding_value='N/A', condition=['>=', 500])
126
+
127
+ # Remove rows by Python type
128
+ df = clean_values(df, cols=['age'], finding_type=str)
129
+
130
+ # Apply on all columns except some
131
+ df = clean_values('data.csv', all_cols_except=['id', 'name'])
132
+ ```
133
+
134
+ | Parameter | Type | Default | Description |
135
+ |---|---|---|---|
136
+ | `data_input` | `str \| DataFrame` | required | File path or DataFrame |
137
+ | `cols` | `list` | `None` | Columns to apply on |
138
+ | `ax_0` | `bool` | `False` | `True`: drop rows with nulls — `False`: drop fully-null cols |
139
+ | `index` | `list` | `None` | Row indexes to drop |
140
+ | `condition` | `list` | `None` | `[operator, value]` — ex: `['>=', 500]` |
141
+ | `all_cols_except` | `list` | `None` | Apply on all columns except these |
142
+ | `finding_value` | `any` | `None` | Find and remove rows containing this value |
143
+ | `finding_type` | `type` | `None` | Find and remove rows matching this Python type |
144
+
145
+ **Supported condition operators:** `>=` `<=` `>` `<` `==` `!=`
146
+
147
+ ---
148
+
149
+ ## Function Signatures
150
+
151
+ ```python
152
+ process_file(data_input: Union[str, pd.DataFrame], save_file_extension: str = None) -> pd.DataFrame
153
+ process_all_files(directory_path: str, output_dir: str = "All CSV Files is Converted Here", file_extension: str = "parquet") -> None
154
+ read_any(file_path: str) -> pd.DataFrame
155
+ clean_values(data_input, cols=None, ax_0=False, index=None, condition=None, all_cols_except=None, finding_value=None, finding_type=None) -> pd.DataFrame
156
+ ```
157
+
158
+ ---
159
+
160
+ ## Key Features
161
+
162
+ - String list → real NumPy array conversion (fast, no manual parsing)
163
+ - Mixed-type column detection and auto-fix
164
+ - Save in any format — CSV, Excel, JSON, Parquet, Feather, and more
165
+ - One universal file reader for 10+ formats
166
+ - Flexible null cleaning by column, row, index, value, or type
167
+ - Conditional filtering with 6 operators
168
+ - Recursive directory traversal
169
+ - Warning messages for full transparency
170
+
171
+ ---
172
+
173
+ ## Notes
174
+
175
+ - Requires `pyarrow` for Parquet and Feather support
176
+ - Only saves files in `process_all_files` if the DataFrame contains converted array columns
177
+
178
+ ---
179
+
180
+ ## Requirements
181
+
182
+ - Python >= 3.7
183
+ - pandas
184
+ - pyarrow
185
+
186
+ ---
187
+
188
+ **By: Abdullah Bakr**
@@ -1,16 +1,15 @@
1
1
  import pyarrow
2
2
  import pandas as pd
3
- from .utils import read_any, clean_values, _validate_cols, _validate_index
4
- from typing import Union, Optional
3
+ from .utils import read_any, clean_values, _validate_cols, _validate_index,_parse_operator,_validate_condition,_save_as
4
+ from typing import Union
5
5
  from ast import literal_eval
6
6
  from numpy import nan,array
7
7
  from os import listdir,makedirs
8
8
  from os.path import join,relpath,dirname,isfile,isdir
9
9
  from warnings import filterwarnings
10
- from typing import Union
11
10
  filterwarnings("ignore")
12
11
 
13
- def process_file(data_input: Union[str, pd.DataFrame]) -> pd.DataFrame:
12
+ def process_file(data_input: Union[str, pd.DataFrame] , save_file_extension=str) -> pd.DataFrame:
14
13
  """
15
14
  Parses string representations of lists in DataFrame columns to actual NumPy arrays.
16
15
 
@@ -58,11 +57,13 @@ def process_file(data_input: Union[str, pd.DataFrame]) -> pd.DataFrame:
58
57
 
59
58
  data[f"{ColName.capitalize()}List"] = data[ColName].apply(lambda x : array(literal_eval(x)) if pd.notna(x) else nan)
60
59
  data.drop(ColName,inplace=True,axis=1)
61
-
60
+
61
+ if save_file_extension.strip().lower() in ['csv','txt','tsv','xls','xlsx','json','parquet','pkl','feather','db','sqlite']:
62
+ _save_as(data=data,ext=save_file_extension)
62
63
  return data
63
64
 
64
65
 
65
- def process_all_files(directory_path: str, output_dir="All CSV Files is Converted Here") -> None:
66
+ def process_all_files(directory_path: str, output_dir="All CSV Files is Converted Here",file_extension= "parquet") -> None:
66
67
  """
67
68
  Recursively processes all CSV and XLSX files in a directory,
68
69
  converts array strings to NumPy arrays, and saves as Parquet files.
@@ -99,7 +100,7 @@ def process_all_files(directory_path: str, output_dir="All CSV Files is Converte
99
100
 
100
101
  Sub_Item_Path = join(Curr_Path,item_name)
101
102
 
102
- if isfile(Sub_Item_Path) and (Sub_Item_Path.endswith(".csv") or Sub_Item_Path.endswith(".xlsx")):
103
+ if isfile(Sub_Item_Path) and (Sub_Item_Path.split(".")[-1].strip().lower() in ['csv','txt','tsv','xls','xlsx','json','parquet','pkl','feather','db','sqlite']):
103
104
 
104
105
  print(f"{Sub_Item_Path} File Is Processing Now...")
105
106
 
@@ -111,10 +112,9 @@ def process_all_files(directory_path: str, output_dir="All CSV Files is Converte
111
112
  if "List" in df_converted.columns[-1]:
112
113
  print(Sub_Item_Path)
113
114
  makedirs(dirname(output),exist_ok=True)
114
- df_converted.to_parquet(output.replace(".csv", ".parquet"))
115
-
116
- print(f"Done!")
117
- print("-"*50)
115
+ _save_as(data=df_converted,
116
+ current_dir=output.replace(f".{Sub_Item_Path.split(".")[-1].strip().lower()}", f".{file_extension}"),
117
+ ext=file_extension)
118
118
 
119
119
  elif isdir(Sub_Item_Path):
120
120