deepcsv 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepcsv-0.3.0/PKG-INFO ADDED
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.4
2
+ Name: deepcsv
3
+ Version: 0.3.0
4
+ Summary: Automatically walks through folders and subfolders, finds all CSV and XLSX files, detects and fixes data issues, and saves the results as Parquet files while keeping the exact same folder structure.
5
+ Author: Abdullah Bakr
6
+ Requires-Python: >=3.7
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: pandas
10
+ Requires-Dist: pyarrow
11
+ Dynamic: author
12
+ Dynamic: description
13
+ Dynamic: description-content-type
14
+ Dynamic: license-file
15
+ Dynamic: requires-dist
16
+ Dynamic: requires-python
17
+ Dynamic: summary
18
+
19
+ # deepcsv
20
+
21
+ A Python library that automatically walks through folders and subfolders, finds all CSV and XLSX files, detects and fixes data issues, and saves the results as Parquet files while keeping the exact same folder structure.
22
+
23
+ ## Installation
24
+ ```bash
25
+ pip install deepcsv
26
+ ```
27
+
28
+ ## What it does
29
+
30
+ - Walks through all folders and subfolders automatically
31
+ - Finds every CSV and XLSX file
32
+ - Detects columns that contain list strings like `"['item1', 'item2']"` and converts them into real Python arrays for faster performance
33
+ - Detects columns with mixed data types and tries to fix them automatically
34
+ - Warns you when a column has mixed types so you know what was changed
35
+ - Saves the results as Parquet files to preserve the converted data types
36
+
37
+ > **Why Parquet?**
38
+ > CSV files cannot store arrays or preserve data types. Parquet solves this by keeping the exact types after conversion.
39
+
40
+ > **Why arrays instead of Python lists?**
41
+ > Arrays are significantly faster for numerical operations and machine learning workflows.
42
+
43
+ ## Functions
44
+
45
+ ### `ConvertListStrToList(file_path)`
46
+
47
+ Reads a CSV file, converts list strings to arrays, fixes mixed-type columns, and returns a clean DataFrame.
48
+ ```python
49
+ import deepcsv
50
+
51
+ df = deepcsv.ConvertListStrToList("path/to/file.csv")
52
+ ```
53
+
54
+ ### `ReadAllCSVData(path)`
55
+
56
+ Walks through all folders and subfolders, applies `ConvertListStrToList` on every CSV and XLSX file, and saves the results as Parquet files in a new folder called `All CSV Data is Converted Here`.
57
+ ```python
58
+ import deepcsv
59
+
60
+ deepcsv.ReadAllCSVData("path/to/folder")
61
+ ```
62
+
63
+ ## Notes
64
+
65
+ - Only files that contain list string columns are saved as Parquet
66
+ - Mixed-type columns are converted to float automatically when possible
67
+ - Skips NaN values without breaking
68
+ - Requires `pyarrow` for Parquet support
69
+
70
+ ## Requirements
71
+
72
+ - Python >= 3.7
73
+ - pandas
74
+ - pyarrow
@@ -0,0 +1,56 @@
1
+ # deepcsv
2
+
3
+ A Python library that automatically walks through folders and subfolders, finds all CSV and XLSX files, detects and fixes data issues, and saves the results as Parquet files while keeping the exact same folder structure.
4
+
5
+ ## Installation
6
+ ```bash
7
+ pip install deepcsv
8
+ ```
9
+
10
+ ## What it does
11
+
12
+ - Walks through all folders and subfolders automatically
13
+ - Finds every CSV and XLSX file
14
+ - Detects columns that contain list strings like `"['item1', 'item2']"` and converts them into real Python arrays for faster performance
15
+ - Detects columns with mixed data types and tries to fix them automatically
16
+ - Warns you when a column has mixed types so you know what was changed
17
+ - Saves the results as Parquet files to preserve the converted data types
18
+
19
+ > **Why Parquet?**
20
+ > CSV files cannot store arrays or preserve data types. Parquet solves this by keeping the exact types after conversion.
21
+
22
+ > **Why arrays instead of Python lists?**
23
+ > Arrays are significantly faster for numerical operations and machine learning workflows.
24
+
25
+ ## Functions
26
+
27
+ ### `ConvertListStrToList(file_path)`
28
+
29
+ Reads a CSV file, converts list strings to arrays, fixes mixed-type columns, and returns a clean DataFrame.
30
+ ```python
31
+ import deepcsv
32
+
33
+ df = deepcsv.ConvertListStrToList("path/to/file.csv")
34
+ ```
35
+
36
+ ### `ReadAllCSVData(path)`
37
+
38
+ Walks through all folders and subfolders, applies `ConvertListStrToList` on every CSV and XLSX file, and saves the results as Parquet files in a new folder called `All CSV Data is Converted Here`.
39
+ ```python
40
+ import deepcsv
41
+
42
+ deepcsv.ReadAllCSVData("path/to/folder")
43
+ ```
44
+
45
+ ## Notes
46
+
47
+ - Only files that contain list string columns are saved as Parquet
48
+ - Mixed-type columns are converted to float automatically when possible
49
+ - Skips NaN values without breaking
50
+ - Requires `pyarrow` for Parquet support
51
+
52
+ ## Requirements
53
+
54
+ - Python >= 3.7
55
+ - pandas
56
+ - pyarrow
@@ -0,0 +1,76 @@
1
+ import pyarrow
2
+ import os
3
+ import warnings
4
+ import numpy as np
5
+ import pandas as pd
6
+ from ast import literal_eval
7
+ warnings.filterwarnings("ignore")
8
+
9
+ def ConvertListStrToList(File_Path):
10
+
11
+ data = pd.read_csv(File_Path)
12
+ for ColName in data.columns:
13
+
14
+ First_Value = data[ColName].iloc[0]
15
+
16
+ if len(data[ColName].apply(type).unique()) >= 2:
17
+
18
+
19
+ sample = (data[data[ColName].apply(type) == str][ColName].head(2)).values
20
+
21
+ if len(sample) > 0 and isinstance(sample[0],str) and sample[0][0].strip().isnumeric():
22
+
23
+ print(f"WARNING:\nThis Dataset Name ({File_Path.split("\\")[-1]}) Found {len(data[ColName].apply(type).unique())} Mixed DataType in a column called ({ColName})\nPath : {File_Path}")
24
+ print(f"System : This column have These types: {data[ColName].apply(type).unique()}")
25
+ print(f"System : Trying to fix the column as a Float to be have only one datatype...")
26
+
27
+ data[ColName] = pd.to_numeric(data[ColName], errors='coerce')
28
+ print("System : Done!")
29
+
30
+ elif isinstance(First_Value , str) and First_Value.strip().startswith("["):
31
+
32
+ data[f"{ColName.capitalize()}List"] = data[ColName].apply(lambda x : literal_eval(x) if pd.notna(x) else np.nan)
33
+ data.drop(ColName,inplace=True,axis=1)
34
+
35
+ return data
36
+
37
+
38
+ def ReadAllCSVData(WorkDirectoryPath):
39
+
40
+ base_output = os.path.join(WorkDirectoryPath, "All CSV Data is Converted Here")
41
+ all_folders = [WorkDirectoryPath]
42
+
43
+ os.makedirs(base_output,exist_ok=True)
44
+
45
+ while True:
46
+
47
+ if all_folders:
48
+
49
+ Curr_Path = all_folders.pop(0)
50
+
51
+ for item_name in os.listdir(Curr_Path):
52
+
53
+ Sub_Item_Path = os.path.join(Curr_Path,item_name)
54
+
55
+ if os.path.isfile(Sub_Item_Path) and (Sub_Item_Path.endswith(".csv") or Sub_Item_Path.endswith(".xlsx")):
56
+
57
+
58
+
59
+ df_converted = ConvertListStrToList(Sub_Item_Path)
60
+ df_converted.reset_index(drop=True,inplace=True)
61
+
62
+ rel_path = os.path.relpath(Sub_Item_Path, WorkDirectoryPath)
63
+ output = os.path.join(base_output,rel_path)
64
+ if "List" in df_converted.columns[-1]:
65
+ print(Sub_Item_Path)
66
+ os.makedirs(os.path.dirname(output),exist_ok=True)
67
+ df_converted.to_parquet(output.replace(".csv", ".parquet"))
68
+
69
+ print("-"*50)
70
+
71
+ elif os.path.isdir(Sub_Item_Path):
72
+
73
+ all_folders.append(Sub_Item_Path)
74
+
75
+ else:
76
+ break
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.4
2
+ Name: deepcsv
3
+ Version: 0.3.0
4
+ Summary: Automatically walks through folders and subfolders, finds all CSV and XLSX files, detects and fixes data issues, and saves the results as Parquet files while keeping the exact same folder structure.
5
+ Author: Abdullah Bakr
6
+ Requires-Python: >=3.7
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: pandas
10
+ Requires-Dist: pyarrow
11
+ Dynamic: author
12
+ Dynamic: description
13
+ Dynamic: description-content-type
14
+ Dynamic: license-file
15
+ Dynamic: requires-dist
16
+ Dynamic: requires-python
17
+ Dynamic: summary
18
+
19
+ # deepcsv
20
+
21
+ A Python library that automatically walks through folders and subfolders, finds all CSV and XLSX files, detects and fixes data issues, and saves the results as Parquet files while keeping the exact same folder structure.
22
+
23
+ ## Installation
24
+ ```bash
25
+ pip install deepcsv
26
+ ```
27
+
28
+ ## What it does
29
+
30
+ - Walks through all folders and subfolders automatically
31
+ - Finds every CSV and XLSX file
32
+ - Detects columns that contain list strings like `"['item1', 'item2']"` and converts them into real Python arrays for faster performance
33
+ - Detects columns with mixed data types and tries to fix them automatically
34
+ - Warns you when a column has mixed types so you know what was changed
35
+ - Saves the results as Parquet files to preserve the converted data types
36
+
37
+ > **Why Parquet?**
38
+ > CSV files cannot store arrays or preserve data types. Parquet solves this by keeping the exact types after conversion.
39
+
40
+ > **Why arrays instead of Python lists?**
41
+ > Arrays are significantly faster for numerical operations and machine learning workflows.
42
+
43
+ ## Functions
44
+
45
+ ### `ConvertListStrToList(file_path)`
46
+
47
+ Reads a CSV file, converts list strings to arrays, fixes mixed-type columns, and returns a clean DataFrame.
48
+ ```python
49
+ import deepcsv
50
+
51
+ df = deepcsv.ConvertListStrToList("path/to/file.csv")
52
+ ```
53
+
54
+ ### `ReadAllCSVData(path)`
55
+
56
+ Walks through all folders and subfolders, applies `ConvertListStrToList` on every CSV and XLSX file, and saves the results as Parquet files in a new folder called `All CSV Data is Converted Here`.
57
+ ```python
58
+ import deepcsv
59
+
60
+ deepcsv.ReadAllCSVData("path/to/folder")
61
+ ```
62
+
63
+ ## Notes
64
+
65
+ - Only files that contain list string columns are saved as Parquet
66
+ - Mixed-type columns are converted to float automatically when possible
67
+ - Skips NaN values without breaking
68
+ - Requires `pyarrow` for Parquet support
69
+
70
+ ## Requirements
71
+
72
+ - Python >= 3.7
73
+ - pandas
74
+ - pyarrow
@@ -1,4 +1,5 @@
1
1
  LICENSE
2
+ README.md
2
3
  setup.py
3
4
  deepcsv/__init__.py
4
5
  deepcsv/deepcsv.py
@@ -0,0 +1,2 @@
1
+ pandas
2
+ pyarrow
deepcsv-0.3.0/setup.py ADDED
@@ -0,0 +1,13 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="deepcsv",
5
+ version="0.3.0",
6
+ author="Abdullah Bakr",
7
+ description="Automatically walks through folders and subfolders, finds all CSV and XLSX files, detects and fixes data issues, and saves the results as Parquet files while keeping the exact same folder structure.",
8
+ long_description=open("README.md").read(),
9
+ long_description_content_type="text/markdown",
10
+ packages=find_packages(),
11
+ install_requires=["pandas","pyarrow"],
12
+ python_requires=">=3.7",
13
+ )
deepcsv-0.1.0/PKG-INFO DELETED
@@ -1,13 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: deepcsv
3
- Version: 0.1.0
4
- Summary: Automatically walks folders and converts list strings to lists in CSV/XLSX files
5
- Author: Abdullah Bakr
6
- Requires-Python: >=3.7
7
- License-File: LICENSE
8
- Requires-Dist: pandas
9
- Dynamic: author
10
- Dynamic: license-file
11
- Dynamic: requires-dist
12
- Dynamic: requires-python
13
- Dynamic: summary
@@ -1,49 +0,0 @@
1
- import os
2
- import pandas as pd
3
- from ast import literal_eval
4
-
5
- def ConvertListStrToList(data):
6
- Data_Col = data.columns
7
- for ColName in Data_Col:
8
- if type(data[ColName][0]) == str and data[ColName][0].startswith("["):
9
- data[f"{ColName.capitalize()}List"] = data[ColName].apply(lambda x : literal_eval(x) if pd.notna(x) else x )
10
- data.drop(ColName,inplace=True,axis=1)
11
-
12
- return data
13
-
14
-
15
- def ReadAllCSVData(WorkDirectoryPath):
16
-
17
- base_output = os.path.join(WorkDirectoryPath, "All CSV Data is Converted Here")
18
- all_folders = [WorkDirectoryPath]
19
-
20
- os.makedirs(base_output,exist_ok=True)
21
-
22
- while True:
23
-
24
- if all_folders:
25
-
26
- Curr_Path = all_folders.pop(0)
27
-
28
- for item_name in os.listdir(Curr_Path):
29
-
30
- Sub_Item_Path = os.path.join(Curr_Path,item_name)
31
-
32
- if os.path.isfile(Sub_Item_Path) and (Sub_Item_Path.endswith(".csv") or Sub_Item_Path.endswith(".xlsx")):
33
- print(Sub_Item_Path)
34
- df = pd.read_csv(Sub_Item_Path)
35
- df_converted = ConvertListStrToList(df)
36
-
37
- rel_path = os.path.relpath(Sub_Item_Path, WorkDirectoryPath)
38
- output = os.path.join(base_output,rel_path)
39
- os.makedirs(os.path.dirname(output),exist_ok=True)
40
-
41
- df_converted.to_csv(output)
42
-
43
-
44
- elif os.path.isdir(Sub_Item_Path):
45
-
46
- all_folders.append(Sub_Item_Path)
47
-
48
- else:
49
- break
@@ -1,13 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: deepcsv
3
- Version: 0.1.0
4
- Summary: Automatically walks folders and converts list strings to lists in CSV/XLSX files
5
- Author: Abdullah Bakr
6
- Requires-Python: >=3.7
7
- License-File: LICENSE
8
- Requires-Dist: pandas
9
- Dynamic: author
10
- Dynamic: license-file
11
- Dynamic: requires-dist
12
- Dynamic: requires-python
13
- Dynamic: summary
@@ -1 +0,0 @@
1
- pandas
deepcsv-0.1.0/setup.py DELETED
@@ -1,11 +0,0 @@
1
- from setuptools import setup, find_packages
2
-
3
- setup(
4
- name="deepcsv",
5
- version="0.1.0",
6
- author="Abdullah Bakr",
7
- description="Automatically walks folders and converts list strings to lists in CSV/XLSX files",
8
- packages=find_packages(),
9
- install_requires=["pandas"],
10
- python_requires=">=3.7",
11
- )
File without changes
File without changes
File without changes