advanced-excel 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- advanced_excel/__init__.py +19 -0
- advanced_excel/blocks.py +171 -0
- advanced_excel/cleaning.py +420 -0
- advanced_excel/columns.py +444 -0
- advanced_excel/core.py +87 -0
- advanced_excel/io.py +167 -0
- advanced_excel/lookup.py +229 -0
- advanced_excel/reshape.py +133 -0
- advanced_excel/rows.py +519 -0
- advanced_excel-2.0.0.dist-info/METADATA +201 -0
- advanced_excel-2.0.0.dist-info/RECORD +14 -0
- advanced_excel-2.0.0.dist-info/WHEEL +5 -0
- advanced_excel-2.0.0.dist-info/licenses/LICENSE +21 -0
- advanced_excel-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
advanced_excel package (standard Python package name).
|
|
3
|
+
|
|
4
|
+
This makes the library fully importable and compatible after `pip install -e .`
|
|
5
|
+
or `pip install advanced-excel`.
|
|
6
|
+
|
|
7
|
+
Recommended import:
|
|
8
|
+
from advanced_excel import AdvancedExcel
|
|
9
|
+
|
|
10
|
+
The main class is AdvancedExcel (CamelCase is standard for classes).
|
|
11
|
+
Implementation is in core.py (the thin class + mixin imports) and the various
|
|
12
|
+
*Mixin modules.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .core import AdvancedExcel
|
|
16
|
+
from .core import ROW_INDEX, COL_INDEX, DATA, __version__
|
|
17
|
+
|
|
18
|
+
__all__ = ["AdvancedExcel", "ROW_INDEX", "COL_INDEX", "DATA", "__version__"]
|
|
19
|
+
|
advanced_excel/blocks.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
class BlockTableMixin:
|
|
2
|
+
"""
|
|
3
|
+
Mixin for detecting and extracting multiple tables / blocks / entities
|
|
4
|
+
inside a single sheet (repeated key sections, "Batch Number" style blocks, etc.).
|
|
5
|
+
This is the core of the "advanced" multi-table Excel handling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
def get_all_tables(self, _sheet, mincol=3):
|
|
9
|
+
"""
|
|
10
|
+
Identifies and extracts table-like structures from a sheet.
|
|
11
|
+
|
|
12
|
+
This method searches for contiguous blocks of rows that have at least `mincol`
|
|
13
|
+
non-NaN values. These blocks are interpreted as tables. It iterates through
|
|
14
|
+
the rows of the sheet. When it encounters a row with at least `mincol`
|
|
15
|
+
valid values, it marks the beginning of a potential table. When it finds a row
|
|
16
|
+
with fewer than `mincol` valid values, it considers the preceding block of
|
|
17
|
+
rows as a complete table and extracts it using the `_getTable` method.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
_sheet (pandas.DataFrame): The sheet to search for tables within.
|
|
21
|
+
mincol (int, optional): The minimum number of non-NaN values required
|
|
22
|
+
for a row to be considered part of a table. Defaults to 3.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
list: A list of pandas DataFrames, where each DataFrame represents
|
|
26
|
+
a table-like structure found in the sheet. Returns an empty
|
|
27
|
+
list if no tables are found.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
allTables = []
|
|
31
|
+
init_table = -1
|
|
32
|
+
|
|
33
|
+
for irow in range(_sheet.shape[0]):
|
|
34
|
+
if len(_sheet.loc[irow].dropna()) >= mincol:
|
|
35
|
+
if init_table == -1:
|
|
36
|
+
init_table = irow
|
|
37
|
+
else:
|
|
38
|
+
if init_table >= 0:
|
|
39
|
+
table = self._getTable(_sheet, init_table, irow)
|
|
40
|
+
allTables.append(table)
|
|
41
|
+
init_table = -1
|
|
42
|
+
|
|
43
|
+
if init_table >= 0:
|
|
44
|
+
table = self._getTable(_sheet, init_table, _sheet.shape[0])
|
|
45
|
+
allTables.append(table)
|
|
46
|
+
|
|
47
|
+
return allTables
|
|
48
|
+
|
|
49
|
+
def get_dataframe_blocks_by_key_name(self, df, key_name):
|
|
50
|
+
"""used for sheet that contains different blocks of information,
|
|
51
|
+
it splits the all data sheet into splited dataframes to be parser later on
|
|
52
|
+
The Split is using the recognition of the first value on a cell that is repeated at the beginning
|
|
53
|
+
of each block"""
|
|
54
|
+
|
|
55
|
+
allDfBlocks = []
|
|
56
|
+
|
|
57
|
+
identified_rows = self.get_all_rows_from_key(df, key_name)
|
|
58
|
+
list_index_of_rows = list(map(lambda x: x["row_index"], identified_rows))
|
|
59
|
+
|
|
60
|
+
for index in range(len(list_index_of_rows)):
|
|
61
|
+
ini = list_index_of_rows[index]
|
|
62
|
+
if index + 1 > len(list_index_of_rows) - 1:
|
|
63
|
+
end = df.shape[0]
|
|
64
|
+
else:
|
|
65
|
+
end = list_index_of_rows[index + 1]
|
|
66
|
+
|
|
67
|
+
block = df.iloc[ini:end].dropna(axis=1, how="all").dropna(axis=0, how="all")
|
|
68
|
+
|
|
69
|
+
block = block.reset_index(drop=True)
|
|
70
|
+
allDfBlocks.append(block)
|
|
71
|
+
|
|
72
|
+
return allDfBlocks
|
|
73
|
+
|
|
74
|
+
def get_dataframe_blocks_by_key_column(self, df, key_column):
|
|
75
|
+
"""
|
|
76
|
+
Splits a DataFrame into blocks based on occurrences of a keyname.
|
|
77
|
+
|
|
78
|
+
This method is designed for sheets containing multiple blocks of information,
|
|
79
|
+
separated by a repeated `key_name` at the beginning of each block. It identifies
|
|
80
|
+
the rows containing the `key_name` and uses their indices to split the DataFrame
|
|
81
|
+
into individual blocks. Each block is then cleaned by removing rows and columns
|
|
82
|
+
that are entirely NaN.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
df (pandas.DataFrame): The DataFrame to split.
|
|
86
|
+
key_name (str): The keyname that marks the beginning of each block.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
list: A list of pandas DataFrames, where each DataFrame represents a
|
|
90
|
+
block of data. Returns an empty list if the `key_name` is not found.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
allDfBlocks = []
|
|
94
|
+
|
|
95
|
+
list_index_of_columns = self._getAllColumnsFromKey(df, key_column)
|
|
96
|
+
|
|
97
|
+
for index in range(len(list_index_of_columns)):
|
|
98
|
+
ini = list_index_of_columns[index]
|
|
99
|
+
if index + 1 > len(list_index_of_columns) - 1:
|
|
100
|
+
end = df.shape[1]
|
|
101
|
+
else:
|
|
102
|
+
end = list_index_of_columns[index + 1]
|
|
103
|
+
|
|
104
|
+
block = df.iloc[:, ini:end].dropna(axis=0, how="all").dropna(axis=1, how="all")
|
|
105
|
+
|
|
106
|
+
block = block.reset_index(drop=True)
|
|
107
|
+
allDfBlocks.append(block)
|
|
108
|
+
|
|
109
|
+
return allDfBlocks
|
|
110
|
+
|
|
111
|
+
def _getAllColumnsFromKey(self, df, key_column):
|
|
112
|
+
"""
|
|
113
|
+
Returns a list of indices for columns matching the specified key column name.
|
|
114
|
+
|
|
115
|
+
This method searches for columns in the DataFrame whose names match `key_column`
|
|
116
|
+
and returns a list of their integer indices.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
df (pandas.DataFrame): The DataFrame to search.
|
|
120
|
+
key_column (str): The name of the column to search for.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
list: A list of integer indices of the matching columns. Returns an
|
|
124
|
+
empty list if no matching columns are found.
|
|
125
|
+
"""
|
|
126
|
+
key_indices = []
|
|
127
|
+
for i, column in enumerate(df.columns):
|
|
128
|
+
if column == key_column:
|
|
129
|
+
key_indices.append(i)
|
|
130
|
+
return key_indices
|
|
131
|
+
|
|
132
|
+
def _getTable(self, _sheet, init_table, end_table):
|
|
133
|
+
"""
|
|
134
|
+
Extracts a table (DataFrame) from a sheet within specified row boundaries.
|
|
135
|
+
|
|
136
|
+
This method extracts a portion of the input sheet (`_sheet`) between `init_table`
|
|
137
|
+
and `end_table` (exclusive) as a new DataFrame. It then cleans the table by
|
|
138
|
+
removing rows and columns that are entirely NaN, sets the first row as the header
|
|
139
|
+
(after cleaning and formatting it), and resets the index.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
_sheet (pandas.DataFrame): The sheet (DataFrame) to extract the table from.
|
|
143
|
+
init_table (int): The starting row index (inclusive).
|
|
144
|
+
end_table (int): The ending row index (exclusive).
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
pandas.DataFrame: A new DataFrame representing the extracted table.
|
|
148
|
+
A copy of the DataFrame is created, so the original DataFrame is not modified in place.
|
|
149
|
+
"""
|
|
150
|
+
table = (
|
|
151
|
+
_sheet.iloc[init_table:end_table].dropna(axis=1, how="all").dropna(axis=0, how="all")
|
|
152
|
+
) # Extract and clean
|
|
153
|
+
|
|
154
|
+
# Set the header from the first row after cleaning and formatting:
|
|
155
|
+
table.columns = [str(s).strip().upper().replace(" ", "_") for s in table.iloc[0]]
|
|
156
|
+
table = table.drop(0).reset_index(
|
|
157
|
+
drop=True
|
|
158
|
+
) # Remove the first row (old header) and reset index
|
|
159
|
+
return table
|
|
160
|
+
|
|
161
|
+
def _headerColumnsAreEmpty(self, columns):
|
|
162
|
+
"""
|
|
163
|
+
Checks if all column names in a Series start with "Unnamed:".
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
columns (pandas.Series): The Series containing the column names.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
bool: True if all column names start with "Unnamed:", False otherwise.
|
|
170
|
+
"""
|
|
171
|
+
return columns.str.contains("^Unnamed:").all()
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CleaningMixin:
|
|
8
|
+
"""
|
|
9
|
+
Mixin for value/string/NaN/header/dtype cleaning and transformation operations.
|
|
10
|
+
Includes replace*, strip, case, rename, numeric conversion helpers, etc.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def replace_nan_in_column_by_nan_strings(self, df, columnName):
|
|
14
|
+
"""
|
|
15
|
+
Replaces NaN (Not a Number) values in a specified column of a DataFrame with the string "NA".
|
|
16
|
+
|
|
17
|
+
This method modifies the DataFrame in place by filling any NaN values in the column
|
|
18
|
+
specified by `columnName` with the string "NA". This is often useful when you
|
|
19
|
+
need to represent missing data with a specific string value, for example, when
|
|
20
|
+
exporting the data to a format that doesn't handle NaN values well.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
24
|
+
columnName (str): The name of the column in which to replace NaN values.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
pandas.DataFrame: The modified DataFrame (the modification is done in place).
|
|
28
|
+
"""
|
|
29
|
+
df.loc[:, columnName] = df[columnName].fillna("NA")
|
|
30
|
+
return df
|
|
31
|
+
|
|
32
|
+
def replace_nan_strings(self, df):
|
|
33
|
+
"""
|
|
34
|
+
Replaces string representations of NaN values with actual NaN values in a DataFrame.
|
|
35
|
+
|
|
36
|
+
This method searches for specific string values ('NA_NA', 'NA', 'NAN', 'na_na', 'na', 'nan')
|
|
37
|
+
that are often used to represent missing data and replaces them with actual
|
|
38
|
+
NaN (Not a Number) values. This ensures consistency in how missing data is
|
|
39
|
+
handled within the DataFrame, allowing pandas functions to correctly interpret
|
|
40
|
+
and process the missing values.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
pandas.DataFrame: A new DataFrame with the string representations of NaN
|
|
47
|
+
replaced by actual NaN values. A copy of the DataFrame is created,
|
|
48
|
+
so the original DataFrame is not modified in place.
|
|
49
|
+
"""
|
|
50
|
+
df = pd.DataFrame(
|
|
51
|
+
np.where(df.isin(["NA_NA", "NA", "NAN", "na_na", "na", "nan"]), np.nan, df),
|
|
52
|
+
df.index,
|
|
53
|
+
df.columns,
|
|
54
|
+
)
|
|
55
|
+
return df
|
|
56
|
+
|
|
57
|
+
def strip_all(self, df):
|
|
58
|
+
"""
|
|
59
|
+
Removes leading and trailing whitespace from all string values in a DataFrame.
|
|
60
|
+
|
|
61
|
+
This method uses a regular expression to remove any whitespace characters
|
|
62
|
+
at the beginning or end of each string value in the DataFrame.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
pandas.DataFrame: A new DataFrame with leading/trailing whitespace removed from all strings.
|
|
69
|
+
A copy of the DataFrame is created, so the original DataFrame is not modified in place.
|
|
70
|
+
"""
|
|
71
|
+
df = df.replace(to_replace=r"^\s*(.*?)\s*$", value=r"\1", regex=True)
|
|
72
|
+
return df
|
|
73
|
+
|
|
74
|
+
def remove_all_crlf(self, df, byValue=""):
|
|
75
|
+
"""
|
|
76
|
+
Removes carriage return (CR) and line feed (LF) characters from string values in a DataFrame.
|
|
77
|
+
|
|
78
|
+
This method replaces all occurrences of carriage return (`\r`) and line feed (`\n`)
|
|
79
|
+
characters in the DataFrame with the value specified by `byValue`. It then
|
|
80
|
+
optionally replaces double occurrences of `byValue` with a single `byValue`.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
84
|
+
byValue (str, optional): The value to replace CR/LF characters with. Defaults to ''.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
pandas.DataFrame: A new DataFrame with CR/LF characters removed or replaced.
|
|
88
|
+
A copy of the DataFrame is created, so the original DataFrame is not modified in place.
|
|
89
|
+
"""
|
|
90
|
+
df = df.replace({"\n": byValue, "\r": byValue}, regex=True) # Replace CR/LF with byValue
|
|
91
|
+
if byValue: # Check if byValue is not empty
|
|
92
|
+
df = df.replace({byValue + byValue: byValue}, regex=True) # Replace double byValue
|
|
93
|
+
return df
|
|
94
|
+
|
|
95
|
+
def replace_by_dictionary(self, df, dictionary, column_to_apply):
|
|
96
|
+
"""
|
|
97
|
+
Replaces values in a specified column of a DataFrame using a dictionary mapping.
|
|
98
|
+
|
|
99
|
+
This method uses a dictionary to map existing values in the specified column
|
|
100
|
+
to new values. It also replaces the micro symbol (µ) with the letter 'u'
|
|
101
|
+
before applying the dictionary mapping. Any values that are not found
|
|
102
|
+
as keys in the dictionary are left unchanged.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
106
|
+
dictionary (dict): A dictionary where keys are the values to replace and
|
|
107
|
+
values are their replacements.
|
|
108
|
+
column_to_apply (str): The name of the column to apply the replacement to.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
pandas.DataFrame: A new DataFrame with values replaced according to the dictionary.
|
|
112
|
+
A copy of the DataFrame is created, so the original DataFrame is not modified in place.
|
|
113
|
+
"""
|
|
114
|
+
df.loc[:, column_to_apply] = (
|
|
115
|
+
df[column_to_apply].str.replace(r"µ", "u").map(dictionary).fillna(df[column_to_apply])
|
|
116
|
+
)
|
|
117
|
+
return df
|
|
118
|
+
|
|
119
|
+
def replace_all(
|
|
120
|
+
self, df, to_replace, replace_by, columns_to_clean=None, columns_not_to_clean=None
|
|
121
|
+
):
|
|
122
|
+
"""
|
|
123
|
+
Replaces all occurrences of a specified value with another value in a DataFrame.
|
|
124
|
+
|
|
125
|
+
This method replaces all occurrences of `to_replace` with `replace_by` within the
|
|
126
|
+
DataFrame. It offers flexibility in targeting specific columns:
|
|
127
|
+
|
|
128
|
+
- If both `columns_to_clean` and `columns_not_to_clean` are None (or empty lists),
|
|
129
|
+
the replacement is applied to *all* columns.
|
|
130
|
+
- If `columns_to_clean` is provided, the replacement is applied *only* to the
|
|
131
|
+
columns listed in this argument.
|
|
132
|
+
- If `columns_not_to_clean` is provided, the replacement is applied to *all*
|
|
133
|
+
columns *except* those listed in this argument.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
137
|
+
to_replace (str): The value to replace.
|
|
138
|
+
replace_by (str): The value to replace with.
|
|
139
|
+
columns_to_clean (list, optional): A list of column names to apply the
|
|
140
|
+
replacement to. Defaults to None.
|
|
141
|
+
columns_not_to_clean (list, optional): A list of column names to *exclude*
|
|
142
|
+
from the replacement. Defaults to None.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
pandas.DataFrame: A new DataFrame with the replacements made. A copy of the
|
|
146
|
+
DataFrame is created, so the original DataFrame is not modified in place.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
if columns_to_clean is None and columns_not_to_clean is None: # Apply to all columns
|
|
150
|
+
df = df.replace(to_replace=r"" + to_replace, value=r"" + replace_by, regex=True)
|
|
151
|
+
elif columns_to_clean: # Apply to specified columns
|
|
152
|
+
for col in columns_to_clean:
|
|
153
|
+
df[col] = df[col].astype(str) # Ensure column is string type
|
|
154
|
+
df[col] = df[col].replace(
|
|
155
|
+
to_replace=r"" + to_replace, value=r"" + replace_by, regex=True
|
|
156
|
+
)
|
|
157
|
+
elif columns_not_to_clean: # Apply to all but specified columns
|
|
158
|
+
for col in df.columns:
|
|
159
|
+
if col not in columns_not_to_clean:
|
|
160
|
+
df[col] = df[col].astype(str) # Ensure column is string type
|
|
161
|
+
df[col] = df[col].replace(
|
|
162
|
+
to_replace=r"" + to_replace, value=r"" + replace_by, regex=True
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return df
|
|
166
|
+
|
|
167
|
+
def replace_all_by_list(self, df, to_replace, columns_to_clean=None, columns_not_to_clean=None):
|
|
168
|
+
"""
|
|
169
|
+
Replaces values in a DataFrame using a list of (pattern, replacement) tuples.
|
|
170
|
+
|
|
171
|
+
This method iterates through a list of tuples, where each tuple defines a
|
|
172
|
+
regular expression pattern and its corresponding replacement value. For each
|
|
173
|
+
tuple, it calls the `replaceAll` method to perform the replacement. This allows
|
|
174
|
+
for multiple replacements to be applied sequentially.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
178
|
+
to_replace (list of tuples): A list of tuples, where each tuple contains
|
|
179
|
+
(pattern, replacement). The `pattern` is a regular expression.
|
|
180
|
+
columns_to_clean (list, optional): A list of column names to apply the
|
|
181
|
+
replacements to. Defaults to None.
|
|
182
|
+
columns_not_to_clean (list, optional): A list of column names to *exclude*
|
|
183
|
+
from the replacements. Defaults to None.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
pandas.DataFrame: A new DataFrame with the replacements made. A copy of the
|
|
187
|
+
DataFrame is created, so the original DataFrame is not modified in place.
|
|
188
|
+
"""
|
|
189
|
+
for (
|
|
190
|
+
pattern,
|
|
191
|
+
replacement,
|
|
192
|
+
) in to_replace: # Iterate through the list of (pattern, replacement) tuples
|
|
193
|
+
df = self.replace_all(df, pattern, replacement, columns_to_clean, columns_not_to_clean)
|
|
194
|
+
|
|
195
|
+
return df
|
|
196
|
+
|
|
197
|
+
def replace_spaces_by_separator(
|
|
198
|
+
self, df, separator=";", columns_to_clean=None, columns_not_to_clean=None
|
|
199
|
+
):
|
|
200
|
+
"""
|
|
201
|
+
Replaces multiple spaces with a specified separator in a DataFrame.
|
|
202
|
+
|
|
203
|
+
This method replaces multiple consecutive whitespace characters with the specified
|
|
204
|
+
`separator`. It also replaces consecutive occurrences of the separator with a single
|
|
205
|
+
separator. This helps to normalize spacing within string values.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
209
|
+
separator (str, optional): The character to use as a separator. Defaults to ';'.
|
|
210
|
+
columns_to_clean (list, optional): A list of column names to apply the
|
|
211
|
+
replacement to. Defaults to None.
|
|
212
|
+
columns_not_to_clean (list, optional): A list of column names to *exclude*
|
|
213
|
+
from the replacement. Defaults to None.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
pandas.DataFrame: A new DataFrame with spaces replaced by the separator.
|
|
217
|
+
A copy of the DataFrame is created, so the original DataFrame is not
|
|
218
|
+
modified in place.
|
|
219
|
+
"""
|
|
220
|
+
if columns_to_clean is None and columns_not_to_clean is None: # Apply to all columns
|
|
221
|
+
df = df.replace(to_replace=r"\s+", value=separator, regex=True).replace(
|
|
222
|
+
to_replace=r"[" + separator + "]+", value=separator, regex=True
|
|
223
|
+
) # Replace multiple separators
|
|
224
|
+
elif columns_to_clean: # Apply to specified columns
|
|
225
|
+
for col in columns_to_clean:
|
|
226
|
+
df[col] = df[col].astype(str) # Ensure string type
|
|
227
|
+
df[col] = (
|
|
228
|
+
df[col]
|
|
229
|
+
.replace(to_replace=r"\s+", value=separator, regex=True)
|
|
230
|
+
.replace(to_replace=r"[" + separator + "]+", value=separator, regex=True)
|
|
231
|
+
) # Replace multiple separators
|
|
232
|
+
elif columns_not_to_clean: # Apply to all but specified columns
|
|
233
|
+
for col in df.columns:
|
|
234
|
+
if col not in columns_not_to_clean:
|
|
235
|
+
df[col] = df[col].astype(str) # Ensure string type
|
|
236
|
+
df[col] = (
|
|
237
|
+
df[col]
|
|
238
|
+
.replace(to_replace=r"\s+", value=separator, regex=True)
|
|
239
|
+
.replace(to_replace=r"[" + separator + "]+", value=separator, regex=True)
|
|
240
|
+
) # Replace multiple separators
|
|
241
|
+
|
|
242
|
+
return df
|
|
243
|
+
|
|
244
|
+
def transform_as_numeric(self, df, columnsToApply=None, columnsToAvoid=None, errors="coerce"):
|
|
245
|
+
"""
|
|
246
|
+
Transforms specified columns in a DataFrame to numeric type, handling errors.
|
|
247
|
+
|
|
248
|
+
This method attempts to convert the values in specified columns to numeric type.
|
|
249
|
+
It offers flexibility in targeting specific columns:
|
|
250
|
+
|
|
251
|
+
- If both `columnsToApply` and `columnsToAvoid` are None, the conversion is
|
|
252
|
+
attempted on *all* columns.
|
|
253
|
+
- If `columnsToApply` is provided, the conversion is applied *only* to the
|
|
254
|
+
columns listed in this argument.
|
|
255
|
+
- If `columnsToAvoid` is provided, the conversion is applied to *all*
|
|
256
|
+
columns *except* those listed in this argument.
|
|
257
|
+
|
|
258
|
+
The `errors` parameter controls how conversion errors are handled. Using
|
|
259
|
+
`errors='coerce'` (the default) will replace any non-numeric values with NaN.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
263
|
+
columnsToApply (list, optional): A list of column names to apply the
|
|
264
|
+
conversion to. Defaults to None.
|
|
265
|
+
columnsToAvoid (list, optional): A list of column names to *exclude*
|
|
266
|
+
from the conversion. Defaults to None.
|
|
267
|
+
errors (str, optional): How to handle conversion errors. Defaults to 'coerce'.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
pandas.DataFrame: A new DataFrame with the columns converted to numeric.
|
|
271
|
+
A copy of the DataFrame is created, so the original DataFrame is not modified in place.
|
|
272
|
+
"""
|
|
273
|
+
if columnsToApply is None and columnsToAvoid is None:
|
|
274
|
+
columns_to_convert = df.columns # Convert all columns
|
|
275
|
+
elif columnsToApply:
|
|
276
|
+
columns_to_convert = columnsToApply # Convert specified columns
|
|
277
|
+
elif columnsToAvoid:
|
|
278
|
+
columns_to_convert = df.columns.difference(
|
|
279
|
+
columnsToAvoid
|
|
280
|
+
) # Convert all but avoided columns
|
|
281
|
+
else:
|
|
282
|
+
columns_to_convert = [] # Empty list if none of the above.
|
|
283
|
+
|
|
284
|
+
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors=errors, axis=1)
|
|
285
|
+
return df
|
|
286
|
+
|
|
287
|
+
def fill_columns_with_left_value(self, df, index_row):
|
|
288
|
+
"""
|
|
289
|
+
Fills NaN values in a row with the value from the nearest cell to the left.
|
|
290
|
+
|
|
291
|
+
This method iterates through the columns of a specified row from left to right.
|
|
292
|
+
If it encounters a NaN value, it fills it with the value from the nearest
|
|
293
|
+
non-NaN cell to its left. This is often useful for handling Excel files where
|
|
294
|
+
cells have been merged or spanned.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
298
|
+
index_row (int): The index of the row to fill.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
pandas.DataFrame: A new DataFrame with the NaN values filled.
|
|
302
|
+
A copy of the DataFrame is created, so the original DataFrame is not modified in place.
|
|
303
|
+
"""
|
|
304
|
+
left_value = None # Store the value from the left
|
|
305
|
+
df = df.reset_index(drop=True) # Reset index for consistent access
|
|
306
|
+
for index_column in range(len(df.columns)):
|
|
307
|
+
current_value = df.loc[
|
|
308
|
+
index_row, df.columns[index_column]
|
|
309
|
+
] # More efficient way to access the cell value.
|
|
310
|
+
if pd.isnull(current_value):
|
|
311
|
+
df.loc[index_row, df.columns[index_column]] = left_value # Fill with left value
|
|
312
|
+
else:
|
|
313
|
+
left_value = current_value # Update left value
|
|
314
|
+
|
|
315
|
+
return df
|
|
316
|
+
|
|
317
|
+
def rename_headers(self, df, dictionary):
|
|
318
|
+
"""
|
|
319
|
+
Renames columns using a dictionary mapping.
|
|
320
|
+
|
|
321
|
+
This method renames columns using the provided dictionary, where keys are the
|
|
322
|
+
old column names and values are the new column names.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
326
|
+
dictionary (dict): A dictionary mapping old column names to new column names.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
pandas.DataFrame: The modified DataFrame (the modification is done in place).
|
|
330
|
+
"""
|
|
331
|
+
df.rename(columns=dictionary, inplace=True)
|
|
332
|
+
return df
|
|
333
|
+
|
|
334
|
+
def rename_headers_by_regexp(self, df, regexp, value=""):
|
|
335
|
+
"""
|
|
336
|
+
Renames columns using a regular expression substitution.
|
|
337
|
+
|
|
338
|
+
This method renames columns by applying a regular expression substitution.
|
|
339
|
+
The `regexp` is used to search for parts of the column names, and `value` is
|
|
340
|
+
used as the replacement.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
344
|
+
regexp (str): The regular expression to search for.
|
|
345
|
+
value (str, optional): The replacement value. Defaults to ''.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
pandas.DataFrame: A new DataFrame with the columns renamed. A copy of the
|
|
349
|
+
DataFrame is created, so the original DataFrame is not modified in place.
|
|
350
|
+
"""
|
|
351
|
+
df = df.rename(columns=lambda x: re.sub(regexp, value, x))
|
|
352
|
+
return df
|
|
353
|
+
|
|
354
|
+
def rename_headers_by_regexp_list(self, df, listOfRegExp, value=""):
|
|
355
|
+
"""
|
|
356
|
+
Renames columns using a list of regular expression substitutions.
|
|
357
|
+
|
|
358
|
+
This method applies a list of regular expression substitutions to the column
|
|
359
|
+
names. Each `regexp` in the `listOfRegExp` is used to search for parts of
|
|
360
|
+
the column names, and `value` is used as the replacement.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
364
|
+
listOfRegExp (list): A list of regular expressions.
|
|
365
|
+
value (str, optional): The replacement value. Defaults to ''.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
pandas.DataFrame: A new DataFrame with the columns renamed. A copy of the
|
|
369
|
+
DataFrame is created, so the original DataFrame is not modified in place.
|
|
370
|
+
"""
|
|
371
|
+
for regexp in listOfRegExp:
|
|
372
|
+
df = self.rename_headers_by_regexp(df, regexp, value)
|
|
373
|
+
return df
|
|
374
|
+
|
|
375
|
+
def case_headers(self, df, uppercase=True):
|
|
376
|
+
"""
|
|
377
|
+
Converts column names to uppercase or lowercase.
|
|
378
|
+
|
|
379
|
+
This method converts all column names to uppercase if `uppercase` is True,
|
|
380
|
+
otherwise it converts them to lowercase.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
384
|
+
uppercase (bool, optional): Whether to convert to uppercase. Defaults to True.
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
pandas.DataFrame: The modified DataFrame (the modification is done in place).
|
|
388
|
+
"""
|
|
389
|
+
df.columns = df.columns.str.upper() if uppercase else df.columns.str.lower()
|
|
390
|
+
return df
|
|
391
|
+
|
|
392
|
+
def case_column_values(self, df, columnName=None, title=False, uppercase=True):
|
|
393
|
+
"""
|
|
394
|
+
Converts values in a specified column to title case, uppercase, or lowercase.
|
|
395
|
+
|
|
396
|
+
This method converts the values in the specified `columnName` to title case
|
|
397
|
+
(first letter uppercase, rest lowercase) if `title` is True. Otherwise, it
|
|
398
|
+
converts them to uppercase if `uppercase` is True, or lowercase if `uppercase`
|
|
399
|
+
is False. It also strips leading/trailing whitespace from the column values.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
df (pandas.DataFrame): The DataFrame to modify.
|
|
403
|
+
columnName (str, optional): The name of the column to modify. Defaults to None.
|
|
404
|
+
title (bool, optional): Whether to convert to title case. Defaults to False.
|
|
405
|
+
uppercase (bool, optional): Whether to convert to uppercase (if title is False). Defaults to True.
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
pandas.DataFrame: The modified DataFrame (the modification is done in place).
|
|
409
|
+
"""
|
|
410
|
+
if columnName is not None:
|
|
411
|
+
df = self.replace_nan_in_column_by_nan_strings(df, columnName) # Handle potential NaN values
|
|
412
|
+
df.loc[:, columnName] = df[columnName].str.strip() # Remove leading/trailing whitespace
|
|
413
|
+
|
|
414
|
+
if title:
|
|
415
|
+
df.loc[:, columnName] = df[columnName].str.title() # Convert to title case
|
|
416
|
+
else:
|
|
417
|
+
df.loc[:, columnName] = (
|
|
418
|
+
df[columnName].str.upper() if uppercase else df[columnName].str.lower()
|
|
419
|
+
) # Convert to upper/lower case
|
|
420
|
+
return df
|