pyxecm 1.6__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyxecm might be problematic. Click here for more details.
- pyxecm/__init__.py +6 -4
- pyxecm/avts.py +673 -246
- pyxecm/coreshare.py +686 -467
- pyxecm/customizer/__init__.py +16 -4
- pyxecm/customizer/__main__.py +58 -0
- pyxecm/customizer/api/__init__.py +5 -0
- pyxecm/customizer/api/__main__.py +6 -0
- pyxecm/customizer/api/app.py +914 -0
- pyxecm/customizer/api/auth.py +154 -0
- pyxecm/customizer/api/metrics.py +92 -0
- pyxecm/customizer/api/models.py +13 -0
- pyxecm/customizer/api/payload_list.py +865 -0
- pyxecm/customizer/api/settings.py +103 -0
- pyxecm/customizer/browser_automation.py +332 -139
- pyxecm/customizer/customizer.py +1007 -1130
- pyxecm/customizer/exceptions.py +35 -0
- pyxecm/customizer/guidewire.py +322 -0
- pyxecm/customizer/k8s.py +713 -378
- pyxecm/customizer/log.py +107 -0
- pyxecm/customizer/m365.py +2867 -909
- pyxecm/customizer/nhc.py +1169 -0
- pyxecm/customizer/openapi.py +258 -0
- pyxecm/customizer/payload.py +16817 -7467
- pyxecm/customizer/pht.py +699 -285
- pyxecm/customizer/salesforce.py +516 -342
- pyxecm/customizer/sap.py +58 -41
- pyxecm/customizer/servicenow.py +593 -371
- pyxecm/customizer/settings.py +442 -0
- pyxecm/customizer/successfactors.py +408 -346
- pyxecm/customizer/translate.py +83 -48
- pyxecm/helper/__init__.py +5 -2
- pyxecm/helper/assoc.py +83 -43
- pyxecm/helper/data.py +2406 -870
- pyxecm/helper/logadapter.py +27 -0
- pyxecm/helper/web.py +229 -101
- pyxecm/helper/xml.py +527 -171
- pyxecm/maintenance_page/__init__.py +5 -0
- pyxecm/maintenance_page/__main__.py +6 -0
- pyxecm/maintenance_page/app.py +51 -0
- pyxecm/maintenance_page/settings.py +28 -0
- pyxecm/maintenance_page/static/favicon.avif +0 -0
- pyxecm/maintenance_page/templates/maintenance.html +165 -0
- pyxecm/otac.py +234 -140
- pyxecm/otawp.py +1436 -557
- pyxecm/otcs.py +7716 -3161
- pyxecm/otds.py +2150 -919
- pyxecm/otiv.py +36 -21
- pyxecm/otmm.py +1272 -325
- pyxecm/otpd.py +231 -127
- pyxecm-2.0.0.dist-info/METADATA +145 -0
- pyxecm-2.0.0.dist-info/RECORD +54 -0
- {pyxecm-1.6.dist-info → pyxecm-2.0.0.dist-info}/WHEEL +1 -1
- pyxecm-1.6.dist-info/METADATA +0 -53
- pyxecm-1.6.dist-info/RECORD +0 -32
- {pyxecm-1.6.dist-info → pyxecm-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {pyxecm-1.6.dist-info → pyxecm-2.0.0.dist-info}/top_level.txt +0 -0
pyxecm/helper/data.py
CHANGED
|
@@ -1,74 +1,61 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
This code implements a class called
|
|
6
|
-
to Pandas
|
|
7
|
-
|
|
8
|
-
Class: Payload
|
|
9
|
-
Methods:
|
|
10
|
-
|
|
11
|
-
__init__ : class initializer
|
|
12
|
-
__len__: Lenght of the embedded DataFrame object.
|
|
13
|
-
__str__: Print the DataFrame of the class
|
|
14
|
-
get_data_frame: Get the Pandas DataFrame object
|
|
15
|
-
set_data_frame: Set the Pandas DataFrame object
|
|
16
|
-
append: Append additional data to the data frame.
|
|
17
|
-
|
|
18
|
-
load_json_data: Load JSON data into DataFrame
|
|
19
|
-
save_json_data: Save JSON data from DataFrame to file
|
|
20
|
-
load_excel_data: Load Excel file into DataFrame
|
|
21
|
-
load_csv_data: Load CSV data into DataFrame
|
|
22
|
-
load_directory: Load directory structure into Pandas Data Frame
|
|
23
|
-
|
|
24
|
-
partitionate: Partition a data frame into equally sized partions
|
|
25
|
-
deduplicate: Remove dupclicate rows that have all fields in unique_fields in common
|
|
26
|
-
sort: Sort the data frame based on one or multiple fields.
|
|
27
|
-
flatten: Flatten a sub-dictionary by copying selected fields to the
|
|
28
|
-
parent dictionary.
|
|
29
|
-
explode_and_flatten: Explode a substructure in the Data Frame
|
|
30
|
-
drop_columns: Drop selected columns from the Data Frame
|
|
31
|
-
keep_columns: Keep only selected columns from the Data Frame. Drop the rest.
|
|
32
|
-
cleanse: Cleanse data with regular expressions and upper/lower case conversion.
|
|
33
|
-
filter: Filter the DataFrame based on conditions
|
|
34
|
-
|
|
35
|
-
fill_forward: Fill the missing cells appropriately by carrying forward
|
|
36
|
-
the values from the previous rows where necessary.
|
|
37
|
-
fill_na_in_column: Replace NA values in a column with a defined new default value
|
|
1
|
+
"""Data Module leveraging Pandas to manipulte data sets read for bulk generation of Content Server items.
|
|
2
|
+
|
|
3
|
+
See: https://pandas.pydata.org
|
|
4
|
+
|
|
5
|
+
This code implements a class called "Data" which is a wrapper
|
|
6
|
+
to Pandas data frame.
|
|
38
7
|
"""
|
|
39
8
|
|
|
40
9
|
__author__ = "Dr. Marc Diefenbruch"
|
|
41
|
-
__copyright__ = "Copyright 2024, OpenText"
|
|
10
|
+
__copyright__ = "Copyright (C) 2024-2025, OpenText"
|
|
42
11
|
__credits__ = ["Kai-Philip Gatzweiler"]
|
|
43
12
|
__maintainer__ = "Dr. Marc Diefenbruch"
|
|
44
13
|
__email__ = "mdiefenb@opentext.com"
|
|
45
14
|
|
|
46
|
-
import logging
|
|
47
15
|
import json
|
|
16
|
+
import logging
|
|
48
17
|
import os
|
|
49
18
|
import re
|
|
50
19
|
import threading
|
|
20
|
+
from io import StringIO
|
|
51
21
|
|
|
52
22
|
import pandas as pd
|
|
23
|
+
import requests
|
|
53
24
|
|
|
54
|
-
|
|
25
|
+
default_logger = logging.getLogger("pyxecm.helper.data")
|
|
55
26
|
|
|
56
27
|
|
|
57
28
|
class Data:
|
|
58
29
|
"""Used to automate data loading for the customizer."""
|
|
59
30
|
|
|
31
|
+
logger: logging.Logger = default_logger
|
|
32
|
+
|
|
60
33
|
_df: pd.DataFrame
|
|
61
|
-
_lock = threading.Lock()
|
|
34
|
+
_lock: threading.Lock = threading.Lock()
|
|
62
35
|
|
|
63
|
-
def __init__(
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
init_data: pd.DataFrame | list = None,
|
|
39
|
+
logger: logging.Logger = default_logger,
|
|
40
|
+
) -> None:
|
|
64
41
|
"""Initialize the Data object.
|
|
65
42
|
|
|
66
43
|
Args:
|
|
67
|
-
init_data (pd.DataFrame | list, optional):
|
|
68
|
-
|
|
69
|
-
|
|
44
|
+
init_data (pd.DataFrame | list, optional):
|
|
45
|
+
Data to initialize the data frame. Can either be
|
|
46
|
+
another data frame (that gets copied) or a list of dictionaries.
|
|
47
|
+
Defaults to None.
|
|
48
|
+
logger (logging.Logger, optional):
|
|
49
|
+
Pass a special logging object. This is optional. If not provided,
|
|
50
|
+
the default logger is used.
|
|
51
|
+
|
|
70
52
|
"""
|
|
71
53
|
|
|
54
|
+
if logger != default_logger:
|
|
55
|
+
self.logger = logger.getChild("data")
|
|
56
|
+
for logfilter in logger.filters:
|
|
57
|
+
self.logger.addFilter(logfilter)
|
|
58
|
+
|
|
72
59
|
if init_data is not None:
|
|
73
60
|
# if a data frame is passed to the constructor we
|
|
74
61
|
# copy its content to the new Data object
|
|
@@ -84,7 +71,7 @@ class Data:
|
|
|
84
71
|
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
85
72
|
self._df: pd.DataFrame = pd.DataFrame([init_data])
|
|
86
73
|
else:
|
|
87
|
-
logger.error("Illegal initialization data for 'Data' class!")
|
|
74
|
+
self.logger.error("Illegal initialization data for 'Data' class!")
|
|
88
75
|
self._df = None
|
|
89
76
|
else:
|
|
90
77
|
self._df = None
|
|
@@ -92,11 +79,14 @@ class Data:
|
|
|
92
79
|
# end method definition
|
|
93
80
|
|
|
94
81
|
def __len__(self) -> int:
|
|
95
|
-
"""
|
|
96
|
-
|
|
82
|
+
"""Return lenght of the embedded Pandas data frame object.
|
|
83
|
+
|
|
84
|
+
This is basically a convenience method.
|
|
97
85
|
|
|
98
86
|
Returns:
|
|
99
|
-
int:
|
|
87
|
+
int:
|
|
88
|
+
Lenght of the data frame.
|
|
89
|
+
|
|
100
90
|
"""
|
|
101
91
|
|
|
102
92
|
if self._df is not None:
|
|
@@ -106,10 +96,12 @@ class Data:
|
|
|
106
96
|
# end method definition
|
|
107
97
|
|
|
108
98
|
def __str__(self) -> str:
|
|
109
|
-
"""Print the
|
|
99
|
+
"""Print the Pandas data frame object.
|
|
110
100
|
|
|
111
101
|
Returns:
|
|
112
|
-
str:
|
|
102
|
+
str:
|
|
103
|
+
String representation.
|
|
104
|
+
|
|
113
105
|
"""
|
|
114
106
|
|
|
115
107
|
# if data frame is initialized we return
|
|
@@ -122,51 +114,72 @@ class Data:
|
|
|
122
114
|
# end method definition
|
|
123
115
|
|
|
124
116
|
def __getitem__(self, column: str) -> pd.Series:
|
|
125
|
-
"""Return the column corresponding to the key from the
|
|
117
|
+
"""Return the column corresponding to the key from the data frame.
|
|
126
118
|
|
|
127
119
|
Args:
|
|
128
|
-
column (str): name of the
|
|
120
|
+
column (str): The name of the data frame column.
|
|
129
121
|
|
|
130
122
|
Returns:
|
|
131
|
-
pd.Series: column of the
|
|
123
|
+
pd.Series: The column of the data frame with the given name.
|
|
124
|
+
|
|
132
125
|
"""
|
|
133
126
|
|
|
134
127
|
return self._df[column]
|
|
135
128
|
|
|
136
129
|
# end method definition
|
|
137
130
|
|
|
138
|
-
def lock(self):
|
|
131
|
+
def lock(self) -> threading.Lock:
|
|
139
132
|
"""Return the threading lock object.
|
|
140
133
|
|
|
141
134
|
Returns:
|
|
142
|
-
|
|
135
|
+
threading.Lock: The threading lock object.
|
|
136
|
+
|
|
143
137
|
"""
|
|
138
|
+
|
|
144
139
|
return self._lock
|
|
145
140
|
|
|
146
141
|
# end method definition
|
|
147
142
|
|
|
148
143
|
def get_data_frame(self) -> pd.DataFrame:
|
|
149
|
-
"""Get the Pandas
|
|
144
|
+
"""Get the Pandas data frame object.
|
|
150
145
|
|
|
151
146
|
Returns:
|
|
152
|
-
pd.DataFrame: Pandas
|
|
147
|
+
pd.DataFrame: The Pandas data frame object.
|
|
148
|
+
|
|
153
149
|
"""
|
|
154
150
|
|
|
155
151
|
return self._df
|
|
156
152
|
|
|
157
153
|
# end method definition
|
|
158
154
|
|
|
159
|
-
def set_data_frame(self, df: pd.DataFrame):
|
|
160
|
-
"""Set the Pandas
|
|
155
|
+
def set_data_frame(self, df: pd.DataFrame) -> None:
|
|
156
|
+
"""Set the Pandas data frame object.
|
|
161
157
|
|
|
162
158
|
Args:
|
|
163
|
-
df (pd.DataFrame): Pandas
|
|
159
|
+
df (pd.DataFrame): The new Pandas data frame object.
|
|
160
|
+
|
|
164
161
|
"""
|
|
165
162
|
|
|
166
163
|
self._df = df
|
|
167
164
|
|
|
168
165
|
# end method definition
|
|
169
166
|
|
|
167
|
+
def get_columns(self) -> list | None:
|
|
168
|
+
"""Get the list of column names of the data frame.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
list | None:
|
|
172
|
+
The list of column names in the data frame.
|
|
173
|
+
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
if self._df is None:
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
return self._df.columns
|
|
180
|
+
|
|
181
|
+
# end method definition
|
|
182
|
+
|
|
170
183
|
def print_info(
|
|
171
184
|
self,
|
|
172
185
|
show_size: bool = True,
|
|
@@ -177,26 +190,40 @@ class Data:
|
|
|
177
190
|
show_sample: bool = False,
|
|
178
191
|
show_statistics: bool = False,
|
|
179
192
|
row_num: int = 10,
|
|
180
|
-
):
|
|
181
|
-
"""Log information about the data frame
|
|
193
|
+
) -> None:
|
|
194
|
+
"""Log information about the data frame.
|
|
182
195
|
|
|
183
196
|
Args:
|
|
184
|
-
show_size (bool, optional):
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
197
|
+
show_size (bool, optional):
|
|
198
|
+
Show size of data frame. Defaults to True.
|
|
199
|
+
show_info (bool, optional):
|
|
200
|
+
Show information for data frame. Defaults to False.
|
|
201
|
+
show_columns (bool, optional):
|
|
202
|
+
Show columns of data frame. Defaults to False.
|
|
203
|
+
show_first (bool, optional):
|
|
204
|
+
Show first N items. Defaults to False. N is defined
|
|
205
|
+
by the row_num parameter.
|
|
206
|
+
show_last (bool, optional):
|
|
207
|
+
Show last N items. Defaults to False. N is defined
|
|
208
|
+
by the row_num parameter.
|
|
209
|
+
show_sample (bool, optional):
|
|
210
|
+
Show N sample items. Defaults to False. N is defined
|
|
211
|
+
by the row_num parameter.
|
|
212
|
+
show_statistics (bool, optional):
|
|
213
|
+
Show data frame statistics. Defaults to False.
|
|
214
|
+
row_num (int, optional):
|
|
215
|
+
Used as the number of rows printed using show_first,
|
|
216
|
+
show_last, show_sample. Default is 10.
|
|
217
|
+
|
|
191
218
|
"""
|
|
192
219
|
|
|
193
220
|
if self._df is None:
|
|
194
|
-
logger.warning("Data
|
|
221
|
+
self.logger.warning("Data frame is not initialized!")
|
|
195
222
|
return
|
|
196
223
|
|
|
197
224
|
if show_size:
|
|
198
|
-
logger.info(
|
|
199
|
-
"Data
|
|
225
|
+
self.logger.info(
|
|
226
|
+
"Data frame has %s row(s) and %s column(s)",
|
|
200
227
|
self._df.shape[0],
|
|
201
228
|
self._df.shape[1],
|
|
202
229
|
)
|
|
@@ -206,39 +233,42 @@ class Data:
|
|
|
206
233
|
self._df.info()
|
|
207
234
|
|
|
208
235
|
if show_columns:
|
|
209
|
-
logger.info("Columns:\n%s", self._df.columns)
|
|
210
|
-
logger.info(
|
|
211
|
-
"Columns with number of
|
|
212
|
-
|
|
213
|
-
logger.info(
|
|
214
|
-
"Columns with number of non-null values:\n%s", self._df.notnull().sum()
|
|
236
|
+
self.logger.info("Columns:\n%s", self._df.columns)
|
|
237
|
+
self.logger.info(
|
|
238
|
+
"Columns with number of NaN values:\n%s",
|
|
239
|
+
self._df.isna().sum(),
|
|
215
240
|
)
|
|
216
|
-
logger.info(
|
|
217
|
-
|
|
218
|
-
|
|
241
|
+
self.logger.info(
|
|
242
|
+
"Columns with number of non-NaN values:\n%s",
|
|
243
|
+
self._df.notna().sum(),
|
|
219
244
|
)
|
|
220
245
|
|
|
221
246
|
if show_first:
|
|
222
247
|
# the default for head is n = 5:
|
|
223
|
-
logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
|
|
248
|
+
self.logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
|
|
224
249
|
|
|
225
250
|
if show_last:
|
|
226
251
|
# the default for tail is n = 5:
|
|
227
|
-
logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
|
|
252
|
+
self.logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
|
|
228
253
|
|
|
229
254
|
if show_sample:
|
|
230
255
|
# the default for sample is n = 1:
|
|
231
|
-
logger.info(
|
|
256
|
+
self.logger.info(
|
|
257
|
+
"%s Sample rows:\n%s",
|
|
258
|
+
str(row_num),
|
|
259
|
+
self._df.sample(n=row_num),
|
|
260
|
+
)
|
|
232
261
|
|
|
233
262
|
if show_statistics:
|
|
234
|
-
logger.info(
|
|
235
|
-
"Description of statistics for data frame:\n%s",
|
|
263
|
+
self.logger.info(
|
|
264
|
+
"Description of statistics for data frame:\n%s",
|
|
265
|
+
self._df.describe(),
|
|
236
266
|
)
|
|
237
|
-
logger.info(
|
|
238
|
-
"Description of statistics for data frame (
|
|
267
|
+
self.logger.info(
|
|
268
|
+
"Description of statistics for data frame (transformed):\n%s",
|
|
239
269
|
self._df.describe().T,
|
|
240
270
|
)
|
|
241
|
-
logger.info(
|
|
271
|
+
self.logger.info(
|
|
242
272
|
"Description of statistics for data frame (objects):\n%s",
|
|
243
273
|
self._df.describe(include="object"),
|
|
244
274
|
)
|
|
@@ -249,10 +279,13 @@ class Data:
|
|
|
249
279
|
"""Append additional data to the data frame.
|
|
250
280
|
|
|
251
281
|
Args:
|
|
252
|
-
add_data (pd.DataFrame | list | dict):
|
|
282
|
+
add_data (pd.DataFrame | list | dict):
|
|
283
|
+
Additional data. Can be pd.DataFrame or list of dicts (or Data).
|
|
253
284
|
|
|
254
285
|
Returns:
|
|
255
|
-
bool:
|
|
286
|
+
bool:
|
|
287
|
+
True = Success, False = Error
|
|
288
|
+
|
|
256
289
|
"""
|
|
257
290
|
|
|
258
291
|
# Does the data frame has already content?
|
|
@@ -264,166 +297,395 @@ class Data:
|
|
|
264
297
|
return True
|
|
265
298
|
elif isinstance(add_data, Data):
|
|
266
299
|
df = add_data.get_data_frame()
|
|
267
|
-
if df:
|
|
300
|
+
if df is not None and not df.empty:
|
|
268
301
|
self._df = pd.concat([self._df, df], ignore_index=True)
|
|
269
302
|
return True
|
|
270
303
|
elif isinstance(add_data, list):
|
|
271
304
|
if add_data:
|
|
272
|
-
df = Data(add_data)
|
|
305
|
+
df = Data(add_data, logger=self.logger)
|
|
273
306
|
self._df = pd.concat(
|
|
274
|
-
[self._df, df.get_data_frame()],
|
|
307
|
+
[self._df, df.get_data_frame()],
|
|
308
|
+
ignore_index=True,
|
|
275
309
|
)
|
|
276
310
|
return True
|
|
277
311
|
elif isinstance(add_data, dict):
|
|
278
312
|
if add_data:
|
|
279
313
|
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
280
|
-
df = Data([add_data])
|
|
314
|
+
df = Data([add_data], logger=self.logger)
|
|
281
315
|
self._df = pd.concat(
|
|
282
|
-
[self._df, df.get_data_frame()],
|
|
316
|
+
[self._df, df.get_data_frame()],
|
|
317
|
+
ignore_index=True,
|
|
283
318
|
)
|
|
284
319
|
return True
|
|
285
320
|
else:
|
|
286
|
-
logger.error("Illegal data type -> '%s'", type(add_data))
|
|
287
|
-
return False
|
|
288
|
-
else: # self._df is None (initial state)
|
|
289
|
-
if isinstance(add_data, pd.DataFrame):
|
|
290
|
-
self._df = add_data
|
|
291
|
-
return True
|
|
292
|
-
elif isinstance(add_data, Data):
|
|
293
|
-
self._df = add_data.get_data_frame()
|
|
294
|
-
return True
|
|
295
|
-
elif isinstance(add_data, list):
|
|
296
|
-
self._df = pd.DataFrame(add_data)
|
|
297
|
-
return True
|
|
298
|
-
elif isinstance(add_data, dict):
|
|
299
|
-
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
300
|
-
self._df = pd.DataFrame([add_data])
|
|
301
|
-
return True
|
|
302
|
-
else:
|
|
303
|
-
logger.error("Illegal data type -> '%s'", type(add_data))
|
|
321
|
+
self.logger.error("Illegal data type -> '%s'", type(add_data))
|
|
304
322
|
return False
|
|
323
|
+
elif isinstance(add_data, pd.DataFrame):
|
|
324
|
+
self._df = add_data
|
|
325
|
+
return True
|
|
326
|
+
elif isinstance(add_data, Data):
|
|
327
|
+
self._df = add_data.get_data_frame()
|
|
328
|
+
return True
|
|
329
|
+
elif isinstance(add_data, list):
|
|
330
|
+
self._df = pd.DataFrame(add_data)
|
|
331
|
+
return True
|
|
332
|
+
elif isinstance(add_data, dict):
|
|
333
|
+
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
334
|
+
self._df = pd.DataFrame([add_data])
|
|
335
|
+
return True
|
|
336
|
+
else:
|
|
337
|
+
self.logger.error("Illegal data type -> '%s'", type(add_data))
|
|
338
|
+
return False
|
|
339
|
+
|
|
340
|
+
# end method definition
|
|
341
|
+
|
|
342
|
+
def merge(
|
|
343
|
+
self,
|
|
344
|
+
merge_data: pd.DataFrame,
|
|
345
|
+
on: str | list[str] | None = None,
|
|
346
|
+
how: str = "inner",
|
|
347
|
+
left_on: str | list[str] | None = None,
|
|
348
|
+
right_on: str | list[str] | None = None,
|
|
349
|
+
left_index: bool = False,
|
|
350
|
+
right_index: bool = False,
|
|
351
|
+
suffixes: tuple[str, str] = ("_x", "_y"),
|
|
352
|
+
indicator: bool = False,
|
|
353
|
+
validate: str | None = None,
|
|
354
|
+
) -> pd.DataFrame | None:
|
|
355
|
+
"""Merge the current DataFrame (_df) with another DataFrame.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
merge_data (pd.DataFrame | Data):
|
|
359
|
+
The DataFrame to merge with.
|
|
360
|
+
on (str | list[str]):
|
|
361
|
+
Column(s) to merge on. Defaults to None.
|
|
362
|
+
how (str, optional):
|
|
363
|
+
Type of merge ('inner', 'outer', 'left', 'right', 'cross'). Defaults to 'inner'.
|
|
364
|
+
left_on (str | list[str] | None, optional):
|
|
365
|
+
Column(s) from self._df to merge on. Defaults to None.
|
|
366
|
+
right_on (str | list[str] | None, optional):
|
|
367
|
+
Column(s) from other DataFrame to merge on. Defaults to None.
|
|
368
|
+
left_index (str | list[str], optional):
|
|
369
|
+
Whether to merge on the index of self._df. Defaults to False.
|
|
370
|
+
right_index (bool, optional):
|
|
371
|
+
Whether to merge on the index of other. Defaults to False.
|
|
372
|
+
suffixes (tuple[str, str]):
|
|
373
|
+
Suffixes for overlapping column names. Defaults to ('_x', '_y').
|
|
374
|
+
indicator (bool, optional):
|
|
375
|
+
If True, adds a column showing the merge source. Defaults to False.
|
|
376
|
+
validate ():
|
|
377
|
+
If provided, checks merge integrity
|
|
378
|
+
('one_to_one', 'one_to_many', 'many_to_one', 'many_to_many'). Defaults to None.
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
The merged DataFrame or None in case of an error.
|
|
382
|
+
|
|
383
|
+
Exceptions:
|
|
384
|
+
ValueError: If `other` is not a DataFrame.
|
|
385
|
+
KeyError: If required columns for merging are missing.
|
|
386
|
+
ValueError: If `validate` check fails.
|
|
387
|
+
|
|
388
|
+
"""
|
|
389
|
+
|
|
390
|
+
if self._df is None or self._df.empty:
|
|
391
|
+
self._df = merge_data
|
|
392
|
+
|
|
393
|
+
if isinstance(merge_data, Data):
|
|
394
|
+
merge_data = merge_data.get_data_frame() # Extract DataFrame from Data instance
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
return self._df.merge(
|
|
398
|
+
merge_data,
|
|
399
|
+
how=how,
|
|
400
|
+
on=on,
|
|
401
|
+
left_on=left_on,
|
|
402
|
+
right_on=right_on,
|
|
403
|
+
left_index=left_index,
|
|
404
|
+
right_index=right_index,
|
|
405
|
+
suffixes=suffixes,
|
|
406
|
+
indicator=indicator,
|
|
407
|
+
validate=validate,
|
|
408
|
+
)
|
|
409
|
+
except KeyError:
|
|
410
|
+
self.logger.error("Column(s) not found for merging!")
|
|
411
|
+
except ValueError:
|
|
412
|
+
self.logger.error("Invalid merge operation!")
|
|
413
|
+
|
|
414
|
+
return None
|
|
415
|
+
|
|
416
|
+
# end method definition
|
|
417
|
+
|
|
418
|
+
def strip(self, columns: list | None = None, inplace: bool = True) -> pd.DataFrame:
|
|
419
|
+
"""Strip leading and trailing spaces from specified columns in a data frame.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
columns (list | None):
|
|
423
|
+
The list of column names to strip. If None, it strips
|
|
424
|
+
leading and trailing spaces from _all_ string columns.
|
|
425
|
+
inplace (bool, optional):
|
|
426
|
+
If True, the data modification is done in place, i.e.
|
|
427
|
+
modifying the existing data frame of the object.
|
|
428
|
+
If False, the data frame is copied and the copy is modified
|
|
429
|
+
and returned.
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
pd.DataFrame:
|
|
433
|
+
The modified data frame with stripped columns.
|
|
434
|
+
|
|
435
|
+
"""
|
|
436
|
+
|
|
437
|
+
df = self._df.copy() if not inplace else self._df
|
|
438
|
+
|
|
439
|
+
if columns is None:
|
|
440
|
+
# Strip spaces from all string columns
|
|
441
|
+
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
|
|
442
|
+
else:
|
|
443
|
+
# Strip spaces from specified columns
|
|
444
|
+
for col in columns:
|
|
445
|
+
if col in df.columns and df[col].dtype == "object": # Check if the column exists and is of string type
|
|
446
|
+
df[col] = df[col].str.strip()
|
|
447
|
+
|
|
448
|
+
if inplace:
|
|
449
|
+
self._df = df
|
|
450
|
+
|
|
451
|
+
return df
|
|
305
452
|
|
|
306
453
|
# end method definition
|
|
307
454
|
|
|
308
|
-
def load_json_data(
|
|
309
|
-
|
|
455
|
+
def load_json_data(
|
|
456
|
+
self,
|
|
457
|
+
json_path: str,
|
|
458
|
+
convert_dates: bool = False,
|
|
459
|
+
index_column: str | None = None,
|
|
460
|
+
compression: str | None = None,
|
|
461
|
+
) -> bool:
|
|
462
|
+
"""Load JSON data into a Pandas data frame.
|
|
310
463
|
|
|
311
464
|
Args:
|
|
312
|
-
json_path (str):
|
|
313
|
-
|
|
465
|
+
json_path (str):
|
|
466
|
+
The path to the JSON file.
|
|
467
|
+
convert_dates (bool, optional):
|
|
468
|
+
Defines whether or not dates should be converted.
|
|
469
|
+
The default is False = dates are NOT converted.
|
|
470
|
+
index_column (str | None, optional):
|
|
471
|
+
The Name of the column (i.e. JSON data field) that should
|
|
472
|
+
become the index in the loaded data frame.
|
|
473
|
+
compression (str | None):
|
|
474
|
+
Remove a compression:
|
|
475
|
+
* gzip (.gz)
|
|
476
|
+
* bz2 (.bz2)
|
|
477
|
+
* zip (.zip)
|
|
478
|
+
* xz (.xz)
|
|
479
|
+
The value for compression should not include the dot.
|
|
480
|
+
Default is None = no compression.
|
|
481
|
+
|
|
314
482
|
Returns:
|
|
315
483
|
bool: False in case an error occured, True otherwise.
|
|
484
|
+
|
|
316
485
|
"""
|
|
317
486
|
|
|
318
|
-
if
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
487
|
+
if not json_path:
|
|
488
|
+
self.logger.error(
|
|
489
|
+
"You have not specified a JSON path!",
|
|
490
|
+
)
|
|
491
|
+
return False
|
|
492
|
+
|
|
493
|
+
# If compression is enabled the file path should have
|
|
494
|
+
# the matching file name extension:
|
|
495
|
+
if compression:
|
|
496
|
+
compression = compression.lstrip(".") # remove a dot prefix if present
|
|
497
|
+
suffix = "." + compression if compression != "gzip" else "gz"
|
|
498
|
+
if not json_path.endswith(suffix):
|
|
499
|
+
json_path += suffix
|
|
500
|
+
|
|
501
|
+
if not os.path.exists(json_path):
|
|
502
|
+
self.logger.error(
|
|
503
|
+
"Missing JSON file - you have not specified a valid path -> '%s'.",
|
|
504
|
+
json_path,
|
|
505
|
+
)
|
|
506
|
+
return False
|
|
507
|
+
|
|
508
|
+
# Load data from JSON file
|
|
509
|
+
try:
|
|
510
|
+
df = pd.read_json(
|
|
511
|
+
path_or_buf=json_path,
|
|
512
|
+
convert_dates=convert_dates,
|
|
513
|
+
compression=compression,
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
if index_column and index_column not in df.columns:
|
|
517
|
+
self.logger.error(
|
|
518
|
+
"Specified index column -> '%s' not found in the JSON data.",
|
|
519
|
+
index_column,
|
|
340
520
|
)
|
|
341
521
|
return False
|
|
342
|
-
except IOError as e:
|
|
343
|
-
logger.error("An I/O error occurred -> %s", str(e))
|
|
344
|
-
return False
|
|
345
|
-
except json.JSONDecodeError as e:
|
|
346
|
-
logger.error("Error: Unable to decode JSON -> %s", str(e))
|
|
347
|
-
return False
|
|
348
|
-
except ValueError as e:
|
|
349
|
-
logger.error("Invalid JSON input -> %s", str(e))
|
|
350
|
-
return False
|
|
351
|
-
except AttributeError as e:
|
|
352
|
-
logger.error("Unexpected JSON data structure -> %s", str(e))
|
|
353
|
-
return False
|
|
354
|
-
except TypeError as e:
|
|
355
|
-
logger.error("Unexpected JSON data type -> %s", str(e))
|
|
356
|
-
return False
|
|
357
|
-
except KeyError as e:
|
|
358
|
-
logger.error("Missing key in JSON data -> %s", str(e))
|
|
359
|
-
return False
|
|
360
522
|
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
523
|
+
if index_column:
|
|
524
|
+
df = df.set_index(keys=index_column)
|
|
525
|
+
if self._df is None:
|
|
526
|
+
self._df = df
|
|
527
|
+
else:
|
|
528
|
+
self._df = pd.concat([self._df, df])
|
|
529
|
+
self.logger.info(
|
|
530
|
+
"After loading JSON file -> '%s', the data frame has %s row(s) and %s column(s)",
|
|
531
|
+
json_path,
|
|
532
|
+
self._df.shape[0],
|
|
533
|
+
self._df.shape[1],
|
|
534
|
+
)
|
|
535
|
+
except FileNotFoundError:
|
|
536
|
+
self.logger.error(
|
|
537
|
+
"JSON file -> '%s' not found. Please check the file path.",
|
|
538
|
+
json_path,
|
|
539
|
+
)
|
|
540
|
+
return False
|
|
541
|
+
except PermissionError:
|
|
542
|
+
self.logger.error(
|
|
543
|
+
"Missing permission to access the JSON file -> '%s'.",
|
|
364
544
|
json_path,
|
|
365
545
|
)
|
|
366
546
|
return False
|
|
547
|
+
except OSError:
|
|
548
|
+
self.logger.error("An I/O error occurred!")
|
|
549
|
+
return False
|
|
550
|
+
except json.JSONDecodeError:
|
|
551
|
+
self.logger.error(
|
|
552
|
+
"Unable to decode JSON file -> '%s'",
|
|
553
|
+
json_path,
|
|
554
|
+
)
|
|
555
|
+
return False
|
|
556
|
+
except ValueError:
|
|
557
|
+
self.logger.error("Invalid JSON input -> %s", json_path)
|
|
558
|
+
return False
|
|
559
|
+
except AttributeError:
|
|
560
|
+
self.logger.error("Unexpected JSON data structure in file -> %s", json_path)
|
|
561
|
+
return False
|
|
562
|
+
except TypeError:
|
|
563
|
+
self.logger.error("Unexpected JSON data type in file -> %s", json_path)
|
|
564
|
+
return False
|
|
565
|
+
except KeyError:
|
|
566
|
+
self.logger.error("Missing key in JSON data in file -> %s", json_path)
|
|
567
|
+
return False
|
|
568
|
+
|
|
367
569
|
return True
|
|
368
570
|
|
|
369
571
|
# end method definition
|
|
370
572
|
|
|
371
573
|
def save_json_data(
|
|
372
|
-
self,
|
|
574
|
+
self,
|
|
575
|
+
json_path: str,
|
|
576
|
+
orient: str = "records",
|
|
577
|
+
preserve_index: bool = False,
|
|
578
|
+
index_column: str = "index",
|
|
579
|
+
compression: str | None = None,
|
|
373
580
|
) -> bool:
|
|
374
|
-
"""Save JSON data from
|
|
581
|
+
"""Save JSON data from data frame to file.
|
|
375
582
|
|
|
376
583
|
Args:
|
|
377
|
-
json_path (str):
|
|
378
|
-
orient (str, optional):
|
|
379
|
-
|
|
584
|
+
json_path (str): The path to where the JSON file should be safed.
|
|
585
|
+
orient (str, optional):
|
|
586
|
+
The structure of the JSON. Possible values:
|
|
587
|
+
* "records" (this is the default)
|
|
588
|
+
* "columns"
|
|
589
|
+
* "index"
|
|
590
|
+
* "table"
|
|
591
|
+
* "split"
|
|
592
|
+
preserve_index (bool, optional):
|
|
593
|
+
Defines if the index column of the data frame should be exported as well.
|
|
594
|
+
The default is False (index is not exported).
|
|
595
|
+
index_column (str, optional):
|
|
596
|
+
The Name of the column (i.e. JSON data field) that should
|
|
597
|
+
become the index in the loaded data frame. The default is "index".
|
|
598
|
+
compression (str | None):
|
|
599
|
+
Apply a compression:
|
|
600
|
+
* gzip (.gz)
|
|
601
|
+
* bz2 (.bz2)
|
|
602
|
+
* zip (.zip)
|
|
603
|
+
* xz (.xz)
|
|
604
|
+
|
|
380
605
|
Returns:
|
|
381
|
-
bool:
|
|
606
|
+
bool:
|
|
607
|
+
False in case an error occured, True otherwise.
|
|
608
|
+
|
|
382
609
|
"""
|
|
383
610
|
|
|
384
|
-
if
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
611
|
+
if not json_path:
|
|
612
|
+
self.logger.error(
|
|
613
|
+
"You have not specified a JSON path!",
|
|
614
|
+
)
|
|
615
|
+
return False
|
|
616
|
+
|
|
617
|
+
# If compression is enabled the file path should have
|
|
618
|
+
# the matching file name extension:
|
|
619
|
+
if compression:
|
|
620
|
+
suffix = "." + compression if compression != "gzip" else ".gz"
|
|
621
|
+
if not json_path.endswith(suffix):
|
|
622
|
+
json_path += suffix
|
|
623
|
+
|
|
624
|
+
# Save data to JSON file
|
|
625
|
+
try:
|
|
626
|
+
if self._df is not None:
|
|
627
|
+
if not os.path.exists(os.path.dirname(json_path)):
|
|
628
|
+
os.makedirs(os.path.dirname(json_path), exist_ok=True)
|
|
629
|
+
|
|
630
|
+
# index parameter is only allowed if orient has one of the following values:
|
|
631
|
+
if orient in ("columns", "index", "table", "split"):
|
|
632
|
+
self._df.to_json(
|
|
633
|
+
path_or_buf=json_path,
|
|
634
|
+
index=preserve_index,
|
|
635
|
+
orient=orient,
|
|
636
|
+
indent=2,
|
|
637
|
+
compression=compression,
|
|
638
|
+
date_format="iso",
|
|
639
|
+
)
|
|
640
|
+
# In this case we cannot use the index parameter as this would give this error:
|
|
641
|
+
# Value Error -> 'index=True' is only valid when 'orient' is 'split', 'table', 'index', or 'columns'
|
|
642
|
+
# So we create a new column that preserves the original row IDs from the index. The nasme
|
|
643
|
+
|
|
644
|
+
elif preserve_index:
|
|
645
|
+
df_with_index = self._df.reset_index(
|
|
646
|
+
names=index_column,
|
|
647
|
+
inplace=False,
|
|
648
|
+
)
|
|
649
|
+
df_with_index.to_json(
|
|
650
|
+
path_or_buf=json_path,
|
|
651
|
+
orient=orient,
|
|
652
|
+
indent=2,
|
|
653
|
+
compression=compression,
|
|
654
|
+
date_format="iso",
|
|
655
|
+
)
|
|
403
656
|
else:
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
657
|
+
self._df.to_json(
|
|
658
|
+
path_or_buf=json_path,
|
|
659
|
+
orient=orient,
|
|
660
|
+
indent=2,
|
|
661
|
+
compression=compression,
|
|
662
|
+
date_format="iso",
|
|
663
|
+
)
|
|
664
|
+
else:
|
|
665
|
+
self.logger.warning(
|
|
666
|
+
"Data frame is empty. Cannot write it to JSON file -> '%s'.",
|
|
667
|
+
json_path,
|
|
409
668
|
)
|
|
410
669
|
return False
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
else:
|
|
422
|
-
logger.error(
|
|
423
|
-
"Missing JSON file -> '%s' you have not specified a valid path!",
|
|
670
|
+
except FileNotFoundError:
|
|
671
|
+
self.logger.error(
|
|
672
|
+
"File -> '%s' not found. Please check the file path.",
|
|
673
|
+
json_path,
|
|
674
|
+
)
|
|
675
|
+
return False
|
|
676
|
+
except PermissionError:
|
|
677
|
+
self.logger.error(
|
|
678
|
+
"Permission denied to access the file -> '%s'.",
|
|
424
679
|
json_path,
|
|
425
680
|
)
|
|
426
681
|
return False
|
|
682
|
+
except OSError:
|
|
683
|
+
self.logger.error("An I/O error occurred accessing file -> %s", json_path)
|
|
684
|
+
return False
|
|
685
|
+
except ValueError:
|
|
686
|
+
self.logger.error("Value error!")
|
|
687
|
+
return False
|
|
688
|
+
|
|
427
689
|
return True
|
|
428
690
|
|
|
429
691
|
# end method definition
|
|
@@ -438,27 +700,40 @@ class Data:
|
|
|
438
700
|
names: list | None = None,
|
|
439
701
|
na_values: list | None = None,
|
|
440
702
|
) -> bool:
|
|
441
|
-
"""Load Excel (xlsx) data into
|
|
442
|
-
|
|
703
|
+
"""Load Excel (xlsx) data into Pandas data frame.
|
|
704
|
+
|
|
705
|
+
Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
|
|
706
|
+
read from a local filesystem or URL. Supports an option to read a
|
|
707
|
+
single sheet or a list of sheets.
|
|
443
708
|
|
|
444
709
|
Args:
|
|
445
|
-
xlsx_path (str):
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
710
|
+
xlsx_path (str):
|
|
711
|
+
The path to the Excel file to load.
|
|
712
|
+
sheet_names (list | str | int, optional):
|
|
713
|
+
Name or Index of the sheet in the Excel workbook to load.
|
|
714
|
+
If 'None' then all sheets will be loaded.
|
|
715
|
+
If 0 then first sheet in workbook will be loaded (this is the Default).
|
|
716
|
+
If string then this is interpreted as the name of the sheet to load.
|
|
717
|
+
If a list is passed, this can be a list of index values (int) or
|
|
718
|
+
a list of strings with the sheet names to load.
|
|
719
|
+
usecols (list | str, optional):
|
|
720
|
+
A list of columns to load, specified by general column names in Excel,
|
|
721
|
+
e.g. usecols='B:D', usecols=['A', 'C', 'F']
|
|
722
|
+
skip_rows (int, optional):
|
|
723
|
+
List of rows to skip on top of the sheet (e.g. to not read headlines)
|
|
724
|
+
header (int | None, optional):
|
|
725
|
+
Excel Row (0-indexed) to use for the column labels of the parsed data frame.
|
|
726
|
+
If file contains no header row, then you should explicitly pass header=None.
|
|
727
|
+
Default is 0.
|
|
728
|
+
names (list, optional):
|
|
729
|
+
A list of column names to use. Default is None.
|
|
730
|
+
na_values (list, optional):
|
|
731
|
+
A list of values in the Excel that should become the Pandas NA value.
|
|
732
|
+
|
|
460
733
|
Returns:
|
|
461
|
-
bool:
|
|
734
|
+
bool:
|
|
735
|
+
False in case an error occured, True otherwise.
|
|
736
|
+
|
|
462
737
|
"""
|
|
463
738
|
|
|
464
739
|
if xlsx_path is not None and os.path.exists(xlsx_path):
|
|
@@ -473,16 +748,21 @@ class Data:
|
|
|
473
748
|
names=names,
|
|
474
749
|
na_values=na_values,
|
|
475
750
|
)
|
|
476
|
-
#
|
|
751
|
+
# If multiple sheets from an Excel workbook are loaded,
|
|
477
752
|
# then read_excel() returns a dictionary. The keys are
|
|
478
|
-
# the names of the sheets and the values are the
|
|
479
|
-
#
|
|
753
|
+
# the names of the sheets and the values are the data frames.
|
|
754
|
+
# As this class can only handle one data frame per object,
|
|
755
|
+
# We handle this case by concatenating the different sheets.
|
|
756
|
+
# If you don't want this make sure your Excel workbook has only
|
|
757
|
+
# one sheet or use the "sheet_name" parameter to select the one(s)
|
|
758
|
+
# you want to load.
|
|
480
759
|
if isinstance(df, dict):
|
|
481
|
-
logger.info("Loading multiple Excel sheets from the workbook!")
|
|
760
|
+
self.logger.info("Loading multiple Excel sheets from the workbook!")
|
|
482
761
|
multi_sheet_df = pd.DataFrame()
|
|
483
|
-
for sheet in df
|
|
762
|
+
for sheet in df:
|
|
484
763
|
multi_sheet_df = pd.concat(
|
|
485
|
-
[multi_sheet_df, df[sheet]],
|
|
764
|
+
[multi_sheet_df, df[sheet]],
|
|
765
|
+
ignore_index=True,
|
|
486
766
|
)
|
|
487
767
|
df = multi_sheet_df
|
|
488
768
|
if self._df is None:
|
|
@@ -490,89 +770,127 @@ class Data:
|
|
|
490
770
|
else:
|
|
491
771
|
self._df = pd.concat([self._df, df], ignore_index=True)
|
|
492
772
|
except FileNotFoundError:
|
|
493
|
-
logger.error(
|
|
773
|
+
self.logger.error(
|
|
494
774
|
"Excel file -> '%s' not found. Please check the file path.",
|
|
495
775
|
xlsx_path,
|
|
496
776
|
)
|
|
497
777
|
return False
|
|
498
778
|
except PermissionError:
|
|
499
|
-
logger.error(
|
|
500
|
-
"
|
|
779
|
+
self.logger.error(
|
|
780
|
+
"Missing permission to access the Excel file -> '%s'.",
|
|
781
|
+
xlsx_path,
|
|
501
782
|
)
|
|
502
783
|
return False
|
|
503
|
-
except
|
|
504
|
-
logger.error(
|
|
505
|
-
"An I/O error occurred
|
|
506
|
-
str(e),
|
|
784
|
+
except OSError:
|
|
785
|
+
self.logger.error(
|
|
786
|
+
"An I/O error occurred while reading the Excel file -> '%s'",
|
|
507
787
|
xlsx_path,
|
|
508
788
|
)
|
|
509
789
|
return False
|
|
510
|
-
except ValueError
|
|
511
|
-
logger.error(
|
|
512
|
-
"Invalid Excel input
|
|
790
|
+
except ValueError:
|
|
791
|
+
self.logger.error(
|
|
792
|
+
"Invalid Excel input in file -> '%s'",
|
|
793
|
+
xlsx_path,
|
|
513
794
|
)
|
|
514
795
|
return False
|
|
515
|
-
except AttributeError
|
|
516
|
-
logger.error("Unexpected data structure -> %s",
|
|
796
|
+
except AttributeError:
|
|
797
|
+
self.logger.error("Unexpected data structure in file -> %s", xlsx_path)
|
|
517
798
|
return False
|
|
518
|
-
except TypeError
|
|
519
|
-
logger.error("Unexpected data type -> %s",
|
|
799
|
+
except TypeError:
|
|
800
|
+
self.logger.error("Unexpected data type in file -> %s", xlsx_path)
|
|
520
801
|
return False
|
|
521
|
-
except KeyError
|
|
522
|
-
logger.error("Missing key in Excel data -> %s",
|
|
802
|
+
except KeyError:
|
|
803
|
+
self.logger.error("Missing key in Excel data in file -> %s", xlsx_path)
|
|
523
804
|
return False
|
|
524
805
|
|
|
525
806
|
else:
|
|
526
|
-
logger.error(
|
|
527
|
-
"Missing Excel file -> '%s'
|
|
807
|
+
self.logger.error(
|
|
808
|
+
"Missing Excel file -> '%s'. You have not specified a valid path!",
|
|
528
809
|
xlsx_path,
|
|
529
810
|
)
|
|
530
811
|
return False
|
|
812
|
+
|
|
531
813
|
return True
|
|
532
814
|
|
|
533
815
|
# end method definition
|
|
534
816
|
|
|
535
817
|
def save_excel_data(
|
|
536
|
-
self,
|
|
818
|
+
self,
|
|
819
|
+
excel_path: str,
|
|
820
|
+
sheet_name: str = "Pandas Export",
|
|
821
|
+
index: bool = False,
|
|
822
|
+
columns: list | None = None,
|
|
537
823
|
) -> bool:
|
|
538
|
-
"""
|
|
539
|
-
Save the DataFrame to an Excel file, with robust error handling and logging.
|
|
824
|
+
"""Save the data frame to an Excel file, with robust error handling and logging.
|
|
540
825
|
|
|
541
826
|
Args:
|
|
542
|
-
excel_path (str):
|
|
543
|
-
|
|
544
|
-
|
|
827
|
+
excel_path (str):
|
|
828
|
+
The file path to save the Excel file.
|
|
829
|
+
sheet_name (str):
|
|
830
|
+
The sheet name where data will be saved. Default is 'Sheet1'.
|
|
831
|
+
index (bool, optional):
|
|
832
|
+
Whether to write the row names (index). Default is False.
|
|
833
|
+
columns (list | None, optional):
|
|
834
|
+
A list of column names to write into the excel file.
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
bool:
|
|
838
|
+
True = success, False = error.
|
|
839
|
+
|
|
545
840
|
"""
|
|
841
|
+
|
|
546
842
|
try:
|
|
547
843
|
# Check if the directory exists
|
|
548
844
|
directory = os.path.dirname(excel_path)
|
|
549
845
|
if directory and not os.path.exists(directory):
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
846
|
+
os.makedirs(directory)
|
|
847
|
+
|
|
848
|
+
# Validate columns if provided
|
|
849
|
+
if columns:
|
|
850
|
+
existing_columns = [col for col in columns if col in self._df.columns]
|
|
851
|
+
missing_columns = set(columns) - set(existing_columns)
|
|
852
|
+
if missing_columns:
|
|
853
|
+
self.logger.warning(
|
|
854
|
+
"The following columns do not exist in the data frame and cannot be saved to Excel -> %s",
|
|
855
|
+
", ".join(missing_columns),
|
|
856
|
+
)
|
|
857
|
+
columns = existing_columns
|
|
553
858
|
|
|
554
|
-
# Attempt to save the
|
|
555
|
-
self._df.to_excel(
|
|
556
|
-
|
|
859
|
+
# Attempt to save the data frame to Excel:
|
|
860
|
+
self._df.to_excel(
|
|
861
|
+
excel_path,
|
|
862
|
+
sheet_name=sheet_name,
|
|
863
|
+
index=index,
|
|
864
|
+
columns=columns or None, # Pass None if no columns provided
|
|
865
|
+
)
|
|
866
|
+
self.logger.info(
|
|
867
|
+
"Data frame saved successfully to Excel file -> '%s'.",
|
|
868
|
+
excel_path,
|
|
869
|
+
)
|
|
557
870
|
|
|
558
|
-
except FileNotFoundError
|
|
559
|
-
logger.error(
|
|
871
|
+
except FileNotFoundError:
|
|
872
|
+
self.logger.error(
|
|
873
|
+
"Cannot write data frame to Excel file -> '%s'",
|
|
874
|
+
excel_path,
|
|
875
|
+
)
|
|
560
876
|
return False
|
|
561
877
|
except PermissionError:
|
|
562
|
-
logger.error(
|
|
563
|
-
"
|
|
878
|
+
self.logger.error(
|
|
879
|
+
"Cannot write data frame to Excel file -> '%s'",
|
|
564
880
|
excel_path,
|
|
565
881
|
)
|
|
566
882
|
return False
|
|
567
|
-
except ValueError
|
|
568
|
-
logger.error(
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
883
|
+
except ValueError:
|
|
884
|
+
self.logger.error(
|
|
885
|
+
"Cannot write data frame to Excel file -> '%s'",
|
|
886
|
+
excel_path,
|
|
887
|
+
)
|
|
572
888
|
return False
|
|
573
|
-
except
|
|
574
|
-
|
|
575
|
-
|
|
889
|
+
except OSError:
|
|
890
|
+
self.logger.error(
|
|
891
|
+
"Cannot write data frame to Excel file -> '%s'",
|
|
892
|
+
excel_path,
|
|
893
|
+
)
|
|
576
894
|
return False
|
|
577
895
|
|
|
578
896
|
return True
|
|
@@ -580,130 +898,266 @@ class Data:
|
|
|
580
898
|
# end method definition
|
|
581
899
|
|
|
582
900
|
def load_csv_data(
|
|
583
|
-
self,
|
|
901
|
+
self,
|
|
902
|
+
csv_path: str,
|
|
903
|
+
delimiter: str = ",",
|
|
904
|
+
names: list | None = None,
|
|
905
|
+
header: int | None = 0,
|
|
906
|
+
usecols: list | None = None,
|
|
907
|
+
encoding: str = "utf-8",
|
|
584
908
|
) -> bool:
|
|
585
|
-
"""Load CSV (Comma separated values) data into
|
|
909
|
+
"""Load CSV (Comma separated values) data into data frame.
|
|
586
910
|
|
|
587
911
|
Args:
|
|
588
|
-
csv_path (str):
|
|
589
|
-
|
|
590
|
-
|
|
912
|
+
csv_path (str):
|
|
913
|
+
The path to the CSV file.
|
|
914
|
+
delimiter (str, optional, length = 1):
|
|
915
|
+
The character used to delimit values. Default is "," (comma).
|
|
916
|
+
names (list | None, optional):
|
|
917
|
+
The list of column names. This is useful if file does not have a header line
|
|
918
|
+
but just the data.
|
|
919
|
+
header (int | None, optional):
|
|
920
|
+
The index of the header line. Default is 0 (first line). None indicates
|
|
921
|
+
that the file does not have a header line
|
|
922
|
+
usecols (list | None, optional):
|
|
923
|
+
There are three possible list values types:
|
|
924
|
+
1. int:
|
|
925
|
+
These values are treated as column indices for columns to keep
|
|
926
|
+
(first column has index 0).
|
|
927
|
+
2. str:
|
|
928
|
+
The names of the columns to keep. For this to work the file needs
|
|
929
|
+
either a header line (i.e. 'header != None') or the 'names'
|
|
930
|
+
parameter must be specified.
|
|
931
|
+
3. bool:
|
|
932
|
+
The length of the list must match the number of columns. Only
|
|
933
|
+
columns that have a value of True are kept.
|
|
934
|
+
encoding (str, optional):
|
|
935
|
+
The encoding of the file. Default = "utf-8".
|
|
936
|
+
|
|
591
937
|
Returns:
|
|
592
|
-
bool:
|
|
938
|
+
bool:
|
|
939
|
+
False in case an error occured, True otherwise.
|
|
940
|
+
|
|
593
941
|
"""
|
|
594
942
|
|
|
595
|
-
if csv_path
|
|
596
|
-
#
|
|
943
|
+
if csv_path.startswith("http"):
|
|
944
|
+
# Download file from remote location specified by the packageUrl
|
|
945
|
+
# this must be a public place without authentication:
|
|
946
|
+
self.logger.debug("Download CSV file from URL -> '%s'.", csv_path)
|
|
947
|
+
|
|
597
948
|
try:
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
self._df = df
|
|
603
|
-
else:
|
|
604
|
-
self._df = pd.concat([self._df, df])
|
|
605
|
-
except FileNotFoundError:
|
|
606
|
-
logger.error(
|
|
607
|
-
"CSV file -> '%s' not found. Please check the file path.", csv_path
|
|
608
|
-
)
|
|
609
|
-
return False
|
|
610
|
-
except PermissionError:
|
|
611
|
-
logger.error(
|
|
612
|
-
"Permission denied to access the CSV file -> %s.", csv_path
|
|
613
|
-
)
|
|
949
|
+
response = requests.get(url=csv_path, timeout=1200)
|
|
950
|
+
response.raise_for_status()
|
|
951
|
+
except requests.exceptions.HTTPError:
|
|
952
|
+
self.logger.error("HTTP error with -> %s", csv_path)
|
|
614
953
|
return False
|
|
615
|
-
except
|
|
616
|
-
logger.error("
|
|
954
|
+
except requests.exceptions.ConnectionError:
|
|
955
|
+
self.logger.error("Connection error with -> %s", csv_path)
|
|
617
956
|
return False
|
|
618
|
-
except
|
|
619
|
-
logger.error("
|
|
957
|
+
except requests.exceptions.Timeout:
|
|
958
|
+
self.logger.error("Timeout error with -> %s", csv_path)
|
|
620
959
|
return False
|
|
621
|
-
except
|
|
622
|
-
logger.error("
|
|
623
|
-
return False
|
|
624
|
-
except TypeError as e:
|
|
625
|
-
logger.error("Unexpected data type -> %s", str(e))
|
|
626
|
-
return False
|
|
627
|
-
except KeyError as e:
|
|
628
|
-
logger.error("Missing key in CSV data -> %s", str(e))
|
|
960
|
+
except requests.exceptions.RequestException:
|
|
961
|
+
self.logger.error("Request error with -> %s", csv_path)
|
|
629
962
|
return False
|
|
630
963
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
"Missing CSV file -> '%s' you have not specified a valid path!",
|
|
964
|
+
self.logger.debug(
|
|
965
|
+
"Successfully downloaded CSV file -> %s; status code -> %s",
|
|
634
966
|
csv_path,
|
|
967
|
+
response.status_code,
|
|
635
968
|
)
|
|
636
|
-
return False
|
|
637
|
-
return True
|
|
638
969
|
|
|
639
|
-
|
|
970
|
+
# Convert bytes to a string using utf-8 and create a file-like object
|
|
971
|
+
csv_file = StringIO(response.content.decode(encoding))
|
|
640
972
|
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
"""Load XML data into DataFrame
|
|
973
|
+
elif os.path.exists(csv_path):
|
|
974
|
+
self.logger.debug("Using local CSV file -> '%s'.", csv_path)
|
|
975
|
+
csv_file = csv_path
|
|
645
976
|
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
"""
|
|
977
|
+
else:
|
|
978
|
+
self.logger.error(
|
|
979
|
+
"Missing CSV file -> '%s' you have not specified a valid path!",
|
|
980
|
+
csv_path,
|
|
981
|
+
)
|
|
982
|
+
return False
|
|
653
983
|
|
|
984
|
+
# Load data from CSV file or buffer
|
|
654
985
|
try:
|
|
655
|
-
df = pd.
|
|
656
|
-
|
|
986
|
+
df = pd.read_csv(
|
|
987
|
+
filepath_or_buffer=csv_file,
|
|
988
|
+
delimiter=delimiter,
|
|
989
|
+
names=names,
|
|
990
|
+
header=header,
|
|
991
|
+
usecols=usecols,
|
|
992
|
+
encoding=encoding,
|
|
993
|
+
skipinitialspace=True,
|
|
994
|
+
)
|
|
657
995
|
if self._df is None:
|
|
658
996
|
self._df = df
|
|
659
997
|
else:
|
|
660
998
|
self._df = pd.concat([self._df, df])
|
|
661
|
-
logger.info("XML file loaded successfully!")
|
|
662
|
-
return True
|
|
663
999
|
except FileNotFoundError:
|
|
664
|
-
|
|
1000
|
+
self.logger.error(
|
|
1001
|
+
"CSV file -> '%s' not found. Please check the file path.",
|
|
1002
|
+
csv_path,
|
|
1003
|
+
)
|
|
665
1004
|
return False
|
|
666
1005
|
except PermissionError:
|
|
667
|
-
logger.error(
|
|
1006
|
+
self.logger.error(
|
|
1007
|
+
"Permission denied to access the CSV file -> '%s'.",
|
|
1008
|
+
csv_path,
|
|
1009
|
+
)
|
|
668
1010
|
return False
|
|
669
|
-
except
|
|
670
|
-
logger.error("An I/O error occurred
|
|
1011
|
+
except OSError:
|
|
1012
|
+
self.logger.error("An I/O error occurred!")
|
|
671
1013
|
return False
|
|
672
|
-
except ValueError
|
|
673
|
-
logger.error("Invalid CSV input -> %s",
|
|
1014
|
+
except ValueError:
|
|
1015
|
+
self.logger.error("Invalid CSV input in file -> %s", csv_path)
|
|
674
1016
|
return False
|
|
675
|
-
except AttributeError
|
|
676
|
-
logger.error("Unexpected data structure -> %s",
|
|
1017
|
+
except AttributeError:
|
|
1018
|
+
self.logger.error("Unexpected data structure in file -> %s", csv_path)
|
|
677
1019
|
return False
|
|
678
|
-
except TypeError
|
|
679
|
-
logger.error("Unexpected data type -> %s",
|
|
1020
|
+
except TypeError:
|
|
1021
|
+
self.logger.error("Unexpected data type in file -> %s", csv_path)
|
|
680
1022
|
return False
|
|
681
|
-
except KeyError
|
|
682
|
-
logger.error("Missing key in CSV data -> %s",
|
|
1023
|
+
except KeyError:
|
|
1024
|
+
self.logger.error("Missing key in CSV data -> %s", csv_path)
|
|
683
1025
|
return False
|
|
684
1026
|
|
|
1027
|
+
return True
|
|
1028
|
+
|
|
685
1029
|
# end method definition
|
|
686
1030
|
|
|
687
|
-
def
|
|
688
|
-
|
|
1031
|
+
def load_xml_data(
|
|
1032
|
+
self,
|
|
1033
|
+
xml_path: str,
|
|
1034
|
+
xpath: str | None = None,
|
|
1035
|
+
xslt_path: str | None = None,
|
|
1036
|
+
encoding: str = "utf-8",
|
|
1037
|
+
) -> bool:
|
|
1038
|
+
"""Load XML data into a Pandas data frame.
|
|
689
1039
|
|
|
690
1040
|
Args:
|
|
691
|
-
|
|
692
|
-
|
|
1041
|
+
xml_path (str):
|
|
1042
|
+
The path to the XML file to load.
|
|
1043
|
+
xpath (str, optional):
|
|
1044
|
+
An XPath to the elements we want to select.
|
|
1045
|
+
xslt_path (str, optional):
|
|
1046
|
+
An XSLT transformation file to convert the XML data.
|
|
1047
|
+
encoding (str, optional):
|
|
1048
|
+
The encoding of the file. Default is UTF-8.
|
|
693
1049
|
|
|
694
1050
|
Returns:
|
|
695
|
-
bool:
|
|
1051
|
+
bool:
|
|
1052
|
+
False in case an error occured, True otherwise.
|
|
1053
|
+
|
|
696
1054
|
"""
|
|
697
1055
|
|
|
698
|
-
|
|
699
|
-
#
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
"The provided path -> '%s' is not a valid directory.", path_to_root
|
|
703
|
-
)
|
|
704
|
-
return False
|
|
1056
|
+
if xml_path.startswith("http"):
|
|
1057
|
+
# Download file from remote location specified by the packageUrl
|
|
1058
|
+
# this must be a public place without authentication:
|
|
1059
|
+
self.logger.debug("Download XML file from URL -> '%s'.", xml_path)
|
|
705
1060
|
|
|
706
|
-
|
|
1061
|
+
try:
|
|
1062
|
+
response = requests.get(url=xml_path, timeout=1200)
|
|
1063
|
+
response.raise_for_status()
|
|
1064
|
+
except requests.exceptions.HTTPError:
|
|
1065
|
+
self.logger.error("HTTP error with -> %s", xml_path)
|
|
1066
|
+
return False
|
|
1067
|
+
except requests.exceptions.ConnectionError:
|
|
1068
|
+
self.logger.error("Connection error with -> %s", xml_path)
|
|
1069
|
+
return False
|
|
1070
|
+
except requests.exceptions.Timeout:
|
|
1071
|
+
self.logger.error("Timeout error with -> %s", xml_path)
|
|
1072
|
+
return False
|
|
1073
|
+
except requests.exceptions.RequestException:
|
|
1074
|
+
self.logger.error("Request error with -> %s", xml_path)
|
|
1075
|
+
return False
|
|
1076
|
+
|
|
1077
|
+
self.logger.debug(
|
|
1078
|
+
"Successfully downloaded XML file -> '%s'; status code -> %s",
|
|
1079
|
+
xml_path,
|
|
1080
|
+
response.status_code,
|
|
1081
|
+
)
|
|
1082
|
+
# Convert bytes to a string using utf-8 and create a file-like object
|
|
1083
|
+
xml_file = StringIO(response.content.decode(encoding))
|
|
1084
|
+
|
|
1085
|
+
elif os.path.exists(xml_path):
|
|
1086
|
+
self.logger.debug("Using local XML file -> '%s'.", xml_path)
|
|
1087
|
+
xml_file = xml_path
|
|
1088
|
+
|
|
1089
|
+
else:
|
|
1090
|
+
self.logger.error(
|
|
1091
|
+
"Missing XML file -> '%s'. You have not specified a valid path or URL!",
|
|
1092
|
+
xml_path,
|
|
1093
|
+
)
|
|
1094
|
+
return False
|
|
1095
|
+
|
|
1096
|
+
# Load data from XML file or buffer
|
|
1097
|
+
try:
|
|
1098
|
+
df = pd.read_xml(
|
|
1099
|
+
path_or_buffer=xml_file,
|
|
1100
|
+
xpath=xpath,
|
|
1101
|
+
stylesheet=xslt_path,
|
|
1102
|
+
encoding=encoding,
|
|
1103
|
+
)
|
|
1104
|
+
# Process the loaded data as needed
|
|
1105
|
+
if self._df is None:
|
|
1106
|
+
self._df = df
|
|
1107
|
+
else:
|
|
1108
|
+
self._df = pd.concat([self._df, df])
|
|
1109
|
+
self.logger.info("XML file -> '%s' loaded successfully!", xml_path)
|
|
1110
|
+
except FileNotFoundError:
|
|
1111
|
+
self.logger.error("XML file -> '%s' not found.", xml_path)
|
|
1112
|
+
return False
|
|
1113
|
+
except PermissionError:
|
|
1114
|
+
self.logger.error(
|
|
1115
|
+
"Missing permission to access the XML file -> '%s'.",
|
|
1116
|
+
xml_path,
|
|
1117
|
+
)
|
|
1118
|
+
return False
|
|
1119
|
+
except OSError:
|
|
1120
|
+
self.logger.error("An I/O error occurred loading from -> %s", xml_path)
|
|
1121
|
+
return False
|
|
1122
|
+
except ValueError:
|
|
1123
|
+
self.logger.error("Invalid XML data in file -> %s", xml_path)
|
|
1124
|
+
return False
|
|
1125
|
+
except AttributeError:
|
|
1126
|
+
self.logger.error("Unexpected data structure in XML file -> %s", xml_path)
|
|
1127
|
+
return False
|
|
1128
|
+
except TypeError:
|
|
1129
|
+
self.logger.error("Unexpected data type in XML file -> %s", xml_path)
|
|
1130
|
+
return False
|
|
1131
|
+
except KeyError:
|
|
1132
|
+
self.logger.error("Missing key in XML file -> %s", xml_path)
|
|
1133
|
+
return False
|
|
1134
|
+
|
|
1135
|
+
return True
|
|
1136
|
+
|
|
1137
|
+
# end method definition
|
|
1138
|
+
|
|
1139
|
+
def load_directory(self, path_to_root: str) -> bool:
|
|
1140
|
+
"""Load directory structure into Pandas data frame.
|
|
1141
|
+
|
|
1142
|
+
Args:
|
|
1143
|
+
path_to_root (str):
|
|
1144
|
+
Path to the root element of the directory structure.
|
|
1145
|
+
|
|
1146
|
+
Returns:
|
|
1147
|
+
bool: True = Success, False = Failure
|
|
1148
|
+
|
|
1149
|
+
"""
|
|
1150
|
+
|
|
1151
|
+
try:
|
|
1152
|
+
# Check if the provided path is a directory
|
|
1153
|
+
if not os.path.isdir(path_to_root):
|
|
1154
|
+
self.logger.error(
|
|
1155
|
+
"The provided path -> '%s' is not a valid directory.",
|
|
1156
|
+
path_to_root,
|
|
1157
|
+
)
|
|
1158
|
+
return False
|
|
1159
|
+
|
|
1160
|
+
# Initialize a list to hold file information
|
|
707
1161
|
data = []
|
|
708
1162
|
|
|
709
1163
|
# Walk through the directory
|
|
@@ -715,55 +1169,88 @@ class Data:
|
|
|
715
1169
|
path_parts = relative_path.split(os.sep)
|
|
716
1170
|
|
|
717
1171
|
# Create a dictionary with the path parts and file details
|
|
718
|
-
entry = {
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
1172
|
+
entry = {"level {}".format(i): part for i, part in enumerate(path_parts[:-1], start=1)}
|
|
1173
|
+
|
|
1174
|
+
entry.update(
|
|
1175
|
+
{
|
|
1176
|
+
"filename": path_parts[-1],
|
|
1177
|
+
"size": file_size,
|
|
1178
|
+
"path": path_parts[1:-1],
|
|
1179
|
+
"relative_path": relative_path,
|
|
1180
|
+
"download_dir": root,
|
|
1181
|
+
},
|
|
1182
|
+
)
|
|
723
1183
|
data.append(entry)
|
|
724
1184
|
|
|
725
|
-
# Create
|
|
1185
|
+
# Create data frame from list of dictionaries:
|
|
726
1186
|
self._df = pd.DataFrame(data)
|
|
727
1187
|
|
|
728
1188
|
# Determine the maximum number of levels
|
|
729
1189
|
max_levels = max((len(entry) - 2 for entry in data), default=0)
|
|
730
1190
|
|
|
731
|
-
# Ensure all entries have the same number of levels
|
|
1191
|
+
# Ensure all entries have the same number of levels:
|
|
732
1192
|
for entry in data:
|
|
733
1193
|
for i in range(1, max_levels + 1):
|
|
734
1194
|
entry.setdefault("level {}".format(i), "")
|
|
735
1195
|
|
|
736
|
-
# Convert to
|
|
1196
|
+
# Convert to data frame again to make sure all columns are consistent:
|
|
737
1197
|
self._df = pd.DataFrame(data)
|
|
738
1198
|
|
|
739
|
-
except NotADirectoryError
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
1199
|
+
except NotADirectoryError:
|
|
1200
|
+
self.logger.error(
|
|
1201
|
+
"Provided path -> '%s' is not a directory!",
|
|
1202
|
+
path_to_root,
|
|
1203
|
+
)
|
|
1204
|
+
return False
|
|
1205
|
+
except FileNotFoundError:
|
|
1206
|
+
self.logger.error(
|
|
1207
|
+
"Provided path -> '%s' does not exist in file system!",
|
|
1208
|
+
path_to_root,
|
|
1209
|
+
)
|
|
1210
|
+
return False
|
|
1211
|
+
except PermissionError:
|
|
1212
|
+
self.logger.error(
|
|
1213
|
+
"Permission error accessing path -> '%s'!",
|
|
1214
|
+
path_to_root,
|
|
1215
|
+
)
|
|
1216
|
+
return False
|
|
745
1217
|
|
|
746
1218
|
return True
|
|
747
1219
|
|
|
748
1220
|
# end method definition
|
|
749
1221
|
|
|
750
|
-
def load_xml_directory(
|
|
751
|
-
|
|
1222
|
+
def load_xml_directory(
|
|
1223
|
+
self,
|
|
1224
|
+
path_to_root: str,
|
|
1225
|
+
xpath: str | None = None,
|
|
1226
|
+
xml_files: list | None = None,
|
|
1227
|
+
) -> bool:
|
|
1228
|
+
"""Load XML files from a directory structure into Pandas data frame.
|
|
752
1229
|
|
|
753
1230
|
Args:
|
|
754
|
-
path_to_root (str):
|
|
755
|
-
|
|
756
|
-
xpath (str, optional):
|
|
1231
|
+
path_to_root (str):
|
|
1232
|
+
Path to the root element of the directory structure.
|
|
1233
|
+
xpath (str, optional):
|
|
1234
|
+
XPath to the XML elements we want to select.
|
|
1235
|
+
xml_files (list | None, optional):
|
|
1236
|
+
Names of the XML files to load from the directory.
|
|
757
1237
|
|
|
758
1238
|
Returns:
|
|
759
|
-
bool:
|
|
1239
|
+
bool:
|
|
1240
|
+
True = Success, False = Failure
|
|
1241
|
+
|
|
760
1242
|
"""
|
|
761
1243
|
|
|
1244
|
+
# Establish a default if None is passed via the parameter:
|
|
1245
|
+
if not xml_files:
|
|
1246
|
+
xml_files = ["docovw.xml"]
|
|
1247
|
+
|
|
762
1248
|
try:
|
|
763
1249
|
# Check if the provided path is a directory
|
|
764
1250
|
if not os.path.isdir(path_to_root):
|
|
765
|
-
logger.error(
|
|
766
|
-
"The provided path -> '%s' is not a valid directory.",
|
|
1251
|
+
self.logger.error(
|
|
1252
|
+
"The provided path -> '%s' is not a valid directory.",
|
|
1253
|
+
path_to_root,
|
|
767
1254
|
)
|
|
768
1255
|
return False
|
|
769
1256
|
|
|
@@ -774,36 +1261,223 @@ class Data:
|
|
|
774
1261
|
file_size = os.path.getsize(file_path)
|
|
775
1262
|
file_name = os.path.basename(file_path)
|
|
776
1263
|
|
|
777
|
-
if file_name
|
|
778
|
-
logger.info(
|
|
779
|
-
"Load XML file -> '%s' of size -> %s",
|
|
1264
|
+
if file_name in xml_files:
|
|
1265
|
+
self.logger.info(
|
|
1266
|
+
"Load XML file -> '%s' of size -> %s from -> '%s'...",
|
|
1267
|
+
file_name,
|
|
1268
|
+
file_size,
|
|
1269
|
+
file_path,
|
|
780
1270
|
)
|
|
781
1271
|
success = self.load_xml_data(file_path, xpath=xpath)
|
|
782
1272
|
if success:
|
|
783
|
-
logger.info(
|
|
784
|
-
"Successfully loaded XML file -> '%s'",
|
|
1273
|
+
self.logger.info(
|
|
1274
|
+
"Successfully loaded XML file -> '%s'.",
|
|
1275
|
+
file_path,
|
|
785
1276
|
)
|
|
786
1277
|
|
|
787
|
-
except NotADirectoryError
|
|
788
|
-
logger.error(
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
1278
|
+
except NotADirectoryError:
|
|
1279
|
+
self.logger.error(
|
|
1280
|
+
"Provided path -> '%s' is not a directory",
|
|
1281
|
+
path_to_root,
|
|
1282
|
+
)
|
|
1283
|
+
return False
|
|
1284
|
+
except FileNotFoundError:
|
|
1285
|
+
self.logger.error(
|
|
1286
|
+
"Provided path -> '%s' does not exist in file system!",
|
|
1287
|
+
path_to_root,
|
|
1288
|
+
)
|
|
1289
|
+
return False
|
|
1290
|
+
except PermissionError:
|
|
1291
|
+
self.logger.error(
|
|
1292
|
+
"Missing permission to access path -> '%s'",
|
|
1293
|
+
path_to_root,
|
|
1294
|
+
)
|
|
1295
|
+
return False
|
|
1296
|
+
|
|
1297
|
+
return True
|
|
1298
|
+
|
|
1299
|
+
# end method definition
|
|
1300
|
+
|
|
1301
|
+
def load_web_links(
|
|
1302
|
+
self,
|
|
1303
|
+
url: str,
|
|
1304
|
+
common_data: dict | None = None,
|
|
1305
|
+
pattern: str = r"",
|
|
1306
|
+
) -> list | None:
|
|
1307
|
+
"""Get all linked file URLs on a given web page (url) that are following a given pattern.
|
|
1308
|
+
|
|
1309
|
+
Construct a list of dictionaries based on this. This method is a helper method for load_web() below.
|
|
1310
|
+
|
|
1311
|
+
Args:
|
|
1312
|
+
url (str):
|
|
1313
|
+
The web page URL.
|
|
1314
|
+
common_data (dict | None, optional):
|
|
1315
|
+
Fields that should be added to each dictionary item. Defaults to None.
|
|
1316
|
+
pattern (str, optional):
|
|
1317
|
+
Regular Expression. Defaults to r"".
|
|
1318
|
+
|
|
1319
|
+
Returns:
|
|
1320
|
+
list | None:
|
|
1321
|
+
List of links on the web page that are complying with the given regular expression.
|
|
1322
|
+
|
|
1323
|
+
"""
|
|
1324
|
+
|
|
1325
|
+
try:
|
|
1326
|
+
response = requests.get(url, timeout=300)
|
|
1327
|
+
response.raise_for_status()
|
|
1328
|
+
except requests.RequestException:
|
|
1329
|
+
self.logger.error("Failed to retrieve page at %s", url)
|
|
1330
|
+
return []
|
|
1331
|
+
|
|
1332
|
+
# Find all file links (hyperlinks) on the page (no file extension assumed)
|
|
1333
|
+
# Example filename pattern: "al022023.public.005"
|
|
1334
|
+
file_links = re.findall(r'href="([^"]+)"', response.text)
|
|
1335
|
+
if not file_links:
|
|
1336
|
+
self.logger.warning("No file links found on the web page -> %s", url)
|
|
1337
|
+
return []
|
|
1338
|
+
|
|
1339
|
+
result_list = []
|
|
1340
|
+
base_url = url if url.endswith("/") else url + "/"
|
|
1341
|
+
|
|
1342
|
+
for link in file_links:
|
|
1343
|
+
data = common_data.copy() if common_data else {}
|
|
1344
|
+
|
|
1345
|
+
# Construct the full URL
|
|
1346
|
+
full_url = base_url + link.lstrip("/")
|
|
1347
|
+
|
|
1348
|
+
if pattern:
|
|
1349
|
+
# Filter by expected naming pattern for links
|
|
1350
|
+
match = re.search(pattern, link)
|
|
1351
|
+
if not match:
|
|
1352
|
+
continue
|
|
1353
|
+
|
|
1354
|
+
# Extract and assign groups if they exist
|
|
1355
|
+
# TODO(mdiefenb): these names are currently hard-coded
|
|
1356
|
+
# for the National Hurricane Center Dataset (NHC)
|
|
1357
|
+
if len(match.groups()) >= 1:
|
|
1358
|
+
data["Code"] = match.group(1).upper()
|
|
1359
|
+
if len(match.groups()) >= 2:
|
|
1360
|
+
data["Type"] = match.group(2)
|
|
1361
|
+
if len(match.groups()) >= 3:
|
|
1362
|
+
data["Message ID"] = match.group(3)
|
|
1363
|
+
|
|
1364
|
+
data["URL"] = full_url
|
|
1365
|
+
data["Filename"] = link
|
|
1366
|
+
|
|
1367
|
+
result_list.append(data)
|
|
1368
|
+
|
|
1369
|
+
return result_list
|
|
1370
|
+
|
|
1371
|
+
# end method definition
|
|
1372
|
+
|
|
1373
|
+
def load_web(
|
|
1374
|
+
self,
|
|
1375
|
+
values: list,
|
|
1376
|
+
value_name: str,
|
|
1377
|
+
url_templates: list,
|
|
1378
|
+
special_values: list | None = None,
|
|
1379
|
+
special_url_templates: dict | None = None,
|
|
1380
|
+
pattern: str = r"",
|
|
1381
|
+
) -> bool:
|
|
1382
|
+
"""Traverse years and bulletin types to collect all bulletin URLs.
|
|
1383
|
+
|
|
1384
|
+
Args:
|
|
1385
|
+
values (list):
|
|
1386
|
+
List of values to travers over
|
|
1387
|
+
value_name (str):
|
|
1388
|
+
Dictionary key to construct an item in combination with a value from values
|
|
1389
|
+
url_templates (list):
|
|
1390
|
+
URLs to travers per value. The URLs should contain one {} that is
|
|
1391
|
+
replace by the current value.
|
|
1392
|
+
special_values (list | None, optional):
|
|
1393
|
+
List of vales (a subset of the other values list)
|
|
1394
|
+
that we want to handle in a special way. Defaults to None.
|
|
1395
|
+
special_url_templates (dict | None, optional):
|
|
1396
|
+
URLs for the special values. Defaults to None.
|
|
1397
|
+
The dictionary keys are the special values. The
|
|
1398
|
+
dictionary values are lists of special URLs with placeholders.
|
|
1399
|
+
pattern (str, optional):
|
|
1400
|
+
Regular expression to find the proper links on the page. Defaults to r"".
|
|
1401
|
+
|
|
1402
|
+
Returns:
|
|
1403
|
+
bool:
|
|
1404
|
+
True for success, False in case of an error.
|
|
1405
|
+
|
|
1406
|
+
"""
|
|
1407
|
+
|
|
1408
|
+
result_list = []
|
|
1409
|
+
|
|
1410
|
+
# We have two nested for loops below. The out traverses over all placeholder values.
|
|
1411
|
+
# These could be the calendar years, e.g. [2003,...,2024]
|
|
1412
|
+
# The inner for loop traverses over the list of specified URLs. We can have multiple for
|
|
1413
|
+
# each value.
|
|
1414
|
+
|
|
1415
|
+
# Do we have a list of placeholder values we want to iterate over?
|
|
1416
|
+
if values:
|
|
1417
|
+
# Traverse all values in the values list:
|
|
1418
|
+
for value in values:
|
|
1419
|
+
# Do we want a special treatment for this value (e.g. the current year)
|
|
1420
|
+
if value in special_values:
|
|
1421
|
+
self.logger.info("Processing special value -> '%s'...", value)
|
|
1422
|
+
if value not in special_url_templates and str(value) not in special_url_templates:
|
|
1423
|
+
self.logger.error(
|
|
1424
|
+
"Cannot find key -> '%s' in special URL templates dictionary -> %s! Skipping...",
|
|
1425
|
+
value,
|
|
1426
|
+
str(special_url_templates),
|
|
1427
|
+
)
|
|
1428
|
+
continue
|
|
1429
|
+
# If the dictionary uses string keys then we need to convert the value
|
|
1430
|
+
# to a string as well to avoid key errors:
|
|
1431
|
+
if str(value) in special_url_templates:
|
|
1432
|
+
value = str(value)
|
|
1433
|
+
special_url_template_list = special_url_templates[value]
|
|
1434
|
+
for special_url_template in special_url_template_list:
|
|
1435
|
+
# Now the value is inserted into the placeholder in the URL:
|
|
1436
|
+
special_url = special_url_template.format(value)
|
|
1437
|
+
common_data = {value_name: value} if value_name else None
|
|
1438
|
+
result_list += self.load_web_links(
|
|
1439
|
+
url=special_url,
|
|
1440
|
+
common_data=common_data,
|
|
1441
|
+
pattern=pattern,
|
|
1442
|
+
)
|
|
1443
|
+
else: # normal URLs
|
|
1444
|
+
self.logger.info("Processing value -> '%s'...", value)
|
|
1445
|
+
for url_template in url_templates:
|
|
1446
|
+
# Now the value is inserted into the placeholder in the URL:
|
|
1447
|
+
url = url_template.format(value)
|
|
1448
|
+
common_data = {value_name: value} if value_name else None
|
|
1449
|
+
result_list += self.load_web_links(
|
|
1450
|
+
url=url,
|
|
1451
|
+
common_data=common_data,
|
|
1452
|
+
pattern=pattern,
|
|
1453
|
+
)
|
|
1454
|
+
else:
|
|
1455
|
+
for url_template in url_templates:
|
|
1456
|
+
url = url_template.format(value)
|
|
1457
|
+
result_list += self.load_web_links(
|
|
1458
|
+
url=url,
|
|
1459
|
+
common_data=None,
|
|
1460
|
+
pattern=pattern,
|
|
1461
|
+
)
|
|
1462
|
+
|
|
1463
|
+
# Add the data list to the data frame:
|
|
1464
|
+
self.append(result_list)
|
|
793
1465
|
|
|
794
1466
|
return True
|
|
795
1467
|
|
|
796
1468
|
# end method definition
|
|
797
1469
|
|
|
798
1470
|
def partitionate(self, number: int) -> list:
|
|
799
|
-
"""Partition a data frame into equally sized
|
|
800
|
-
partions
|
|
1471
|
+
"""Partition a data frame into equally sized partitions.
|
|
801
1472
|
|
|
802
1473
|
Args:
|
|
803
|
-
number (int):
|
|
1474
|
+
number (int):
|
|
1475
|
+
The number of desired partitions.
|
|
804
1476
|
|
|
805
1477
|
Returns:
|
|
806
|
-
list:
|
|
1478
|
+
list:
|
|
1479
|
+
A list of created partitions.
|
|
1480
|
+
|
|
807
1481
|
"""
|
|
808
1482
|
|
|
809
1483
|
# Calculate the approximate size of each partition
|
|
@@ -817,24 +1491,20 @@ class Data:
|
|
|
817
1491
|
number = 1
|
|
818
1492
|
remainder = 0
|
|
819
1493
|
|
|
820
|
-
logger.info(
|
|
821
|
-
"Data
|
|
1494
|
+
self.logger.info(
|
|
1495
|
+
"Data frame has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
|
|
822
1496
|
str(size),
|
|
823
1497
|
str(number),
|
|
824
1498
|
str(partition_size),
|
|
825
1499
|
str(remainder),
|
|
826
1500
|
)
|
|
827
1501
|
|
|
828
|
-
# Initialize a list to store partitions
|
|
1502
|
+
# Initialize a list to store partitions:
|
|
829
1503
|
partitions = []
|
|
830
1504
|
start_index = 0
|
|
831
1505
|
|
|
832
|
-
# Slice the
|
|
1506
|
+
# Slice the data frame into equally sized partitions:
|
|
833
1507
|
for i in range(number):
|
|
834
|
-
# start_index = i * partition_size
|
|
835
|
-
# end_index = (i + 1) * partition_size if i < number - 1 else None
|
|
836
|
-
# partition = self._df.iloc[start_index:end_index]
|
|
837
|
-
# partitions.append(partition)
|
|
838
1508
|
# Calculate the end index for this partition
|
|
839
1509
|
end_index = start_index + partition_size + (1 if i < remainder else 0)
|
|
840
1510
|
partition = self._df.iloc[start_index:end_index]
|
|
@@ -849,34 +1519,44 @@ class Data:
|
|
|
849
1519
|
"""Partition a data frame based on equal values in a specified column.
|
|
850
1520
|
|
|
851
1521
|
Args:
|
|
852
|
-
column_name (str):
|
|
1522
|
+
column_name (str):
|
|
1523
|
+
The column name to partition by.
|
|
853
1524
|
|
|
854
1525
|
Returns:
|
|
855
|
-
list | None:
|
|
1526
|
+
list | None:
|
|
1527
|
+
List of partitions or None in case of an error (e.g. column name does not exist).
|
|
1528
|
+
|
|
856
1529
|
"""
|
|
857
1530
|
|
|
858
1531
|
if column_name not in self._df.columns:
|
|
859
|
-
logger.error(
|
|
860
|
-
"
|
|
1532
|
+
self.logger.error(
|
|
1533
|
+
"Cannot partitionate by column -> '%s'. Column does not exist in the data frame. Data frame has these columns -> %s",
|
|
861
1534
|
column_name,
|
|
862
1535
|
str(self._df.columns),
|
|
863
1536
|
)
|
|
864
1537
|
return None
|
|
865
1538
|
|
|
866
|
-
# Separate rows with NaN or None values in the specified column
|
|
1539
|
+
# Separate rows with NaN or None values in the specified column:
|
|
867
1540
|
nan_partitions = self._df[self._df[column_name].isna()]
|
|
1541
|
+
|
|
1542
|
+
# Keep only rows where the specified column has valid (non-NaN) values:
|
|
868
1543
|
non_nan_df = self._df.dropna(subset=[column_name])
|
|
869
1544
|
|
|
870
|
-
# Group by the specified column
|
|
1545
|
+
# Group the non-NaN DataFrame by the specified column's values:
|
|
871
1546
|
grouped = non_nan_df.groupby(column_name)
|
|
1547
|
+
|
|
1548
|
+
# Create a list of partitions (DataFrames) for each unique value in the column:
|
|
872
1549
|
partitions = [group for _, group in grouped]
|
|
873
1550
|
|
|
874
|
-
# Add each row with NaN
|
|
875
|
-
|
|
876
|
-
|
|
1551
|
+
# Add each row with NaN/None as its own partition
|
|
1552
|
+
# iterrows() returns each row as a Series. To convert it back to a DataFrame:
|
|
1553
|
+
# 1. .to_frame() turns the Series into a DataFrame, but with the original column names as rows.
|
|
1554
|
+
# 2. .T (transpose) flips it back, turning the original row into a proper DataFrame row.
|
|
1555
|
+
# This ensures that even rows with NaN values are treated as DataFrame partitions.
|
|
1556
|
+
partitions.extend([row.to_frame().T for _, row in nan_partitions.iterrows()])
|
|
877
1557
|
|
|
878
|
-
logger.info(
|
|
879
|
-
"Data
|
|
1558
|
+
self.logger.info(
|
|
1559
|
+
"Data frame has been partitioned into -> %s partitions based on the values in column -> '%s'...",
|
|
880
1560
|
str(len(partitions)),
|
|
881
1561
|
column_name,
|
|
882
1562
|
)
|
|
@@ -886,18 +1566,19 @@ class Data:
|
|
|
886
1566
|
# end method definition
|
|
887
1567
|
|
|
888
1568
|
def deduplicate(self, unique_fields: list, inplace: bool = True) -> pd.DataFrame:
|
|
889
|
-
"""Remove dupclicate rows that have all fields in
|
|
890
|
-
unique_fields in common.
|
|
1569
|
+
"""Remove dupclicate rows that have all fields in unique_fields in common.
|
|
891
1570
|
|
|
892
1571
|
Args:
|
|
893
|
-
unique_fields (list):
|
|
894
|
-
|
|
895
|
-
inplace (bool, optional):
|
|
896
|
-
|
|
1572
|
+
unique_fields (list):
|
|
1573
|
+
Defines the fields for which we want a unique combination for.
|
|
1574
|
+
inplace (bool, optional):
|
|
1575
|
+
True if the deduplication happens in-place. Defaults to True.
|
|
1576
|
+
|
|
897
1577
|
Returns:
|
|
898
|
-
pd.DataFrame
|
|
899
|
-
|
|
900
|
-
|
|
1578
|
+
pd.DataFrame:
|
|
1579
|
+
If inplace is False than a new deduplicatd data frame is returned.
|
|
1580
|
+
Otherwise the object is modified in place and self._df is returned.
|
|
1581
|
+
|
|
901
1582
|
"""
|
|
902
1583
|
|
|
903
1584
|
if inplace:
|
|
@@ -911,34 +1592,38 @@ class Data:
|
|
|
911
1592
|
|
|
912
1593
|
# end method definition
|
|
913
1594
|
|
|
914
|
-
def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame:
|
|
915
|
-
"""Sort the data frame based on one or multiple fields
|
|
916
|
-
|
|
1595
|
+
def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame | None:
|
|
1596
|
+
"""Sort the data frame based on one or multiple fields.
|
|
1597
|
+
|
|
1598
|
+
Sorting can be either in place or return it as a new data frame
|
|
1599
|
+
(e.g. not modifying self._df).
|
|
917
1600
|
|
|
918
1601
|
Args:
|
|
919
|
-
sort_fields (list):
|
|
920
|
-
|
|
921
|
-
|
|
1602
|
+
sort_fields (list):
|
|
1603
|
+
The columns / fields to be used for sorting.
|
|
1604
|
+
inplace (bool, optional):
|
|
1605
|
+
If the sorting should be inplace, i.e. modifying self._df.
|
|
1606
|
+
Defaults to True.
|
|
1607
|
+
|
|
922
1608
|
Returns:
|
|
923
|
-
pd.DataFrame
|
|
1609
|
+
pd.DataFrame | None:
|
|
1610
|
+
New data frame (if inplace = False) or self._df (if inplace = True).
|
|
1611
|
+
None in case of an error.
|
|
1612
|
+
|
|
924
1613
|
"""
|
|
925
1614
|
|
|
926
1615
|
if self._df is None:
|
|
927
1616
|
return None
|
|
928
1617
|
|
|
929
1618
|
if not all(sort_field in self._df.columns for sort_field in sort_fields):
|
|
930
|
-
logger.warning(
|
|
931
|
-
"Not all of the given sort fields -> %s do exist in the
|
|
1619
|
+
self.logger.warning(
|
|
1620
|
+
"Not all of the given sort fields -> %s do exist in the data frame.",
|
|
932
1621
|
str(sort_fields),
|
|
933
1622
|
)
|
|
934
|
-
# Reduce the sort fields to those that really exist in the
|
|
935
|
-
sort_fields = [
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
if sort_field in self._df.columns
|
|
939
|
-
]
|
|
940
|
-
logger.warning(
|
|
941
|
-
"Only these given sort fields -> %s do exist as columns in the Data Frame.",
|
|
1623
|
+
# Reduce the sort fields to those that really exist in the data frame:
|
|
1624
|
+
sort_fields = [sort_field for sort_field in sort_fields if sort_field in self._df.columns]
|
|
1625
|
+
self.logger.warning(
|
|
1626
|
+
"Only these given sort fields -> %s do exist as columns in the data frame.",
|
|
942
1627
|
str(sort_fields),
|
|
943
1628
|
)
|
|
944
1629
|
|
|
@@ -953,156 +1638,278 @@ class Data:
|
|
|
953
1638
|
|
|
954
1639
|
# end method definition
|
|
955
1640
|
|
|
956
|
-
def flatten(
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
1641
|
+
def flatten(self, parent_field: str, flatten_fields: list, concatenator: str = "_") -> None:
|
|
1642
|
+
"""Flatten a sub-dictionary by copying selected fields to the parent dictionary.
|
|
1643
|
+
|
|
1644
|
+
This is e.g. useful for then de-duplicate a data frame.
|
|
1645
|
+
To flatten a data frame makes sense in situation when a column used
|
|
1646
|
+
to have a list of dictionaries and got "exploded" (see explode_and_flatten()
|
|
1647
|
+
method below). In this case the column as dictionary values that then can
|
|
1648
|
+
be flattened.
|
|
964
1649
|
|
|
965
1650
|
Args:
|
|
966
|
-
parent_field (str):
|
|
967
|
-
|
|
968
|
-
|
|
1651
|
+
parent_field (str):
|
|
1652
|
+
Name prefix of the new column in the data frame. The flattened field
|
|
1653
|
+
names are added with a leading underscore.
|
|
1654
|
+
flatten_fields (list):
|
|
1655
|
+
Fields in the dictionary of the source column that are copied
|
|
1656
|
+
as new columns into the data frame.
|
|
1657
|
+
concatenator (str, optional):
|
|
1658
|
+
Character or string used to concatenate the parent field with the flattened field
|
|
1659
|
+
to create a unique name.
|
|
1660
|
+
|
|
969
1661
|
"""
|
|
970
1662
|
|
|
1663
|
+
# First do a sanity check if the data frame is not yet initialized.
|
|
1664
|
+
if self._df is None:
|
|
1665
|
+
self.logger.error(
|
|
1666
|
+
"The data frame is not initialized or empty. Cannot flatten field(s) -> '%s' in the data frame.",
|
|
1667
|
+
flatten_fields,
|
|
1668
|
+
)
|
|
1669
|
+
return
|
|
1670
|
+
|
|
1671
|
+
if parent_field not in self._df.columns:
|
|
1672
|
+
self.logger.warning(
|
|
1673
|
+
"The parent field -> '%s' cannot be flattened as it doesn't exist as column in the data frame!",
|
|
1674
|
+
parent_field,
|
|
1675
|
+
)
|
|
1676
|
+
return
|
|
1677
|
+
|
|
971
1678
|
for flatten_field in flatten_fields:
|
|
972
|
-
flat_field = parent_field +
|
|
1679
|
+
flat_field = parent_field + concatenator + flatten_field
|
|
973
1680
|
# The following expression generates a new column in the
|
|
974
1681
|
# data frame with the name of 'flat_field'.
|
|
975
|
-
# In the
|
|
1682
|
+
# In the lambda function x is a dictionary that includes the subvalues
|
|
976
1683
|
# and it returns the value of the given flatten field
|
|
977
1684
|
# (if it exists, otherwise None). So x is self._df[parent_field], i.e.
|
|
978
1685
|
# what the lambda function gets 'applied' on.
|
|
979
1686
|
self._df[flat_field] = self._df[parent_field].apply(
|
|
980
|
-
lambda x, sub_field=flatten_field: (
|
|
981
|
-
x.get(sub_field, None) if isinstance(x, dict) else None
|
|
982
|
-
)
|
|
1687
|
+
lambda x, sub_field=flatten_field: (x.get(sub_field, None) if isinstance(x, dict) else None),
|
|
983
1688
|
)
|
|
984
1689
|
|
|
985
1690
|
# end method definition
|
|
986
1691
|
|
|
987
1692
|
def explode_and_flatten(
|
|
988
1693
|
self,
|
|
989
|
-
|
|
1694
|
+
explode_fields: str | list,
|
|
990
1695
|
flatten_fields: list | None = None,
|
|
991
1696
|
make_unique: bool = False,
|
|
992
1697
|
reset_index: bool = False,
|
|
993
1698
|
split_string_to_list: bool = False,
|
|
994
1699
|
separator: str = ";,",
|
|
995
|
-
) -> pd.DataFrame:
|
|
996
|
-
"""Explode a substructure in the
|
|
1700
|
+
) -> pd.DataFrame | None:
|
|
1701
|
+
"""Explode a substructure in the Pandas data frame.
|
|
997
1702
|
|
|
998
1703
|
Args:
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
flatten_fields (list):
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1704
|
+
explode_fields (str | list):
|
|
1705
|
+
Field(s) to explode. Each field to explode should have a list structure.
|
|
1706
|
+
Exploding multiple columns at once is possible. This delivers
|
|
1707
|
+
a very different result compared to exploding one column after the other!
|
|
1708
|
+
flatten_fields (list):
|
|
1709
|
+
Fields in the exploded substructure to include
|
|
1710
|
+
in the main dictionaries for easier processing.
|
|
1711
|
+
make_unique (bool, optional):
|
|
1712
|
+
If True, deduplicate the exploded data frame.
|
|
1713
|
+
reset_index (bool, False):
|
|
1714
|
+
If True, then the index is reset, False = Index is not reset.
|
|
1715
|
+
split_string_to_list (bool, optional):
|
|
1716
|
+
If True flatten the exploded data frame.
|
|
1717
|
+
separator (str, optional):
|
|
1718
|
+
Characters used to split the string values in the given column into a list.
|
|
1719
|
+
|
|
1009
1720
|
Returns:
|
|
1010
|
-
pd.DataFrame
|
|
1721
|
+
pd.DataFrame | None:
|
|
1722
|
+
Pointer to the Pandas data frame.
|
|
1723
|
+
|
|
1011
1724
|
"""
|
|
1012
1725
|
|
|
1013
|
-
def update_column(row):
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1726
|
+
def update_column(row: pd.Series, sub: str) -> str:
|
|
1727
|
+
"""Extract the value of a sub-column from a nested dictionary within a Pandas Series.
|
|
1728
|
+
|
|
1729
|
+
Args:
|
|
1730
|
+
row (pd.Series):
|
|
1731
|
+
A row from the data frame.
|
|
1732
|
+
sub (str):
|
|
1733
|
+
The sub-column name to extract.
|
|
1734
|
+
|
|
1735
|
+
Returns:
|
|
1736
|
+
str:
|
|
1737
|
+
The value of the sub-column, or an empty string if not found.
|
|
1738
|
+
|
|
1739
|
+
"""
|
|
1740
|
+
|
|
1741
|
+
if isinstance(row, dict) and sub in row:
|
|
1742
|
+
return row[sub]
|
|
1743
|
+
return ""
|
|
1744
|
+
|
|
1745
|
+
# end def update_column()
|
|
1746
|
+
|
|
1747
|
+
def string_to_list(value: str) -> list:
|
|
1748
|
+
"""Convert a string to a list by splitting it using a specified separator.
|
|
1749
|
+
|
|
1750
|
+
If the input is already a list, it is returned as-is. If the input is `None` or a missing value,
|
|
1751
|
+
an empty list is returned. Otherwise, the string is split into a list of substrings using
|
|
1752
|
+
the given separator. Leading and trailing spaces in the resulting substrings are removed.
|
|
1753
|
+
|
|
1754
|
+
Args:
|
|
1755
|
+
value (str):
|
|
1756
|
+
The input string to be converted into a list. Can also be a list, `None`,
|
|
1757
|
+
or a missing value (e.g., NaN).
|
|
1758
|
+
|
|
1759
|
+
Returns:
|
|
1760
|
+
list:
|
|
1761
|
+
A list of substrings if the input is a string, or an empty list if the input
|
|
1762
|
+
is `None` or a missing value. If the input is already a list, it is returned unchanged.
|
|
1763
|
+
|
|
1764
|
+
"""
|
|
1765
|
+
|
|
1766
|
+
# Check if the value is already a list; if so, return it directly
|
|
1767
|
+
if isinstance(value, list):
|
|
1768
|
+
return value
|
|
1769
|
+
|
|
1770
|
+
# If the value is None or a missing value (e.g., NaN), return an empty list
|
|
1771
|
+
if not value or pd.isna(value):
|
|
1772
|
+
return []
|
|
1773
|
+
|
|
1774
|
+
# Use a regular expression to split the string by the separator
|
|
1775
|
+
# and remove leading/trailing spaces from each resulting substring
|
|
1776
|
+
return_list = re.split(rf"[{separator}]\s*", str(value))
|
|
1030
1777
|
|
|
1031
1778
|
return return_list
|
|
1032
1779
|
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1780
|
+
# end def string_to_list()
|
|
1781
|
+
|
|
1782
|
+
#
|
|
1783
|
+
# Start of main method:
|
|
1784
|
+
#
|
|
1785
|
+
|
|
1786
|
+
# First do a sanity check if the data frame is not yet initialized.
|
|
1787
|
+
if self._df is None:
|
|
1788
|
+
self.logger.error(
|
|
1789
|
+
"The data frame is not initialized or empty. Cannot explode data frame.",
|
|
1790
|
+
)
|
|
1791
|
+
return None
|
|
1792
|
+
|
|
1793
|
+
# Next do a sanity check for the given explode_field. It should
|
|
1794
|
+
# either be a string (single column name) or a list (multiple column names):
|
|
1795
|
+
if isinstance(explode_fields, list):
|
|
1796
|
+
self.logger.info("Exploding list of columns -> %s", str(explode_fields))
|
|
1797
|
+
elif isinstance(explode_fields, str):
|
|
1798
|
+
self.logger.info("Exploding single column -> '%s'", explode_fields)
|
|
1037
1799
|
else:
|
|
1038
|
-
logger.error(
|
|
1039
|
-
"Illegal explode field(s) data type
|
|
1800
|
+
self.logger.error(
|
|
1801
|
+
"Illegal explode field(s) data type -> %s. Explode field must either be a string or a list of strings.",
|
|
1802
|
+
type(explode_fields),
|
|
1040
1803
|
)
|
|
1041
1804
|
return self._df
|
|
1042
1805
|
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1806
|
+
# Ensure explode_fields is a list for uniform processing:
|
|
1807
|
+
if isinstance(explode_fields, str):
|
|
1808
|
+
explode_fields = [explode_fields]
|
|
1809
|
+
|
|
1810
|
+
# Process nested field names with '.'
|
|
1811
|
+
processed_fields = []
|
|
1812
|
+
for field in explode_fields:
|
|
1813
|
+
# The "." indicates that the column has dictionary values:
|
|
1814
|
+
if "." in field:
|
|
1815
|
+
main, sub = field.split(".", 1)
|
|
1816
|
+
if main not in self._df.columns:
|
|
1817
|
+
self.logger.error(
|
|
1818
|
+
"The column -> '%s' does not exist in the data frame! Cannot explode it. Data frame has these columns -> %s",
|
|
1819
|
+
main,
|
|
1820
|
+
str(self._df.columns.tolist()),
|
|
1821
|
+
)
|
|
1822
|
+
continue
|
|
1823
|
+
|
|
1824
|
+
# Use update_column to extract the dictionary key specified by the sub value:
|
|
1825
|
+
self.logger.info(
|
|
1826
|
+
"Extracting dictionary value for key -> '%s' from column -> '%s'.",
|
|
1827
|
+
sub,
|
|
1828
|
+
main,
|
|
1829
|
+
)
|
|
1830
|
+
self._df[main] = self._df[main].apply(update_column, args=(sub,))
|
|
1831
|
+
processed_fields.append(main)
|
|
1832
|
+
else:
|
|
1833
|
+
processed_fields.append(field)
|
|
1834
|
+
|
|
1835
|
+
# Verify all processed fields exist in the data frame:
|
|
1836
|
+
missing_columns = [col for col in processed_fields if col not in self._df.columns]
|
|
1837
|
+
if missing_columns:
|
|
1838
|
+
self.logger.error(
|
|
1839
|
+
"The following columns are missing in the data frame and cannot be exploded -> %s. Data frame has these columns -> %s",
|
|
1840
|
+
missing_columns,
|
|
1841
|
+
str(self._df.columns.tolist()),
|
|
1842
|
+
)
|
|
1843
|
+
return self._df
|
|
1844
|
+
|
|
1845
|
+
# Handle splitting strings into lists if required:
|
|
1846
|
+
if split_string_to_list:
|
|
1847
|
+
for field in processed_fields:
|
|
1848
|
+
self.logger.info(
|
|
1849
|
+
"Splitting strings in column -> '%s' into lists using separator -> '%s'",
|
|
1850
|
+
field,
|
|
1058
1851
|
separator,
|
|
1059
1852
|
)
|
|
1060
1853
|
# Apply the function to convert the string values in the column (give by the name in explode_field) to lists
|
|
1061
1854
|
# The string_to_list() sub-method above also considers the separator parameter.
|
|
1062
|
-
self._df[
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1855
|
+
self._df[field] = self._df[field].apply(string_to_list)
|
|
1856
|
+
|
|
1857
|
+
# Explode all specified columns at once.
|
|
1858
|
+
# explode() can either take a string field or a list of fields.
|
|
1859
|
+
# # It is VERY important to do the explosion of multiple columns together -
|
|
1860
|
+
# otherwise we get combinatorial explosion. Explosion of multiple columns 1-by-1
|
|
1861
|
+
# is VERY different from doing the explosion together!
|
|
1862
|
+
self.logger.info("Validated column(s) to explode -> %s", processed_fields)
|
|
1863
|
+
try:
|
|
1864
|
+
self._df = self._df.explode(
|
|
1865
|
+
column=processed_fields,
|
|
1866
|
+
ignore_index=reset_index,
|
|
1867
|
+
)
|
|
1068
1868
|
except ValueError:
|
|
1069
|
-
logger.error(
|
|
1070
|
-
"
|
|
1869
|
+
self.logger.error(
|
|
1870
|
+
"Error exploding columns -> %s",
|
|
1871
|
+
processed_fields,
|
|
1071
1872
|
)
|
|
1873
|
+
return self._df
|
|
1072
1874
|
|
|
1073
1875
|
if flatten_fields:
|
|
1074
|
-
|
|
1876
|
+
# Ensure that flatten() is called for each exploded column
|
|
1877
|
+
for field in processed_fields:
|
|
1878
|
+
self.flatten(parent_field=field, flatten_fields=flatten_fields)
|
|
1075
1879
|
|
|
1880
|
+
# Deduplicate rows if required
|
|
1076
1881
|
if make_unique:
|
|
1077
1882
|
self._df.drop_duplicates(subset=flatten_fields, inplace=True)
|
|
1078
1883
|
|
|
1884
|
+
# Reset index explicitly if not handled during explode
|
|
1079
1885
|
if reset_index:
|
|
1080
|
-
self._df.reset_index(inplace=True)
|
|
1886
|
+
self._df.reset_index(drop=True, inplace=True)
|
|
1081
1887
|
|
|
1082
1888
|
return self._df
|
|
1083
1889
|
|
|
1084
1890
|
# end method definition
|
|
1085
1891
|
|
|
1086
1892
|
def drop_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
|
|
1087
|
-
"""Drop selected columns from the
|
|
1893
|
+
"""Drop selected columns from the Pandas data frame.
|
|
1088
1894
|
|
|
1089
1895
|
Args:
|
|
1090
|
-
column_names (list):
|
|
1091
|
-
|
|
1092
|
-
|
|
1896
|
+
column_names (list):
|
|
1897
|
+
The list of column names to drop.
|
|
1898
|
+
inplace (bool, optional):
|
|
1899
|
+
Whether or not the dropping should be inplace, i.e. modifying self._df.
|
|
1900
|
+
Defaults to True.
|
|
1901
|
+
|
|
1093
1902
|
Returns:
|
|
1094
|
-
pd.DataFrame:
|
|
1903
|
+
pd.DataFrame:
|
|
1904
|
+
New data frame (if inplace = False) or self._df (if inplace = True)
|
|
1905
|
+
|
|
1095
1906
|
"""
|
|
1096
1907
|
|
|
1097
1908
|
if not all(column_name in self._df.columns for column_name in column_names):
|
|
1098
|
-
# Reduce the column names to those that really exist in the
|
|
1099
|
-
column_names = [
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
if column_name in self._df.columns
|
|
1103
|
-
]
|
|
1104
|
-
logger.warning(
|
|
1105
|
-
"Reduce to these columns -> %s that do exist in the Data Frame.",
|
|
1909
|
+
# Reduce the column names to those that really exist in the data frame:
|
|
1910
|
+
column_names = [column_name for column_name in column_names if column_name in self._df.columns]
|
|
1911
|
+
self.logger.info(
|
|
1912
|
+
"Drop columns -> %s from the data frame.",
|
|
1106
1913
|
str(column_names),
|
|
1107
1914
|
)
|
|
1108
1915
|
|
|
@@ -1116,25 +1923,26 @@ class Data:
|
|
|
1116
1923
|
# end method definition
|
|
1117
1924
|
|
|
1118
1925
|
def keep_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
|
|
1119
|
-
"""Keep only selected columns
|
|
1926
|
+
"""Keep only selected columns in the data frame. Drop the rest.
|
|
1120
1927
|
|
|
1121
1928
|
Args:
|
|
1122
|
-
column_names (list):
|
|
1123
|
-
|
|
1124
|
-
|
|
1929
|
+
column_names (list):
|
|
1930
|
+
A list of column names to keep.
|
|
1931
|
+
inplace (bool, optional):
|
|
1932
|
+
If the keeping should be inplace, i.e. modifying self._df.
|
|
1933
|
+
Defaults to True.
|
|
1934
|
+
|
|
1125
1935
|
Returns:
|
|
1126
|
-
pd.DataFrame:
|
|
1936
|
+
pd.DataFrame:
|
|
1937
|
+
New data frame (if inplace = False) or self._df (if inplace = True).
|
|
1938
|
+
|
|
1127
1939
|
"""
|
|
1128
1940
|
|
|
1129
1941
|
if not all(column_name in self._df.columns for column_name in column_names):
|
|
1130
|
-
# Reduce the column names to those that really exist in the
|
|
1131
|
-
column_names = [
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
if column_name in self._df.columns
|
|
1135
|
-
]
|
|
1136
|
-
logger.warning(
|
|
1137
|
-
"Reduce to these columns -> %s that do exist in the Data Frame.",
|
|
1942
|
+
# Reduce the column names to those that really exist in the data frame:
|
|
1943
|
+
column_names = [column_name for column_name in column_names if column_name in self._df.columns]
|
|
1944
|
+
self.logger.info(
|
|
1945
|
+
"Reduce columns to keep to these columns -> %s that do exist in the data frame.",
|
|
1138
1946
|
column_names,
|
|
1139
1947
|
)
|
|
1140
1948
|
|
|
@@ -1152,272 +1960,797 @@ class Data:
|
|
|
1152
1960
|
|
|
1153
1961
|
# end method definition
|
|
1154
1962
|
|
|
1155
|
-
def
|
|
1156
|
-
"""
|
|
1963
|
+
def rename_column(self, old_column_name: str, new_column_name: str) -> bool:
|
|
1964
|
+
"""Rename a data frame column.
|
|
1965
|
+
|
|
1966
|
+
Args:
|
|
1967
|
+
old_column_name (str):
|
|
1968
|
+
The old name of the column.
|
|
1969
|
+
new_column_name (str):
|
|
1970
|
+
The new name of the column.
|
|
1971
|
+
|
|
1972
|
+
Returns:
|
|
1973
|
+
bool:
|
|
1974
|
+
True = Success, False = Error
|
|
1975
|
+
|
|
1976
|
+
"""
|
|
1977
|
+
|
|
1978
|
+
if self._df is None:
|
|
1979
|
+
return False
|
|
1980
|
+
|
|
1981
|
+
if old_column_name not in self._df.columns:
|
|
1982
|
+
self.logger.error(
|
|
1983
|
+
"Cannot rename column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
|
|
1984
|
+
old_column_name,
|
|
1985
|
+
str(self._df.columns),
|
|
1986
|
+
)
|
|
1987
|
+
return False
|
|
1988
|
+
|
|
1989
|
+
if new_column_name in self._df.columns:
|
|
1990
|
+
self.logger.error(
|
|
1991
|
+
"Cannot rename column -> '%s' to -> '%s'. New name does already exist as column in the data frame! Data frame has these columns -> %s",
|
|
1992
|
+
old_column_name,
|
|
1993
|
+
new_column_name,
|
|
1994
|
+
str(self._df.columns),
|
|
1995
|
+
)
|
|
1996
|
+
return False
|
|
1997
|
+
|
|
1998
|
+
self._df.rename(columns={old_column_name: new_column_name}, inplace=True)
|
|
1999
|
+
|
|
2000
|
+
return True
|
|
2001
|
+
|
|
2002
|
+
# end method definition
|
|
2003
|
+
|
|
2004
|
+
def is_dict_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
|
|
2005
|
+
"""Safely checks if a column predominantly contains dictionary-like objects.
|
|
2006
|
+
|
|
2007
|
+
Args:
|
|
2008
|
+
column (pd.Series):
|
|
2009
|
+
The pandas Series (column) to check.
|
|
2010
|
+
threshold (float, optional):
|
|
2011
|
+
0.0 < threshold <= 1.0. Float representation of the percentage.
|
|
2012
|
+
Default = 0.5 (50%).
|
|
2013
|
+
|
|
2014
|
+
Returns:
|
|
2015
|
+
bool:
|
|
2016
|
+
True if the column contains mostly dictionary-like objects, False otherwise.
|
|
2017
|
+
|
|
2018
|
+
"""
|
|
2019
|
+
|
|
2020
|
+
if not isinstance(column, pd.Series):
|
|
2021
|
+
self.logger.error(
|
|
2022
|
+
"Expected Pandas series, but got -> %s",
|
|
2023
|
+
str(type(column)),
|
|
2024
|
+
)
|
|
2025
|
+
return False
|
|
2026
|
+
if not 0.0 < threshold <= 1.0:
|
|
2027
|
+
self.logger.error(
|
|
2028
|
+
"Threshold must be between 0.0 and 1.0, but got -> %s",
|
|
2029
|
+
str(threshold),
|
|
2030
|
+
)
|
|
2031
|
+
return False
|
|
2032
|
+
|
|
2033
|
+
# Drop null values (NaN or None) and check types of remaining values
|
|
2034
|
+
non_null_values = column.dropna()
|
|
2035
|
+
dict_count = non_null_values.apply(lambda x: isinstance(x, dict)).sum()
|
|
2036
|
+
|
|
2037
|
+
# If more than threshold % of non-null values are dictionaries, return True.
|
|
2038
|
+
# Else return False.
|
|
2039
|
+
return dict_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
|
|
2040
|
+
|
|
2041
|
+
# end method definition
|
|
2042
|
+
|
|
2043
|
+
def is_list_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
|
|
2044
|
+
"""Safely checks if a column predominantly contains list-like objects.
|
|
2045
|
+
|
|
2046
|
+
Args:
|
|
2047
|
+
column (pd.Series):
|
|
2048
|
+
The pandas Series (column) to check.
|
|
2049
|
+
threshold (float, optional):
|
|
2050
|
+
0.0 < threshold <= 1.0. Float representation of the percentage. Default = 0.5 (50%).
|
|
2051
|
+
|
|
2052
|
+
Returns:
|
|
2053
|
+
bool:
|
|
2054
|
+
True if the column contains list-like objects, False otherwise.
|
|
2055
|
+
|
|
2056
|
+
"""
|
|
2057
|
+
|
|
2058
|
+
if not isinstance(column, pd.Series):
|
|
2059
|
+
self.logger.error(
|
|
2060
|
+
"Expected pandas series, but got -> %s",
|
|
2061
|
+
str(type(column)),
|
|
2062
|
+
)
|
|
2063
|
+
return False
|
|
2064
|
+
if not 0.0 < threshold <= 1.0:
|
|
2065
|
+
self.logger.error(
|
|
2066
|
+
"Threshold must be between 0.0 and 1.0, but got -> %s",
|
|
2067
|
+
str(threshold),
|
|
2068
|
+
)
|
|
2069
|
+
return False
|
|
2070
|
+
|
|
2071
|
+
# Drop null values (NaN or None) and check types of remaining values
|
|
2072
|
+
non_null_values = column.dropna()
|
|
2073
|
+
list_count = non_null_values.apply(lambda x: isinstance(x, list)).sum()
|
|
2074
|
+
|
|
2075
|
+
# If more than threshold % of non-null values are lists, return True.
|
|
2076
|
+
# Else return False.
|
|
2077
|
+
return list_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
|
|
2078
|
+
|
|
2079
|
+
# end method definition
|
|
2080
|
+
|
|
2081
|
+
def is_string_column(self, column: pd.Series) -> bool:
|
|
2082
|
+
"""Determine if a Pandas series predominantly contains string values, ignoring NaN values.
|
|
1157
2083
|
|
|
1158
2084
|
Args:
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
cleansings = {
|
|
1167
|
-
"airportName": {
|
|
1168
|
-
"upper": true
|
|
1169
|
-
"replacements" : {
|
|
1170
|
-
"-": " ", # replace hypen with space
|
|
1171
|
-
",\s*": " ", # remove commas followed by on or more spaces with a single space
|
|
1172
|
-
"\s+$": "", # remove trailing spaces at the end of the name
|
|
1173
|
-
"^\s+": "", # remove spaces at the beginning of the name
|
|
1174
|
-
}
|
|
1175
|
-
"length": 10
|
|
1176
|
-
}
|
|
1177
|
-
"airportId": {
|
|
1178
|
-
"upper": true
|
|
1179
|
-
"replacements" : {
|
|
1180
|
-
"K(.{3})": "\1", # if the airport has 4 charters and starts with a 'K' we remove the 'K'
|
|
1181
|
-
"\/": "", # remove forward slashes - this helps to have consistency with N/A, NA, n/a, na
|
|
1182
|
-
}
|
|
1183
|
-
}
|
|
1184
|
-
}
|
|
2085
|
+
column (pd.Series):
|
|
2086
|
+
The Pandas Series to check.
|
|
2087
|
+
|
|
2088
|
+
Returns:
|
|
2089
|
+
bool:
|
|
2090
|
+
True if all non-NaN values in the column are strings, False otherwise.
|
|
2091
|
+
|
|
1185
2092
|
"""
|
|
1186
2093
|
|
|
1187
|
-
#
|
|
2094
|
+
# Drop NaN values and check if remaining values are strings
|
|
2095
|
+
return column.dropna().map(lambda x: isinstance(x, str)).all()
|
|
2096
|
+
|
|
2097
|
+
# end method definition
|
|
2098
|
+
|
|
2099
|
+
def cleanse(self, cleansings: dict) -> None:
|
|
2100
|
+
"""Cleanse data with regular expressions and upper/lower case conversions.
|
|
2101
|
+
|
|
2102
|
+
Args:
|
|
2103
|
+
cleansings (dict):
|
|
2104
|
+
Dictionary with keys that equal the column names.
|
|
2105
|
+
The dictionary values are dictionaries themselves with
|
|
2106
|
+
these fields:
|
|
2107
|
+
* replacements (dict): name of a column in the data frame
|
|
2108
|
+
* upper (bool, optional, default = False): change the value to uppercase
|
|
2109
|
+
* lower (bool, optional, default = False): change the value to lowercase
|
|
2110
|
+
* capitalize (bool, optional, default = False) - first character upper case, rest lower-case
|
|
2111
|
+
* title (bool, optional, default = False) - first character of each word upper case
|
|
2112
|
+
* length (int, optional, default = 0): truncate to max length
|
|
2113
|
+
|
|
2114
|
+
"""
|
|
2115
|
+
|
|
2116
|
+
# Iterate over each column in the cleansing dictionary
|
|
1188
2117
|
for column, cleansing in cleansings.items():
|
|
1189
|
-
#
|
|
1190
|
-
|
|
2118
|
+
# Read the cleansing parameters:
|
|
2119
|
+
replacements = cleansing.get("replacements", {})
|
|
2120
|
+
upper = cleansing.get("upper", False)
|
|
2121
|
+
lower = cleansing.get("lower", False)
|
|
2122
|
+
capitalize = cleansing.get("capitalize", False)
|
|
2123
|
+
title = cleansing.get("title", False)
|
|
2124
|
+
length = cleansing.get("length", 0)
|
|
2125
|
+
|
|
2126
|
+
# Handle dict columns - we expect the column name to seperate
|
|
2127
|
+
# main field from sub field using a dot syntax (e.g., "column.subfield")
|
|
1191
2128
|
if "." in column:
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
2129
|
+
column, dict_key = column.split(".")
|
|
2130
|
+
if column not in self._df.columns:
|
|
2131
|
+
self.logger.error(
|
|
2132
|
+
"Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
|
|
2133
|
+
column,
|
|
2134
|
+
str(self._df.columns),
|
|
2135
|
+
)
|
|
1195
2136
|
continue
|
|
1196
|
-
#
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
2137
|
+
# Apply cleansing to dictionary values in the main column
|
|
2138
|
+
self.logger.info(
|
|
2139
|
+
"Cleansing for column -> '%s' has a subfield -> '%s' configured. Do cleansing for dictionary items with key -> '%s'...",
|
|
2140
|
+
column,
|
|
2141
|
+
dict_key,
|
|
2142
|
+
dict_key,
|
|
2143
|
+
)
|
|
2144
|
+
self._df[column] = self._df[column].apply(
|
|
2145
|
+
lambda x,
|
|
2146
|
+
dict_key=dict_key,
|
|
2147
|
+
replacements=replacements,
|
|
2148
|
+
upper=upper,
|
|
2149
|
+
lower=lower,
|
|
2150
|
+
capitalize=capitalize,
|
|
2151
|
+
title=title,
|
|
2152
|
+
length=length: self._cleanse_subfield(
|
|
1200
2153
|
data=x,
|
|
1201
|
-
|
|
1202
|
-
replacements=
|
|
1203
|
-
upper=
|
|
1204
|
-
lower=
|
|
1205
|
-
|
|
1206
|
-
|
|
2154
|
+
dict_key=dict_key,
|
|
2155
|
+
replacements=replacements,
|
|
2156
|
+
upper=upper,
|
|
2157
|
+
lower=lower,
|
|
2158
|
+
capitalize=capitalize,
|
|
2159
|
+
title=title,
|
|
2160
|
+
length=length,
|
|
2161
|
+
),
|
|
1207
2162
|
)
|
|
1208
|
-
|
|
1209
|
-
|
|
2163
|
+
# end if "." in column
|
|
2164
|
+
else: # the else case handles strings and list columns
|
|
2165
|
+
if column not in self._df.columns:
|
|
2166
|
+
self.logger.error(
|
|
2167
|
+
"Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
|
|
2168
|
+
column,
|
|
2169
|
+
str(self._df.columns),
|
|
2170
|
+
)
|
|
1210
2171
|
continue
|
|
1211
2172
|
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
self.
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
|
|
1240
|
-
# Wrap with word boundaries for whole-word matching
|
|
1241
|
-
regex_pattern = rf"\b{regex_pattern}\b"
|
|
1242
|
-
self._df[column] = self._df[column].str.replace(
|
|
1243
|
-
pat=regex_pattern, repl=replacement, regex=True
|
|
2173
|
+
# Handle string columns:
|
|
2174
|
+
if self.is_string_column(self._df[column]):
|
|
2175
|
+
# Apply cleansing operations on string column
|
|
2176
|
+
self.logger.info(
|
|
2177
|
+
"Column -> '%s' has string values. Do cleansing for string values...",
|
|
2178
|
+
column,
|
|
2179
|
+
)
|
|
2180
|
+
self._df[column] = self._df[column].apply(
|
|
2181
|
+
lambda x,
|
|
2182
|
+
replacements=replacements,
|
|
2183
|
+
upper=upper,
|
|
2184
|
+
lower=lower,
|
|
2185
|
+
capitalize=capitalize,
|
|
2186
|
+
title=title,
|
|
2187
|
+
length=length: (
|
|
2188
|
+
self._apply_string_cleansing(
|
|
2189
|
+
value=x,
|
|
2190
|
+
replacements=replacements,
|
|
2191
|
+
upper=upper,
|
|
2192
|
+
lower=lower,
|
|
2193
|
+
capitalize=capitalize,
|
|
2194
|
+
title=title,
|
|
2195
|
+
length=length,
|
|
2196
|
+
)
|
|
2197
|
+
if isinstance(x, str)
|
|
2198
|
+
else x
|
|
2199
|
+
),
|
|
1244
2200
|
)
|
|
1245
2201
|
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
self.
|
|
1251
|
-
|
|
2202
|
+
# Handle list columns:
|
|
2203
|
+
elif self.is_list_column(self._df[column]):
|
|
2204
|
+
# Handle list-like columns for this we iterate over each list item
|
|
2205
|
+
# and apply the cleansing by calling _apply_string_cleansing() for item:
|
|
2206
|
+
self.logger.info(
|
|
2207
|
+
"Column -> '%s' has list values. Do cleansing for each list item...",
|
|
2208
|
+
column,
|
|
2209
|
+
)
|
|
2210
|
+
self._df[column] = self._df[column].apply(
|
|
2211
|
+
lambda x,
|
|
2212
|
+
replacements=replacements,
|
|
2213
|
+
upper=upper,
|
|
2214
|
+
lower=lower,
|
|
2215
|
+
capitalize=capitalize,
|
|
2216
|
+
title=title,
|
|
2217
|
+
length=length: (
|
|
2218
|
+
[
|
|
2219
|
+
(
|
|
2220
|
+
self._apply_string_cleansing(
|
|
2221
|
+
value=item,
|
|
2222
|
+
replacements=replacements,
|
|
2223
|
+
upper=upper,
|
|
2224
|
+
lower=lower,
|
|
2225
|
+
capitalize=capitalize,
|
|
2226
|
+
title=title,
|
|
2227
|
+
length=length,
|
|
2228
|
+
)
|
|
2229
|
+
if isinstance(
|
|
2230
|
+
item,
|
|
2231
|
+
str,
|
|
2232
|
+
) # we just change string list items
|
|
2233
|
+
else item
|
|
2234
|
+
)
|
|
2235
|
+
for item in x
|
|
2236
|
+
]
|
|
2237
|
+
if isinstance(x, list)
|
|
2238
|
+
else x
|
|
2239
|
+
),
|
|
2240
|
+
)
|
|
2241
|
+
|
|
2242
|
+
else:
|
|
2243
|
+
self.logger.error(
|
|
2244
|
+
"Column -> '%s' is not a string, list, or dict-like column. Skipping cleansing...",
|
|
2245
|
+
column,
|
|
2246
|
+
)
|
|
2247
|
+
# end else handling strings and lists
|
|
2248
|
+
# for column, cleansing in cleansings.items()
|
|
2249
|
+
|
|
2250
|
+
# end method definition
|
|
2251
|
+
|
|
2252
|
+
def _cleanse_dictionary(
|
|
2253
|
+
self,
|
|
2254
|
+
data: dict,
|
|
2255
|
+
dict_key: str,
|
|
2256
|
+
replacements: dict[str, str],
|
|
2257
|
+
upper: bool,
|
|
2258
|
+
lower: bool,
|
|
2259
|
+
capitalize: bool = False,
|
|
2260
|
+
title: bool = False,
|
|
2261
|
+
length: int = 0,
|
|
2262
|
+
) -> dict:
|
|
2263
|
+
"""Cleanse dictionary data within a single column value that has a given key.
|
|
2264
|
+
|
|
2265
|
+
Args:
|
|
2266
|
+
data (dict):
|
|
2267
|
+
The column dictionary value.
|
|
2268
|
+
dict_key (str):
|
|
2269
|
+
The dictionary key whose value should be cleansed in the row to cleanse.
|
|
2270
|
+
replacements (dict):
|
|
2271
|
+
Dictionary of regex replacements to apply to the subfield value.
|
|
2272
|
+
upper (bool):
|
|
2273
|
+
If True, convert value in subfield to upper-case.
|
|
2274
|
+
lower (bool):
|
|
2275
|
+
If True, convert value in subfield to lower-case.
|
|
2276
|
+
capitalize (bool, optional):
|
|
2277
|
+
If True, capitalize the first letter of the subfield value.
|
|
2278
|
+
title (bool, optional):
|
|
2279
|
+
If True, title-case the subfield value.
|
|
2280
|
+
length (int, optional):
|
|
2281
|
+
The maximum length for the subfield value.
|
|
2282
|
+
|
|
2283
|
+
Returns:
|
|
2284
|
+
dict:
|
|
2285
|
+
The updated data with the cleansing applied to the dictionary item with the given key.
|
|
2286
|
+
|
|
2287
|
+
"""
|
|
2288
|
+
|
|
2289
|
+
if pd.isna(data):
|
|
2290
|
+
return data
|
|
2291
|
+
|
|
2292
|
+
if dict_key not in data:
|
|
2293
|
+
self.logger.warning(
|
|
2294
|
+
"The dictionary key -> '%s' (field) is not in the data frame row! Cleansing skipped!",
|
|
2295
|
+
dict_key,
|
|
2296
|
+
)
|
|
2297
|
+
return data
|
|
2298
|
+
|
|
2299
|
+
# 1. Read the value to be cleansed from the data dict:
|
|
2300
|
+
value = data[dict_key]
|
|
2301
|
+
|
|
2302
|
+
# 2. Apply string operations based on the type of the value (str, list, or dict)
|
|
2303
|
+
|
|
2304
|
+
if isinstance(value, str):
|
|
2305
|
+
# If the value is a string, apply the string operations directly
|
|
2306
|
+
value: str = self._apply_string_cleansing(
|
|
2307
|
+
value=value,
|
|
2308
|
+
replacements=replacements,
|
|
2309
|
+
upper=upper,
|
|
2310
|
+
lower=lower,
|
|
2311
|
+
capitalize=capitalize,
|
|
2312
|
+
title=title,
|
|
2313
|
+
length=length,
|
|
2314
|
+
)
|
|
2315
|
+
elif isinstance(value, list):
|
|
2316
|
+
# If the value is a list, apply string operations to each element
|
|
2317
|
+
value: list = [
|
|
2318
|
+
(
|
|
2319
|
+
self._apply_string_cleansing(
|
|
2320
|
+
value=item,
|
|
2321
|
+
replacements=replacements,
|
|
2322
|
+
upper=upper,
|
|
2323
|
+
lower=lower,
|
|
2324
|
+
capitalize=capitalize,
|
|
2325
|
+
title=title,
|
|
2326
|
+
length=length,
|
|
2327
|
+
)
|
|
2328
|
+
if isinstance(item, str)
|
|
2329
|
+
else item
|
|
2330
|
+
)
|
|
2331
|
+
for item in value
|
|
2332
|
+
]
|
|
2333
|
+
elif isinstance(value, dict):
|
|
2334
|
+
# If the value is a dictionary, apply string operations to each value
|
|
2335
|
+
value: dict = {
|
|
2336
|
+
k: (
|
|
2337
|
+
self._apply_string_cleansing(
|
|
2338
|
+
value=v,
|
|
2339
|
+
replacements=replacements,
|
|
2340
|
+
upper=upper,
|
|
2341
|
+
lower=lower,
|
|
2342
|
+
capitalize=capitalize,
|
|
2343
|
+
title=title,
|
|
2344
|
+
length=length,
|
|
1252
2345
|
)
|
|
2346
|
+
if isinstance(v, str)
|
|
2347
|
+
else v
|
|
2348
|
+
)
|
|
2349
|
+
for k, v in value.items()
|
|
2350
|
+
}
|
|
2351
|
+
|
|
2352
|
+
# 3. Write back the cleansed value to the data dict:
|
|
2353
|
+
data[dict_key] = value
|
|
1253
2354
|
|
|
1254
|
-
|
|
2355
|
+
return data
|
|
1255
2356
|
|
|
1256
2357
|
# end method definition
|
|
1257
2358
|
|
|
1258
2359
|
def _cleanse_subfield(
|
|
1259
2360
|
self,
|
|
1260
|
-
data:
|
|
1261
|
-
|
|
1262
|
-
replacements: dict,
|
|
2361
|
+
data: dict | list,
|
|
2362
|
+
dict_key: str,
|
|
2363
|
+
replacements: dict[str, str],
|
|
1263
2364
|
upper: bool,
|
|
1264
2365
|
lower: bool,
|
|
2366
|
+
capitalize: bool = False,
|
|
2367
|
+
title: bool = False,
|
|
1265
2368
|
length: int = 0,
|
|
1266
|
-
) ->
|
|
1267
|
-
"""
|
|
2369
|
+
) -> dict | list:
|
|
2370
|
+
"""Cleanse subfield data within a single column value.
|
|
2371
|
+
|
|
2372
|
+
This is NOT a pd.Series but either a dictionary or a list of dictionaries.
|
|
1268
2373
|
|
|
1269
2374
|
Args:
|
|
1270
|
-
data (
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
2375
|
+
data (dict | list):
|
|
2376
|
+
The column value. Can be a dictionary or a list of dictionaries
|
|
2377
|
+
dict_key (str):
|
|
2378
|
+
The dictionary key whose value should be cleansed in the data to cleanse.
|
|
2379
|
+
replacements (dict):
|
|
2380
|
+
Dictionary of regex replacements to apply to the subfield value.
|
|
2381
|
+
upper (bool):
|
|
2382
|
+
If True, convert value in subfield to upper-case.
|
|
2383
|
+
lower (bool):
|
|
2384
|
+
If True, convert value in subfield to lower-case.
|
|
2385
|
+
capitalize (bool, optional):
|
|
2386
|
+
If True, capitalize the first letter of the subfield value.
|
|
2387
|
+
title (bool, optional):
|
|
2388
|
+
If True, title-case the subfield value.
|
|
2389
|
+
length (int, optional):
|
|
2390
|
+
The maximum length for the subfield value.
|
|
2391
|
+
|
|
1276
2392
|
Returns:
|
|
1277
|
-
|
|
2393
|
+
dict | list:
|
|
2394
|
+
The updated data with the cleansing applied to the subfield.
|
|
2395
|
+
|
|
1278
2396
|
"""
|
|
1279
2397
|
|
|
1280
2398
|
if isinstance(data, list):
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
for regex_pattern, replacement in replacements.items():
|
|
1293
|
-
if replacement:
|
|
1294
|
-
regex_pattern = rf"\b{regex_pattern}\b"
|
|
1295
|
-
item[sub_field] = re.sub(
|
|
1296
|
-
regex_pattern, replacement, item[sub_field]
|
|
1297
|
-
)
|
|
1298
|
-
if length > 0:
|
|
1299
|
-
item[sub_field] = item[sub_field][:length]
|
|
1300
|
-
data[i] = item
|
|
1301
|
-
elif isinstance(data, dict):
|
|
1302
|
-
# If data is a dictionary, apply cleansing directly to the subfield
|
|
1303
|
-
if sub_field in data and not pd.isnull(data[sub_field]):
|
|
1304
|
-
if upper:
|
|
1305
|
-
data[sub_field] = data[sub_field].upper()
|
|
1306
|
-
elif lower:
|
|
1307
|
-
data[sub_field] = data[sub_field].lower()
|
|
1308
|
-
for regex_pattern, replacement in replacements.items():
|
|
1309
|
-
if replacement:
|
|
1310
|
-
regex_pattern = rf"\b{regex_pattern}\b"
|
|
1311
|
-
data[sub_field] = re.sub(
|
|
1312
|
-
regex_pattern, replacement, data[sub_field]
|
|
2399
|
+
data = [
|
|
2400
|
+
(
|
|
2401
|
+
self._cleanse_dictionary(
|
|
2402
|
+
data=item,
|
|
2403
|
+
dict_key=dict_key,
|
|
2404
|
+
replacements=replacements,
|
|
2405
|
+
upper=upper,
|
|
2406
|
+
lower=lower,
|
|
2407
|
+
capitalize=capitalize,
|
|
2408
|
+
title=title,
|
|
2409
|
+
length=length,
|
|
1313
2410
|
)
|
|
1314
|
-
|
|
1315
|
-
|
|
2411
|
+
if item is not None and dict_key in item and not pd.isna(item[dict_key])
|
|
2412
|
+
else item
|
|
2413
|
+
)
|
|
2414
|
+
for item in data
|
|
2415
|
+
]
|
|
2416
|
+
elif isinstance(data, dict):
|
|
2417
|
+
data = self._cleanse_dictionary(
|
|
2418
|
+
data=data,
|
|
2419
|
+
dict_key=dict_key,
|
|
2420
|
+
replacements=replacements,
|
|
2421
|
+
upper=upper,
|
|
2422
|
+
lower=lower,
|
|
2423
|
+
capitalize=capitalize,
|
|
2424
|
+
title=title,
|
|
2425
|
+
length=length,
|
|
2426
|
+
)
|
|
2427
|
+
|
|
1316
2428
|
return data
|
|
1317
2429
|
|
|
1318
2430
|
# end method definition
|
|
1319
2431
|
|
|
1320
|
-
def
|
|
1321
|
-
|
|
2432
|
+
def _apply_string_cleansing(
|
|
2433
|
+
self,
|
|
2434
|
+
value: str,
|
|
2435
|
+
replacements: dict[str, str],
|
|
2436
|
+
upper: bool,
|
|
2437
|
+
lower: bool,
|
|
2438
|
+
capitalize: bool,
|
|
2439
|
+
title: bool,
|
|
2440
|
+
length: int,
|
|
2441
|
+
) -> str | None:
|
|
2442
|
+
"""Apply string operations (upper, lower, capitalize, title-case, replacements) to a string.
|
|
2443
|
+
|
|
2444
|
+
Args:
|
|
2445
|
+
value (str):
|
|
2446
|
+
The string value to which the operations will be applied.
|
|
2447
|
+
replacements (dict[str, str]):
|
|
2448
|
+
A dictionary of regular expression patterns (keys) and replacement strings (values) to apply to the string.
|
|
2449
|
+
upper (bool):
|
|
2450
|
+
If True, convert the string to uppercase.
|
|
2451
|
+
lower (bool):
|
|
2452
|
+
If True, convert the string to lowercase.
|
|
2453
|
+
capitalize (bool):
|
|
2454
|
+
If True, capitalize the first letter of the string and lowercase the rest. Default is False.
|
|
2455
|
+
title (bool):
|
|
2456
|
+
If True, convert the string to title-case (first letter of each word is capitalized). Default is False.
|
|
2457
|
+
length (int):
|
|
2458
|
+
If greater than 0, truncate the string to this length. Default is 0 (no truncation).
|
|
2459
|
+
|
|
2460
|
+
Returns:
|
|
2461
|
+
str | None:
|
|
2462
|
+
The updated string with all the applied operations. None in case an error occured.
|
|
2463
|
+
|
|
2464
|
+
Example:
|
|
2465
|
+
value = "hello world"
|
|
2466
|
+
replacements = {r"world": "there"}
|
|
2467
|
+
upper = True
|
|
2468
|
+
length = 5
|
|
2469
|
+
|
|
2470
|
+
result = _apply_string_cleansing(value, replacements, upper, length=length)
|
|
2471
|
+
# result would be "HELLO"
|
|
2472
|
+
|
|
2473
|
+
"""
|
|
2474
|
+
|
|
2475
|
+
if not isinstance(
|
|
2476
|
+
value,
|
|
2477
|
+
str,
|
|
2478
|
+
): # Only apply string operations if the value is a string
|
|
2479
|
+
return None
|
|
2480
|
+
|
|
2481
|
+
if upper:
|
|
2482
|
+
value = value.upper()
|
|
2483
|
+
if lower:
|
|
2484
|
+
value = value.lower()
|
|
2485
|
+
if capitalize:
|
|
2486
|
+
value = value.capitalize()
|
|
2487
|
+
if title:
|
|
2488
|
+
value = value.title()
|
|
2489
|
+
|
|
2490
|
+
# Handle regex replacements
|
|
2491
|
+
for regex_pattern, replacement in replacements.items():
|
|
2492
|
+
if regex_pattern:
|
|
2493
|
+
# Check if the pattern does NOT contain any regex special characters
|
|
2494
|
+
# (excluding dot and ampersand) and ONLY then use \b ... \b
|
|
2495
|
+
# Special regexp characters include: ^ $ * + ? ( ) | [ ] { } \
|
|
2496
|
+
if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
|
|
2497
|
+
# Wrap with word boundaries for whole-word matching
|
|
2498
|
+
# \b is a word boundary anchor in regular expressions.
|
|
2499
|
+
# It matches a position where one side is a word character
|
|
2500
|
+
# (like a letter or digit) and the other side is a non-word character
|
|
2501
|
+
# (like whitespace or punctuation). It's used to match whole words.
|
|
2502
|
+
# We want to have this to e.g. not replace "INT" with "INTERNATIONAL"
|
|
2503
|
+
# if the word is already "INTERNATIONAL". It is important
|
|
2504
|
+
# that the \b ... \b enclosure is ONLY used if regex_pattern is NOT
|
|
2505
|
+
# a regular expression but just a normal string.
|
|
2506
|
+
# TODO: we may reconsider if re.escape() is required or not:
|
|
2507
|
+
regex_pattern = re.escape(regex_pattern)
|
|
2508
|
+
regex_pattern = rf"\b{regex_pattern}\b"
|
|
2509
|
+
try:
|
|
2510
|
+
value = re.sub(regex_pattern, replacement, value)
|
|
2511
|
+
except re.error:
|
|
2512
|
+
self.logger.error(
|
|
2513
|
+
"Invalid regex pattern -> '%s' in replacement processing!",
|
|
2514
|
+
regex_pattern,
|
|
2515
|
+
)
|
|
2516
|
+
continue
|
|
2517
|
+
|
|
2518
|
+
# Truncate to the specified length, starting from index 0
|
|
2519
|
+
if 0 < length < len(value):
|
|
2520
|
+
value = value[:length]
|
|
2521
|
+
|
|
2522
|
+
return value
|
|
2523
|
+
|
|
2524
|
+
# end method definition
|
|
2525
|
+
|
|
2526
|
+
def filter(
|
|
2527
|
+
self,
|
|
2528
|
+
conditions: list,
|
|
2529
|
+
inplace: bool = True,
|
|
2530
|
+
reset_index: bool = True,
|
|
2531
|
+
) -> pd.DataFrame | None:
|
|
2532
|
+
"""Filter the data frame based on (multiple) conditions.
|
|
1322
2533
|
|
|
1323
2534
|
Args:
|
|
1324
|
-
conditions (list):
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
2535
|
+
conditions (list):
|
|
2536
|
+
Conditions are a list of dictionaries with 3 items:
|
|
2537
|
+
* field (str): The name of a column in the data frame
|
|
2538
|
+
* value (str or list):
|
|
2539
|
+
Expected value (filter criterium).
|
|
2540
|
+
If it is a list then one of the list elements must match the field value (OR)
|
|
2541
|
+
* equal (bool):
|
|
2542
|
+
Whether to test for equal or non-equal. If not specified equal is treated as True.
|
|
2543
|
+
* regex (bool):
|
|
2544
|
+
This flag controls if the value is interpreted as a
|
|
2545
|
+
regular expression. If there is no regex item in the
|
|
2546
|
+
dictionary then the default is False (= values is NOT regex).
|
|
2547
|
+
* enabled (bool):
|
|
2548
|
+
True or False. The filter is only applied if 'enabled = True'
|
|
2549
|
+
If there are multiple conditions in the list each has to evaluate to True (AND)
|
|
2550
|
+
inplace (bool, optional):
|
|
2551
|
+
Defines if the self._df is modified (inplace) or just
|
|
2552
|
+
a new data frame is returned. Defaults to True.
|
|
2553
|
+
reset_index (bool, optional):
|
|
2554
|
+
Filter removes rows. If filter_index = True then the numbering
|
|
2555
|
+
of the index is newly calculated
|
|
2556
|
+
|
|
1335
2557
|
Returns:
|
|
1336
|
-
pd.DataFrame
|
|
2558
|
+
pd.DataFrame | None:
|
|
2559
|
+
A new data frame or pointer to self._df (depending on the value of 'inplace').
|
|
2560
|
+
None in case of an error.
|
|
2561
|
+
|
|
1337
2562
|
"""
|
|
1338
2563
|
|
|
1339
2564
|
if self._df is None:
|
|
1340
|
-
logger.error("
|
|
2565
|
+
self.logger.error("Data frame is not initialized.")
|
|
1341
2566
|
return None
|
|
1342
2567
|
|
|
1343
2568
|
if self._df.empty:
|
|
1344
|
-
logger.error("
|
|
2569
|
+
self.logger.error("Data frame is empty.")
|
|
1345
2570
|
return None
|
|
1346
2571
|
|
|
1347
|
-
#
|
|
1348
|
-
#
|
|
2572
|
+
# First filtered_df is the full data frame.
|
|
2573
|
+
# Then it is subsequentially reduced by each condition
|
|
1349
2574
|
# at the end it is just those rows that match all conditions.
|
|
1350
|
-
filtered_df = self._df
|
|
2575
|
+
filtered_df = self._df if inplace else self._df.copy()
|
|
2576
|
+
|
|
2577
|
+
def list_matches(row: list, values: list) -> bool:
|
|
2578
|
+
"""Check if any item in the 'values' list is present in the given 'row' list.
|
|
2579
|
+
|
|
2580
|
+
Args:
|
|
2581
|
+
row (list):
|
|
2582
|
+
A list of items from the data frame column.
|
|
2583
|
+
values (list):
|
|
2584
|
+
A list of values to check for in the 'row'.
|
|
2585
|
+
|
|
2586
|
+
Returns:
|
|
2587
|
+
bool:
|
|
2588
|
+
True if any item in 'values' is found in 'row', otherwise False.
|
|
2589
|
+
|
|
2590
|
+
"""
|
|
2591
|
+
|
|
2592
|
+
return any(item in values for item in row)
|
|
2593
|
+
|
|
2594
|
+
def dict_matches(row: dict, key: str, values: list) -> bool:
|
|
2595
|
+
"""Check if the value for the dictionary 'key' is in 'values'.
|
|
1351
2596
|
|
|
1352
|
-
|
|
2597
|
+
Args:
|
|
2598
|
+
row (dict):
|
|
2599
|
+
A dictionary from the data frame column.
|
|
2600
|
+
key (str):
|
|
2601
|
+
The key to lookup in the dictionary.
|
|
2602
|
+
values (list):
|
|
2603
|
+
A list of values to check for in the 'row'.
|
|
2604
|
+
|
|
2605
|
+
Returns:
|
|
2606
|
+
bool:
|
|
2607
|
+
True, if the value for the dictionary key is in 'values', otherwise False.
|
|
2608
|
+
|
|
2609
|
+
"""
|
|
2610
|
+
|
|
2611
|
+
if not row or key not in row:
|
|
2612
|
+
return False
|
|
2613
|
+
|
|
2614
|
+
return row[key] in values
|
|
2615
|
+
|
|
2616
|
+
# We traverse a list of conditions. Each condition must evaluate to True
|
|
1353
2617
|
# otherwise the current workspace or document (i.e. the data set for these objects)
|
|
1354
|
-
# will be skipped.
|
|
2618
|
+
# will be skipped.
|
|
1355
2619
|
for condition in conditions:
|
|
2620
|
+
# Check if the condition is enabled. If 'enabled' is not
|
|
2621
|
+
# in the condition dict then we assume it is enabled.
|
|
2622
|
+
if not condition.get("enabled", True):
|
|
2623
|
+
continue
|
|
1356
2624
|
field = condition.get("field", None)
|
|
1357
2625
|
if not field:
|
|
1358
|
-
logger.error(
|
|
2626
|
+
self.logger.error(
|
|
2627
|
+
"Missing value for filter condition 'field' in payload!",
|
|
2628
|
+
)
|
|
1359
2629
|
continue
|
|
2630
|
+
if "." in field:
|
|
2631
|
+
field, sub = field.split(".", 1)
|
|
2632
|
+
else:
|
|
2633
|
+
sub = None
|
|
2634
|
+
|
|
1360
2635
|
if field not in self._df.columns:
|
|
1361
|
-
logger.warning(
|
|
1362
|
-
"Filter condition field -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
|
|
2636
|
+
self.logger.warning(
|
|
2637
|
+
"Filter condition field -> '%s' does not exist as column in the data frame! Data frame has these columns -> %s",
|
|
1363
2638
|
field,
|
|
1364
2639
|
str(self._df.columns),
|
|
1365
2640
|
)
|
|
1366
|
-
continue # Skip filtering for columns not present in
|
|
2641
|
+
continue # Skip filtering for columns not present in data frame
|
|
2642
|
+
|
|
2643
|
+
regex = condition.get("regex", False)
|
|
2644
|
+
# We need the column to be of type string if we want to use regular expressions
|
|
2645
|
+
# so if the column is not yet a string we convert the column to string:
|
|
2646
|
+
if regex and filtered_df[field].dtype != "object":
|
|
2647
|
+
# Change type of column to string:
|
|
2648
|
+
filtered_df[field] = filtered_df[field].astype(str)
|
|
2649
|
+
filtered_df[field] = filtered_df[field].fillna("")
|
|
2650
|
+
|
|
1367
2651
|
value = condition.get("value", None)
|
|
1368
|
-
if
|
|
1369
|
-
|
|
1370
|
-
|
|
2652
|
+
if value is None:
|
|
2653
|
+
# Support alternative syntax using plural.
|
|
2654
|
+
value = condition.get("values", None)
|
|
2655
|
+
if value is None:
|
|
2656
|
+
self.logger.error(
|
|
2657
|
+
"Missing filter value(s) for filter condition field -> '%s'!",
|
|
2658
|
+
field,
|
|
1371
2659
|
)
|
|
1372
2660
|
continue
|
|
1373
|
-
regex = condition.get("regex", False)
|
|
1374
|
-
|
|
1375
|
-
logger.info(
|
|
1376
|
-
"Data Frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
|
|
1377
|
-
filtered_df.shape[0],
|
|
1378
|
-
filtered_df.shape[1],
|
|
1379
|
-
str(condition),
|
|
1380
|
-
)
|
|
1381
|
-
|
|
1382
|
-
filtered_dfs = []
|
|
1383
2661
|
|
|
1384
2662
|
# if a single string is passed as value we put
|
|
1385
2663
|
# it into an 1-item list to simplify the following code:
|
|
1386
2664
|
if not isinstance(value, list):
|
|
1387
2665
|
value = [value]
|
|
1388
2666
|
|
|
1389
|
-
#
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
2667
|
+
# If all values in the condition are strings then we
|
|
2668
|
+
# want the column also to be of type string:
|
|
2669
|
+
if all(isinstance(v, str) for v in value):
|
|
2670
|
+
# Change type of column to string:
|
|
2671
|
+
# filtered_df[field] = filtered_df[field].astype(str)
|
|
2672
|
+
# filtered_df[field] = filtered_df[field].fillna("").astype(str)
|
|
2673
|
+
# filtered_df[field] = filtered_df[field].fillna("")
|
|
2674
|
+
|
|
2675
|
+
# When inplace == True, filtered_df is just a reference to self._df.
|
|
2676
|
+
# Using .loc[:, field] ensures that Pandas updates the column correctly in self._df.
|
|
2677
|
+
# When inplace == False, filtered_df is a full copy (self._df.copy() above),
|
|
2678
|
+
# so modifications remain in filtered_df.
|
|
2679
|
+
# .loc[:, field] ensures no SettingWithCopyWarning, since filtered_df is now a separate DataFrame.
|
|
2680
|
+
filtered_df.loc[:, field] = filtered_df[field].fillna("").astype(str)
|
|
2681
|
+
|
|
2682
|
+
self.logger.info(
|
|
2683
|
+
"Data frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
|
|
2684
|
+
str(filtered_df.shape[0]),
|
|
2685
|
+
str(filtered_df.shape[1]),
|
|
2686
|
+
str(condition),
|
|
2687
|
+
)
|
|
2688
|
+
|
|
2689
|
+
# Check if the column is boolean
|
|
2690
|
+
if pd.api.types.is_bool_dtype(filtered_df[field]):
|
|
2691
|
+
# Convert string representations of booleans to actual booleans
|
|
2692
|
+
value = [v.lower() in ["true", "1"] if isinstance(v, str) else bool(v) for v in value]
|
|
2693
|
+
|
|
2694
|
+
# Do we want to test for equalitiy or non-equality?
|
|
2695
|
+
# For lists equality means: value is in the list
|
|
2696
|
+
# For lists non-equality means: value is NOT in the list
|
|
2697
|
+
test_for_equal = condition.get("equal", True)
|
|
2698
|
+
|
|
2699
|
+
# Check if the column contains only lists (every non-empty element in the column is a list).
|
|
2700
|
+
# `filtered_df[field]`: Access the column with the name specified in 'field'.
|
|
2701
|
+
# `.dropna()`: Drop None or NaN rows for the test.
|
|
2702
|
+
# `.apply(lambda x: isinstance(x, list))`: For each element in the column, check if it is a list.
|
|
2703
|
+
# `.all()`: Ensure that all elements in the column satisfy the condition of being a list.
|
|
2704
|
+
if filtered_df[field].dropna().apply(lambda x: isinstance(x, list)).all():
|
|
2705
|
+
if not test_for_equal:
|
|
2706
|
+
filtered_df = filtered_df[~filtered_df[field].apply(list_matches, values=value)]
|
|
2707
|
+
else:
|
|
2708
|
+
filtered_df = filtered_df[filtered_df[field].apply(list_matches, values=value)]
|
|
2709
|
+
# Check if the column contains only dictionaries (every non-empty element in the column is a dict).
|
|
2710
|
+
# `filtered_df[field]`: Access the column with the name specified in 'field'.
|
|
2711
|
+
# `.dropna()`: Drop None or NaN rows for the test.
|
|
2712
|
+
# `.apply(lambda x: isinstance(x, dict))`: For each element in the column, check if it is a dict.
|
|
2713
|
+
# `.all()`: Ensure that all elements in the column satisfy the condition of being a dictionary.
|
|
2714
|
+
elif filtered_df[field].dropna().apply(lambda x: isinstance(x, dict)).all():
|
|
2715
|
+
if not sub:
|
|
2716
|
+
self.logger.error(
|
|
2717
|
+
"Filtering on dictionary values need a key. This needs to be provided with 'field.key' syntax!",
|
|
1397
2718
|
)
|
|
2719
|
+
continue
|
|
2720
|
+
if not test_for_equal:
|
|
2721
|
+
filtered_df = filtered_df[~filtered_df[field].apply(dict_matches, key=sub, values=value)]
|
|
1398
2722
|
else:
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
str(
|
|
1411
|
-
|
|
1412
|
-
|
|
2723
|
+
filtered_df = filtered_df[filtered_df[field].apply(dict_matches, key=sub, values=value)]
|
|
2724
|
+
# Check if the column has boolean values:
|
|
2725
|
+
elif pd.api.types.is_bool_dtype(filtered_df[field]):
|
|
2726
|
+
# For a boolean filter we can drop NA values:
|
|
2727
|
+
filtered_df = filtered_df.dropna(subset=[field])
|
|
2728
|
+
if not test_for_equal:
|
|
2729
|
+
filtered_df = filtered_df[~filtered_df[field].isin(value)]
|
|
2730
|
+
else:
|
|
2731
|
+
filtered_df = filtered_df[filtered_df[field].isin(value)]
|
|
2732
|
+
elif not regex:
|
|
2733
|
+
if pd.api.types.is_string_dtype(filtered_df[field]):
|
|
2734
|
+
filtered_df[field] = filtered_df[field].str.strip()
|
|
2735
|
+
if not test_for_equal:
|
|
2736
|
+
filtered_df = filtered_df[~filtered_df[field].isin(value)]
|
|
2737
|
+
else:
|
|
2738
|
+
filtered_df = filtered_df[filtered_df[field].isin(value)]
|
|
1413
2739
|
else:
|
|
1414
|
-
#
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
2740
|
+
# Create a pure boolean pd.Series as a filter criterium:
|
|
2741
|
+
regex_condition = filtered_df[field].str.contains(
|
|
2742
|
+
"|".join(value),
|
|
2743
|
+
regex=True,
|
|
2744
|
+
na=False,
|
|
2745
|
+
)
|
|
2746
|
+
# Apply the boolean pd.Series named 'regex_condition' as
|
|
2747
|
+
# a filter - either non-negated or negated (using ~):
|
|
2748
|
+
filtered_df = filtered_df[~regex_condition] if not test_for_equal else filtered_df[regex_condition]
|
|
2749
|
+
|
|
2750
|
+
self.logger.info(
|
|
2751
|
+
"Data frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
|
|
2752
|
+
str(filtered_df.shape[0]),
|
|
2753
|
+
str(filtered_df.shape[1]),
|
|
1421
2754
|
str(condition),
|
|
1422
2755
|
)
|
|
1423
2756
|
# end for condition
|
|
@@ -1425,23 +2758,29 @@ class Data:
|
|
|
1425
2758
|
if inplace:
|
|
1426
2759
|
self._df = filtered_df
|
|
1427
2760
|
|
|
2761
|
+
if reset_index:
|
|
2762
|
+
self._df.reset_index(inplace=True, drop=True)
|
|
2763
|
+
|
|
1428
2764
|
return filtered_df
|
|
1429
2765
|
|
|
1430
2766
|
# end method definition
|
|
1431
2767
|
|
|
1432
|
-
def fill_na_in_column(self, column_name: str, default_value: str | int):
|
|
1433
|
-
"""Replace NA values in a column with a defined new default value
|
|
2768
|
+
def fill_na_in_column(self, column_name: str, default_value: str | int) -> None:
|
|
2769
|
+
"""Replace NA values in a column with a defined new default value.
|
|
1434
2770
|
|
|
1435
2771
|
Args:
|
|
1436
|
-
column_name (str):
|
|
1437
|
-
|
|
2772
|
+
column_name (str):
|
|
2773
|
+
The name of the column in the data frame.
|
|
2774
|
+
default_value (str | int):
|
|
2775
|
+
The value to replace NA with.
|
|
2776
|
+
|
|
1438
2777
|
"""
|
|
1439
2778
|
|
|
1440
2779
|
if column_name in self._df.columns:
|
|
1441
2780
|
self._df[column_name] = self._df[column_name].fillna(value=default_value)
|
|
1442
2781
|
else:
|
|
1443
|
-
logger.error(
|
|
1444
|
-
"Cannot replace NA values as column -> '%s' does not exist in the
|
|
2782
|
+
self.logger.error(
|
|
2783
|
+
"Cannot replace NA values as column -> '%s' does not exist in the data frame! Available columns -> %s",
|
|
1445
2784
|
column_name,
|
|
1446
2785
|
str(self._df.columns),
|
|
1447
2786
|
)
|
|
@@ -1449,16 +2788,19 @@ class Data:
|
|
|
1449
2788
|
# end method definition
|
|
1450
2789
|
|
|
1451
2790
|
def fill_forward(self, inplace: bool) -> pd.DataFrame:
|
|
1452
|
-
"""Fill the missing cells appropriately by carrying forward
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
2791
|
+
"""Fill the missing cells appropriately by carrying forward the values from the previous rows where necessary.
|
|
2792
|
+
|
|
2793
|
+
This has applications if a hierarchy is represented by
|
|
2794
|
+
nested cells e.g. in an Excel sheet.
|
|
1456
2795
|
|
|
1457
2796
|
Args:
|
|
1458
|
-
inplace (bool):
|
|
2797
|
+
inplace (bool):
|
|
2798
|
+
Should the modification happen inplace or not.
|
|
1459
2799
|
|
|
1460
2800
|
Returns:
|
|
1461
|
-
pd.DataFrame:
|
|
2801
|
+
pd.DataFrame:
|
|
2802
|
+
The resulting data frame.
|
|
2803
|
+
|
|
1462
2804
|
"""
|
|
1463
2805
|
|
|
1464
2806
|
# To convert an Excel representation of a folder structure with nested
|
|
@@ -1471,70 +2813,137 @@ class Data:
|
|
|
1471
2813
|
# end method definition
|
|
1472
2814
|
|
|
1473
2815
|
def lookup_value(
|
|
1474
|
-
self,
|
|
1475
|
-
|
|
1476
|
-
|
|
2816
|
+
self,
|
|
2817
|
+
lookup_column: str,
|
|
2818
|
+
lookup_value: str,
|
|
2819
|
+
separator: str = "|",
|
|
2820
|
+
single_row: bool = True,
|
|
2821
|
+
) -> pd.Series | pd.DataFrame | None:
|
|
2822
|
+
"""Lookup row(s) that includes a lookup value in the value of a given column.
|
|
1477
2823
|
|
|
1478
2824
|
Args:
|
|
1479
|
-
lookup_column (str):
|
|
1480
|
-
|
|
1481
|
-
|
|
2825
|
+
lookup_column (str):
|
|
2826
|
+
The name of the column to search in.
|
|
2827
|
+
lookup_value (str):
|
|
2828
|
+
The value to search for.
|
|
2829
|
+
separator (str):
|
|
2830
|
+
The string list delimiter / separator. The pipe symbol | is the default
|
|
2831
|
+
as it is unlikely to appear in a normal string (other than a plain comma).
|
|
2832
|
+
The separator is NOT looked for in the lookup_value but in the column that
|
|
2833
|
+
is given by lookup_column!
|
|
2834
|
+
single_row (bool, optional):
|
|
2835
|
+
This defines if we just return the first matching row if multiple matching rows
|
|
2836
|
+
are found. Default is True (= single row).
|
|
1482
2837
|
|
|
1483
2838
|
Returns:
|
|
1484
|
-
pd.Series |
|
|
2839
|
+
pd.Series | pd.DataFrame | None:
|
|
2840
|
+
Data frame (multiple rows) or Series (row) that matches the lookup value.
|
|
2841
|
+
None if no match was found.
|
|
2842
|
+
|
|
1485
2843
|
"""
|
|
1486
2844
|
|
|
1487
|
-
# Use the `apply` function to filter rows where the lookup value matches a
|
|
1488
|
-
|
|
1489
|
-
|
|
2845
|
+
# Use the `apply` function to filter rows where the lookup value matches a
|
|
2846
|
+
# whole item in the separator-divided list:
|
|
2847
|
+
def match_lookup_value(string_list: str | None) -> bool:
|
|
2848
|
+
"""Check if the lookup value is in a string list.
|
|
2849
|
+
|
|
2850
|
+
For this the string list is converted to a python
|
|
2851
|
+
list. A separator is used for the splitting.
|
|
1490
2852
|
|
|
1491
2853
|
Args:
|
|
1492
|
-
string_list (str):
|
|
2854
|
+
string_list (str):
|
|
2855
|
+
Delimiter-separated string list like "a, b, c" or "a | b | c"
|
|
1493
2856
|
|
|
1494
2857
|
Returns:
|
|
1495
|
-
bool:
|
|
2858
|
+
bool:
|
|
2859
|
+
True if lookup_value is equal to one of the delimiter-separated terms.
|
|
2860
|
+
|
|
1496
2861
|
"""
|
|
2862
|
+
|
|
2863
|
+
if pd.isna(string_list): # Handle None/NaN safely
|
|
2864
|
+
return False
|
|
2865
|
+
|
|
1497
2866
|
# Ensure that the string is a string
|
|
1498
2867
|
string_list = str(string_list)
|
|
1499
2868
|
|
|
1500
|
-
return lookup_value in [
|
|
1501
|
-
item.strip() for item in string_list.split(separator)
|
|
1502
|
-
]
|
|
2869
|
+
return lookup_value in [item.strip() for item in string_list.split(separator)]
|
|
1503
2870
|
|
|
1504
|
-
|
|
2871
|
+
# end method definition
|
|
1505
2872
|
|
|
1506
2873
|
if self._df is None:
|
|
1507
2874
|
return None
|
|
1508
2875
|
|
|
2876
|
+
df = self._df
|
|
2877
|
+
|
|
1509
2878
|
if lookup_column not in self._df.columns:
|
|
1510
|
-
logger.error(
|
|
1511
|
-
"
|
|
2879
|
+
self.logger.error(
|
|
2880
|
+
"Cannot lookup value in column -> '%s'. Column does not exist in the data frame! Data frame has these columns -> %s",
|
|
1512
2881
|
lookup_column,
|
|
1513
2882
|
str(self._df.columns),
|
|
1514
2883
|
)
|
|
1515
2884
|
return None
|
|
1516
2885
|
|
|
1517
2886
|
# Fill NaN or None values in the lookup column with empty strings
|
|
1518
|
-
df[lookup_column] = df[lookup_column].fillna("")
|
|
2887
|
+
# df[lookup_column] = df[lookup_column].fillna("")
|
|
2888
|
+
|
|
2889
|
+
# Use the `apply` function to filter rows where the lookup value is in row cell
|
|
2890
|
+
# of column given by lookup_column. match_lookup_value() is called with
|
|
2891
|
+
# the content of the individual cell contents:
|
|
2892
|
+
matched_rows = df[df[lookup_column].apply(match_lookup_value)]
|
|
2893
|
+
|
|
2894
|
+
# If nothing was found we return None:
|
|
2895
|
+
if matched_rows.empty:
|
|
2896
|
+
return None
|
|
1519
2897
|
|
|
1520
|
-
#
|
|
1521
|
-
|
|
2898
|
+
# If it is OK to have multiple matches (= multiple rows = pd.DataFrame).
|
|
2899
|
+
# We can just return the matched_rows now which should be a pd.DataFrame:
|
|
2900
|
+
if not single_row:
|
|
2901
|
+
return matched_rows
|
|
2902
|
+
|
|
2903
|
+
# Check if more than one row matches, and log a warning if so
|
|
2904
|
+
if len(matched_rows) > 1:
|
|
2905
|
+
self.logger.warning(
|
|
2906
|
+
"More than one match found for lookup value -> '%s' in column -> '%s'. Returning the first match.",
|
|
2907
|
+
lookup_value,
|
|
2908
|
+
lookup_column,
|
|
2909
|
+
)
|
|
1522
2910
|
|
|
1523
2911
|
# Return the first matched row, if any
|
|
1524
|
-
|
|
1525
|
-
return matched_row.iloc[0]
|
|
2912
|
+
return matched_rows.iloc[0]
|
|
1526
2913
|
|
|
1527
|
-
|
|
2914
|
+
# end method definition
|
|
2915
|
+
|
|
2916
|
+
def set_value(self, column: str, value, condition: pd.Series | None = None) -> None: # noqa: ANN001
|
|
2917
|
+
"""Set the value in the data frame based on a condition.
|
|
2918
|
+
|
|
2919
|
+
Args:
|
|
2920
|
+
column (str):
|
|
2921
|
+
The name of the column.
|
|
2922
|
+
value (Any):
|
|
2923
|
+
The value to set for those rows that fulfill the condition.
|
|
2924
|
+
condition (pd.Series, optional):
|
|
2925
|
+
This should be a boolean Series where each element is True or False,
|
|
2926
|
+
representing rows in the data frame that meet a certain condition.
|
|
2927
|
+
If None is provided then ALL rows get the 'value' in the given
|
|
2928
|
+
column.
|
|
2929
|
+
|
|
2930
|
+
"""
|
|
2931
|
+
|
|
2932
|
+
if condition is None:
|
|
2933
|
+
self._df[column] = value # Set value unconditionally
|
|
2934
|
+
else:
|
|
2935
|
+
self._df.loc[condition, column] = value # Set value based on condition
|
|
1528
2936
|
|
|
1529
2937
|
# end method definition
|
|
1530
2938
|
|
|
1531
2939
|
def add_column(
|
|
1532
2940
|
self,
|
|
1533
|
-
source_column: str,
|
|
1534
|
-
reg_exp: str,
|
|
1535
2941
|
new_column: str,
|
|
1536
|
-
|
|
1537
|
-
|
|
2942
|
+
data_type: str = "string",
|
|
2943
|
+
source_column: str = "",
|
|
2944
|
+
reg_exp: str = "",
|
|
2945
|
+
prefix: str = "",
|
|
2946
|
+
suffix: str = "",
|
|
1538
2947
|
length: int | None = None,
|
|
1539
2948
|
group_chars: int | None = None,
|
|
1540
2949
|
group_separator: str = ".",
|
|
@@ -1543,26 +2952,78 @@ class Data:
|
|
|
1543
2952
|
"""Add additional column to the data frame.
|
|
1544
2953
|
|
|
1545
2954
|
Args:
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
2955
|
+
new_column (str):
|
|
2956
|
+
The name of the column to add.
|
|
2957
|
+
data_type (str, optional):
|
|
2958
|
+
The data type of the new column.
|
|
2959
|
+
source_column (str, optional):
|
|
2960
|
+
The name of the source column.
|
|
2961
|
+
reg_exp (str, optional):
|
|
2962
|
+
A regular expression to apply on the content of the source column.
|
|
2963
|
+
prefix (str, optional):
|
|
2964
|
+
Prefix to add in front of the value. Defaults to "".
|
|
2965
|
+
suffix (str, optional):
|
|
2966
|
+
Suffix to add at the end of the value. Defaults to "".
|
|
2967
|
+
length (int | None, optional):
|
|
2968
|
+
Length to reduce to. Defaults to None (= unlimited).
|
|
2969
|
+
group_chars (int | None, optional):
|
|
2970
|
+
Group the resulting string in characters of group_chars. Defaults to None.
|
|
2971
|
+
Usable e.g. for thousand seperator "."
|
|
2972
|
+
group_separator (str, optional):
|
|
2973
|
+
Separator string for the grouping. Defaults to ".".
|
|
2974
|
+
group_remove_leading_zero (bool, optional):
|
|
2975
|
+
Remove leading zeros from the groups. Defaults to True.
|
|
1556
2976
|
|
|
1557
2977
|
Returns:
|
|
1558
|
-
bool:
|
|
2978
|
+
bool:
|
|
2979
|
+
True = Success, False = Failure
|
|
2980
|
+
|
|
1559
2981
|
"""
|
|
1560
2982
|
|
|
1561
2983
|
if self._df is None:
|
|
1562
2984
|
return False
|
|
1563
2985
|
|
|
2986
|
+
# Check that the new column does not yet exist
|
|
2987
|
+
if new_column in self._df.columns:
|
|
2988
|
+
self.logger.error(
|
|
2989
|
+
"New column -> '%s' does already exist in data frame! Cannot add it. Data frame has these columns -> %s",
|
|
2990
|
+
new_column,
|
|
2991
|
+
str(self._df.columns),
|
|
2992
|
+
)
|
|
2993
|
+
return False
|
|
2994
|
+
|
|
2995
|
+
# first we handle the very simple case to not have
|
|
2996
|
+
# a source column but just add an empty new column:
|
|
2997
|
+
if not source_column:
|
|
2998
|
+
self._df[new_column] = pd.Series(dtype=data_type)
|
|
2999
|
+
return True
|
|
3000
|
+
|
|
3001
|
+
# Check if the source column exists
|
|
3002
|
+
if source_column not in self._df.columns:
|
|
3003
|
+
self.logger.error(
|
|
3004
|
+
"Source column -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
|
|
3005
|
+
source_column,
|
|
3006
|
+
str(self._df.columns),
|
|
3007
|
+
)
|
|
3008
|
+
return False
|
|
3009
|
+
|
|
3010
|
+
# Validate the regex pattern
|
|
3011
|
+
try:
|
|
3012
|
+
re.compile(reg_exp) # Check if the pattern is a valid regex
|
|
3013
|
+
except re.error:
|
|
3014
|
+
self.logger.error(
|
|
3015
|
+
"Invalid regular expression -> %s. Cannot extract data for new column -> '%s'!",
|
|
3016
|
+
reg_exp,
|
|
3017
|
+
new_column,
|
|
3018
|
+
)
|
|
3019
|
+
return False
|
|
3020
|
+
|
|
3021
|
+
# Ensure the source column is of type string (convert it, if necessary)
|
|
3022
|
+
if self._df[source_column].dtype != "object":
|
|
3023
|
+
self._df[source_column] = self._df[source_column].astype(str)
|
|
3024
|
+
|
|
1564
3025
|
# Use str.extract to apply the regular expression to the source column
|
|
1565
|
-
# and then assign this modified
|
|
3026
|
+
# and then assign this modified column to the variable "extracted":
|
|
1566
3027
|
extracted = self._df[source_column].str.extract(pat=reg_exp, expand=False)
|
|
1567
3028
|
|
|
1568
3029
|
# Limit the result to the specified length
|
|
@@ -1571,9 +3032,9 @@ class Data:
|
|
|
1571
3032
|
|
|
1572
3033
|
if group_chars is not None:
|
|
1573
3034
|
|
|
1574
|
-
def process_grouping(x):
|
|
3035
|
+
def process_grouping(x) -> str | None: # noqa: ANN001
|
|
1575
3036
|
if pd.isna(x):
|
|
1576
|
-
return
|
|
3037
|
+
return None
|
|
1577
3038
|
# Split into groups
|
|
1578
3039
|
groups = [x[i : i + group_chars] for i in range(0, len(x), group_chars)]
|
|
1579
3040
|
if group_remove_leading_zero:
|
|
@@ -1594,21 +3055,36 @@ class Data:
|
|
|
1594
3055
|
|
|
1595
3056
|
# end method definition
|
|
1596
3057
|
|
|
1597
|
-
def convert_to_lists(self, columns: list, delimiter: str = ","):
|
|
1598
|
-
"""
|
|
1599
|
-
|
|
3058
|
+
def convert_to_lists(self, columns: list, delimiter: str = ",") -> None:
|
|
3059
|
+
"""Intelligently convert string values to list values, in defined data frame columns.
|
|
3060
|
+
|
|
3061
|
+
The delimiter to separate values in the string value can be configured.
|
|
3062
|
+
The method is ignoring delimiters that are inside quotes.
|
|
1600
3063
|
|
|
1601
3064
|
Args:
|
|
1602
|
-
columns (list):
|
|
1603
|
-
|
|
1604
|
-
delimiter (str, optional):
|
|
3065
|
+
columns (list):
|
|
3066
|
+
The name of the columns whose values should be converted to lists.
|
|
3067
|
+
delimiter (str, optional):
|
|
3068
|
+
Character that delimits list items. Defaults to ",".
|
|
1605
3069
|
|
|
1606
3070
|
Returns:
|
|
1607
3071
|
None. self._df is modified in place.
|
|
3072
|
+
|
|
1608
3073
|
"""
|
|
1609
3074
|
|
|
1610
3075
|
# Regex to split by the delimiter, ignoring those inside quotes or double quotes
|
|
1611
|
-
def split_string_ignoring_quotes(s, delimiter):
|
|
3076
|
+
def split_string_ignoring_quotes(s: str, delimiter: str) -> list:
|
|
3077
|
+
"""Split a string into a list at positions that have a delimiter character.
|
|
3078
|
+
|
|
3079
|
+
Args:
|
|
3080
|
+
s (str): the string to split
|
|
3081
|
+
delimiter (str): The single character that is used for splitting.
|
|
3082
|
+
|
|
3083
|
+
Returns:
|
|
3084
|
+
A list of splitted values.
|
|
3085
|
+
|
|
3086
|
+
"""
|
|
3087
|
+
|
|
1612
3088
|
# Escaping the delimiter in case it's a special regex character
|
|
1613
3089
|
delimiter = re.escape(delimiter)
|
|
1614
3090
|
# Match quoted strings and unquoted delimiters separately
|
|
@@ -1617,27 +3093,84 @@ class Data:
|
|
|
1617
3093
|
|
|
1618
3094
|
for col in columns:
|
|
1619
3095
|
self._df[col] = self._df[col].apply(
|
|
1620
|
-
lambda x: (
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
3096
|
+
lambda x: (split_string_ignoring_quotes(x, delimiter) if isinstance(x, str) and delimiter in x else x),
|
|
3097
|
+
)
|
|
3098
|
+
|
|
3099
|
+
# end method definition
|
|
3100
|
+
|
|
3101
|
+
def add_column_concat(
|
|
3102
|
+
self,
|
|
3103
|
+
source_columns: list,
|
|
3104
|
+
new_column: str,
|
|
3105
|
+
concat_char: str = "",
|
|
3106
|
+
upper: bool = False,
|
|
3107
|
+
lower: bool = False,
|
|
3108
|
+
capitalize: bool = False,
|
|
3109
|
+
title: bool = False,
|
|
3110
|
+
) -> None:
|
|
3111
|
+
"""Add a column as a concatenation of the values of multiple source columns.
|
|
3112
|
+
|
|
3113
|
+
Args:
|
|
3114
|
+
source_columns (list):
|
|
3115
|
+
The column names the list values are taken from.
|
|
3116
|
+
new_column (str):
|
|
3117
|
+
The name of the new column.
|
|
3118
|
+
concat_char (str, optional):
|
|
3119
|
+
Character to insert between the concatenated values. Default is "".
|
|
3120
|
+
upper (bool, optional):
|
|
3121
|
+
Convert result to uppercase if True.
|
|
3122
|
+
lower (bool, optional):
|
|
3123
|
+
Convert result to lowercase if True.
|
|
3124
|
+
capitalize (bool, optional):
|
|
3125
|
+
Capitalize the result if True.
|
|
3126
|
+
title (bool, optional):
|
|
3127
|
+
Convert result to title case if True.
|
|
3128
|
+
|
|
3129
|
+
Returns:
|
|
3130
|
+
None. self._df is modified in place.
|
|
3131
|
+
|
|
3132
|
+
"""
|
|
3133
|
+
|
|
3134
|
+
def concatenate(row: pd.Series) -> str:
|
|
3135
|
+
# Comprehension to create a list from all source column values:
|
|
3136
|
+
concatenated = concat_char.join(
|
|
3137
|
+
[str(row[col]) for col in source_columns if pd.notna(row[col])],
|
|
1625
3138
|
)
|
|
1626
3139
|
|
|
3140
|
+
# Apply case transformations based on parameters
|
|
3141
|
+
if upper:
|
|
3142
|
+
concatenated = concatenated.upper()
|
|
3143
|
+
elif lower:
|
|
3144
|
+
concatenated = concatenated.lower()
|
|
3145
|
+
elif capitalize:
|
|
3146
|
+
concatenated = concatenated.capitalize()
|
|
3147
|
+
elif title:
|
|
3148
|
+
concatenated = concatenated.title()
|
|
3149
|
+
|
|
3150
|
+
# end method definition
|
|
3151
|
+
|
|
3152
|
+
self._df[new_column] = self._df.apply(concatenate, axis=1)
|
|
3153
|
+
|
|
1627
3154
|
# end method definition
|
|
1628
3155
|
|
|
1629
|
-
def add_column_list(self, source_columns: list, new_column: str):
|
|
1630
|
-
"""Add a column with list objects.
|
|
1631
|
-
|
|
3156
|
+
def add_column_list(self, source_columns: list, new_column: str) -> None:
|
|
3157
|
+
"""Add a column with list objects.
|
|
3158
|
+
|
|
3159
|
+
The list items are taken from a list of source columns (row by row).
|
|
1632
3160
|
|
|
1633
3161
|
Args:
|
|
1634
|
-
source_columns (list):
|
|
1635
|
-
|
|
3162
|
+
source_columns (list):
|
|
3163
|
+
The column names the list values are taken from.
|
|
3164
|
+
new_column (str):
|
|
3165
|
+
The name of the new column.
|
|
3166
|
+
|
|
1636
3167
|
Returns:
|
|
1637
3168
|
None. self._df is modified in place.
|
|
3169
|
+
|
|
1638
3170
|
"""
|
|
1639
3171
|
|
|
1640
|
-
def create_list(row):
|
|
3172
|
+
def create_list(row: pd.Series) -> list:
|
|
3173
|
+
# Comprehension to create a list from all source column values:
|
|
1641
3174
|
return [row[col] for col in source_columns]
|
|
1642
3175
|
|
|
1643
3176
|
self._df[new_column] = self._df.apply(create_list, axis=1)
|
|
@@ -1645,87 +3178,90 @@ class Data:
|
|
|
1645
3178
|
# end method definition
|
|
1646
3179
|
|
|
1647
3180
|
def add_column_table(
|
|
1648
|
-
self,
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
3181
|
+
self,
|
|
3182
|
+
source_columns: list,
|
|
3183
|
+
new_column: str,
|
|
3184
|
+
delimiter: str = ",",
|
|
3185
|
+
) -> None:
|
|
3186
|
+
"""Add a column with tabular objects (list of dictionaries).
|
|
3187
|
+
|
|
3188
|
+
The source columns should include lists. The resulting dictionary
|
|
3189
|
+
keys are the column names for the source columns.
|
|
3190
|
+
|
|
3191
|
+
Example (["X", "Y"] are the source_columns, "Table" is the new_column):
|
|
3192
|
+
X[1] = [1, 2, 3] # row 1
|
|
3193
|
+
Y[1] = ["A", "B", "C"] # row 1
|
|
3194
|
+
X[2] = [4, 5, 6] # row 2
|
|
3195
|
+
Y[2] = ["D", "E", "F"] # row 2
|
|
3196
|
+
|
|
3197
|
+
Table[1] = [
|
|
3198
|
+
{
|
|
3199
|
+
"X": "1"
|
|
3200
|
+
"Y": "A"
|
|
3201
|
+
},
|
|
3202
|
+
{
|
|
3203
|
+
"X": "2"
|
|
3204
|
+
"Y": "B"
|
|
3205
|
+
}
|
|
3206
|
+
{
|
|
3207
|
+
"X": "3"
|
|
3208
|
+
"Y": "C"
|
|
3209
|
+
}
|
|
3210
|
+
]
|
|
3211
|
+
Table[2] = [
|
|
3212
|
+
{
|
|
3213
|
+
"X": "4"
|
|
3214
|
+
"Y": "D"
|
|
3215
|
+
},
|
|
3216
|
+
{
|
|
3217
|
+
"X": "5"
|
|
3218
|
+
"Y": "E"
|
|
3219
|
+
}
|
|
3220
|
+
{
|
|
3221
|
+
"X": "6"
|
|
3222
|
+
"Y": "F"
|
|
3223
|
+
}
|
|
3224
|
+
]
|
|
1688
3225
|
|
|
1689
3226
|
Args:
|
|
1690
|
-
source_columns (list):
|
|
1691
|
-
|
|
1692
|
-
|
|
3227
|
+
source_columns (list):
|
|
3228
|
+
The column names the list values are taken from.
|
|
3229
|
+
new_column (str):
|
|
3230
|
+
The name of the new column.
|
|
3231
|
+
delimiter (str, optional):
|
|
3232
|
+
Character that delimits list items. Defaults to ",".
|
|
1693
3233
|
|
|
1694
3234
|
Returns:
|
|
1695
3235
|
None. self._df is modified in place.
|
|
3236
|
+
|
|
1696
3237
|
"""
|
|
1697
3238
|
|
|
1698
3239
|
# Call the convert_to_lists method to ensure the columns are converted
|
|
1699
3240
|
self.convert_to_lists(columns=source_columns, delimiter=delimiter)
|
|
1700
3241
|
|
|
1701
3242
|
# Sub-method to pad lists to the same length
|
|
1702
|
-
def pad_list(lst: list, max_len: int):
|
|
3243
|
+
def pad_list(lst: list, max_len: int) -> list:
|
|
1703
3244
|
return lst + [None] * (max_len - len(lst))
|
|
1704
3245
|
|
|
1705
|
-
def create_table(row) -> list:
|
|
1706
|
-
max_len = max(
|
|
1707
|
-
len(row[col]) if isinstance(row[col], list) else 1
|
|
1708
|
-
for col in source_columns
|
|
1709
|
-
)
|
|
3246
|
+
def create_table(row: pd.Series) -> list:
|
|
3247
|
+
max_len = max(len(row[col]) if isinstance(row[col], list) else 1 for col in source_columns)
|
|
1710
3248
|
|
|
1711
|
-
# Pad lists to the maximum length, leave
|
|
3249
|
+
# Pad lists to the maximum length, leave scalar values as they are
|
|
1712
3250
|
for col in source_columns:
|
|
1713
3251
|
if isinstance(row[col], list):
|
|
1714
3252
|
row[col] = pad_list(row[col], max_len)
|
|
3253
|
+
elif not pd.isna(row[col]):
|
|
3254
|
+
row[col] = [
|
|
3255
|
+
row[col],
|
|
3256
|
+
] * max_len # Repeat scalar value to match the max length
|
|
1715
3257
|
else:
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
else:
|
|
1721
|
-
row[col] = [None] * max_len
|
|
1722
|
-
# Create a list of dictionaries for each row
|
|
1723
|
-
table = []
|
|
1724
|
-
for i in range(max_len):
|
|
1725
|
-
table.append({col: row[col][i] for col in source_columns})
|
|
3258
|
+
row[col] = [None] * max_len
|
|
3259
|
+
# Create a list of dictionaries for each row:
|
|
3260
|
+
table = [{col: row[col][i] for col in source_columns} for i in range(max_len)]
|
|
3261
|
+
|
|
1726
3262
|
return table
|
|
1727
3263
|
|
|
1728
|
-
# Apply the function to create a new column with
|
|
3264
|
+
# Apply the function to create a new column with table values:
|
|
1729
3265
|
self._df[new_column] = self._df.apply(create_table, axis=1)
|
|
1730
3266
|
|
|
1731
3267
|
# end method definition
|