pyxecm 1.5__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyxecm might be problematic. Click here for more details.
- pyxecm/__init__.py +6 -2
- pyxecm/avts.py +1492 -0
- pyxecm/coreshare.py +1075 -960
- pyxecm/customizer/__init__.py +16 -4
- pyxecm/customizer/__main__.py +58 -0
- pyxecm/customizer/api/__init__.py +5 -0
- pyxecm/customizer/api/__main__.py +6 -0
- pyxecm/customizer/api/app.py +914 -0
- pyxecm/customizer/api/auth.py +154 -0
- pyxecm/customizer/api/metrics.py +92 -0
- pyxecm/customizer/api/models.py +13 -0
- pyxecm/customizer/api/payload_list.py +865 -0
- pyxecm/customizer/api/settings.py +103 -0
- pyxecm/customizer/browser_automation.py +332 -139
- pyxecm/customizer/customizer.py +1075 -1057
- pyxecm/customizer/exceptions.py +35 -0
- pyxecm/customizer/guidewire.py +322 -0
- pyxecm/customizer/k8s.py +787 -338
- pyxecm/customizer/log.py +107 -0
- pyxecm/customizer/m365.py +3424 -2270
- pyxecm/customizer/nhc.py +1169 -0
- pyxecm/customizer/openapi.py +258 -0
- pyxecm/customizer/payload.py +18201 -7030
- pyxecm/customizer/pht.py +1047 -210
- pyxecm/customizer/salesforce.py +836 -727
- pyxecm/customizer/sap.py +58 -41
- pyxecm/customizer/servicenow.py +851 -383
- pyxecm/customizer/settings.py +442 -0
- pyxecm/customizer/successfactors.py +408 -346
- pyxecm/customizer/translate.py +83 -48
- pyxecm/helper/__init__.py +5 -2
- pyxecm/helper/assoc.py +98 -38
- pyxecm/helper/data.py +2482 -742
- pyxecm/helper/logadapter.py +27 -0
- pyxecm/helper/web.py +229 -101
- pyxecm/helper/xml.py +528 -172
- pyxecm/maintenance_page/__init__.py +5 -0
- pyxecm/maintenance_page/__main__.py +6 -0
- pyxecm/maintenance_page/app.py +51 -0
- pyxecm/maintenance_page/settings.py +28 -0
- pyxecm/maintenance_page/static/favicon.avif +0 -0
- pyxecm/maintenance_page/templates/maintenance.html +165 -0
- pyxecm/otac.py +234 -140
- pyxecm/otawp.py +2689 -0
- pyxecm/otcs.py +12344 -7547
- pyxecm/otds.py +3166 -2219
- pyxecm/otiv.py +36 -21
- pyxecm/otmm.py +1363 -296
- pyxecm/otpd.py +231 -127
- pyxecm-2.0.0.dist-info/METADATA +145 -0
- pyxecm-2.0.0.dist-info/RECORD +54 -0
- {pyxecm-1.5.dist-info → pyxecm-2.0.0.dist-info}/WHEEL +1 -1
- pyxecm-1.5.dist-info/METADATA +0 -51
- pyxecm-1.5.dist-info/RECORD +0 -30
- {pyxecm-1.5.dist-info → pyxecm-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {pyxecm-1.5.dist-info → pyxecm-2.0.0.dist-info}/top_level.txt +0 -0
pyxecm/helper/data.py
CHANGED
|
@@ -1,74 +1,61 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
This code implements a class called
|
|
6
|
-
to Pandas
|
|
7
|
-
|
|
8
|
-
Class: Payload
|
|
9
|
-
Methods:
|
|
10
|
-
|
|
11
|
-
__init__ : class initializer
|
|
12
|
-
__len__: Lenght of the embedded DataFrame object.
|
|
13
|
-
__str__: Print the DataFrame of the class
|
|
14
|
-
get_data_frame: Get the Pandas DataFrame object
|
|
15
|
-
set_data_frame: Set the Pandas DataFrame object
|
|
16
|
-
append: Append additional data to the data frame.
|
|
17
|
-
|
|
18
|
-
load_json_data: Load JSON data into DataFrame
|
|
19
|
-
save_json_data: Save JSON data from DataFrame to file
|
|
20
|
-
load_excel_data: Load Excel file into DataFrame
|
|
21
|
-
load_csv_data: Load CSV data into DataFrame
|
|
22
|
-
load_directory: Load directory structure into Pandas Data Frame
|
|
23
|
-
|
|
24
|
-
partitionate: Partition a data frame into equally sized partions
|
|
25
|
-
deduplicate: Remove dupclicate rows that have all fields in unique_fields in common
|
|
26
|
-
sort: Sort the data frame based on one or multiple fields.
|
|
27
|
-
flatten: Flatten a sub-dictionary by copying selected fields to the
|
|
28
|
-
parent dictionary.
|
|
29
|
-
explode_and_flatten: Explode a substructure in the Data Frame
|
|
30
|
-
drop_columns: Drop selected columns from the Data Frame
|
|
31
|
-
keep_columns: Keep only selected columns from the Data Frame. Drop the rest.
|
|
32
|
-
cleanse: Cleanse data with regular expressions and upper/lower case conversion.
|
|
33
|
-
filter: Filter the DataFrame based on conditions
|
|
34
|
-
|
|
35
|
-
fill_forward: Fill the missing cells appropriately by carrying forward
|
|
36
|
-
the values from the previous rows where necessary.
|
|
37
|
-
fill_na_in_column: Replace NA values in a column with a defined new default value
|
|
1
|
+
"""Data Module leveraging Pandas to manipulte data sets read for bulk generation of Content Server items.
|
|
2
|
+
|
|
3
|
+
See: https://pandas.pydata.org
|
|
4
|
+
|
|
5
|
+
This code implements a class called "Data" which is a wrapper
|
|
6
|
+
to Pandas data frame.
|
|
38
7
|
"""
|
|
39
8
|
|
|
40
9
|
__author__ = "Dr. Marc Diefenbruch"
|
|
41
|
-
__copyright__ = "Copyright 2024, OpenText"
|
|
10
|
+
__copyright__ = "Copyright (C) 2024-2025, OpenText"
|
|
42
11
|
__credits__ = ["Kai-Philip Gatzweiler"]
|
|
43
12
|
__maintainer__ = "Dr. Marc Diefenbruch"
|
|
44
13
|
__email__ = "mdiefenb@opentext.com"
|
|
45
14
|
|
|
46
|
-
import logging
|
|
47
15
|
import json
|
|
16
|
+
import logging
|
|
48
17
|
import os
|
|
49
18
|
import re
|
|
50
19
|
import threading
|
|
20
|
+
from io import StringIO
|
|
51
21
|
|
|
52
22
|
import pandas as pd
|
|
23
|
+
import requests
|
|
53
24
|
|
|
54
|
-
|
|
25
|
+
default_logger = logging.getLogger("pyxecm.helper.data")
|
|
55
26
|
|
|
56
27
|
|
|
57
28
|
class Data:
|
|
58
29
|
"""Used to automate data loading for the customizer."""
|
|
59
30
|
|
|
31
|
+
logger: logging.Logger = default_logger
|
|
32
|
+
|
|
60
33
|
_df: pd.DataFrame
|
|
61
|
-
_lock = threading.Lock()
|
|
34
|
+
_lock: threading.Lock = threading.Lock()
|
|
62
35
|
|
|
63
|
-
def __init__(
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
init_data: pd.DataFrame | list = None,
|
|
39
|
+
logger: logging.Logger = default_logger,
|
|
40
|
+
) -> None:
|
|
64
41
|
"""Initialize the Data object.
|
|
65
42
|
|
|
66
43
|
Args:
|
|
67
|
-
init_data (pd.DataFrame | list, optional):
|
|
68
|
-
|
|
69
|
-
|
|
44
|
+
init_data (pd.DataFrame | list, optional):
|
|
45
|
+
Data to initialize the data frame. Can either be
|
|
46
|
+
another data frame (that gets copied) or a list of dictionaries.
|
|
47
|
+
Defaults to None.
|
|
48
|
+
logger (logging.Logger, optional):
|
|
49
|
+
Pass a special logging object. This is optional. If not provided,
|
|
50
|
+
the default logger is used.
|
|
51
|
+
|
|
70
52
|
"""
|
|
71
53
|
|
|
54
|
+
if logger != default_logger:
|
|
55
|
+
self.logger = logger.getChild("data")
|
|
56
|
+
for logfilter in logger.filters:
|
|
57
|
+
self.logger.addFilter(logfilter)
|
|
58
|
+
|
|
72
59
|
if init_data is not None:
|
|
73
60
|
# if a data frame is passed to the constructor we
|
|
74
61
|
# copy its content to the new Data object
|
|
@@ -84,7 +71,7 @@ class Data:
|
|
|
84
71
|
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
85
72
|
self._df: pd.DataFrame = pd.DataFrame([init_data])
|
|
86
73
|
else:
|
|
87
|
-
logger.error("Illegal initialization data for 'Data' class!")
|
|
74
|
+
self.logger.error("Illegal initialization data for 'Data' class!")
|
|
88
75
|
self._df = None
|
|
89
76
|
else:
|
|
90
77
|
self._df = None
|
|
@@ -92,11 +79,14 @@ class Data:
|
|
|
92
79
|
# end method definition
|
|
93
80
|
|
|
94
81
|
def __len__(self) -> int:
|
|
95
|
-
"""
|
|
96
|
-
|
|
82
|
+
"""Return lenght of the embedded Pandas data frame object.
|
|
83
|
+
|
|
84
|
+
This is basically a convenience method.
|
|
97
85
|
|
|
98
86
|
Returns:
|
|
99
|
-
int:
|
|
87
|
+
int:
|
|
88
|
+
Lenght of the data frame.
|
|
89
|
+
|
|
100
90
|
"""
|
|
101
91
|
|
|
102
92
|
if self._df is not None:
|
|
@@ -106,10 +96,12 @@ class Data:
|
|
|
106
96
|
# end method definition
|
|
107
97
|
|
|
108
98
|
def __str__(self) -> str:
|
|
109
|
-
"""Print the
|
|
99
|
+
"""Print the Pandas data frame object.
|
|
110
100
|
|
|
111
101
|
Returns:
|
|
112
|
-
str:
|
|
102
|
+
str:
|
|
103
|
+
String representation.
|
|
104
|
+
|
|
113
105
|
"""
|
|
114
106
|
|
|
115
107
|
# if data frame is initialized we return
|
|
@@ -121,38 +113,73 @@ class Data:
|
|
|
121
113
|
|
|
122
114
|
# end method definition
|
|
123
115
|
|
|
124
|
-
def
|
|
116
|
+
def __getitem__(self, column: str) -> pd.Series:
|
|
117
|
+
"""Return the column corresponding to the key from the data frame.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
column (str): The name of the data frame column.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
pd.Series: The column of the data frame with the given name.
|
|
124
|
+
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
return self._df[column]
|
|
128
|
+
|
|
129
|
+
# end method definition
|
|
130
|
+
|
|
131
|
+
def lock(self) -> threading.Lock:
|
|
125
132
|
"""Return the threading lock object.
|
|
126
133
|
|
|
127
134
|
Returns:
|
|
128
|
-
|
|
135
|
+
threading.Lock: The threading lock object.
|
|
136
|
+
|
|
129
137
|
"""
|
|
138
|
+
|
|
130
139
|
return self._lock
|
|
131
140
|
|
|
132
141
|
# end method definition
|
|
133
142
|
|
|
134
143
|
def get_data_frame(self) -> pd.DataFrame:
|
|
135
|
-
"""Get the Pandas
|
|
144
|
+
"""Get the Pandas data frame object.
|
|
136
145
|
|
|
137
146
|
Returns:
|
|
138
|
-
pd.DataFrame: Pandas
|
|
147
|
+
pd.DataFrame: The Pandas data frame object.
|
|
148
|
+
|
|
139
149
|
"""
|
|
140
150
|
|
|
141
151
|
return self._df
|
|
142
152
|
|
|
143
153
|
# end method definition
|
|
144
154
|
|
|
145
|
-
def set_data_frame(self, df: pd.DataFrame):
|
|
146
|
-
"""Set the Pandas
|
|
155
|
+
def set_data_frame(self, df: pd.DataFrame) -> None:
|
|
156
|
+
"""Set the Pandas data frame object.
|
|
147
157
|
|
|
148
158
|
Args:
|
|
149
|
-
df (pd.DataFrame): Pandas
|
|
159
|
+
df (pd.DataFrame): The new Pandas data frame object.
|
|
160
|
+
|
|
150
161
|
"""
|
|
151
162
|
|
|
152
163
|
self._df = df
|
|
153
164
|
|
|
154
165
|
# end method definition
|
|
155
166
|
|
|
167
|
+
def get_columns(self) -> list | None:
|
|
168
|
+
"""Get the list of column names of the data frame.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
list | None:
|
|
172
|
+
The list of column names in the data frame.
|
|
173
|
+
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
if self._df is None:
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
return self._df.columns
|
|
180
|
+
|
|
181
|
+
# end method definition
|
|
182
|
+
|
|
156
183
|
def print_info(
|
|
157
184
|
self,
|
|
158
185
|
show_size: bool = True,
|
|
@@ -163,26 +190,40 @@ class Data:
|
|
|
163
190
|
show_sample: bool = False,
|
|
164
191
|
show_statistics: bool = False,
|
|
165
192
|
row_num: int = 10,
|
|
166
|
-
):
|
|
167
|
-
"""Log information about the data frame
|
|
193
|
+
) -> None:
|
|
194
|
+
"""Log information about the data frame.
|
|
168
195
|
|
|
169
196
|
Args:
|
|
170
|
-
show_size (bool, optional):
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
197
|
+
show_size (bool, optional):
|
|
198
|
+
Show size of data frame. Defaults to True.
|
|
199
|
+
show_info (bool, optional):
|
|
200
|
+
Show information for data frame. Defaults to False.
|
|
201
|
+
show_columns (bool, optional):
|
|
202
|
+
Show columns of data frame. Defaults to False.
|
|
203
|
+
show_first (bool, optional):
|
|
204
|
+
Show first N items. Defaults to False. N is defined
|
|
205
|
+
by the row_num parameter.
|
|
206
|
+
show_last (bool, optional):
|
|
207
|
+
Show last N items. Defaults to False. N is defined
|
|
208
|
+
by the row_num parameter.
|
|
209
|
+
show_sample (bool, optional):
|
|
210
|
+
Show N sample items. Defaults to False. N is defined
|
|
211
|
+
by the row_num parameter.
|
|
212
|
+
show_statistics (bool, optional):
|
|
213
|
+
Show data frame statistics. Defaults to False.
|
|
214
|
+
row_num (int, optional):
|
|
215
|
+
Used as the number of rows printed using show_first,
|
|
216
|
+
show_last, show_sample. Default is 10.
|
|
217
|
+
|
|
177
218
|
"""
|
|
178
219
|
|
|
179
220
|
if self._df is None:
|
|
180
|
-
logger.warning("Data
|
|
221
|
+
self.logger.warning("Data frame is not initialized!")
|
|
181
222
|
return
|
|
182
223
|
|
|
183
224
|
if show_size:
|
|
184
|
-
logger.info(
|
|
185
|
-
"Data
|
|
225
|
+
self.logger.info(
|
|
226
|
+
"Data frame has %s row(s) and %s column(s)",
|
|
186
227
|
self._df.shape[0],
|
|
187
228
|
self._df.shape[1],
|
|
188
229
|
)
|
|
@@ -192,39 +233,42 @@ class Data:
|
|
|
192
233
|
self._df.info()
|
|
193
234
|
|
|
194
235
|
if show_columns:
|
|
195
|
-
logger.info("Columns:\n%s", self._df.columns)
|
|
196
|
-
logger.info(
|
|
197
|
-
"Columns with number of
|
|
236
|
+
self.logger.info("Columns:\n%s", self._df.columns)
|
|
237
|
+
self.logger.info(
|
|
238
|
+
"Columns with number of NaN values:\n%s",
|
|
239
|
+
self._df.isna().sum(),
|
|
198
240
|
)
|
|
199
|
-
logger.info(
|
|
200
|
-
"Columns with number of non-
|
|
201
|
-
|
|
202
|
-
logger.info("Columns with number of NaN values:\n%s", self._df.isna().sum())
|
|
203
|
-
logger.info(
|
|
204
|
-
"Columns with number of non-NaN values:\n%s", self._df.notna().sum()
|
|
241
|
+
self.logger.info(
|
|
242
|
+
"Columns with number of non-NaN values:\n%s",
|
|
243
|
+
self._df.notna().sum(),
|
|
205
244
|
)
|
|
206
245
|
|
|
207
246
|
if show_first:
|
|
208
247
|
# the default for head is n = 5:
|
|
209
|
-
logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
|
|
248
|
+
self.logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
|
|
210
249
|
|
|
211
250
|
if show_last:
|
|
212
251
|
# the default for tail is n = 5:
|
|
213
|
-
logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
|
|
252
|
+
self.logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
|
|
214
253
|
|
|
215
254
|
if show_sample:
|
|
216
255
|
# the default for sample is n = 1:
|
|
217
|
-
logger.info(
|
|
256
|
+
self.logger.info(
|
|
257
|
+
"%s Sample rows:\n%s",
|
|
258
|
+
str(row_num),
|
|
259
|
+
self._df.sample(n=row_num),
|
|
260
|
+
)
|
|
218
261
|
|
|
219
262
|
if show_statistics:
|
|
220
|
-
logger.info(
|
|
221
|
-
"Description of statistics for data frame:\n%s",
|
|
263
|
+
self.logger.info(
|
|
264
|
+
"Description of statistics for data frame:\n%s",
|
|
265
|
+
self._df.describe(),
|
|
222
266
|
)
|
|
223
|
-
logger.info(
|
|
224
|
-
"Description of statistics for data frame (
|
|
267
|
+
self.logger.info(
|
|
268
|
+
"Description of statistics for data frame (transformed):\n%s",
|
|
225
269
|
self._df.describe().T,
|
|
226
270
|
)
|
|
227
|
-
logger.info(
|
|
271
|
+
self.logger.info(
|
|
228
272
|
"Description of statistics for data frame (objects):\n%s",
|
|
229
273
|
self._df.describe(include="object"),
|
|
230
274
|
)
|
|
@@ -235,10 +279,13 @@ class Data:
|
|
|
235
279
|
"""Append additional data to the data frame.
|
|
236
280
|
|
|
237
281
|
Args:
|
|
238
|
-
add_data (pd.DataFrame | list | dict):
|
|
282
|
+
add_data (pd.DataFrame | list | dict):
|
|
283
|
+
Additional data. Can be pd.DataFrame or list of dicts (or Data).
|
|
239
284
|
|
|
240
285
|
Returns:
|
|
241
|
-
bool:
|
|
286
|
+
bool:
|
|
287
|
+
True = Success, False = Error
|
|
288
|
+
|
|
242
289
|
"""
|
|
243
290
|
|
|
244
291
|
# Does the data frame has already content?
|
|
@@ -250,164 +297,395 @@ class Data:
|
|
|
250
297
|
return True
|
|
251
298
|
elif isinstance(add_data, Data):
|
|
252
299
|
df = add_data.get_data_frame()
|
|
253
|
-
if df:
|
|
300
|
+
if df is not None and not df.empty:
|
|
254
301
|
self._df = pd.concat([self._df, df], ignore_index=True)
|
|
255
302
|
return True
|
|
256
303
|
elif isinstance(add_data, list):
|
|
257
304
|
if add_data:
|
|
258
|
-
df = Data(add_data)
|
|
305
|
+
df = Data(add_data, logger=self.logger)
|
|
259
306
|
self._df = pd.concat(
|
|
260
|
-
[self._df, df.get_data_frame()],
|
|
307
|
+
[self._df, df.get_data_frame()],
|
|
308
|
+
ignore_index=True,
|
|
261
309
|
)
|
|
262
310
|
return True
|
|
263
311
|
elif isinstance(add_data, dict):
|
|
264
312
|
if add_data:
|
|
265
313
|
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
266
|
-
df = Data([add_data])
|
|
314
|
+
df = Data([add_data], logger=self.logger)
|
|
267
315
|
self._df = pd.concat(
|
|
268
|
-
[self._df, df.get_data_frame()],
|
|
316
|
+
[self._df, df.get_data_frame()],
|
|
317
|
+
ignore_index=True,
|
|
269
318
|
)
|
|
270
319
|
return True
|
|
271
320
|
else:
|
|
272
|
-
logger.error("Illegal data type -> '%s'", type(add_data))
|
|
273
|
-
return False
|
|
274
|
-
else: # self._df is None (initial state)
|
|
275
|
-
if isinstance(add_data, pd.DataFrame):
|
|
276
|
-
self._df = add_data
|
|
277
|
-
return True
|
|
278
|
-
elif isinstance(add_data, Data):
|
|
279
|
-
self._df = add_data.get_data_frame()
|
|
280
|
-
return True
|
|
281
|
-
elif isinstance(add_data, list):
|
|
282
|
-
self._df = pd.DataFrame(add_data)
|
|
283
|
-
return True
|
|
284
|
-
elif isinstance(add_data, dict):
|
|
285
|
-
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
286
|
-
self._df = pd.DataFrame([add_data])
|
|
287
|
-
return True
|
|
288
|
-
else:
|
|
289
|
-
logger.error("Illegal data type -> '%s'", type(add_data))
|
|
321
|
+
self.logger.error("Illegal data type -> '%s'", type(add_data))
|
|
290
322
|
return False
|
|
323
|
+
elif isinstance(add_data, pd.DataFrame):
|
|
324
|
+
self._df = add_data
|
|
325
|
+
return True
|
|
326
|
+
elif isinstance(add_data, Data):
|
|
327
|
+
self._df = add_data.get_data_frame()
|
|
328
|
+
return True
|
|
329
|
+
elif isinstance(add_data, list):
|
|
330
|
+
self._df = pd.DataFrame(add_data)
|
|
331
|
+
return True
|
|
332
|
+
elif isinstance(add_data, dict):
|
|
333
|
+
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
334
|
+
self._df = pd.DataFrame([add_data])
|
|
335
|
+
return True
|
|
336
|
+
else:
|
|
337
|
+
self.logger.error("Illegal data type -> '%s'", type(add_data))
|
|
338
|
+
return False
|
|
339
|
+
|
|
340
|
+
# end method definition
|
|
341
|
+
|
|
342
|
+
def merge(
|
|
343
|
+
self,
|
|
344
|
+
merge_data: pd.DataFrame,
|
|
345
|
+
on: str | list[str] | None = None,
|
|
346
|
+
how: str = "inner",
|
|
347
|
+
left_on: str | list[str] | None = None,
|
|
348
|
+
right_on: str | list[str] | None = None,
|
|
349
|
+
left_index: bool = False,
|
|
350
|
+
right_index: bool = False,
|
|
351
|
+
suffixes: tuple[str, str] = ("_x", "_y"),
|
|
352
|
+
indicator: bool = False,
|
|
353
|
+
validate: str | None = None,
|
|
354
|
+
) -> pd.DataFrame | None:
|
|
355
|
+
"""Merge the current DataFrame (_df) with another DataFrame.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
merge_data (pd.DataFrame | Data):
|
|
359
|
+
The DataFrame to merge with.
|
|
360
|
+
on (str | list[str]):
|
|
361
|
+
Column(s) to merge on. Defaults to None.
|
|
362
|
+
how (str, optional):
|
|
363
|
+
Type of merge ('inner', 'outer', 'left', 'right', 'cross'). Defaults to 'inner'.
|
|
364
|
+
left_on (str | list[str] | None, optional):
|
|
365
|
+
Column(s) from self._df to merge on. Defaults to None.
|
|
366
|
+
right_on (str | list[str] | None, optional):
|
|
367
|
+
Column(s) from other DataFrame to merge on. Defaults to None.
|
|
368
|
+
left_index (str | list[str], optional):
|
|
369
|
+
Whether to merge on the index of self._df. Defaults to False.
|
|
370
|
+
right_index (bool, optional):
|
|
371
|
+
Whether to merge on the index of other. Defaults to False.
|
|
372
|
+
suffixes (tuple[str, str]):
|
|
373
|
+
Suffixes for overlapping column names. Defaults to ('_x', '_y').
|
|
374
|
+
indicator (bool, optional):
|
|
375
|
+
If True, adds a column showing the merge source. Defaults to False.
|
|
376
|
+
validate ():
|
|
377
|
+
If provided, checks merge integrity
|
|
378
|
+
('one_to_one', 'one_to_many', 'many_to_one', 'many_to_many'). Defaults to None.
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
The merged DataFrame or None in case of an error.
|
|
382
|
+
|
|
383
|
+
Exceptions:
|
|
384
|
+
ValueError: If `other` is not a DataFrame.
|
|
385
|
+
KeyError: If required columns for merging are missing.
|
|
386
|
+
ValueError: If `validate` check fails.
|
|
387
|
+
|
|
388
|
+
"""
|
|
389
|
+
|
|
390
|
+
if self._df is None or self._df.empty:
|
|
391
|
+
self._df = merge_data
|
|
392
|
+
|
|
393
|
+
if isinstance(merge_data, Data):
|
|
394
|
+
merge_data = merge_data.get_data_frame() # Extract DataFrame from Data instance
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
return self._df.merge(
|
|
398
|
+
merge_data,
|
|
399
|
+
how=how,
|
|
400
|
+
on=on,
|
|
401
|
+
left_on=left_on,
|
|
402
|
+
right_on=right_on,
|
|
403
|
+
left_index=left_index,
|
|
404
|
+
right_index=right_index,
|
|
405
|
+
suffixes=suffixes,
|
|
406
|
+
indicator=indicator,
|
|
407
|
+
validate=validate,
|
|
408
|
+
)
|
|
409
|
+
except KeyError:
|
|
410
|
+
self.logger.error("Column(s) not found for merging!")
|
|
411
|
+
except ValueError:
|
|
412
|
+
self.logger.error("Invalid merge operation!")
|
|
413
|
+
|
|
414
|
+
return None
|
|
415
|
+
|
|
416
|
+
# end method definition
|
|
417
|
+
|
|
418
|
+
def strip(self, columns: list | None = None, inplace: bool = True) -> pd.DataFrame:
|
|
419
|
+
"""Strip leading and trailing spaces from specified columns in a data frame.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
columns (list | None):
|
|
423
|
+
The list of column names to strip. If None, it strips
|
|
424
|
+
leading and trailing spaces from _all_ string columns.
|
|
425
|
+
inplace (bool, optional):
|
|
426
|
+
If True, the data modification is done in place, i.e.
|
|
427
|
+
modifying the existing data frame of the object.
|
|
428
|
+
If False, the data frame is copied and the copy is modified
|
|
429
|
+
and returned.
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
pd.DataFrame:
|
|
433
|
+
The modified data frame with stripped columns.
|
|
434
|
+
|
|
435
|
+
"""
|
|
436
|
+
|
|
437
|
+
df = self._df.copy() if not inplace else self._df
|
|
438
|
+
|
|
439
|
+
if columns is None:
|
|
440
|
+
# Strip spaces from all string columns
|
|
441
|
+
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
|
|
442
|
+
else:
|
|
443
|
+
# Strip spaces from specified columns
|
|
444
|
+
for col in columns:
|
|
445
|
+
if col in df.columns and df[col].dtype == "object": # Check if the column exists and is of string type
|
|
446
|
+
df[col] = df[col].str.strip()
|
|
447
|
+
|
|
448
|
+
if inplace:
|
|
449
|
+
self._df = df
|
|
450
|
+
|
|
451
|
+
return df
|
|
291
452
|
|
|
292
453
|
# end method definition
|
|
293
454
|
|
|
294
|
-
def load_json_data(
|
|
295
|
-
|
|
455
|
+
def load_json_data(
|
|
456
|
+
self,
|
|
457
|
+
json_path: str,
|
|
458
|
+
convert_dates: bool = False,
|
|
459
|
+
index_column: str | None = None,
|
|
460
|
+
compression: str | None = None,
|
|
461
|
+
) -> bool:
|
|
462
|
+
"""Load JSON data into a Pandas data frame.
|
|
296
463
|
|
|
297
464
|
Args:
|
|
298
|
-
json_path (str):
|
|
299
|
-
|
|
465
|
+
json_path (str):
|
|
466
|
+
The path to the JSON file.
|
|
467
|
+
convert_dates (bool, optional):
|
|
468
|
+
Defines whether or not dates should be converted.
|
|
469
|
+
The default is False = dates are NOT converted.
|
|
470
|
+
index_column (str | None, optional):
|
|
471
|
+
The Name of the column (i.e. JSON data field) that should
|
|
472
|
+
become the index in the loaded data frame.
|
|
473
|
+
compression (str | None):
|
|
474
|
+
Remove a compression:
|
|
475
|
+
* gzip (.gz)
|
|
476
|
+
* bz2 (.bz2)
|
|
477
|
+
* zip (.zip)
|
|
478
|
+
* xz (.xz)
|
|
479
|
+
The value for compression should not include the dot.
|
|
480
|
+
Default is None = no compression.
|
|
481
|
+
|
|
300
482
|
Returns:
|
|
301
483
|
bool: False in case an error occured, True otherwise.
|
|
484
|
+
|
|
302
485
|
"""
|
|
303
486
|
|
|
304
|
-
if
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
487
|
+
if not json_path:
|
|
488
|
+
self.logger.error(
|
|
489
|
+
"You have not specified a JSON path!",
|
|
490
|
+
)
|
|
491
|
+
return False
|
|
492
|
+
|
|
493
|
+
# If compression is enabled the file path should have
|
|
494
|
+
# the matching file name extension:
|
|
495
|
+
if compression:
|
|
496
|
+
compression = compression.lstrip(".") # remove a dot prefix if present
|
|
497
|
+
suffix = "." + compression if compression != "gzip" else "gz"
|
|
498
|
+
if not json_path.endswith(suffix):
|
|
499
|
+
json_path += suffix
|
|
500
|
+
|
|
501
|
+
if not os.path.exists(json_path):
|
|
502
|
+
self.logger.error(
|
|
503
|
+
"Missing JSON file - you have not specified a valid path -> '%s'.",
|
|
504
|
+
json_path,
|
|
505
|
+
)
|
|
506
|
+
return False
|
|
507
|
+
|
|
508
|
+
# Load data from JSON file
|
|
509
|
+
try:
|
|
510
|
+
df = pd.read_json(
|
|
511
|
+
path_or_buf=json_path,
|
|
512
|
+
convert_dates=convert_dates,
|
|
513
|
+
compression=compression,
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
if index_column and index_column not in df.columns:
|
|
517
|
+
self.logger.error(
|
|
518
|
+
"Specified index column -> '%s' not found in the JSON data.",
|
|
519
|
+
index_column,
|
|
321
520
|
)
|
|
322
521
|
return False
|
|
323
|
-
except PermissionError:
|
|
324
|
-
logger.error("Permission denied to access the file -> %s.", json_path)
|
|
325
|
-
return False
|
|
326
|
-
except IOError as e:
|
|
327
|
-
logger.error("An I/O error occurred -> %s", str(e))
|
|
328
|
-
return False
|
|
329
|
-
except json.JSONDecodeError as e:
|
|
330
|
-
logger.error("Error: Unable to decode JSON -> %s", str(e))
|
|
331
|
-
return False
|
|
332
|
-
except ValueError as e:
|
|
333
|
-
logger.error("Invalid JSON input -> %s", str(e))
|
|
334
|
-
return False
|
|
335
|
-
except AttributeError as e:
|
|
336
|
-
logger.error("Unexpected data structure -> %s", str(e))
|
|
337
|
-
return False
|
|
338
|
-
except TypeError as e:
|
|
339
|
-
logger.error("Unexpected data type -> %s", str(e))
|
|
340
|
-
return False
|
|
341
|
-
except KeyError as e:
|
|
342
|
-
logger.error("Missing key in JSON data -> %s", str(e))
|
|
343
|
-
return False
|
|
344
522
|
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
523
|
+
if index_column:
|
|
524
|
+
df = df.set_index(keys=index_column)
|
|
525
|
+
if self._df is None:
|
|
526
|
+
self._df = df
|
|
527
|
+
else:
|
|
528
|
+
self._df = pd.concat([self._df, df])
|
|
529
|
+
self.logger.info(
|
|
530
|
+
"After loading JSON file -> '%s', the data frame has %s row(s) and %s column(s)",
|
|
531
|
+
json_path,
|
|
532
|
+
self._df.shape[0],
|
|
533
|
+
self._df.shape[1],
|
|
534
|
+
)
|
|
535
|
+
except FileNotFoundError:
|
|
536
|
+
self.logger.error(
|
|
537
|
+
"JSON file -> '%s' not found. Please check the file path.",
|
|
538
|
+
json_path,
|
|
539
|
+
)
|
|
540
|
+
return False
|
|
541
|
+
except PermissionError:
|
|
542
|
+
self.logger.error(
|
|
543
|
+
"Missing permission to access the JSON file -> '%s'.",
|
|
348
544
|
json_path,
|
|
349
545
|
)
|
|
350
546
|
return False
|
|
547
|
+
except OSError:
|
|
548
|
+
self.logger.error("An I/O error occurred!")
|
|
549
|
+
return False
|
|
550
|
+
except json.JSONDecodeError:
|
|
551
|
+
self.logger.error(
|
|
552
|
+
"Unable to decode JSON file -> '%s'",
|
|
553
|
+
json_path,
|
|
554
|
+
)
|
|
555
|
+
return False
|
|
556
|
+
except ValueError:
|
|
557
|
+
self.logger.error("Invalid JSON input -> %s", json_path)
|
|
558
|
+
return False
|
|
559
|
+
except AttributeError:
|
|
560
|
+
self.logger.error("Unexpected JSON data structure in file -> %s", json_path)
|
|
561
|
+
return False
|
|
562
|
+
except TypeError:
|
|
563
|
+
self.logger.error("Unexpected JSON data type in file -> %s", json_path)
|
|
564
|
+
return False
|
|
565
|
+
except KeyError:
|
|
566
|
+
self.logger.error("Missing key in JSON data in file -> %s", json_path)
|
|
567
|
+
return False
|
|
568
|
+
|
|
351
569
|
return True
|
|
352
570
|
|
|
353
571
|
# end method definition
|
|
354
572
|
|
|
355
573
|
def save_json_data(
|
|
356
|
-
self,
|
|
574
|
+
self,
|
|
575
|
+
json_path: str,
|
|
576
|
+
orient: str = "records",
|
|
577
|
+
preserve_index: bool = False,
|
|
578
|
+
index_column: str = "index",
|
|
579
|
+
compression: str | None = None,
|
|
357
580
|
) -> bool:
|
|
358
|
-
"""Save JSON data from
|
|
581
|
+
"""Save JSON data from data frame to file.
|
|
359
582
|
|
|
360
583
|
Args:
|
|
361
|
-
json_path (str):
|
|
362
|
-
orient (str, optional):
|
|
363
|
-
|
|
584
|
+
json_path (str): The path to where the JSON file should be safed.
|
|
585
|
+
orient (str, optional):
|
|
586
|
+
The structure of the JSON. Possible values:
|
|
587
|
+
* "records" (this is the default)
|
|
588
|
+
* "columns"
|
|
589
|
+
* "index"
|
|
590
|
+
* "table"
|
|
591
|
+
* "split"
|
|
592
|
+
preserve_index (bool, optional):
|
|
593
|
+
Defines if the index column of the data frame should be exported as well.
|
|
594
|
+
The default is False (index is not exported).
|
|
595
|
+
index_column (str, optional):
|
|
596
|
+
The Name of the column (i.e. JSON data field) that should
|
|
597
|
+
become the index in the loaded data frame. The default is "index".
|
|
598
|
+
compression (str | None):
|
|
599
|
+
Apply a compression:
|
|
600
|
+
* gzip (.gz)
|
|
601
|
+
* bz2 (.bz2)
|
|
602
|
+
* zip (.zip)
|
|
603
|
+
* xz (.xz)
|
|
604
|
+
|
|
364
605
|
Returns:
|
|
365
|
-
bool:
|
|
606
|
+
bool:
|
|
607
|
+
False in case an error occured, True otherwise.
|
|
608
|
+
|
|
366
609
|
"""
|
|
367
610
|
|
|
368
|
-
if
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
611
|
+
if not json_path:
|
|
612
|
+
self.logger.error(
|
|
613
|
+
"You have not specified a JSON path!",
|
|
614
|
+
)
|
|
615
|
+
return False
|
|
616
|
+
|
|
617
|
+
# If compression is enabled the file path should have
|
|
618
|
+
# the matching file name extension:
|
|
619
|
+
if compression:
|
|
620
|
+
suffix = "." + compression if compression != "gzip" else ".gz"
|
|
621
|
+
if not json_path.endswith(suffix):
|
|
622
|
+
json_path += suffix
|
|
623
|
+
|
|
624
|
+
# Save data to JSON file
|
|
625
|
+
try:
|
|
626
|
+
if self._df is not None:
|
|
627
|
+
if not os.path.exists(os.path.dirname(json_path)):
|
|
628
|
+
os.makedirs(os.path.dirname(json_path), exist_ok=True)
|
|
629
|
+
|
|
630
|
+
# index parameter is only allowed if orient has one of the following values:
|
|
631
|
+
if orient in ("columns", "index", "table", "split"):
|
|
632
|
+
self._df.to_json(
|
|
633
|
+
path_or_buf=json_path,
|
|
634
|
+
index=preserve_index,
|
|
635
|
+
orient=orient,
|
|
636
|
+
indent=2,
|
|
637
|
+
compression=compression,
|
|
638
|
+
date_format="iso",
|
|
639
|
+
)
|
|
640
|
+
# In this case we cannot use the index parameter as this would give this error:
|
|
641
|
+
# Value Error -> 'index=True' is only valid when 'orient' is 'split', 'table', 'index', or 'columns'
|
|
642
|
+
# So we create a new column that preserves the original row IDs from the index. The nasme
|
|
643
|
+
|
|
644
|
+
elif preserve_index:
|
|
645
|
+
df_with_index = self._df.reset_index(
|
|
646
|
+
names=index_column,
|
|
647
|
+
inplace=False,
|
|
648
|
+
)
|
|
649
|
+
df_with_index.to_json(
|
|
650
|
+
path_or_buf=json_path,
|
|
651
|
+
orient=orient,
|
|
652
|
+
indent=2,
|
|
653
|
+
compression=compression,
|
|
654
|
+
date_format="iso",
|
|
655
|
+
)
|
|
387
656
|
else:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
657
|
+
self._df.to_json(
|
|
658
|
+
path_or_buf=json_path,
|
|
659
|
+
orient=orient,
|
|
660
|
+
indent=2,
|
|
661
|
+
compression=compression,
|
|
662
|
+
date_format="iso",
|
|
663
|
+
)
|
|
664
|
+
else:
|
|
665
|
+
self.logger.warning(
|
|
666
|
+
"Data frame is empty. Cannot write it to JSON file -> '%s'.",
|
|
667
|
+
json_path,
|
|
393
668
|
)
|
|
394
669
|
return False
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
else:
|
|
406
|
-
logger.error(
|
|
407
|
-
"Missing JSON file -> '%s' you have not specified a valid path!",
|
|
670
|
+
except FileNotFoundError:
|
|
671
|
+
self.logger.error(
|
|
672
|
+
"File -> '%s' not found. Please check the file path.",
|
|
673
|
+
json_path,
|
|
674
|
+
)
|
|
675
|
+
return False
|
|
676
|
+
except PermissionError:
|
|
677
|
+
self.logger.error(
|
|
678
|
+
"Permission denied to access the file -> '%s'.",
|
|
408
679
|
json_path,
|
|
409
680
|
)
|
|
410
681
|
return False
|
|
682
|
+
except OSError:
|
|
683
|
+
self.logger.error("An I/O error occurred accessing file -> %s", json_path)
|
|
684
|
+
return False
|
|
685
|
+
except ValueError:
|
|
686
|
+
self.logger.error("Value error!")
|
|
687
|
+
return False
|
|
688
|
+
|
|
411
689
|
return True
|
|
412
690
|
|
|
413
691
|
# end method definition
|
|
@@ -422,27 +700,40 @@ class Data:
|
|
|
422
700
|
names: list | None = None,
|
|
423
701
|
na_values: list | None = None,
|
|
424
702
|
) -> bool:
|
|
425
|
-
"""Load Excel (xlsx) data into
|
|
426
|
-
|
|
703
|
+
"""Load Excel (xlsx) data into Pandas data frame.
|
|
704
|
+
|
|
705
|
+
Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
|
|
706
|
+
read from a local filesystem or URL. Supports an option to read a
|
|
707
|
+
single sheet or a list of sheets.
|
|
427
708
|
|
|
428
709
|
Args:
|
|
429
|
-
xlsx_path (str):
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
710
|
+
xlsx_path (str):
|
|
711
|
+
The path to the Excel file to load.
|
|
712
|
+
sheet_names (list | str | int, optional):
|
|
713
|
+
Name or Index of the sheet in the Excel workbook to load.
|
|
714
|
+
If 'None' then all sheets will be loaded.
|
|
715
|
+
If 0 then first sheet in workbook will be loaded (this is the Default).
|
|
716
|
+
If string then this is interpreted as the name of the sheet to load.
|
|
717
|
+
If a list is passed, this can be a list of index values (int) or
|
|
718
|
+
a list of strings with the sheet names to load.
|
|
719
|
+
usecols (list | str, optional):
|
|
720
|
+
A list of columns to load, specified by general column names in Excel,
|
|
721
|
+
e.g. usecols='B:D', usecols=['A', 'C', 'F']
|
|
722
|
+
skip_rows (int, optional):
|
|
723
|
+
List of rows to skip on top of the sheet (e.g. to not read headlines)
|
|
724
|
+
header (int | None, optional):
|
|
725
|
+
Excel Row (0-indexed) to use for the column labels of the parsed data frame.
|
|
726
|
+
If file contains no header row, then you should explicitly pass header=None.
|
|
727
|
+
Default is 0.
|
|
728
|
+
names (list, optional):
|
|
729
|
+
A list of column names to use. Default is None.
|
|
730
|
+
na_values (list, optional):
|
|
731
|
+
A list of values in the Excel that should become the Pandas NA value.
|
|
732
|
+
|
|
444
733
|
Returns:
|
|
445
|
-
bool:
|
|
734
|
+
bool:
|
|
735
|
+
False in case an error occured, True otherwise.
|
|
736
|
+
|
|
446
737
|
"""
|
|
447
738
|
|
|
448
739
|
if xlsx_path is not None and os.path.exists(xlsx_path):
|
|
@@ -457,16 +748,21 @@ class Data:
|
|
|
457
748
|
names=names,
|
|
458
749
|
na_values=na_values,
|
|
459
750
|
)
|
|
460
|
-
#
|
|
751
|
+
# If multiple sheets from an Excel workbook are loaded,
|
|
461
752
|
# then read_excel() returns a dictionary. The keys are
|
|
462
|
-
# the names of the sheets and the values are the
|
|
463
|
-
#
|
|
753
|
+
# the names of the sheets and the values are the data frames.
|
|
754
|
+
# As this class can only handle one data frame per object,
|
|
755
|
+
# We handle this case by concatenating the different sheets.
|
|
756
|
+
# If you don't want this make sure your Excel workbook has only
|
|
757
|
+
# one sheet or use the "sheet_name" parameter to select the one(s)
|
|
758
|
+
# you want to load.
|
|
464
759
|
if isinstance(df, dict):
|
|
465
|
-
logger.info("Loading multiple Excel sheets from the workbook!")
|
|
760
|
+
self.logger.info("Loading multiple Excel sheets from the workbook!")
|
|
466
761
|
multi_sheet_df = pd.DataFrame()
|
|
467
|
-
for sheet in df
|
|
762
|
+
for sheet in df:
|
|
468
763
|
multi_sheet_df = pd.concat(
|
|
469
|
-
[multi_sheet_df, df[sheet]],
|
|
764
|
+
[multi_sheet_df, df[sheet]],
|
|
765
|
+
ignore_index=True,
|
|
470
766
|
)
|
|
471
767
|
df = multi_sheet_df
|
|
472
768
|
if self._df is None:
|
|
@@ -474,199 +770,390 @@ class Data:
|
|
|
474
770
|
else:
|
|
475
771
|
self._df = pd.concat([self._df, df], ignore_index=True)
|
|
476
772
|
except FileNotFoundError:
|
|
477
|
-
logger.error(
|
|
478
|
-
"
|
|
773
|
+
self.logger.error(
|
|
774
|
+
"Excel file -> '%s' not found. Please check the file path.",
|
|
775
|
+
xlsx_path,
|
|
479
776
|
)
|
|
480
777
|
return False
|
|
481
778
|
except PermissionError:
|
|
482
|
-
logger.error(
|
|
779
|
+
self.logger.error(
|
|
780
|
+
"Missing permission to access the Excel file -> '%s'.",
|
|
781
|
+
xlsx_path,
|
|
782
|
+
)
|
|
483
783
|
return False
|
|
484
|
-
except
|
|
485
|
-
logger.error(
|
|
784
|
+
except OSError:
|
|
785
|
+
self.logger.error(
|
|
786
|
+
"An I/O error occurred while reading the Excel file -> '%s'",
|
|
787
|
+
xlsx_path,
|
|
788
|
+
)
|
|
486
789
|
return False
|
|
487
|
-
except ValueError
|
|
488
|
-
logger.error(
|
|
790
|
+
except ValueError:
|
|
791
|
+
self.logger.error(
|
|
792
|
+
"Invalid Excel input in file -> '%s'",
|
|
793
|
+
xlsx_path,
|
|
794
|
+
)
|
|
489
795
|
return False
|
|
490
|
-
except AttributeError
|
|
491
|
-
logger.error("Unexpected data structure -> %s",
|
|
796
|
+
except AttributeError:
|
|
797
|
+
self.logger.error("Unexpected data structure in file -> %s", xlsx_path)
|
|
492
798
|
return False
|
|
493
|
-
except TypeError
|
|
494
|
-
logger.error("Unexpected data type -> %s",
|
|
799
|
+
except TypeError:
|
|
800
|
+
self.logger.error("Unexpected data type in file -> %s", xlsx_path)
|
|
495
801
|
return False
|
|
496
|
-
except KeyError
|
|
497
|
-
logger.error("Missing key in Excel data -> %s",
|
|
802
|
+
except KeyError:
|
|
803
|
+
self.logger.error("Missing key in Excel data in file -> %s", xlsx_path)
|
|
498
804
|
return False
|
|
499
805
|
|
|
500
806
|
else:
|
|
501
|
-
logger.error(
|
|
502
|
-
"Missing Excel file -> '%s'
|
|
807
|
+
self.logger.error(
|
|
808
|
+
"Missing Excel file -> '%s'. You have not specified a valid path!",
|
|
503
809
|
xlsx_path,
|
|
504
810
|
)
|
|
505
811
|
return False
|
|
812
|
+
|
|
506
813
|
return True
|
|
507
814
|
|
|
508
815
|
# end method definition
|
|
509
816
|
|
|
510
817
|
def save_excel_data(
|
|
511
|
-
self,
|
|
818
|
+
self,
|
|
819
|
+
excel_path: str,
|
|
820
|
+
sheet_name: str = "Pandas Export",
|
|
821
|
+
index: bool = False,
|
|
822
|
+
columns: list | None = None,
|
|
512
823
|
) -> bool:
|
|
513
|
-
"""
|
|
514
|
-
Save the DataFrame to an Excel file, with robust error handling and logging.
|
|
824
|
+
"""Save the data frame to an Excel file, with robust error handling and logging.
|
|
515
825
|
|
|
516
826
|
Args:
|
|
517
|
-
excel_path (str):
|
|
518
|
-
|
|
519
|
-
|
|
827
|
+
excel_path (str):
|
|
828
|
+
The file path to save the Excel file.
|
|
829
|
+
sheet_name (str):
|
|
830
|
+
The sheet name where data will be saved. Default is 'Sheet1'.
|
|
831
|
+
index (bool, optional):
|
|
832
|
+
Whether to write the row names (index). Default is False.
|
|
833
|
+
columns (list | None, optional):
|
|
834
|
+
A list of column names to write into the excel file.
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
bool:
|
|
838
|
+
True = success, False = error.
|
|
839
|
+
|
|
520
840
|
"""
|
|
841
|
+
|
|
521
842
|
try:
|
|
522
843
|
# Check if the directory exists
|
|
523
844
|
directory = os.path.dirname(excel_path)
|
|
524
845
|
if directory and not os.path.exists(directory):
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
846
|
+
os.makedirs(directory)
|
|
847
|
+
|
|
848
|
+
# Validate columns if provided
|
|
849
|
+
if columns:
|
|
850
|
+
existing_columns = [col for col in columns if col in self._df.columns]
|
|
851
|
+
missing_columns = set(columns) - set(existing_columns)
|
|
852
|
+
if missing_columns:
|
|
853
|
+
self.logger.warning(
|
|
854
|
+
"The following columns do not exist in the data frame and cannot be saved to Excel -> %s",
|
|
855
|
+
", ".join(missing_columns),
|
|
856
|
+
)
|
|
857
|
+
columns = existing_columns
|
|
528
858
|
|
|
529
|
-
# Attempt to save the
|
|
530
|
-
self._df.to_excel(
|
|
531
|
-
|
|
859
|
+
# Attempt to save the data frame to Excel:
|
|
860
|
+
self._df.to_excel(
|
|
861
|
+
excel_path,
|
|
862
|
+
sheet_name=sheet_name,
|
|
863
|
+
index=index,
|
|
864
|
+
columns=columns or None, # Pass None if no columns provided
|
|
865
|
+
)
|
|
866
|
+
self.logger.info(
|
|
867
|
+
"Data frame saved successfully to Excel file -> '%s'.",
|
|
868
|
+
excel_path,
|
|
869
|
+
)
|
|
532
870
|
|
|
533
|
-
except FileNotFoundError
|
|
534
|
-
logger.error(
|
|
871
|
+
except FileNotFoundError:
|
|
872
|
+
self.logger.error(
|
|
873
|
+
"Cannot write data frame to Excel file -> '%s'",
|
|
874
|
+
excel_path,
|
|
875
|
+
)
|
|
535
876
|
return False
|
|
536
877
|
except PermissionError:
|
|
537
|
-
logger.error(
|
|
538
|
-
"
|
|
878
|
+
self.logger.error(
|
|
879
|
+
"Cannot write data frame to Excel file -> '%s'",
|
|
539
880
|
excel_path,
|
|
540
881
|
)
|
|
541
882
|
return False
|
|
542
|
-
except ValueError
|
|
543
|
-
logger.error(
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
883
|
+
except ValueError:
|
|
884
|
+
self.logger.error(
|
|
885
|
+
"Cannot write data frame to Excel file -> '%s'",
|
|
886
|
+
excel_path,
|
|
887
|
+
)
|
|
547
888
|
return False
|
|
548
|
-
except
|
|
549
|
-
|
|
550
|
-
|
|
889
|
+
except OSError:
|
|
890
|
+
self.logger.error(
|
|
891
|
+
"Cannot write data frame to Excel file -> '%s'",
|
|
892
|
+
excel_path,
|
|
893
|
+
)
|
|
551
894
|
return False
|
|
552
895
|
|
|
553
896
|
return True
|
|
554
897
|
|
|
555
898
|
# end method definition
|
|
556
899
|
|
|
557
|
-
def load_csv_data(
|
|
558
|
-
|
|
900
|
+
def load_csv_data(
|
|
901
|
+
self,
|
|
902
|
+
csv_path: str,
|
|
903
|
+
delimiter: str = ",",
|
|
904
|
+
names: list | None = None,
|
|
905
|
+
header: int | None = 0,
|
|
906
|
+
usecols: list | None = None,
|
|
907
|
+
encoding: str = "utf-8",
|
|
908
|
+
) -> bool:
|
|
909
|
+
"""Load CSV (Comma separated values) data into data frame.
|
|
559
910
|
|
|
560
911
|
Args:
|
|
561
|
-
csv_path (str):
|
|
912
|
+
csv_path (str):
|
|
913
|
+
The path to the CSV file.
|
|
914
|
+
delimiter (str, optional, length = 1):
|
|
915
|
+
The character used to delimit values. Default is "," (comma).
|
|
916
|
+
names (list | None, optional):
|
|
917
|
+
The list of column names. This is useful if file does not have a header line
|
|
918
|
+
but just the data.
|
|
919
|
+
header (int | None, optional):
|
|
920
|
+
The index of the header line. Default is 0 (first line). None indicates
|
|
921
|
+
that the file does not have a header line
|
|
922
|
+
usecols (list | None, optional):
|
|
923
|
+
There are three possible list values types:
|
|
924
|
+
1. int:
|
|
925
|
+
These values are treated as column indices for columns to keep
|
|
926
|
+
(first column has index 0).
|
|
927
|
+
2. str:
|
|
928
|
+
The names of the columns to keep. For this to work the file needs
|
|
929
|
+
either a header line (i.e. 'header != None') or the 'names'
|
|
930
|
+
parameter must be specified.
|
|
931
|
+
3. bool:
|
|
932
|
+
The length of the list must match the number of columns. Only
|
|
933
|
+
columns that have a value of True are kept.
|
|
934
|
+
encoding (str, optional):
|
|
935
|
+
The encoding of the file. Default = "utf-8".
|
|
936
|
+
|
|
562
937
|
Returns:
|
|
563
|
-
bool:
|
|
938
|
+
bool:
|
|
939
|
+
False in case an error occured, True otherwise.
|
|
940
|
+
|
|
564
941
|
"""
|
|
565
942
|
|
|
566
|
-
if csv_path
|
|
567
|
-
#
|
|
943
|
+
if csv_path.startswith("http"):
|
|
944
|
+
# Download file from remote location specified by the packageUrl
|
|
945
|
+
# this must be a public place without authentication:
|
|
946
|
+
self.logger.debug("Download CSV file from URL -> '%s'.", csv_path)
|
|
947
|
+
|
|
568
948
|
try:
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
self._df = pd.concat([self._df, df])
|
|
574
|
-
except FileNotFoundError:
|
|
575
|
-
logger.error(
|
|
576
|
-
"File -> '%s' not found. Please check the file path.", csv_path
|
|
577
|
-
)
|
|
578
|
-
return False
|
|
579
|
-
except PermissionError:
|
|
580
|
-
logger.error("Permission denied to access the file -> %s.", csv_path)
|
|
949
|
+
response = requests.get(url=csv_path, timeout=1200)
|
|
950
|
+
response.raise_for_status()
|
|
951
|
+
except requests.exceptions.HTTPError:
|
|
952
|
+
self.logger.error("HTTP error with -> %s", csv_path)
|
|
581
953
|
return False
|
|
582
|
-
except
|
|
583
|
-
logger.error("
|
|
954
|
+
except requests.exceptions.ConnectionError:
|
|
955
|
+
self.logger.error("Connection error with -> %s", csv_path)
|
|
584
956
|
return False
|
|
585
|
-
except
|
|
586
|
-
logger.error("
|
|
957
|
+
except requests.exceptions.Timeout:
|
|
958
|
+
self.logger.error("Timeout error with -> %s", csv_path)
|
|
587
959
|
return False
|
|
588
|
-
except
|
|
589
|
-
logger.error("
|
|
590
|
-
return False
|
|
591
|
-
except TypeError as e:
|
|
592
|
-
logger.error("Unexpected data type -> %s", str(e))
|
|
593
|
-
return False
|
|
594
|
-
except KeyError as e:
|
|
595
|
-
logger.error("Missing key in CSV data -> %s", str(e))
|
|
960
|
+
except requests.exceptions.RequestException:
|
|
961
|
+
self.logger.error("Request error with -> %s", csv_path)
|
|
596
962
|
return False
|
|
597
963
|
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
"Missing CSV file -> '%s' you have not specified a valid path!",
|
|
964
|
+
self.logger.debug(
|
|
965
|
+
"Successfully downloaded CSV file -> %s; status code -> %s",
|
|
601
966
|
csv_path,
|
|
967
|
+
response.status_code,
|
|
602
968
|
)
|
|
603
|
-
return False
|
|
604
|
-
return True
|
|
605
969
|
|
|
606
|
-
|
|
970
|
+
# Convert bytes to a string using utf-8 and create a file-like object
|
|
971
|
+
csv_file = StringIO(response.content.decode(encoding))
|
|
607
972
|
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
"""Load XML data into DataFrame
|
|
973
|
+
elif os.path.exists(csv_path):
|
|
974
|
+
self.logger.debug("Using local CSV file -> '%s'.", csv_path)
|
|
975
|
+
csv_file = csv_path
|
|
612
976
|
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
"""
|
|
977
|
+
else:
|
|
978
|
+
self.logger.error(
|
|
979
|
+
"Missing CSV file -> '%s' you have not specified a valid path!",
|
|
980
|
+
csv_path,
|
|
981
|
+
)
|
|
982
|
+
return False
|
|
620
983
|
|
|
984
|
+
# Load data from CSV file or buffer
|
|
621
985
|
try:
|
|
622
|
-
df = pd.
|
|
623
|
-
|
|
986
|
+
df = pd.read_csv(
|
|
987
|
+
filepath_or_buffer=csv_file,
|
|
988
|
+
delimiter=delimiter,
|
|
989
|
+
names=names,
|
|
990
|
+
header=header,
|
|
991
|
+
usecols=usecols,
|
|
992
|
+
encoding=encoding,
|
|
993
|
+
skipinitialspace=True,
|
|
994
|
+
)
|
|
624
995
|
if self._df is None:
|
|
625
996
|
self._df = df
|
|
626
997
|
else:
|
|
627
998
|
self._df = pd.concat([self._df, df])
|
|
628
|
-
logger.info("XML file loaded successfully!")
|
|
629
|
-
return True
|
|
630
999
|
except FileNotFoundError:
|
|
631
|
-
|
|
1000
|
+
self.logger.error(
|
|
1001
|
+
"CSV file -> '%s' not found. Please check the file path.",
|
|
1002
|
+
csv_path,
|
|
1003
|
+
)
|
|
632
1004
|
return False
|
|
633
1005
|
except PermissionError:
|
|
634
|
-
logger.error(
|
|
1006
|
+
self.logger.error(
|
|
1007
|
+
"Permission denied to access the CSV file -> '%s'.",
|
|
1008
|
+
csv_path,
|
|
1009
|
+
)
|
|
635
1010
|
return False
|
|
636
|
-
except
|
|
637
|
-
logger.error("An I/O error occurred
|
|
1011
|
+
except OSError:
|
|
1012
|
+
self.logger.error("An I/O error occurred!")
|
|
638
1013
|
return False
|
|
639
|
-
except ValueError
|
|
640
|
-
logger.error("Invalid CSV input -> %s",
|
|
1014
|
+
except ValueError:
|
|
1015
|
+
self.logger.error("Invalid CSV input in file -> %s", csv_path)
|
|
641
1016
|
return False
|
|
642
|
-
except AttributeError
|
|
643
|
-
logger.error("Unexpected data structure -> %s",
|
|
1017
|
+
except AttributeError:
|
|
1018
|
+
self.logger.error("Unexpected data structure in file -> %s", csv_path)
|
|
644
1019
|
return False
|
|
645
|
-
except TypeError
|
|
646
|
-
logger.error("Unexpected data type -> %s",
|
|
1020
|
+
except TypeError:
|
|
1021
|
+
self.logger.error("Unexpected data type in file -> %s", csv_path)
|
|
647
1022
|
return False
|
|
648
|
-
except KeyError
|
|
649
|
-
logger.error("Missing key in CSV data -> %s",
|
|
1023
|
+
except KeyError:
|
|
1024
|
+
self.logger.error("Missing key in CSV data -> %s", csv_path)
|
|
650
1025
|
return False
|
|
651
1026
|
|
|
1027
|
+
return True
|
|
1028
|
+
|
|
652
1029
|
# end method definition
|
|
653
1030
|
|
|
654
|
-
def
|
|
655
|
-
|
|
1031
|
+
def load_xml_data(
|
|
1032
|
+
self,
|
|
1033
|
+
xml_path: str,
|
|
1034
|
+
xpath: str | None = None,
|
|
1035
|
+
xslt_path: str | None = None,
|
|
1036
|
+
encoding: str = "utf-8",
|
|
1037
|
+
) -> bool:
|
|
1038
|
+
"""Load XML data into a Pandas data frame.
|
|
656
1039
|
|
|
657
1040
|
Args:
|
|
658
|
-
|
|
659
|
-
|
|
1041
|
+
xml_path (str):
|
|
1042
|
+
The path to the XML file to load.
|
|
1043
|
+
xpath (str, optional):
|
|
1044
|
+
An XPath to the elements we want to select.
|
|
1045
|
+
xslt_path (str, optional):
|
|
1046
|
+
An XSLT transformation file to convert the XML data.
|
|
1047
|
+
encoding (str, optional):
|
|
1048
|
+
The encoding of the file. Default is UTF-8.
|
|
1049
|
+
|
|
1050
|
+
Returns:
|
|
1051
|
+
bool:
|
|
1052
|
+
False in case an error occured, True otherwise.
|
|
1053
|
+
|
|
1054
|
+
"""
|
|
1055
|
+
|
|
1056
|
+
if xml_path.startswith("http"):
|
|
1057
|
+
# Download file from remote location specified by the packageUrl
|
|
1058
|
+
# this must be a public place without authentication:
|
|
1059
|
+
self.logger.debug("Download XML file from URL -> '%s'.", xml_path)
|
|
1060
|
+
|
|
1061
|
+
try:
|
|
1062
|
+
response = requests.get(url=xml_path, timeout=1200)
|
|
1063
|
+
response.raise_for_status()
|
|
1064
|
+
except requests.exceptions.HTTPError:
|
|
1065
|
+
self.logger.error("HTTP error with -> %s", xml_path)
|
|
1066
|
+
return False
|
|
1067
|
+
except requests.exceptions.ConnectionError:
|
|
1068
|
+
self.logger.error("Connection error with -> %s", xml_path)
|
|
1069
|
+
return False
|
|
1070
|
+
except requests.exceptions.Timeout:
|
|
1071
|
+
self.logger.error("Timeout error with -> %s", xml_path)
|
|
1072
|
+
return False
|
|
1073
|
+
except requests.exceptions.RequestException:
|
|
1074
|
+
self.logger.error("Request error with -> %s", xml_path)
|
|
1075
|
+
return False
|
|
1076
|
+
|
|
1077
|
+
self.logger.debug(
|
|
1078
|
+
"Successfully downloaded XML file -> '%s'; status code -> %s",
|
|
1079
|
+
xml_path,
|
|
1080
|
+
response.status_code,
|
|
1081
|
+
)
|
|
1082
|
+
# Convert bytes to a string using utf-8 and create a file-like object
|
|
1083
|
+
xml_file = StringIO(response.content.decode(encoding))
|
|
1084
|
+
|
|
1085
|
+
elif os.path.exists(xml_path):
|
|
1086
|
+
self.logger.debug("Using local XML file -> '%s'.", xml_path)
|
|
1087
|
+
xml_file = xml_path
|
|
1088
|
+
|
|
1089
|
+
else:
|
|
1090
|
+
self.logger.error(
|
|
1091
|
+
"Missing XML file -> '%s'. You have not specified a valid path or URL!",
|
|
1092
|
+
xml_path,
|
|
1093
|
+
)
|
|
1094
|
+
return False
|
|
1095
|
+
|
|
1096
|
+
# Load data from XML file or buffer
|
|
1097
|
+
try:
|
|
1098
|
+
df = pd.read_xml(
|
|
1099
|
+
path_or_buffer=xml_file,
|
|
1100
|
+
xpath=xpath,
|
|
1101
|
+
stylesheet=xslt_path,
|
|
1102
|
+
encoding=encoding,
|
|
1103
|
+
)
|
|
1104
|
+
# Process the loaded data as needed
|
|
1105
|
+
if self._df is None:
|
|
1106
|
+
self._df = df
|
|
1107
|
+
else:
|
|
1108
|
+
self._df = pd.concat([self._df, df])
|
|
1109
|
+
self.logger.info("XML file -> '%s' loaded successfully!", xml_path)
|
|
1110
|
+
except FileNotFoundError:
|
|
1111
|
+
self.logger.error("XML file -> '%s' not found.", xml_path)
|
|
1112
|
+
return False
|
|
1113
|
+
except PermissionError:
|
|
1114
|
+
self.logger.error(
|
|
1115
|
+
"Missing permission to access the XML file -> '%s'.",
|
|
1116
|
+
xml_path,
|
|
1117
|
+
)
|
|
1118
|
+
return False
|
|
1119
|
+
except OSError:
|
|
1120
|
+
self.logger.error("An I/O error occurred loading from -> %s", xml_path)
|
|
1121
|
+
return False
|
|
1122
|
+
except ValueError:
|
|
1123
|
+
self.logger.error("Invalid XML data in file -> %s", xml_path)
|
|
1124
|
+
return False
|
|
1125
|
+
except AttributeError:
|
|
1126
|
+
self.logger.error("Unexpected data structure in XML file -> %s", xml_path)
|
|
1127
|
+
return False
|
|
1128
|
+
except TypeError:
|
|
1129
|
+
self.logger.error("Unexpected data type in XML file -> %s", xml_path)
|
|
1130
|
+
return False
|
|
1131
|
+
except KeyError:
|
|
1132
|
+
self.logger.error("Missing key in XML file -> %s", xml_path)
|
|
1133
|
+
return False
|
|
1134
|
+
|
|
1135
|
+
return True
|
|
1136
|
+
|
|
1137
|
+
# end method definition
|
|
1138
|
+
|
|
1139
|
+
def load_directory(self, path_to_root: str) -> bool:
|
|
1140
|
+
"""Load directory structure into Pandas data frame.
|
|
1141
|
+
|
|
1142
|
+
Args:
|
|
1143
|
+
path_to_root (str):
|
|
1144
|
+
Path to the root element of the directory structure.
|
|
660
1145
|
|
|
661
1146
|
Returns:
|
|
662
1147
|
bool: True = Success, False = Failure
|
|
1148
|
+
|
|
663
1149
|
"""
|
|
664
1150
|
|
|
665
1151
|
try:
|
|
666
1152
|
# Check if the provided path is a directory
|
|
667
1153
|
if not os.path.isdir(path_to_root):
|
|
668
|
-
logger.error(
|
|
669
|
-
"The provided path -> '%s' is not a valid directory.",
|
|
1154
|
+
self.logger.error(
|
|
1155
|
+
"The provided path -> '%s' is not a valid directory.",
|
|
1156
|
+
path_to_root,
|
|
670
1157
|
)
|
|
671
1158
|
return False
|
|
672
1159
|
|
|
@@ -682,55 +1169,88 @@ class Data:
|
|
|
682
1169
|
path_parts = relative_path.split(os.sep)
|
|
683
1170
|
|
|
684
1171
|
# Create a dictionary with the path parts and file details
|
|
685
|
-
entry = {
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
1172
|
+
entry = {"level {}".format(i): part for i, part in enumerate(path_parts[:-1], start=1)}
|
|
1173
|
+
|
|
1174
|
+
entry.update(
|
|
1175
|
+
{
|
|
1176
|
+
"filename": path_parts[-1],
|
|
1177
|
+
"size": file_size,
|
|
1178
|
+
"path": path_parts[1:-1],
|
|
1179
|
+
"relative_path": relative_path,
|
|
1180
|
+
"download_dir": root,
|
|
1181
|
+
},
|
|
1182
|
+
)
|
|
690
1183
|
data.append(entry)
|
|
691
1184
|
|
|
692
|
-
# Create
|
|
1185
|
+
# Create data frame from list of dictionaries:
|
|
693
1186
|
self._df = pd.DataFrame(data)
|
|
694
1187
|
|
|
695
1188
|
# Determine the maximum number of levels
|
|
696
1189
|
max_levels = max((len(entry) - 2 for entry in data), default=0)
|
|
697
1190
|
|
|
698
|
-
# Ensure all entries have the same number of levels
|
|
1191
|
+
# Ensure all entries have the same number of levels:
|
|
699
1192
|
for entry in data:
|
|
700
1193
|
for i in range(1, max_levels + 1):
|
|
701
1194
|
entry.setdefault("level {}".format(i), "")
|
|
702
1195
|
|
|
703
|
-
# Convert to
|
|
1196
|
+
# Convert to data frame again to make sure all columns are consistent:
|
|
704
1197
|
self._df = pd.DataFrame(data)
|
|
705
1198
|
|
|
706
|
-
except NotADirectoryError
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
1199
|
+
except NotADirectoryError:
|
|
1200
|
+
self.logger.error(
|
|
1201
|
+
"Provided path -> '%s' is not a directory!",
|
|
1202
|
+
path_to_root,
|
|
1203
|
+
)
|
|
1204
|
+
return False
|
|
1205
|
+
except FileNotFoundError:
|
|
1206
|
+
self.logger.error(
|
|
1207
|
+
"Provided path -> '%s' does not exist in file system!",
|
|
1208
|
+
path_to_root,
|
|
1209
|
+
)
|
|
1210
|
+
return False
|
|
1211
|
+
except PermissionError:
|
|
1212
|
+
self.logger.error(
|
|
1213
|
+
"Permission error accessing path -> '%s'!",
|
|
1214
|
+
path_to_root,
|
|
1215
|
+
)
|
|
1216
|
+
return False
|
|
712
1217
|
|
|
713
1218
|
return True
|
|
714
1219
|
|
|
715
1220
|
# end method definition
|
|
716
1221
|
|
|
717
|
-
def load_xml_directory(
|
|
718
|
-
|
|
1222
|
+
def load_xml_directory(
|
|
1223
|
+
self,
|
|
1224
|
+
path_to_root: str,
|
|
1225
|
+
xpath: str | None = None,
|
|
1226
|
+
xml_files: list | None = None,
|
|
1227
|
+
) -> bool:
|
|
1228
|
+
"""Load XML files from a directory structure into Pandas data frame.
|
|
719
1229
|
|
|
720
1230
|
Args:
|
|
721
|
-
path_to_root (str):
|
|
722
|
-
|
|
723
|
-
xpath (str, optional):
|
|
1231
|
+
path_to_root (str):
|
|
1232
|
+
Path to the root element of the directory structure.
|
|
1233
|
+
xpath (str, optional):
|
|
1234
|
+
XPath to the XML elements we want to select.
|
|
1235
|
+
xml_files (list | None, optional):
|
|
1236
|
+
Names of the XML files to load from the directory.
|
|
724
1237
|
|
|
725
1238
|
Returns:
|
|
726
|
-
bool:
|
|
1239
|
+
bool:
|
|
1240
|
+
True = Success, False = Failure
|
|
1241
|
+
|
|
727
1242
|
"""
|
|
728
1243
|
|
|
1244
|
+
# Establish a default if None is passed via the parameter:
|
|
1245
|
+
if not xml_files:
|
|
1246
|
+
xml_files = ["docovw.xml"]
|
|
1247
|
+
|
|
729
1248
|
try:
|
|
730
1249
|
# Check if the provided path is a directory
|
|
731
1250
|
if not os.path.isdir(path_to_root):
|
|
732
|
-
logger.error(
|
|
733
|
-
"The provided path -> '%s' is not a valid directory.",
|
|
1251
|
+
self.logger.error(
|
|
1252
|
+
"The provided path -> '%s' is not a valid directory.",
|
|
1253
|
+
path_to_root,
|
|
734
1254
|
)
|
|
735
1255
|
return False
|
|
736
1256
|
|
|
@@ -741,36 +1261,223 @@ class Data:
|
|
|
741
1261
|
file_size = os.path.getsize(file_path)
|
|
742
1262
|
file_name = os.path.basename(file_path)
|
|
743
1263
|
|
|
744
|
-
if file_name
|
|
745
|
-
logger.info(
|
|
746
|
-
"Load XML file -> '%s' of size -> %s",
|
|
1264
|
+
if file_name in xml_files:
|
|
1265
|
+
self.logger.info(
|
|
1266
|
+
"Load XML file -> '%s' of size -> %s from -> '%s'...",
|
|
1267
|
+
file_name,
|
|
1268
|
+
file_size,
|
|
1269
|
+
file_path,
|
|
747
1270
|
)
|
|
748
1271
|
success = self.load_xml_data(file_path, xpath=xpath)
|
|
749
1272
|
if success:
|
|
750
|
-
logger.info(
|
|
751
|
-
"Successfully loaded XML file -> '%s'",
|
|
1273
|
+
self.logger.info(
|
|
1274
|
+
"Successfully loaded XML file -> '%s'.",
|
|
1275
|
+
file_path,
|
|
752
1276
|
)
|
|
753
1277
|
|
|
754
|
-
except NotADirectoryError
|
|
755
|
-
logger.error(
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
1278
|
+
except NotADirectoryError:
|
|
1279
|
+
self.logger.error(
|
|
1280
|
+
"Provided path -> '%s' is not a directory",
|
|
1281
|
+
path_to_root,
|
|
1282
|
+
)
|
|
1283
|
+
return False
|
|
1284
|
+
except FileNotFoundError:
|
|
1285
|
+
self.logger.error(
|
|
1286
|
+
"Provided path -> '%s' does not exist in file system!",
|
|
1287
|
+
path_to_root,
|
|
1288
|
+
)
|
|
1289
|
+
return False
|
|
1290
|
+
except PermissionError:
|
|
1291
|
+
self.logger.error(
|
|
1292
|
+
"Missing permission to access path -> '%s'",
|
|
1293
|
+
path_to_root,
|
|
1294
|
+
)
|
|
1295
|
+
return False
|
|
1296
|
+
|
|
1297
|
+
return True
|
|
1298
|
+
|
|
1299
|
+
# end method definition
|
|
1300
|
+
|
|
1301
|
+
def load_web_links(
|
|
1302
|
+
self,
|
|
1303
|
+
url: str,
|
|
1304
|
+
common_data: dict | None = None,
|
|
1305
|
+
pattern: str = r"",
|
|
1306
|
+
) -> list | None:
|
|
1307
|
+
"""Get all linked file URLs on a given web page (url) that are following a given pattern.
|
|
1308
|
+
|
|
1309
|
+
Construct a list of dictionaries based on this. This method is a helper method for load_web() below.
|
|
1310
|
+
|
|
1311
|
+
Args:
|
|
1312
|
+
url (str):
|
|
1313
|
+
The web page URL.
|
|
1314
|
+
common_data (dict | None, optional):
|
|
1315
|
+
Fields that should be added to each dictionary item. Defaults to None.
|
|
1316
|
+
pattern (str, optional):
|
|
1317
|
+
Regular Expression. Defaults to r"".
|
|
1318
|
+
|
|
1319
|
+
Returns:
|
|
1320
|
+
list | None:
|
|
1321
|
+
List of links on the web page that are complying with the given regular expression.
|
|
1322
|
+
|
|
1323
|
+
"""
|
|
1324
|
+
|
|
1325
|
+
try:
|
|
1326
|
+
response = requests.get(url, timeout=300)
|
|
1327
|
+
response.raise_for_status()
|
|
1328
|
+
except requests.RequestException:
|
|
1329
|
+
self.logger.error("Failed to retrieve page at %s", url)
|
|
1330
|
+
return []
|
|
1331
|
+
|
|
1332
|
+
# Find all file links (hyperlinks) on the page (no file extension assumed)
|
|
1333
|
+
# Example filename pattern: "al022023.public.005"
|
|
1334
|
+
file_links = re.findall(r'href="([^"]+)"', response.text)
|
|
1335
|
+
if not file_links:
|
|
1336
|
+
self.logger.warning("No file links found on the web page -> %s", url)
|
|
1337
|
+
return []
|
|
1338
|
+
|
|
1339
|
+
result_list = []
|
|
1340
|
+
base_url = url if url.endswith("/") else url + "/"
|
|
1341
|
+
|
|
1342
|
+
for link in file_links:
|
|
1343
|
+
data = common_data.copy() if common_data else {}
|
|
1344
|
+
|
|
1345
|
+
# Construct the full URL
|
|
1346
|
+
full_url = base_url + link.lstrip("/")
|
|
1347
|
+
|
|
1348
|
+
if pattern:
|
|
1349
|
+
# Filter by expected naming pattern for links
|
|
1350
|
+
match = re.search(pattern, link)
|
|
1351
|
+
if not match:
|
|
1352
|
+
continue
|
|
1353
|
+
|
|
1354
|
+
# Extract and assign groups if they exist
|
|
1355
|
+
# TODO(mdiefenb): these names are currently hard-coded
|
|
1356
|
+
# for the National Hurricane Center Dataset (NHC)
|
|
1357
|
+
if len(match.groups()) >= 1:
|
|
1358
|
+
data["Code"] = match.group(1).upper()
|
|
1359
|
+
if len(match.groups()) >= 2:
|
|
1360
|
+
data["Type"] = match.group(2)
|
|
1361
|
+
if len(match.groups()) >= 3:
|
|
1362
|
+
data["Message ID"] = match.group(3)
|
|
1363
|
+
|
|
1364
|
+
data["URL"] = full_url
|
|
1365
|
+
data["Filename"] = link
|
|
1366
|
+
|
|
1367
|
+
result_list.append(data)
|
|
1368
|
+
|
|
1369
|
+
return result_list
|
|
1370
|
+
|
|
1371
|
+
# end method definition
|
|
1372
|
+
|
|
1373
|
+
def load_web(
|
|
1374
|
+
self,
|
|
1375
|
+
values: list,
|
|
1376
|
+
value_name: str,
|
|
1377
|
+
url_templates: list,
|
|
1378
|
+
special_values: list | None = None,
|
|
1379
|
+
special_url_templates: dict | None = None,
|
|
1380
|
+
pattern: str = r"",
|
|
1381
|
+
) -> bool:
|
|
1382
|
+
"""Traverse years and bulletin types to collect all bulletin URLs.
|
|
1383
|
+
|
|
1384
|
+
Args:
|
|
1385
|
+
values (list):
|
|
1386
|
+
List of values to travers over
|
|
1387
|
+
value_name (str):
|
|
1388
|
+
Dictionary key to construct an item in combination with a value from values
|
|
1389
|
+
url_templates (list):
|
|
1390
|
+
URLs to travers per value. The URLs should contain one {} that is
|
|
1391
|
+
replace by the current value.
|
|
1392
|
+
special_values (list | None, optional):
|
|
1393
|
+
List of vales (a subset of the other values list)
|
|
1394
|
+
that we want to handle in a special way. Defaults to None.
|
|
1395
|
+
special_url_templates (dict | None, optional):
|
|
1396
|
+
URLs for the special values. Defaults to None.
|
|
1397
|
+
The dictionary keys are the special values. The
|
|
1398
|
+
dictionary values are lists of special URLs with placeholders.
|
|
1399
|
+
pattern (str, optional):
|
|
1400
|
+
Regular expression to find the proper links on the page. Defaults to r"".
|
|
1401
|
+
|
|
1402
|
+
Returns:
|
|
1403
|
+
bool:
|
|
1404
|
+
True for success, False in case of an error.
|
|
1405
|
+
|
|
1406
|
+
"""
|
|
1407
|
+
|
|
1408
|
+
result_list = []
|
|
1409
|
+
|
|
1410
|
+
# We have two nested for loops below. The out traverses over all placeholder values.
|
|
1411
|
+
# These could be the calendar years, e.g. [2003,...,2024]
|
|
1412
|
+
# The inner for loop traverses over the list of specified URLs. We can have multiple for
|
|
1413
|
+
# each value.
|
|
1414
|
+
|
|
1415
|
+
# Do we have a list of placeholder values we want to iterate over?
|
|
1416
|
+
if values:
|
|
1417
|
+
# Traverse all values in the values list:
|
|
1418
|
+
for value in values:
|
|
1419
|
+
# Do we want a special treatment for this value (e.g. the current year)
|
|
1420
|
+
if value in special_values:
|
|
1421
|
+
self.logger.info("Processing special value -> '%s'...", value)
|
|
1422
|
+
if value not in special_url_templates and str(value) not in special_url_templates:
|
|
1423
|
+
self.logger.error(
|
|
1424
|
+
"Cannot find key -> '%s' in special URL templates dictionary -> %s! Skipping...",
|
|
1425
|
+
value,
|
|
1426
|
+
str(special_url_templates),
|
|
1427
|
+
)
|
|
1428
|
+
continue
|
|
1429
|
+
# If the dictionary uses string keys then we need to convert the value
|
|
1430
|
+
# to a string as well to avoid key errors:
|
|
1431
|
+
if str(value) in special_url_templates:
|
|
1432
|
+
value = str(value)
|
|
1433
|
+
special_url_template_list = special_url_templates[value]
|
|
1434
|
+
for special_url_template in special_url_template_list:
|
|
1435
|
+
# Now the value is inserted into the placeholder in the URL:
|
|
1436
|
+
special_url = special_url_template.format(value)
|
|
1437
|
+
common_data = {value_name: value} if value_name else None
|
|
1438
|
+
result_list += self.load_web_links(
|
|
1439
|
+
url=special_url,
|
|
1440
|
+
common_data=common_data,
|
|
1441
|
+
pattern=pattern,
|
|
1442
|
+
)
|
|
1443
|
+
else: # normal URLs
|
|
1444
|
+
self.logger.info("Processing value -> '%s'...", value)
|
|
1445
|
+
for url_template in url_templates:
|
|
1446
|
+
# Now the value is inserted into the placeholder in the URL:
|
|
1447
|
+
url = url_template.format(value)
|
|
1448
|
+
common_data = {value_name: value} if value_name else None
|
|
1449
|
+
result_list += self.load_web_links(
|
|
1450
|
+
url=url,
|
|
1451
|
+
common_data=common_data,
|
|
1452
|
+
pattern=pattern,
|
|
1453
|
+
)
|
|
1454
|
+
else:
|
|
1455
|
+
for url_template in url_templates:
|
|
1456
|
+
url = url_template.format(value)
|
|
1457
|
+
result_list += self.load_web_links(
|
|
1458
|
+
url=url,
|
|
1459
|
+
common_data=None,
|
|
1460
|
+
pattern=pattern,
|
|
1461
|
+
)
|
|
1462
|
+
|
|
1463
|
+
# Add the data list to the data frame:
|
|
1464
|
+
self.append(result_list)
|
|
760
1465
|
|
|
761
1466
|
return True
|
|
762
1467
|
|
|
763
1468
|
# end method definition
|
|
764
1469
|
|
|
765
1470
|
def partitionate(self, number: int) -> list:
|
|
766
|
-
"""Partition a data frame into equally sized
|
|
767
|
-
partions
|
|
1471
|
+
"""Partition a data frame into equally sized partitions.
|
|
768
1472
|
|
|
769
1473
|
Args:
|
|
770
|
-
number (int):
|
|
1474
|
+
number (int):
|
|
1475
|
+
The number of desired partitions.
|
|
771
1476
|
|
|
772
1477
|
Returns:
|
|
773
|
-
list:
|
|
1478
|
+
list:
|
|
1479
|
+
A list of created partitions.
|
|
1480
|
+
|
|
774
1481
|
"""
|
|
775
1482
|
|
|
776
1483
|
# Calculate the approximate size of each partition
|
|
@@ -784,24 +1491,20 @@ class Data:
|
|
|
784
1491
|
number = 1
|
|
785
1492
|
remainder = 0
|
|
786
1493
|
|
|
787
|
-
logger.info(
|
|
788
|
-
"Data
|
|
1494
|
+
self.logger.info(
|
|
1495
|
+
"Data frame has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
|
|
789
1496
|
str(size),
|
|
790
1497
|
str(number),
|
|
791
1498
|
str(partition_size),
|
|
792
1499
|
str(remainder),
|
|
793
1500
|
)
|
|
794
1501
|
|
|
795
|
-
# Initialize a list to store partitions
|
|
1502
|
+
# Initialize a list to store partitions:
|
|
796
1503
|
partitions = []
|
|
797
1504
|
start_index = 0
|
|
798
1505
|
|
|
799
|
-
# Slice the
|
|
1506
|
+
# Slice the data frame into equally sized partitions:
|
|
800
1507
|
for i in range(number):
|
|
801
|
-
# start_index = i * partition_size
|
|
802
|
-
# end_index = (i + 1) * partition_size if i < number - 1 else None
|
|
803
|
-
# partition = self._df.iloc[start_index:end_index]
|
|
804
|
-
# partitions.append(partition)
|
|
805
1508
|
# Calculate the end index for this partition
|
|
806
1509
|
end_index = start_index + partition_size + (1 if i < remainder else 0)
|
|
807
1510
|
partition = self._df.iloc[start_index:end_index]
|
|
@@ -816,34 +1519,44 @@ class Data:
|
|
|
816
1519
|
"""Partition a data frame based on equal values in a specified column.
|
|
817
1520
|
|
|
818
1521
|
Args:
|
|
819
|
-
column_name (str):
|
|
1522
|
+
column_name (str):
|
|
1523
|
+
The column name to partition by.
|
|
820
1524
|
|
|
821
1525
|
Returns:
|
|
822
|
-
list
|
|
1526
|
+
list | None:
|
|
1527
|
+
List of partitions or None in case of an error (e.g. column name does not exist).
|
|
1528
|
+
|
|
823
1529
|
"""
|
|
824
1530
|
|
|
825
1531
|
if column_name not in self._df.columns:
|
|
826
|
-
logger.error(
|
|
827
|
-
"
|
|
1532
|
+
self.logger.error(
|
|
1533
|
+
"Cannot partitionate by column -> '%s'. Column does not exist in the data frame. Data frame has these columns -> %s",
|
|
828
1534
|
column_name,
|
|
829
1535
|
str(self._df.columns),
|
|
830
1536
|
)
|
|
831
1537
|
return None
|
|
832
1538
|
|
|
833
|
-
# Separate rows with NaN or None values in the specified column
|
|
1539
|
+
# Separate rows with NaN or None values in the specified column:
|
|
834
1540
|
nan_partitions = self._df[self._df[column_name].isna()]
|
|
1541
|
+
|
|
1542
|
+
# Keep only rows where the specified column has valid (non-NaN) values:
|
|
835
1543
|
non_nan_df = self._df.dropna(subset=[column_name])
|
|
836
1544
|
|
|
837
|
-
# Group by the specified column
|
|
1545
|
+
# Group the non-NaN DataFrame by the specified column's values:
|
|
838
1546
|
grouped = non_nan_df.groupby(column_name)
|
|
1547
|
+
|
|
1548
|
+
# Create a list of partitions (DataFrames) for each unique value in the column:
|
|
839
1549
|
partitions = [group for _, group in grouped]
|
|
840
1550
|
|
|
841
|
-
# Add each row with NaN
|
|
842
|
-
|
|
843
|
-
|
|
1551
|
+
# Add each row with NaN/None as its own partition
|
|
1552
|
+
# iterrows() returns each row as a Series. To convert it back to a DataFrame:
|
|
1553
|
+
# 1. .to_frame() turns the Series into a DataFrame, but with the original column names as rows.
|
|
1554
|
+
# 2. .T (transpose) flips it back, turning the original row into a proper DataFrame row.
|
|
1555
|
+
# This ensures that even rows with NaN values are treated as DataFrame partitions.
|
|
1556
|
+
partitions.extend([row.to_frame().T for _, row in nan_partitions.iterrows()])
|
|
844
1557
|
|
|
845
|
-
logger.info(
|
|
846
|
-
"Data
|
|
1558
|
+
self.logger.info(
|
|
1559
|
+
"Data frame has been partitioned into -> %s partitions based on the values in column -> '%s'...",
|
|
847
1560
|
str(len(partitions)),
|
|
848
1561
|
column_name,
|
|
849
1562
|
)
|
|
@@ -853,18 +1566,19 @@ class Data:
|
|
|
853
1566
|
# end method definition
|
|
854
1567
|
|
|
855
1568
|
def deduplicate(self, unique_fields: list, inplace: bool = True) -> pd.DataFrame:
|
|
856
|
-
"""Remove dupclicate rows that have all fields in
|
|
857
|
-
unique_fields in common.
|
|
1569
|
+
"""Remove dupclicate rows that have all fields in unique_fields in common.
|
|
858
1570
|
|
|
859
1571
|
Args:
|
|
860
|
-
unique_fields (list):
|
|
861
|
-
|
|
862
|
-
inplace (bool, optional):
|
|
863
|
-
|
|
1572
|
+
unique_fields (list):
|
|
1573
|
+
Defines the fields for which we want a unique combination for.
|
|
1574
|
+
inplace (bool, optional):
|
|
1575
|
+
True if the deduplication happens in-place. Defaults to True.
|
|
1576
|
+
|
|
864
1577
|
Returns:
|
|
865
|
-
pd.DataFrame
|
|
866
|
-
|
|
867
|
-
|
|
1578
|
+
pd.DataFrame:
|
|
1579
|
+
If inplace is False than a new deduplicatd data frame is returned.
|
|
1580
|
+
Otherwise the object is modified in place and self._df is returned.
|
|
1581
|
+
|
|
868
1582
|
"""
|
|
869
1583
|
|
|
870
1584
|
if inplace:
|
|
@@ -878,34 +1592,38 @@ class Data:
|
|
|
878
1592
|
|
|
879
1593
|
# end method definition
|
|
880
1594
|
|
|
881
|
-
def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame:
|
|
882
|
-
"""Sort the data frame based on one or multiple fields
|
|
883
|
-
|
|
1595
|
+
def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame | None:
|
|
1596
|
+
"""Sort the data frame based on one or multiple fields.
|
|
1597
|
+
|
|
1598
|
+
Sorting can be either in place or return it as a new data frame
|
|
1599
|
+
(e.g. not modifying self._df).
|
|
884
1600
|
|
|
885
1601
|
Args:
|
|
886
|
-
sort_fields (list):
|
|
887
|
-
|
|
888
|
-
|
|
1602
|
+
sort_fields (list):
|
|
1603
|
+
The columns / fields to be used for sorting.
|
|
1604
|
+
inplace (bool, optional):
|
|
1605
|
+
If the sorting should be inplace, i.e. modifying self._df.
|
|
1606
|
+
Defaults to True.
|
|
1607
|
+
|
|
889
1608
|
Returns:
|
|
890
|
-
pd.DataFrame
|
|
1609
|
+
pd.DataFrame | None:
|
|
1610
|
+
New data frame (if inplace = False) or self._df (if inplace = True).
|
|
1611
|
+
None in case of an error.
|
|
1612
|
+
|
|
891
1613
|
"""
|
|
892
1614
|
|
|
893
1615
|
if self._df is None:
|
|
894
1616
|
return None
|
|
895
1617
|
|
|
896
1618
|
if not all(sort_field in self._df.columns for sort_field in sort_fields):
|
|
897
|
-
logger.warning(
|
|
898
|
-
"Not all of the given sort fields -> %s do exist in the
|
|
1619
|
+
self.logger.warning(
|
|
1620
|
+
"Not all of the given sort fields -> %s do exist in the data frame.",
|
|
899
1621
|
str(sort_fields),
|
|
900
1622
|
)
|
|
901
|
-
# Reduce the sort fields to those that really exist in the
|
|
902
|
-
sort_fields = [
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
if sort_field in self._df.columns
|
|
906
|
-
]
|
|
907
|
-
logger.warning(
|
|
908
|
-
"Only these given sort fields -> %s do exist as columns in the Data Frame.",
|
|
1623
|
+
# Reduce the sort fields to those that really exist in the data frame:
|
|
1624
|
+
sort_fields = [sort_field for sort_field in sort_fields if sort_field in self._df.columns]
|
|
1625
|
+
self.logger.warning(
|
|
1626
|
+
"Only these given sort fields -> %s do exist as columns in the data frame.",
|
|
909
1627
|
str(sort_fields),
|
|
910
1628
|
)
|
|
911
1629
|
|
|
@@ -920,138 +1638,278 @@ class Data:
|
|
|
920
1638
|
|
|
921
1639
|
# end method definition
|
|
922
1640
|
|
|
923
|
-
def flatten(
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
1641
|
+
def flatten(self, parent_field: str, flatten_fields: list, concatenator: str = "_") -> None:
|
|
1642
|
+
"""Flatten a sub-dictionary by copying selected fields to the parent dictionary.
|
|
1643
|
+
|
|
1644
|
+
This is e.g. useful for then de-duplicate a data frame.
|
|
1645
|
+
To flatten a data frame makes sense in situation when a column used
|
|
1646
|
+
to have a list of dictionaries and got "exploded" (see explode_and_flatten()
|
|
1647
|
+
method below). In this case the column as dictionary values that then can
|
|
1648
|
+
be flattened.
|
|
931
1649
|
|
|
932
1650
|
Args:
|
|
933
|
-
parent_field (str):
|
|
934
|
-
|
|
935
|
-
|
|
1651
|
+
parent_field (str):
|
|
1652
|
+
Name prefix of the new column in the data frame. The flattened field
|
|
1653
|
+
names are added with a leading underscore.
|
|
1654
|
+
flatten_fields (list):
|
|
1655
|
+
Fields in the dictionary of the source column that are copied
|
|
1656
|
+
as new columns into the data frame.
|
|
1657
|
+
concatenator (str, optional):
|
|
1658
|
+
Character or string used to concatenate the parent field with the flattened field
|
|
1659
|
+
to create a unique name.
|
|
1660
|
+
|
|
936
1661
|
"""
|
|
937
1662
|
|
|
1663
|
+
# First do a sanity check if the data frame is not yet initialized.
|
|
1664
|
+
if self._df is None:
|
|
1665
|
+
self.logger.error(
|
|
1666
|
+
"The data frame is not initialized or empty. Cannot flatten field(s) -> '%s' in the data frame.",
|
|
1667
|
+
flatten_fields,
|
|
1668
|
+
)
|
|
1669
|
+
return
|
|
1670
|
+
|
|
1671
|
+
if parent_field not in self._df.columns:
|
|
1672
|
+
self.logger.warning(
|
|
1673
|
+
"The parent field -> '%s' cannot be flattened as it doesn't exist as column in the data frame!",
|
|
1674
|
+
parent_field,
|
|
1675
|
+
)
|
|
1676
|
+
return
|
|
1677
|
+
|
|
938
1678
|
for flatten_field in flatten_fields:
|
|
939
|
-
flat_field = parent_field +
|
|
1679
|
+
flat_field = parent_field + concatenator + flatten_field
|
|
940
1680
|
# The following expression generates a new column in the
|
|
941
1681
|
# data frame with the name of 'flat_field'.
|
|
942
|
-
# In the
|
|
1682
|
+
# In the lambda function x is a dictionary that includes the subvalues
|
|
943
1683
|
# and it returns the value of the given flatten field
|
|
944
1684
|
# (if it exists, otherwise None). So x is self._df[parent_field], i.e.
|
|
945
1685
|
# what the lambda function gets 'applied' on.
|
|
946
1686
|
self._df[flat_field] = self._df[parent_field].apply(
|
|
947
|
-
lambda x, sub_field=flatten_field: (
|
|
948
|
-
x.get(sub_field, None) if isinstance(x, dict) else None
|
|
949
|
-
)
|
|
1687
|
+
lambda x, sub_field=flatten_field: (x.get(sub_field, None) if isinstance(x, dict) else None),
|
|
950
1688
|
)
|
|
951
1689
|
|
|
952
1690
|
# end method definition
|
|
953
1691
|
|
|
954
1692
|
def explode_and_flatten(
|
|
955
1693
|
self,
|
|
956
|
-
|
|
1694
|
+
explode_fields: str | list,
|
|
957
1695
|
flatten_fields: list | None = None,
|
|
958
1696
|
make_unique: bool = False,
|
|
959
1697
|
reset_index: bool = False,
|
|
960
1698
|
split_string_to_list: bool = False,
|
|
961
|
-
|
|
962
|
-
|
|
1699
|
+
separator: str = ";,",
|
|
1700
|
+
) -> pd.DataFrame | None:
|
|
1701
|
+
"""Explode a substructure in the Pandas data frame.
|
|
963
1702
|
|
|
964
1703
|
Args:
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
flatten_fields (list):
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
1704
|
+
explode_fields (str | list):
|
|
1705
|
+
Field(s) to explode. Each field to explode should have a list structure.
|
|
1706
|
+
Exploding multiple columns at once is possible. This delivers
|
|
1707
|
+
a very different result compared to exploding one column after the other!
|
|
1708
|
+
flatten_fields (list):
|
|
1709
|
+
Fields in the exploded substructure to include
|
|
1710
|
+
in the main dictionaries for easier processing.
|
|
1711
|
+
make_unique (bool, optional):
|
|
1712
|
+
If True, deduplicate the exploded data frame.
|
|
1713
|
+
reset_index (bool, False):
|
|
1714
|
+
If True, then the index is reset, False = Index is not reset.
|
|
1715
|
+
split_string_to_list (bool, optional):
|
|
1716
|
+
If True flatten the exploded data frame.
|
|
1717
|
+
separator (str, optional):
|
|
1718
|
+
Characters used to split the string values in the given column into a list.
|
|
1719
|
+
|
|
973
1720
|
Returns:
|
|
974
|
-
pd.DataFrame
|
|
1721
|
+
pd.DataFrame | None:
|
|
1722
|
+
Pointer to the Pandas data frame.
|
|
1723
|
+
|
|
975
1724
|
"""
|
|
976
1725
|
|
|
977
|
-
def update_column(row):
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
1726
|
+
def update_column(row: pd.Series, sub: str) -> str:
|
|
1727
|
+
"""Extract the value of a sub-column from a nested dictionary within a Pandas Series.
|
|
1728
|
+
|
|
1729
|
+
Args:
|
|
1730
|
+
row (pd.Series):
|
|
1731
|
+
A row from the data frame.
|
|
1732
|
+
sub (str):
|
|
1733
|
+
The sub-column name to extract.
|
|
1734
|
+
|
|
1735
|
+
Returns:
|
|
1736
|
+
str:
|
|
1737
|
+
The value of the sub-column, or an empty string if not found.
|
|
1738
|
+
|
|
1739
|
+
"""
|
|
1740
|
+
|
|
1741
|
+
if isinstance(row, dict) and sub in row:
|
|
1742
|
+
return row[sub]
|
|
1743
|
+
return ""
|
|
1744
|
+
|
|
1745
|
+
# end def update_column()
|
|
1746
|
+
|
|
1747
|
+
def string_to_list(value: str) -> list:
|
|
1748
|
+
"""Convert a string to a list by splitting it using a specified separator.
|
|
1749
|
+
|
|
1750
|
+
If the input is already a list, it is returned as-is. If the input is `None` or a missing value,
|
|
1751
|
+
an empty list is returned. Otherwise, the string is split into a list of substrings using
|
|
1752
|
+
the given separator. Leading and trailing spaces in the resulting substrings are removed.
|
|
1753
|
+
|
|
1754
|
+
Args:
|
|
1755
|
+
value (str):
|
|
1756
|
+
The input string to be converted into a list. Can also be a list, `None`,
|
|
1757
|
+
or a missing value (e.g., NaN).
|
|
1758
|
+
|
|
1759
|
+
Returns:
|
|
1760
|
+
list:
|
|
1761
|
+
A list of substrings if the input is a string, or an empty list if the input
|
|
1762
|
+
is `None` or a missing value. If the input is already a list, it is returned unchanged.
|
|
1763
|
+
|
|
1764
|
+
"""
|
|
1765
|
+
|
|
1766
|
+
# Check if the value is already a list; if so, return it directly
|
|
1767
|
+
if isinstance(value, list):
|
|
1768
|
+
return value
|
|
1769
|
+
|
|
1770
|
+
# If the value is None or a missing value (e.g., NaN), return an empty list
|
|
1771
|
+
if not value or pd.isna(value):
|
|
987
1772
|
return []
|
|
988
|
-
# Use regular expression to split by comma, semicolon, or comma followed by space
|
|
989
|
-
return re.split(r"[;,]\s*", str(string))
|
|
990
1773
|
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
1774
|
+
# Use a regular expression to split the string by the separator
|
|
1775
|
+
# and remove leading/trailing spaces from each resulting substring
|
|
1776
|
+
return_list = re.split(rf"[{separator}]\s*", str(value))
|
|
1777
|
+
|
|
1778
|
+
return return_list
|
|
1779
|
+
|
|
1780
|
+
# end def string_to_list()
|
|
1781
|
+
|
|
1782
|
+
#
|
|
1783
|
+
# Start of main method:
|
|
1784
|
+
#
|
|
1785
|
+
|
|
1786
|
+
# First do a sanity check if the data frame is not yet initialized.
|
|
1787
|
+
if self._df is None:
|
|
1788
|
+
self.logger.error(
|
|
1789
|
+
"The data frame is not initialized or empty. Cannot explode data frame.",
|
|
1790
|
+
)
|
|
1791
|
+
return None
|
|
1792
|
+
|
|
1793
|
+
# Next do a sanity check for the given explode_field. It should
|
|
1794
|
+
# either be a string (single column name) or a list (multiple column names):
|
|
1795
|
+
if isinstance(explode_fields, list):
|
|
1796
|
+
self.logger.info("Exploding list of columns -> %s", str(explode_fields))
|
|
1797
|
+
elif isinstance(explode_fields, str):
|
|
1798
|
+
self.logger.info("Exploding single column -> '%s'", explode_fields)
|
|
995
1799
|
else:
|
|
996
|
-
logger.error(
|
|
997
|
-
"Illegal explode field(s) data type
|
|
1800
|
+
self.logger.error(
|
|
1801
|
+
"Illegal explode field(s) data type -> %s. Explode field must either be a string or a list of strings.",
|
|
1802
|
+
type(explode_fields),
|
|
998
1803
|
)
|
|
999
1804
|
return self._df
|
|
1000
1805
|
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1806
|
+
# Ensure explode_fields is a list for uniform processing:
|
|
1807
|
+
if isinstance(explode_fields, str):
|
|
1808
|
+
explode_fields = [explode_fields]
|
|
1809
|
+
|
|
1810
|
+
# Process nested field names with '.'
|
|
1811
|
+
processed_fields = []
|
|
1812
|
+
for field in explode_fields:
|
|
1813
|
+
# The "." indicates that the column has dictionary values:
|
|
1814
|
+
if "." in field:
|
|
1815
|
+
main, sub = field.split(".", 1)
|
|
1816
|
+
if main not in self._df.columns:
|
|
1817
|
+
self.logger.error(
|
|
1818
|
+
"The column -> '%s' does not exist in the data frame! Cannot explode it. Data frame has these columns -> %s",
|
|
1819
|
+
main,
|
|
1820
|
+
str(self._df.columns.tolist()),
|
|
1821
|
+
)
|
|
1822
|
+
continue
|
|
1823
|
+
|
|
1824
|
+
# Use update_column to extract the dictionary key specified by the sub value:
|
|
1825
|
+
self.logger.info(
|
|
1826
|
+
"Extracting dictionary value for key -> '%s' from column -> '%s'.",
|
|
1827
|
+
sub,
|
|
1828
|
+
main,
|
|
1829
|
+
)
|
|
1830
|
+
self._df[main] = self._df[main].apply(update_column, args=(sub,))
|
|
1831
|
+
processed_fields.append(main)
|
|
1832
|
+
else:
|
|
1833
|
+
processed_fields.append(field)
|
|
1834
|
+
|
|
1835
|
+
# Verify all processed fields exist in the data frame:
|
|
1836
|
+
missing_columns = [col for col in processed_fields if col not in self._df.columns]
|
|
1837
|
+
if missing_columns:
|
|
1838
|
+
self.logger.error(
|
|
1839
|
+
"The following columns are missing in the data frame and cannot be exploded -> %s. Data frame has these columns -> %s",
|
|
1840
|
+
missing_columns,
|
|
1841
|
+
str(self._df.columns.tolist()),
|
|
1842
|
+
)
|
|
1843
|
+
return self._df
|
|
1004
1844
|
|
|
1845
|
+
# Handle splitting strings into lists if required:
|
|
1846
|
+
if split_string_to_list:
|
|
1847
|
+
for field in processed_fields:
|
|
1848
|
+
self.logger.info(
|
|
1849
|
+
"Splitting strings in column -> '%s' into lists using separator -> '%s'",
|
|
1850
|
+
field,
|
|
1851
|
+
separator,
|
|
1852
|
+
)
|
|
1853
|
+
# Apply the function to convert the string values in the column (give by the name in explode_field) to lists
|
|
1854
|
+
# The string_to_list() sub-method above also considers the separator parameter.
|
|
1855
|
+
self._df[field] = self._df[field].apply(string_to_list)
|
|
1856
|
+
|
|
1857
|
+
# Explode all specified columns at once.
|
|
1858
|
+
# explode() can either take a string field or a list of fields.
|
|
1859
|
+
# # It is VERY important to do the explosion of multiple columns together -
|
|
1860
|
+
# otherwise we get combinatorial explosion. Explosion of multiple columns 1-by-1
|
|
1861
|
+
# is VERY different from doing the explosion together!
|
|
1862
|
+
self.logger.info("Validated column(s) to explode -> %s", processed_fields)
|
|
1005
1863
|
try:
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
sub = explode_field.split(".")[1]
|
|
1011
|
-
self._df[main] = self._df[main].apply(update_column)
|
|
1012
|
-
explode_field = main
|
|
1013
|
-
# Explode the field that has list values
|
|
1014
|
-
self._df = self._df.explode(column=explode_field)
|
|
1015
|
-
except KeyError:
|
|
1016
|
-
logger.error("Column -> '%s' not found in Data Frame!", str(explode_field))
|
|
1864
|
+
self._df = self._df.explode(
|
|
1865
|
+
column=processed_fields,
|
|
1866
|
+
ignore_index=reset_index,
|
|
1867
|
+
)
|
|
1017
1868
|
except ValueError:
|
|
1018
|
-
logger.error(
|
|
1019
|
-
"
|
|
1869
|
+
self.logger.error(
|
|
1870
|
+
"Error exploding columns -> %s",
|
|
1871
|
+
processed_fields,
|
|
1020
1872
|
)
|
|
1873
|
+
return self._df
|
|
1021
1874
|
|
|
1022
1875
|
if flatten_fields:
|
|
1023
|
-
|
|
1876
|
+
# Ensure that flatten() is called for each exploded column
|
|
1877
|
+
for field in processed_fields:
|
|
1878
|
+
self.flatten(parent_field=field, flatten_fields=flatten_fields)
|
|
1024
1879
|
|
|
1880
|
+
# Deduplicate rows if required
|
|
1025
1881
|
if make_unique:
|
|
1026
1882
|
self._df.drop_duplicates(subset=flatten_fields, inplace=True)
|
|
1027
1883
|
|
|
1884
|
+
# Reset index explicitly if not handled during explode
|
|
1028
1885
|
if reset_index:
|
|
1029
|
-
self._df.reset_index(inplace=True)
|
|
1886
|
+
self._df.reset_index(drop=True, inplace=True)
|
|
1030
1887
|
|
|
1031
1888
|
return self._df
|
|
1032
1889
|
|
|
1033
1890
|
# end method definition
|
|
1034
1891
|
|
|
1035
1892
|
def drop_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
|
|
1036
|
-
"""Drop selected columns from the
|
|
1893
|
+
"""Drop selected columns from the Pandas data frame.
|
|
1037
1894
|
|
|
1038
1895
|
Args:
|
|
1039
|
-
column_names (list):
|
|
1040
|
-
|
|
1041
|
-
|
|
1896
|
+
column_names (list):
|
|
1897
|
+
The list of column names to drop.
|
|
1898
|
+
inplace (bool, optional):
|
|
1899
|
+
Whether or not the dropping should be inplace, i.e. modifying self._df.
|
|
1900
|
+
Defaults to True.
|
|
1901
|
+
|
|
1042
1902
|
Returns:
|
|
1043
|
-
pd.DataFrame:
|
|
1903
|
+
pd.DataFrame:
|
|
1904
|
+
New data frame (if inplace = False) or self._df (if inplace = True)
|
|
1905
|
+
|
|
1044
1906
|
"""
|
|
1045
1907
|
|
|
1046
1908
|
if not all(column_name in self._df.columns for column_name in column_names):
|
|
1047
|
-
# Reduce the column names to those that really exist in the
|
|
1048
|
-
column_names = [
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
if column_name in self._df.columns
|
|
1052
|
-
]
|
|
1053
|
-
logger.warning(
|
|
1054
|
-
"Reduce to these columns -> %s that do exist in the Data Frame.",
|
|
1909
|
+
# Reduce the column names to those that really exist in the data frame:
|
|
1910
|
+
column_names = [column_name for column_name in column_names if column_name in self._df.columns]
|
|
1911
|
+
self.logger.info(
|
|
1912
|
+
"Drop columns -> %s from the data frame.",
|
|
1055
1913
|
str(column_names),
|
|
1056
1914
|
)
|
|
1057
1915
|
|
|
@@ -1065,25 +1923,26 @@ class Data:
|
|
|
1065
1923
|
# end method definition
|
|
1066
1924
|
|
|
1067
1925
|
def keep_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
|
|
1068
|
-
"""Keep only selected columns
|
|
1926
|
+
"""Keep only selected columns in the data frame. Drop the rest.
|
|
1069
1927
|
|
|
1070
1928
|
Args:
|
|
1071
|
-
column_names (list):
|
|
1072
|
-
|
|
1073
|
-
|
|
1929
|
+
column_names (list):
|
|
1930
|
+
A list of column names to keep.
|
|
1931
|
+
inplace (bool, optional):
|
|
1932
|
+
If the keeping should be inplace, i.e. modifying self._df.
|
|
1933
|
+
Defaults to True.
|
|
1934
|
+
|
|
1074
1935
|
Returns:
|
|
1075
|
-
pd.DataFrame:
|
|
1936
|
+
pd.DataFrame:
|
|
1937
|
+
New data frame (if inplace = False) or self._df (if inplace = True).
|
|
1938
|
+
|
|
1076
1939
|
"""
|
|
1077
1940
|
|
|
1078
1941
|
if not all(column_name in self._df.columns for column_name in column_names):
|
|
1079
|
-
# Reduce the column names to those that really exist in the
|
|
1080
|
-
column_names = [
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
if column_name in self._df.columns
|
|
1084
|
-
]
|
|
1085
|
-
logger.warning(
|
|
1086
|
-
"Reduce to these columns -> %s that do exist in the Data Frame.",
|
|
1942
|
+
# Reduce the column names to those that really exist in the data frame:
|
|
1943
|
+
column_names = [column_name for column_name in column_names if column_name in self._df.columns]
|
|
1944
|
+
self.logger.info(
|
|
1945
|
+
"Reduce columns to keep to these columns -> %s that do exist in the data frame.",
|
|
1087
1946
|
column_names,
|
|
1088
1947
|
)
|
|
1089
1948
|
|
|
@@ -1101,262 +1960,797 @@ class Data:
|
|
|
1101
1960
|
|
|
1102
1961
|
# end method definition
|
|
1103
1962
|
|
|
1104
|
-
def
|
|
1105
|
-
"""
|
|
1963
|
+
def rename_column(self, old_column_name: str, new_column_name: str) -> bool:
|
|
1964
|
+
"""Rename a data frame column.
|
|
1106
1965
|
|
|
1107
1966
|
Args:
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1967
|
+
old_column_name (str):
|
|
1968
|
+
The old name of the column.
|
|
1969
|
+
new_column_name (str):
|
|
1970
|
+
The new name of the column.
|
|
1971
|
+
|
|
1972
|
+
Returns:
|
|
1973
|
+
bool:
|
|
1974
|
+
True = Success, False = Error
|
|
1975
|
+
|
|
1976
|
+
"""
|
|
1977
|
+
|
|
1978
|
+
if self._df is None:
|
|
1979
|
+
return False
|
|
1980
|
+
|
|
1981
|
+
if old_column_name not in self._df.columns:
|
|
1982
|
+
self.logger.error(
|
|
1983
|
+
"Cannot rename column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
|
|
1984
|
+
old_column_name,
|
|
1985
|
+
str(self._df.columns),
|
|
1986
|
+
)
|
|
1987
|
+
return False
|
|
1988
|
+
|
|
1989
|
+
if new_column_name in self._df.columns:
|
|
1990
|
+
self.logger.error(
|
|
1991
|
+
"Cannot rename column -> '%s' to -> '%s'. New name does already exist as column in the data frame! Data frame has these columns -> %s",
|
|
1992
|
+
old_column_name,
|
|
1993
|
+
new_column_name,
|
|
1994
|
+
str(self._df.columns),
|
|
1995
|
+
)
|
|
1996
|
+
return False
|
|
1997
|
+
|
|
1998
|
+
self._df.rename(columns={old_column_name: new_column_name}, inplace=True)
|
|
1999
|
+
|
|
2000
|
+
return True
|
|
2001
|
+
|
|
2002
|
+
# end method definition
|
|
2003
|
+
|
|
2004
|
+
def is_dict_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
|
|
2005
|
+
"""Safely checks if a column predominantly contains dictionary-like objects.
|
|
2006
|
+
|
|
2007
|
+
Args:
|
|
2008
|
+
column (pd.Series):
|
|
2009
|
+
The pandas Series (column) to check.
|
|
2010
|
+
threshold (float, optional):
|
|
2011
|
+
0.0 < threshold <= 1.0. Float representation of the percentage.
|
|
2012
|
+
Default = 0.5 (50%).
|
|
2013
|
+
|
|
2014
|
+
Returns:
|
|
2015
|
+
bool:
|
|
2016
|
+
True if the column contains mostly dictionary-like objects, False otherwise.
|
|
2017
|
+
|
|
1134
2018
|
"""
|
|
1135
2019
|
|
|
1136
|
-
|
|
2020
|
+
if not isinstance(column, pd.Series):
|
|
2021
|
+
self.logger.error(
|
|
2022
|
+
"Expected Pandas series, but got -> %s",
|
|
2023
|
+
str(type(column)),
|
|
2024
|
+
)
|
|
2025
|
+
return False
|
|
2026
|
+
if not 0.0 < threshold <= 1.0:
|
|
2027
|
+
self.logger.error(
|
|
2028
|
+
"Threshold must be between 0.0 and 1.0, but got -> %s",
|
|
2029
|
+
str(threshold),
|
|
2030
|
+
)
|
|
2031
|
+
return False
|
|
2032
|
+
|
|
2033
|
+
# Drop null values (NaN or None) and check types of remaining values
|
|
2034
|
+
non_null_values = column.dropna()
|
|
2035
|
+
dict_count = non_null_values.apply(lambda x: isinstance(x, dict)).sum()
|
|
2036
|
+
|
|
2037
|
+
# If more than threshold % of non-null values are dictionaries, return True.
|
|
2038
|
+
# Else return False.
|
|
2039
|
+
return dict_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
|
|
2040
|
+
|
|
2041
|
+
# end method definition
|
|
2042
|
+
|
|
2043
|
+
def is_list_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
|
|
2044
|
+
"""Safely checks if a column predominantly contains list-like objects.
|
|
2045
|
+
|
|
2046
|
+
Args:
|
|
2047
|
+
column (pd.Series):
|
|
2048
|
+
The pandas Series (column) to check.
|
|
2049
|
+
threshold (float, optional):
|
|
2050
|
+
0.0 < threshold <= 1.0. Float representation of the percentage. Default = 0.5 (50%).
|
|
2051
|
+
|
|
2052
|
+
Returns:
|
|
2053
|
+
bool:
|
|
2054
|
+
True if the column contains list-like objects, False otherwise.
|
|
2055
|
+
|
|
2056
|
+
"""
|
|
2057
|
+
|
|
2058
|
+
if not isinstance(column, pd.Series):
|
|
2059
|
+
self.logger.error(
|
|
2060
|
+
"Expected pandas series, but got -> %s",
|
|
2061
|
+
str(type(column)),
|
|
2062
|
+
)
|
|
2063
|
+
return False
|
|
2064
|
+
if not 0.0 < threshold <= 1.0:
|
|
2065
|
+
self.logger.error(
|
|
2066
|
+
"Threshold must be between 0.0 and 1.0, but got -> %s",
|
|
2067
|
+
str(threshold),
|
|
2068
|
+
)
|
|
2069
|
+
return False
|
|
2070
|
+
|
|
2071
|
+
# Drop null values (NaN or None) and check types of remaining values
|
|
2072
|
+
non_null_values = column.dropna()
|
|
2073
|
+
list_count = non_null_values.apply(lambda x: isinstance(x, list)).sum()
|
|
2074
|
+
|
|
2075
|
+
# If more than threshold % of non-null values are lists, return True.
|
|
2076
|
+
# Else return False.
|
|
2077
|
+
return list_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
|
|
2078
|
+
|
|
2079
|
+
# end method definition
|
|
2080
|
+
|
|
2081
|
+
def is_string_column(self, column: pd.Series) -> bool:
|
|
2082
|
+
"""Determine if a Pandas series predominantly contains string values, ignoring NaN values.
|
|
2083
|
+
|
|
2084
|
+
Args:
|
|
2085
|
+
column (pd.Series):
|
|
2086
|
+
The Pandas Series to check.
|
|
2087
|
+
|
|
2088
|
+
Returns:
|
|
2089
|
+
bool:
|
|
2090
|
+
True if all non-NaN values in the column are strings, False otherwise.
|
|
2091
|
+
|
|
2092
|
+
"""
|
|
2093
|
+
|
|
2094
|
+
# Drop NaN values and check if remaining values are strings
|
|
2095
|
+
return column.dropna().map(lambda x: isinstance(x, str)).all()
|
|
2096
|
+
|
|
2097
|
+
# end method definition
|
|
2098
|
+
|
|
2099
|
+
def cleanse(self, cleansings: dict) -> None:
|
|
2100
|
+
"""Cleanse data with regular expressions and upper/lower case conversions.
|
|
2101
|
+
|
|
2102
|
+
Args:
|
|
2103
|
+
cleansings (dict):
|
|
2104
|
+
Dictionary with keys that equal the column names.
|
|
2105
|
+
The dictionary values are dictionaries themselves with
|
|
2106
|
+
these fields:
|
|
2107
|
+
* replacements (dict): name of a column in the data frame
|
|
2108
|
+
* upper (bool, optional, default = False): change the value to uppercase
|
|
2109
|
+
* lower (bool, optional, default = False): change the value to lowercase
|
|
2110
|
+
* capitalize (bool, optional, default = False) - first character upper case, rest lower-case
|
|
2111
|
+
* title (bool, optional, default = False) - first character of each word upper case
|
|
2112
|
+
* length (int, optional, default = 0): truncate to max length
|
|
2113
|
+
|
|
2114
|
+
"""
|
|
2115
|
+
|
|
2116
|
+
# Iterate over each column in the cleansing dictionary
|
|
1137
2117
|
for column, cleansing in cleansings.items():
|
|
1138
|
-
#
|
|
1139
|
-
|
|
2118
|
+
# Read the cleansing parameters:
|
|
2119
|
+
replacements = cleansing.get("replacements", {})
|
|
2120
|
+
upper = cleansing.get("upper", False)
|
|
2121
|
+
lower = cleansing.get("lower", False)
|
|
2122
|
+
capitalize = cleansing.get("capitalize", False)
|
|
2123
|
+
title = cleansing.get("title", False)
|
|
2124
|
+
length = cleansing.get("length", 0)
|
|
2125
|
+
|
|
2126
|
+
# Handle dict columns - we expect the column name to seperate
|
|
2127
|
+
# main field from sub field using a dot syntax (e.g., "column.subfield")
|
|
1140
2128
|
if "." in column:
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
2129
|
+
column, dict_key = column.split(".")
|
|
2130
|
+
if column not in self._df.columns:
|
|
2131
|
+
self.logger.error(
|
|
2132
|
+
"Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
|
|
2133
|
+
column,
|
|
2134
|
+
str(self._df.columns),
|
|
2135
|
+
)
|
|
1144
2136
|
continue
|
|
1145
|
-
#
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
2137
|
+
# Apply cleansing to dictionary values in the main column
|
|
2138
|
+
self.logger.info(
|
|
2139
|
+
"Cleansing for column -> '%s' has a subfield -> '%s' configured. Do cleansing for dictionary items with key -> '%s'...",
|
|
2140
|
+
column,
|
|
2141
|
+
dict_key,
|
|
2142
|
+
dict_key,
|
|
2143
|
+
)
|
|
2144
|
+
self._df[column] = self._df[column].apply(
|
|
2145
|
+
lambda x,
|
|
2146
|
+
dict_key=dict_key,
|
|
2147
|
+
replacements=replacements,
|
|
2148
|
+
upper=upper,
|
|
2149
|
+
lower=lower,
|
|
2150
|
+
capitalize=capitalize,
|
|
2151
|
+
title=title,
|
|
2152
|
+
length=length: self._cleanse_subfield(
|
|
1149
2153
|
data=x,
|
|
1150
|
-
|
|
1151
|
-
replacements=
|
|
1152
|
-
upper=
|
|
1153
|
-
lower=
|
|
1154
|
-
|
|
1155
|
-
|
|
2154
|
+
dict_key=dict_key,
|
|
2155
|
+
replacements=replacements,
|
|
2156
|
+
upper=upper,
|
|
2157
|
+
lower=lower,
|
|
2158
|
+
capitalize=capitalize,
|
|
2159
|
+
title=title,
|
|
2160
|
+
length=length,
|
|
2161
|
+
),
|
|
1156
2162
|
)
|
|
1157
|
-
|
|
1158
|
-
|
|
2163
|
+
# end if "." in column
|
|
2164
|
+
else: # the else case handles strings and list columns
|
|
2165
|
+
if column not in self._df.columns:
|
|
2166
|
+
self.logger.error(
|
|
2167
|
+
"Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
|
|
2168
|
+
column,
|
|
2169
|
+
str(self._df.columns),
|
|
2170
|
+
)
|
|
1159
2171
|
continue
|
|
1160
2172
|
|
|
1161
|
-
|
|
2173
|
+
# Handle string columns:
|
|
2174
|
+
if self.is_string_column(self._df[column]):
|
|
2175
|
+
# Apply cleansing operations on string column
|
|
2176
|
+
self.logger.info(
|
|
2177
|
+
"Column -> '%s' has string values. Do cleansing for string values...",
|
|
2178
|
+
column,
|
|
2179
|
+
)
|
|
2180
|
+
self._df[column] = self._df[column].apply(
|
|
2181
|
+
lambda x,
|
|
2182
|
+
replacements=replacements,
|
|
2183
|
+
upper=upper,
|
|
2184
|
+
lower=lower,
|
|
2185
|
+
capitalize=capitalize,
|
|
2186
|
+
title=title,
|
|
2187
|
+
length=length: (
|
|
2188
|
+
self._apply_string_cleansing(
|
|
2189
|
+
value=x,
|
|
2190
|
+
replacements=replacements,
|
|
2191
|
+
upper=upper,
|
|
2192
|
+
lower=lower,
|
|
2193
|
+
capitalize=capitalize,
|
|
2194
|
+
title=title,
|
|
2195
|
+
length=length,
|
|
2196
|
+
)
|
|
2197
|
+
if isinstance(x, str)
|
|
2198
|
+
else x
|
|
2199
|
+
),
|
|
2200
|
+
)
|
|
1162
2201
|
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
2202
|
+
# Handle list columns:
|
|
2203
|
+
elif self.is_list_column(self._df[column]):
|
|
2204
|
+
# Handle list-like columns for this we iterate over each list item
|
|
2205
|
+
# and apply the cleansing by calling _apply_string_cleansing() for item:
|
|
2206
|
+
self.logger.info(
|
|
2207
|
+
"Column -> '%s' has list values. Do cleansing for each list item...",
|
|
2208
|
+
column,
|
|
2209
|
+
)
|
|
2210
|
+
self._df[column] = self._df[column].apply(
|
|
2211
|
+
lambda x,
|
|
2212
|
+
replacements=replacements,
|
|
2213
|
+
upper=upper,
|
|
2214
|
+
lower=lower,
|
|
2215
|
+
capitalize=capitalize,
|
|
2216
|
+
title=title,
|
|
2217
|
+
length=length: (
|
|
2218
|
+
[
|
|
2219
|
+
(
|
|
2220
|
+
self._apply_string_cleansing(
|
|
2221
|
+
value=item,
|
|
2222
|
+
replacements=replacements,
|
|
2223
|
+
upper=upper,
|
|
2224
|
+
lower=lower,
|
|
2225
|
+
capitalize=capitalize,
|
|
2226
|
+
title=title,
|
|
2227
|
+
length=length,
|
|
2228
|
+
)
|
|
2229
|
+
if isinstance(
|
|
2230
|
+
item,
|
|
2231
|
+
str,
|
|
2232
|
+
) # we just change string list items
|
|
2233
|
+
else item
|
|
2234
|
+
)
|
|
2235
|
+
for item in x
|
|
2236
|
+
]
|
|
2237
|
+
if isinstance(x, list)
|
|
2238
|
+
else x
|
|
2239
|
+
),
|
|
2240
|
+
)
|
|
1167
2241
|
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
# if replacement:
|
|
1173
|
-
# \b is a word boundary anchor in regular expressions.
|
|
1174
|
-
# It matches a position where one side is a word character
|
|
1175
|
-
# (like a letter or digit) and the other side is a non-word character
|
|
1176
|
-
# (like whitespace or punctuation). It's often used to match whole words.
|
|
1177
|
-
# regex_pattern = rf"\b{regex_pattern}\b"
|
|
1178
|
-
# self._df[column] = self._df[column].replace(
|
|
1179
|
-
# regex=regex_pattern, value=replacement
|
|
1180
|
-
# )
|
|
1181
|
-
self._df[column] = self._df[column].str.replace(
|
|
1182
|
-
pat=regex_pattern, repl=replacement, regex=True
|
|
2242
|
+
else:
|
|
2243
|
+
self.logger.error(
|
|
2244
|
+
"Column -> '%s' is not a string, list, or dict-like column. Skipping cleansing...",
|
|
2245
|
+
column,
|
|
1183
2246
|
)
|
|
2247
|
+
# end else handling strings and lists
|
|
2248
|
+
# for column, cleansing in cleansings.items()
|
|
2249
|
+
|
|
2250
|
+
# end method definition
|
|
2251
|
+
|
|
2252
|
+
def _cleanse_dictionary(
|
|
2253
|
+
self,
|
|
2254
|
+
data: dict,
|
|
2255
|
+
dict_key: str,
|
|
2256
|
+
replacements: dict[str, str],
|
|
2257
|
+
upper: bool,
|
|
2258
|
+
lower: bool,
|
|
2259
|
+
capitalize: bool = False,
|
|
2260
|
+
title: bool = False,
|
|
2261
|
+
length: int = 0,
|
|
2262
|
+
) -> dict:
|
|
2263
|
+
"""Cleanse dictionary data within a single column value that has a given key.
|
|
2264
|
+
|
|
2265
|
+
Args:
|
|
2266
|
+
data (dict):
|
|
2267
|
+
The column dictionary value.
|
|
2268
|
+
dict_key (str):
|
|
2269
|
+
The dictionary key whose value should be cleansed in the row to cleanse.
|
|
2270
|
+
replacements (dict):
|
|
2271
|
+
Dictionary of regex replacements to apply to the subfield value.
|
|
2272
|
+
upper (bool):
|
|
2273
|
+
If True, convert value in subfield to upper-case.
|
|
2274
|
+
lower (bool):
|
|
2275
|
+
If True, convert value in subfield to lower-case.
|
|
2276
|
+
capitalize (bool, optional):
|
|
2277
|
+
If True, capitalize the first letter of the subfield value.
|
|
2278
|
+
title (bool, optional):
|
|
2279
|
+
If True, title-case the subfield value.
|
|
2280
|
+
length (int, optional):
|
|
2281
|
+
The maximum length for the subfield value.
|
|
2282
|
+
|
|
2283
|
+
Returns:
|
|
2284
|
+
dict:
|
|
2285
|
+
The updated data with the cleansing applied to the dictionary item with the given key.
|
|
2286
|
+
|
|
2287
|
+
"""
|
|
2288
|
+
|
|
2289
|
+
if pd.isna(data):
|
|
2290
|
+
return data
|
|
1184
2291
|
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
2292
|
+
if dict_key not in data:
|
|
2293
|
+
self.logger.warning(
|
|
2294
|
+
"The dictionary key -> '%s' (field) is not in the data frame row! Cleansing skipped!",
|
|
2295
|
+
dict_key,
|
|
2296
|
+
)
|
|
2297
|
+
return data
|
|
2298
|
+
|
|
2299
|
+
# 1. Read the value to be cleansed from the data dict:
|
|
2300
|
+
value = data[dict_key]
|
|
2301
|
+
|
|
2302
|
+
# 2. Apply string operations based on the type of the value (str, list, or dict)
|
|
2303
|
+
|
|
2304
|
+
if isinstance(value, str):
|
|
2305
|
+
# If the value is a string, apply the string operations directly
|
|
2306
|
+
value: str = self._apply_string_cleansing(
|
|
2307
|
+
value=value,
|
|
2308
|
+
replacements=replacements,
|
|
2309
|
+
upper=upper,
|
|
2310
|
+
lower=lower,
|
|
2311
|
+
capitalize=capitalize,
|
|
2312
|
+
title=title,
|
|
2313
|
+
length=length,
|
|
2314
|
+
)
|
|
2315
|
+
elif isinstance(value, list):
|
|
2316
|
+
# If the value is a list, apply string operations to each element
|
|
2317
|
+
value: list = [
|
|
2318
|
+
(
|
|
2319
|
+
self._apply_string_cleansing(
|
|
2320
|
+
value=item,
|
|
2321
|
+
replacements=replacements,
|
|
2322
|
+
upper=upper,
|
|
2323
|
+
lower=lower,
|
|
2324
|
+
capitalize=capitalize,
|
|
2325
|
+
title=title,
|
|
2326
|
+
length=length,
|
|
1191
2327
|
)
|
|
2328
|
+
if isinstance(item, str)
|
|
2329
|
+
else item
|
|
2330
|
+
)
|
|
2331
|
+
for item in value
|
|
2332
|
+
]
|
|
2333
|
+
elif isinstance(value, dict):
|
|
2334
|
+
# If the value is a dictionary, apply string operations to each value
|
|
2335
|
+
value: dict = {
|
|
2336
|
+
k: (
|
|
2337
|
+
self._apply_string_cleansing(
|
|
2338
|
+
value=v,
|
|
2339
|
+
replacements=replacements,
|
|
2340
|
+
upper=upper,
|
|
2341
|
+
lower=lower,
|
|
2342
|
+
capitalize=capitalize,
|
|
2343
|
+
title=title,
|
|
2344
|
+
length=length,
|
|
2345
|
+
)
|
|
2346
|
+
if isinstance(v, str)
|
|
2347
|
+
else v
|
|
2348
|
+
)
|
|
2349
|
+
for k, v in value.items()
|
|
2350
|
+
}
|
|
1192
2351
|
|
|
1193
|
-
|
|
2352
|
+
# 3. Write back the cleansed value to the data dict:
|
|
2353
|
+
data[dict_key] = value
|
|
2354
|
+
|
|
2355
|
+
return data
|
|
1194
2356
|
|
|
1195
2357
|
# end method definition
|
|
1196
2358
|
|
|
1197
2359
|
def _cleanse_subfield(
|
|
1198
2360
|
self,
|
|
1199
|
-
data:
|
|
1200
|
-
|
|
1201
|
-
replacements: dict,
|
|
2361
|
+
data: dict | list,
|
|
2362
|
+
dict_key: str,
|
|
2363
|
+
replacements: dict[str, str],
|
|
1202
2364
|
upper: bool,
|
|
1203
2365
|
lower: bool,
|
|
2366
|
+
capitalize: bool = False,
|
|
2367
|
+
title: bool = False,
|
|
1204
2368
|
length: int = 0,
|
|
1205
|
-
) ->
|
|
1206
|
-
"""
|
|
2369
|
+
) -> dict | list:
|
|
2370
|
+
"""Cleanse subfield data within a single column value.
|
|
2371
|
+
|
|
2372
|
+
This is NOT a pd.Series but either a dictionary or a list of dictionaries.
|
|
1207
2373
|
|
|
1208
2374
|
Args:
|
|
1209
|
-
data (
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
2375
|
+
data (dict | list):
|
|
2376
|
+
The column value. Can be a dictionary or a list of dictionaries
|
|
2377
|
+
dict_key (str):
|
|
2378
|
+
The dictionary key whose value should be cleansed in the data to cleanse.
|
|
2379
|
+
replacements (dict):
|
|
2380
|
+
Dictionary of regex replacements to apply to the subfield value.
|
|
2381
|
+
upper (bool):
|
|
2382
|
+
If True, convert value in subfield to upper-case.
|
|
2383
|
+
lower (bool):
|
|
2384
|
+
If True, convert value in subfield to lower-case.
|
|
2385
|
+
capitalize (bool, optional):
|
|
2386
|
+
If True, capitalize the first letter of the subfield value.
|
|
2387
|
+
title (bool, optional):
|
|
2388
|
+
If True, title-case the subfield value.
|
|
2389
|
+
length (int, optional):
|
|
2390
|
+
The maximum length for the subfield value.
|
|
2391
|
+
|
|
1215
2392
|
Returns:
|
|
1216
|
-
|
|
2393
|
+
dict | list:
|
|
2394
|
+
The updated data with the cleansing applied to the subfield.
|
|
2395
|
+
|
|
1217
2396
|
"""
|
|
1218
2397
|
|
|
1219
2398
|
if isinstance(data, list):
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
for regex_pattern, replacement in replacements.items():
|
|
1232
|
-
if replacement:
|
|
1233
|
-
regex_pattern = rf"\b{regex_pattern}\b"
|
|
1234
|
-
item[sub_field] = re.sub(
|
|
1235
|
-
regex_pattern, replacement, item[sub_field]
|
|
1236
|
-
)
|
|
1237
|
-
if length > 0:
|
|
1238
|
-
item[sub_field] = item[sub_field][:length]
|
|
1239
|
-
data[i] = item
|
|
1240
|
-
elif isinstance(data, dict):
|
|
1241
|
-
# If data is a dictionary, apply cleansing directly to the subfield
|
|
1242
|
-
if sub_field in data and not pd.isnull(data[sub_field]):
|
|
1243
|
-
if upper:
|
|
1244
|
-
data[sub_field] = data[sub_field].upper()
|
|
1245
|
-
elif lower:
|
|
1246
|
-
data[sub_field] = data[sub_field].lower()
|
|
1247
|
-
for regex_pattern, replacement in replacements.items():
|
|
1248
|
-
if replacement:
|
|
1249
|
-
regex_pattern = rf"\b{regex_pattern}\b"
|
|
1250
|
-
data[sub_field] = re.sub(
|
|
1251
|
-
regex_pattern, replacement, data[sub_field]
|
|
2399
|
+
data = [
|
|
2400
|
+
(
|
|
2401
|
+
self._cleanse_dictionary(
|
|
2402
|
+
data=item,
|
|
2403
|
+
dict_key=dict_key,
|
|
2404
|
+
replacements=replacements,
|
|
2405
|
+
upper=upper,
|
|
2406
|
+
lower=lower,
|
|
2407
|
+
capitalize=capitalize,
|
|
2408
|
+
title=title,
|
|
2409
|
+
length=length,
|
|
1252
2410
|
)
|
|
1253
|
-
|
|
1254
|
-
|
|
2411
|
+
if item is not None and dict_key in item and not pd.isna(item[dict_key])
|
|
2412
|
+
else item
|
|
2413
|
+
)
|
|
2414
|
+
for item in data
|
|
2415
|
+
]
|
|
2416
|
+
elif isinstance(data, dict):
|
|
2417
|
+
data = self._cleanse_dictionary(
|
|
2418
|
+
data=data,
|
|
2419
|
+
dict_key=dict_key,
|
|
2420
|
+
replacements=replacements,
|
|
2421
|
+
upper=upper,
|
|
2422
|
+
lower=lower,
|
|
2423
|
+
capitalize=capitalize,
|
|
2424
|
+
title=title,
|
|
2425
|
+
length=length,
|
|
2426
|
+
)
|
|
2427
|
+
|
|
1255
2428
|
return data
|
|
1256
2429
|
|
|
1257
2430
|
# end method definition
|
|
1258
2431
|
|
|
1259
|
-
def
|
|
1260
|
-
|
|
2432
|
+
def _apply_string_cleansing(
|
|
2433
|
+
self,
|
|
2434
|
+
value: str,
|
|
2435
|
+
replacements: dict[str, str],
|
|
2436
|
+
upper: bool,
|
|
2437
|
+
lower: bool,
|
|
2438
|
+
capitalize: bool,
|
|
2439
|
+
title: bool,
|
|
2440
|
+
length: int,
|
|
2441
|
+
) -> str | None:
|
|
2442
|
+
"""Apply string operations (upper, lower, capitalize, title-case, replacements) to a string.
|
|
1261
2443
|
|
|
1262
2444
|
Args:
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
2445
|
+
value (str):
|
|
2446
|
+
The string value to which the operations will be applied.
|
|
2447
|
+
replacements (dict[str, str]):
|
|
2448
|
+
A dictionary of regular expression patterns (keys) and replacement strings (values) to apply to the string.
|
|
2449
|
+
upper (bool):
|
|
2450
|
+
If True, convert the string to uppercase.
|
|
2451
|
+
lower (bool):
|
|
2452
|
+
If True, convert the string to lowercase.
|
|
2453
|
+
capitalize (bool):
|
|
2454
|
+
If True, capitalize the first letter of the string and lowercase the rest. Default is False.
|
|
2455
|
+
title (bool):
|
|
2456
|
+
If True, convert the string to title-case (first letter of each word is capitalized). Default is False.
|
|
2457
|
+
length (int):
|
|
2458
|
+
If greater than 0, truncate the string to this length. Default is 0 (no truncation).
|
|
2459
|
+
|
|
1274
2460
|
Returns:
|
|
1275
|
-
|
|
2461
|
+
str | None:
|
|
2462
|
+
The updated string with all the applied operations. None in case an error occured.
|
|
2463
|
+
|
|
2464
|
+
Example:
|
|
2465
|
+
value = "hello world"
|
|
2466
|
+
replacements = {r"world": "there"}
|
|
2467
|
+
upper = True
|
|
2468
|
+
length = 5
|
|
2469
|
+
|
|
2470
|
+
result = _apply_string_cleansing(value, replacements, upper, length=length)
|
|
2471
|
+
# result would be "HELLO"
|
|
2472
|
+
|
|
2473
|
+
"""
|
|
2474
|
+
|
|
2475
|
+
if not isinstance(
|
|
2476
|
+
value,
|
|
2477
|
+
str,
|
|
2478
|
+
): # Only apply string operations if the value is a string
|
|
2479
|
+
return None
|
|
2480
|
+
|
|
2481
|
+
if upper:
|
|
2482
|
+
value = value.upper()
|
|
2483
|
+
if lower:
|
|
2484
|
+
value = value.lower()
|
|
2485
|
+
if capitalize:
|
|
2486
|
+
value = value.capitalize()
|
|
2487
|
+
if title:
|
|
2488
|
+
value = value.title()
|
|
2489
|
+
|
|
2490
|
+
# Handle regex replacements
|
|
2491
|
+
for regex_pattern, replacement in replacements.items():
|
|
2492
|
+
if regex_pattern:
|
|
2493
|
+
# Check if the pattern does NOT contain any regex special characters
|
|
2494
|
+
# (excluding dot and ampersand) and ONLY then use \b ... \b
|
|
2495
|
+
# Special regexp characters include: ^ $ * + ? ( ) | [ ] { } \
|
|
2496
|
+
if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
|
|
2497
|
+
# Wrap with word boundaries for whole-word matching
|
|
2498
|
+
# \b is a word boundary anchor in regular expressions.
|
|
2499
|
+
# It matches a position where one side is a word character
|
|
2500
|
+
# (like a letter or digit) and the other side is a non-word character
|
|
2501
|
+
# (like whitespace or punctuation). It's used to match whole words.
|
|
2502
|
+
# We want to have this to e.g. not replace "INT" with "INTERNATIONAL"
|
|
2503
|
+
# if the word is already "INTERNATIONAL". It is important
|
|
2504
|
+
# that the \b ... \b enclosure is ONLY used if regex_pattern is NOT
|
|
2505
|
+
# a regular expression but just a normal string.
|
|
2506
|
+
# TODO: we may reconsider if re.escape() is required or not:
|
|
2507
|
+
regex_pattern = re.escape(regex_pattern)
|
|
2508
|
+
regex_pattern = rf"\b{regex_pattern}\b"
|
|
2509
|
+
try:
|
|
2510
|
+
value = re.sub(regex_pattern, replacement, value)
|
|
2511
|
+
except re.error:
|
|
2512
|
+
self.logger.error(
|
|
2513
|
+
"Invalid regex pattern -> '%s' in replacement processing!",
|
|
2514
|
+
regex_pattern,
|
|
2515
|
+
)
|
|
2516
|
+
continue
|
|
2517
|
+
|
|
2518
|
+
# Truncate to the specified length, starting from index 0
|
|
2519
|
+
if 0 < length < len(value):
|
|
2520
|
+
value = value[:length]
|
|
2521
|
+
|
|
2522
|
+
return value
|
|
2523
|
+
|
|
2524
|
+
# end method definition
|
|
2525
|
+
|
|
2526
|
+
def filter(
|
|
2527
|
+
self,
|
|
2528
|
+
conditions: list,
|
|
2529
|
+
inplace: bool = True,
|
|
2530
|
+
reset_index: bool = True,
|
|
2531
|
+
) -> pd.DataFrame | None:
|
|
2532
|
+
"""Filter the data frame based on (multiple) conditions.
|
|
2533
|
+
|
|
2534
|
+
Args:
|
|
2535
|
+
conditions (list):
|
|
2536
|
+
Conditions are a list of dictionaries with 3 items:
|
|
2537
|
+
* field (str): The name of a column in the data frame
|
|
2538
|
+
* value (str or list):
|
|
2539
|
+
Expected value (filter criterium).
|
|
2540
|
+
If it is a list then one of the list elements must match the field value (OR)
|
|
2541
|
+
* equal (bool):
|
|
2542
|
+
Whether to test for equal or non-equal. If not specified equal is treated as True.
|
|
2543
|
+
* regex (bool):
|
|
2544
|
+
This flag controls if the value is interpreted as a
|
|
2545
|
+
regular expression. If there is no regex item in the
|
|
2546
|
+
dictionary then the default is False (= values is NOT regex).
|
|
2547
|
+
* enabled (bool):
|
|
2548
|
+
True or False. The filter is only applied if 'enabled = True'
|
|
2549
|
+
If there are multiple conditions in the list each has to evaluate to True (AND)
|
|
2550
|
+
inplace (bool, optional):
|
|
2551
|
+
Defines if the self._df is modified (inplace) or just
|
|
2552
|
+
a new data frame is returned. Defaults to True.
|
|
2553
|
+
reset_index (bool, optional):
|
|
2554
|
+
Filter removes rows. If filter_index = True then the numbering
|
|
2555
|
+
of the index is newly calculated
|
|
2556
|
+
|
|
2557
|
+
Returns:
|
|
2558
|
+
pd.DataFrame | None:
|
|
2559
|
+
A new data frame or pointer to self._df (depending on the value of 'inplace').
|
|
2560
|
+
None in case of an error.
|
|
2561
|
+
|
|
1276
2562
|
"""
|
|
1277
2563
|
|
|
1278
2564
|
if self._df is None:
|
|
1279
|
-
logger.error("
|
|
2565
|
+
self.logger.error("Data frame is not initialized.")
|
|
1280
2566
|
return None
|
|
1281
2567
|
|
|
1282
2568
|
if self._df.empty:
|
|
1283
|
-
logger.error("
|
|
2569
|
+
self.logger.error("Data frame is empty.")
|
|
1284
2570
|
return None
|
|
1285
2571
|
|
|
1286
|
-
#
|
|
1287
|
-
#
|
|
2572
|
+
# First filtered_df is the full data frame.
|
|
2573
|
+
# Then it is subsequentially reduced by each condition
|
|
1288
2574
|
# at the end it is just those rows that match all conditions.
|
|
1289
|
-
filtered_df = self._df
|
|
2575
|
+
filtered_df = self._df if inplace else self._df.copy()
|
|
2576
|
+
|
|
2577
|
+
def list_matches(row: list, values: list) -> bool:
|
|
2578
|
+
"""Check if any item in the 'values' list is present in the given 'row' list.
|
|
2579
|
+
|
|
2580
|
+
Args:
|
|
2581
|
+
row (list):
|
|
2582
|
+
A list of items from the data frame column.
|
|
2583
|
+
values (list):
|
|
2584
|
+
A list of values to check for in the 'row'.
|
|
2585
|
+
|
|
2586
|
+
Returns:
|
|
2587
|
+
bool:
|
|
2588
|
+
True if any item in 'values' is found in 'row', otherwise False.
|
|
2589
|
+
|
|
2590
|
+
"""
|
|
2591
|
+
|
|
2592
|
+
return any(item in values for item in row)
|
|
2593
|
+
|
|
2594
|
+
def dict_matches(row: dict, key: str, values: list) -> bool:
|
|
2595
|
+
"""Check if the value for the dictionary 'key' is in 'values'.
|
|
2596
|
+
|
|
2597
|
+
Args:
|
|
2598
|
+
row (dict):
|
|
2599
|
+
A dictionary from the data frame column.
|
|
2600
|
+
key (str):
|
|
2601
|
+
The key to lookup in the dictionary.
|
|
2602
|
+
values (list):
|
|
2603
|
+
A list of values to check for in the 'row'.
|
|
1290
2604
|
|
|
1291
|
-
|
|
2605
|
+
Returns:
|
|
2606
|
+
bool:
|
|
2607
|
+
True, if the value for the dictionary key is in 'values', otherwise False.
|
|
2608
|
+
|
|
2609
|
+
"""
|
|
2610
|
+
|
|
2611
|
+
if not row or key not in row:
|
|
2612
|
+
return False
|
|
2613
|
+
|
|
2614
|
+
return row[key] in values
|
|
2615
|
+
|
|
2616
|
+
# We traverse a list of conditions. Each condition must evaluate to True
|
|
1292
2617
|
# otherwise the current workspace or document (i.e. the data set for these objects)
|
|
1293
|
-
# will be skipped.
|
|
2618
|
+
# will be skipped.
|
|
1294
2619
|
for condition in conditions:
|
|
2620
|
+
# Check if the condition is enabled. If 'enabled' is not
|
|
2621
|
+
# in the condition dict then we assume it is enabled.
|
|
2622
|
+
if not condition.get("enabled", True):
|
|
2623
|
+
continue
|
|
1295
2624
|
field = condition.get("field", None)
|
|
1296
2625
|
if not field:
|
|
1297
|
-
logger.error(
|
|
2626
|
+
self.logger.error(
|
|
2627
|
+
"Missing value for filter condition 'field' in payload!",
|
|
2628
|
+
)
|
|
1298
2629
|
continue
|
|
2630
|
+
if "." in field:
|
|
2631
|
+
field, sub = field.split(".", 1)
|
|
2632
|
+
else:
|
|
2633
|
+
sub = None
|
|
2634
|
+
|
|
1299
2635
|
if field not in self._df.columns:
|
|
1300
|
-
logger.warning(
|
|
1301
|
-
"Filter condition field -> %s does not exist as column in data frame! Data frame has these columns -> %s",
|
|
2636
|
+
self.logger.warning(
|
|
2637
|
+
"Filter condition field -> '%s' does not exist as column in the data frame! Data frame has these columns -> %s",
|
|
1302
2638
|
field,
|
|
1303
2639
|
str(self._df.columns),
|
|
1304
2640
|
)
|
|
1305
|
-
continue # Skip filtering for columns not present in
|
|
2641
|
+
continue # Skip filtering for columns not present in data frame
|
|
2642
|
+
|
|
2643
|
+
regex = condition.get("regex", False)
|
|
2644
|
+
# We need the column to be of type string if we want to use regular expressions
|
|
2645
|
+
# so if the column is not yet a string we convert the column to string:
|
|
2646
|
+
if regex and filtered_df[field].dtype != "object":
|
|
2647
|
+
# Change type of column to string:
|
|
2648
|
+
filtered_df[field] = filtered_df[field].astype(str)
|
|
2649
|
+
filtered_df[field] = filtered_df[field].fillna("")
|
|
2650
|
+
|
|
1306
2651
|
value = condition.get("value", None)
|
|
1307
|
-
if
|
|
1308
|
-
|
|
1309
|
-
|
|
2652
|
+
if value is None:
|
|
2653
|
+
# Support alternative syntax using plural.
|
|
2654
|
+
value = condition.get("values", None)
|
|
2655
|
+
if value is None:
|
|
2656
|
+
self.logger.error(
|
|
2657
|
+
"Missing filter value(s) for filter condition field -> '%s'!",
|
|
2658
|
+
field,
|
|
1310
2659
|
)
|
|
1311
2660
|
continue
|
|
1312
|
-
regex = condition.get("regex", False)
|
|
1313
|
-
|
|
1314
|
-
logger.info(
|
|
1315
|
-
"Data Frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
|
|
1316
|
-
filtered_df.shape[0],
|
|
1317
|
-
filtered_df.shape[1],
|
|
1318
|
-
str(condition),
|
|
1319
|
-
)
|
|
1320
|
-
|
|
1321
|
-
filtered_dfs = []
|
|
1322
2661
|
|
|
1323
2662
|
# if a single string is passed as value we put
|
|
1324
2663
|
# it into an 1-item list to simplify the following code:
|
|
1325
2664
|
if not isinstance(value, list):
|
|
1326
2665
|
value = [value]
|
|
1327
2666
|
|
|
1328
|
-
#
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
2667
|
+
# If all values in the condition are strings then we
|
|
2668
|
+
# want the column also to be of type string:
|
|
2669
|
+
if all(isinstance(v, str) for v in value):
|
|
2670
|
+
# Change type of column to string:
|
|
2671
|
+
# filtered_df[field] = filtered_df[field].astype(str)
|
|
2672
|
+
# filtered_df[field] = filtered_df[field].fillna("").astype(str)
|
|
2673
|
+
# filtered_df[field] = filtered_df[field].fillna("")
|
|
2674
|
+
|
|
2675
|
+
# When inplace == True, filtered_df is just a reference to self._df.
|
|
2676
|
+
# Using .loc[:, field] ensures that Pandas updates the column correctly in self._df.
|
|
2677
|
+
# When inplace == False, filtered_df is a full copy (self._df.copy() above),
|
|
2678
|
+
# so modifications remain in filtered_df.
|
|
2679
|
+
# .loc[:, field] ensures no SettingWithCopyWarning, since filtered_df is now a separate DataFrame.
|
|
2680
|
+
filtered_df.loc[:, field] = filtered_df[field].fillna("").astype(str)
|
|
2681
|
+
|
|
2682
|
+
self.logger.info(
|
|
2683
|
+
"Data frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
|
|
2684
|
+
str(filtered_df.shape[0]),
|
|
2685
|
+
str(filtered_df.shape[1]),
|
|
2686
|
+
str(condition),
|
|
2687
|
+
)
|
|
2688
|
+
|
|
2689
|
+
# Check if the column is boolean
|
|
2690
|
+
if pd.api.types.is_bool_dtype(filtered_df[field]):
|
|
2691
|
+
# Convert string representations of booleans to actual booleans
|
|
2692
|
+
value = [v.lower() in ["true", "1"] if isinstance(v, str) else bool(v) for v in value]
|
|
2693
|
+
|
|
2694
|
+
# Do we want to test for equalitiy or non-equality?
|
|
2695
|
+
# For lists equality means: value is in the list
|
|
2696
|
+
# For lists non-equality means: value is NOT in the list
|
|
2697
|
+
test_for_equal = condition.get("equal", True)
|
|
2698
|
+
|
|
2699
|
+
# Check if the column contains only lists (every non-empty element in the column is a list).
|
|
2700
|
+
# `filtered_df[field]`: Access the column with the name specified in 'field'.
|
|
2701
|
+
# `.dropna()`: Drop None or NaN rows for the test.
|
|
2702
|
+
# `.apply(lambda x: isinstance(x, list))`: For each element in the column, check if it is a list.
|
|
2703
|
+
# `.all()`: Ensure that all elements in the column satisfy the condition of being a list.
|
|
2704
|
+
if filtered_df[field].dropna().apply(lambda x: isinstance(x, list)).all():
|
|
2705
|
+
if not test_for_equal:
|
|
2706
|
+
filtered_df = filtered_df[~filtered_df[field].apply(list_matches, values=value)]
|
|
2707
|
+
else:
|
|
2708
|
+
filtered_df = filtered_df[filtered_df[field].apply(list_matches, values=value)]
|
|
2709
|
+
# Check if the column contains only dictionaries (every non-empty element in the column is a dict).
|
|
2710
|
+
# `filtered_df[field]`: Access the column with the name specified in 'field'.
|
|
2711
|
+
# `.dropna()`: Drop None or NaN rows for the test.
|
|
2712
|
+
# `.apply(lambda x: isinstance(x, dict))`: For each element in the column, check if it is a dict.
|
|
2713
|
+
# `.all()`: Ensure that all elements in the column satisfy the condition of being a dictionary.
|
|
2714
|
+
elif filtered_df[field].dropna().apply(lambda x: isinstance(x, dict)).all():
|
|
2715
|
+
if not sub:
|
|
2716
|
+
self.logger.error(
|
|
2717
|
+
"Filtering on dictionary values need a key. This needs to be provided with 'field.key' syntax!",
|
|
1336
2718
|
)
|
|
2719
|
+
continue
|
|
2720
|
+
if not test_for_equal:
|
|
2721
|
+
filtered_df = filtered_df[~filtered_df[field].apply(dict_matches, key=sub, values=value)]
|
|
1337
2722
|
else:
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
str(
|
|
1350
|
-
|
|
1351
|
-
|
|
2723
|
+
filtered_df = filtered_df[filtered_df[field].apply(dict_matches, key=sub, values=value)]
|
|
2724
|
+
# Check if the column has boolean values:
|
|
2725
|
+
elif pd.api.types.is_bool_dtype(filtered_df[field]):
|
|
2726
|
+
# For a boolean filter we can drop NA values:
|
|
2727
|
+
filtered_df = filtered_df.dropna(subset=[field])
|
|
2728
|
+
if not test_for_equal:
|
|
2729
|
+
filtered_df = filtered_df[~filtered_df[field].isin(value)]
|
|
2730
|
+
else:
|
|
2731
|
+
filtered_df = filtered_df[filtered_df[field].isin(value)]
|
|
2732
|
+
elif not regex:
|
|
2733
|
+
if pd.api.types.is_string_dtype(filtered_df[field]):
|
|
2734
|
+
filtered_df[field] = filtered_df[field].str.strip()
|
|
2735
|
+
if not test_for_equal:
|
|
2736
|
+
filtered_df = filtered_df[~filtered_df[field].isin(value)]
|
|
2737
|
+
else:
|
|
2738
|
+
filtered_df = filtered_df[filtered_df[field].isin(value)]
|
|
1352
2739
|
else:
|
|
1353
|
-
#
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
2740
|
+
# Create a pure boolean pd.Series as a filter criterium:
|
|
2741
|
+
regex_condition = filtered_df[field].str.contains(
|
|
2742
|
+
"|".join(value),
|
|
2743
|
+
regex=True,
|
|
2744
|
+
na=False,
|
|
2745
|
+
)
|
|
2746
|
+
# Apply the boolean pd.Series named 'regex_condition' as
|
|
2747
|
+
# a filter - either non-negated or negated (using ~):
|
|
2748
|
+
filtered_df = filtered_df[~regex_condition] if not test_for_equal else filtered_df[regex_condition]
|
|
2749
|
+
|
|
2750
|
+
self.logger.info(
|
|
2751
|
+
"Data frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
|
|
2752
|
+
str(filtered_df.shape[0]),
|
|
2753
|
+
str(filtered_df.shape[1]),
|
|
1360
2754
|
str(condition),
|
|
1361
2755
|
)
|
|
1362
2756
|
# end for condition
|
|
@@ -1364,23 +2758,29 @@ class Data:
|
|
|
1364
2758
|
if inplace:
|
|
1365
2759
|
self._df = filtered_df
|
|
1366
2760
|
|
|
2761
|
+
if reset_index:
|
|
2762
|
+
self._df.reset_index(inplace=True, drop=True)
|
|
2763
|
+
|
|
1367
2764
|
return filtered_df
|
|
1368
2765
|
|
|
1369
2766
|
# end method definition
|
|
1370
2767
|
|
|
1371
|
-
def fill_na_in_column(self, column_name: str, default_value: str | int):
|
|
1372
|
-
"""Replace NA values in a column with a defined new default value
|
|
2768
|
+
def fill_na_in_column(self, column_name: str, default_value: str | int) -> None:
|
|
2769
|
+
"""Replace NA values in a column with a defined new default value.
|
|
1373
2770
|
|
|
1374
2771
|
Args:
|
|
1375
|
-
column_name (str):
|
|
1376
|
-
|
|
2772
|
+
column_name (str):
|
|
2773
|
+
The name of the column in the data frame.
|
|
2774
|
+
default_value (str | int):
|
|
2775
|
+
The value to replace NA with.
|
|
2776
|
+
|
|
1377
2777
|
"""
|
|
1378
2778
|
|
|
1379
2779
|
if column_name in self._df.columns:
|
|
1380
2780
|
self._df[column_name] = self._df[column_name].fillna(value=default_value)
|
|
1381
2781
|
else:
|
|
1382
|
-
logger.error(
|
|
1383
|
-
"Cannot replace NA values as column -> '%s' does not exist in the
|
|
2782
|
+
self.logger.error(
|
|
2783
|
+
"Cannot replace NA values as column -> '%s' does not exist in the data frame! Available columns -> %s",
|
|
1384
2784
|
column_name,
|
|
1385
2785
|
str(self._df.columns),
|
|
1386
2786
|
)
|
|
@@ -1388,16 +2788,19 @@ class Data:
|
|
|
1388
2788
|
# end method definition
|
|
1389
2789
|
|
|
1390
2790
|
def fill_forward(self, inplace: bool) -> pd.DataFrame:
|
|
1391
|
-
"""Fill the missing cells appropriately by carrying forward
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
2791
|
+
"""Fill the missing cells appropriately by carrying forward the values from the previous rows where necessary.
|
|
2792
|
+
|
|
2793
|
+
This has applications if a hierarchy is represented by
|
|
2794
|
+
nested cells e.g. in an Excel sheet.
|
|
1395
2795
|
|
|
1396
2796
|
Args:
|
|
1397
|
-
inplace (bool):
|
|
2797
|
+
inplace (bool):
|
|
2798
|
+
Should the modification happen inplace or not.
|
|
1398
2799
|
|
|
1399
2800
|
Returns:
|
|
1400
|
-
pd.DataFrame:
|
|
2801
|
+
pd.DataFrame:
|
|
2802
|
+
The resulting data frame.
|
|
2803
|
+
|
|
1401
2804
|
"""
|
|
1402
2805
|
|
|
1403
2806
|
# To convert an Excel representation of a folder structure with nested
|
|
@@ -1410,67 +2813,137 @@ class Data:
|
|
|
1410
2813
|
# end method definition
|
|
1411
2814
|
|
|
1412
2815
|
def lookup_value(
|
|
1413
|
-
self,
|
|
1414
|
-
|
|
1415
|
-
|
|
2816
|
+
self,
|
|
2817
|
+
lookup_column: str,
|
|
2818
|
+
lookup_value: str,
|
|
2819
|
+
separator: str = "|",
|
|
2820
|
+
single_row: bool = True,
|
|
2821
|
+
) -> pd.Series | pd.DataFrame | None:
|
|
2822
|
+
"""Lookup row(s) that includes a lookup value in the value of a given column.
|
|
1416
2823
|
|
|
1417
2824
|
Args:
|
|
1418
|
-
lookup_column (str):
|
|
1419
|
-
|
|
1420
|
-
|
|
2825
|
+
lookup_column (str):
|
|
2826
|
+
The name of the column to search in.
|
|
2827
|
+
lookup_value (str):
|
|
2828
|
+
The value to search for.
|
|
2829
|
+
separator (str):
|
|
2830
|
+
The string list delimiter / separator. The pipe symbol | is the default
|
|
2831
|
+
as it is unlikely to appear in a normal string (other than a plain comma).
|
|
2832
|
+
The separator is NOT looked for in the lookup_value but in the column that
|
|
2833
|
+
is given by lookup_column!
|
|
2834
|
+
single_row (bool, optional):
|
|
2835
|
+
This defines if we just return the first matching row if multiple matching rows
|
|
2836
|
+
are found. Default is True (= single row).
|
|
1421
2837
|
|
|
1422
2838
|
Returns:
|
|
1423
|
-
pd.Series |
|
|
2839
|
+
pd.Series | pd.DataFrame | None:
|
|
2840
|
+
Data frame (multiple rows) or Series (row) that matches the lookup value.
|
|
2841
|
+
None if no match was found.
|
|
2842
|
+
|
|
1424
2843
|
"""
|
|
1425
2844
|
|
|
1426
|
-
# Use the `apply` function to filter rows where the lookup value matches a
|
|
1427
|
-
|
|
1428
|
-
|
|
2845
|
+
# Use the `apply` function to filter rows where the lookup value matches a
|
|
2846
|
+
# whole item in the separator-divided list:
|
|
2847
|
+
def match_lookup_value(string_list: str | None) -> bool:
|
|
2848
|
+
"""Check if the lookup value is in a string list.
|
|
2849
|
+
|
|
2850
|
+
For this the string list is converted to a python
|
|
2851
|
+
list. A separator is used for the splitting.
|
|
1429
2852
|
|
|
1430
2853
|
Args:
|
|
1431
|
-
string_list (str):
|
|
2854
|
+
string_list (str):
|
|
2855
|
+
Delimiter-separated string list like "a, b, c" or "a | b | c"
|
|
1432
2856
|
|
|
1433
2857
|
Returns:
|
|
1434
|
-
bool:
|
|
2858
|
+
bool:
|
|
2859
|
+
True if lookup_value is equal to one of the delimiter-separated terms.
|
|
2860
|
+
|
|
1435
2861
|
"""
|
|
1436
|
-
return lookup_value in [
|
|
1437
|
-
item.strip() for item in string_list.split(separator)
|
|
1438
|
-
]
|
|
1439
2862
|
|
|
1440
|
-
|
|
2863
|
+
if pd.isna(string_list): # Handle None/NaN safely
|
|
2864
|
+
return False
|
|
2865
|
+
|
|
2866
|
+
# Ensure that the string is a string
|
|
2867
|
+
string_list = str(string_list)
|
|
2868
|
+
|
|
2869
|
+
return lookup_value in [item.strip() for item in string_list.split(separator)]
|
|
2870
|
+
|
|
2871
|
+
# end method definition
|
|
1441
2872
|
|
|
1442
2873
|
if self._df is None:
|
|
1443
2874
|
return None
|
|
1444
2875
|
|
|
2876
|
+
df = self._df
|
|
2877
|
+
|
|
1445
2878
|
if lookup_column not in self._df.columns:
|
|
1446
|
-
logger.error(
|
|
1447
|
-
"
|
|
2879
|
+
self.logger.error(
|
|
2880
|
+
"Cannot lookup value in column -> '%s'. Column does not exist in the data frame! Data frame has these columns -> %s",
|
|
1448
2881
|
lookup_column,
|
|
1449
2882
|
str(self._df.columns),
|
|
1450
2883
|
)
|
|
1451
2884
|
return None
|
|
1452
2885
|
|
|
1453
2886
|
# Fill NaN or None values in the lookup column with empty strings
|
|
1454
|
-
df[lookup_column] = df[lookup_column].fillna("")
|
|
2887
|
+
# df[lookup_column] = df[lookup_column].fillna("")
|
|
2888
|
+
|
|
2889
|
+
# Use the `apply` function to filter rows where the lookup value is in row cell
|
|
2890
|
+
# of column given by lookup_column. match_lookup_value() is called with
|
|
2891
|
+
# the content of the individual cell contents:
|
|
2892
|
+
matched_rows = df[df[lookup_column].apply(match_lookup_value)]
|
|
1455
2893
|
|
|
1456
|
-
#
|
|
1457
|
-
|
|
2894
|
+
# If nothing was found we return None:
|
|
2895
|
+
if matched_rows.empty:
|
|
2896
|
+
return None
|
|
2897
|
+
|
|
2898
|
+
# If it is OK to have multiple matches (= multiple rows = pd.DataFrame).
|
|
2899
|
+
# We can just return the matched_rows now which should be a pd.DataFrame:
|
|
2900
|
+
if not single_row:
|
|
2901
|
+
return matched_rows
|
|
2902
|
+
|
|
2903
|
+
# Check if more than one row matches, and log a warning if so
|
|
2904
|
+
if len(matched_rows) > 1:
|
|
2905
|
+
self.logger.warning(
|
|
2906
|
+
"More than one match found for lookup value -> '%s' in column -> '%s'. Returning the first match.",
|
|
2907
|
+
lookup_value,
|
|
2908
|
+
lookup_column,
|
|
2909
|
+
)
|
|
1458
2910
|
|
|
1459
2911
|
# Return the first matched row, if any
|
|
1460
|
-
|
|
1461
|
-
return matched_row.iloc[0]
|
|
2912
|
+
return matched_rows.iloc[0]
|
|
1462
2913
|
|
|
1463
|
-
|
|
2914
|
+
# end method definition
|
|
2915
|
+
|
|
2916
|
+
def set_value(self, column: str, value, condition: pd.Series | None = None) -> None: # noqa: ANN001
|
|
2917
|
+
"""Set the value in the data frame based on a condition.
|
|
2918
|
+
|
|
2919
|
+
Args:
|
|
2920
|
+
column (str):
|
|
2921
|
+
The name of the column.
|
|
2922
|
+
value (Any):
|
|
2923
|
+
The value to set for those rows that fulfill the condition.
|
|
2924
|
+
condition (pd.Series, optional):
|
|
2925
|
+
This should be a boolean Series where each element is True or False,
|
|
2926
|
+
representing rows in the data frame that meet a certain condition.
|
|
2927
|
+
If None is provided then ALL rows get the 'value' in the given
|
|
2928
|
+
column.
|
|
2929
|
+
|
|
2930
|
+
"""
|
|
2931
|
+
|
|
2932
|
+
if condition is None:
|
|
2933
|
+
self._df[column] = value # Set value unconditionally
|
|
2934
|
+
else:
|
|
2935
|
+
self._df.loc[condition, column] = value # Set value based on condition
|
|
1464
2936
|
|
|
1465
2937
|
# end method definition
|
|
1466
2938
|
|
|
1467
2939
|
def add_column(
|
|
1468
2940
|
self,
|
|
1469
|
-
source_column: str,
|
|
1470
|
-
reg_exp: str,
|
|
1471
2941
|
new_column: str,
|
|
1472
|
-
|
|
1473
|
-
|
|
2942
|
+
data_type: str = "string",
|
|
2943
|
+
source_column: str = "",
|
|
2944
|
+
reg_exp: str = "",
|
|
2945
|
+
prefix: str = "",
|
|
2946
|
+
suffix: str = "",
|
|
1474
2947
|
length: int | None = None,
|
|
1475
2948
|
group_chars: int | None = None,
|
|
1476
2949
|
group_separator: str = ".",
|
|
@@ -1479,24 +2952,78 @@ class Data:
|
|
|
1479
2952
|
"""Add additional column to the data frame.
|
|
1480
2953
|
|
|
1481
2954
|
Args:
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
2955
|
+
new_column (str):
|
|
2956
|
+
The name of the column to add.
|
|
2957
|
+
data_type (str, optional):
|
|
2958
|
+
The data type of the new column.
|
|
2959
|
+
source_column (str, optional):
|
|
2960
|
+
The name of the source column.
|
|
2961
|
+
reg_exp (str, optional):
|
|
2962
|
+
A regular expression to apply on the content of the source column.
|
|
2963
|
+
prefix (str, optional):
|
|
2964
|
+
Prefix to add in front of the value. Defaults to "".
|
|
2965
|
+
suffix (str, optional):
|
|
2966
|
+
Suffix to add at the end of the value. Defaults to "".
|
|
2967
|
+
length (int | None, optional):
|
|
2968
|
+
Length to reduce to. Defaults to None (= unlimited).
|
|
2969
|
+
group_chars (int | None, optional):
|
|
2970
|
+
Group the resulting string in characters of group_chars. Defaults to None.
|
|
2971
|
+
Usable e.g. for thousand seperator "."
|
|
2972
|
+
group_separator (str, optional):
|
|
2973
|
+
Separator string for the grouping. Defaults to ".".
|
|
2974
|
+
group_remove_leading_zero (bool, optional):
|
|
2975
|
+
Remove leading zeros from the groups. Defaults to True.
|
|
1491
2976
|
|
|
1492
2977
|
Returns:
|
|
1493
|
-
bool:
|
|
2978
|
+
bool:
|
|
2979
|
+
True = Success, False = Failure
|
|
2980
|
+
|
|
1494
2981
|
"""
|
|
1495
2982
|
|
|
1496
2983
|
if self._df is None:
|
|
1497
2984
|
return False
|
|
1498
2985
|
|
|
2986
|
+
# Check that the new column does not yet exist
|
|
2987
|
+
if new_column in self._df.columns:
|
|
2988
|
+
self.logger.error(
|
|
2989
|
+
"New column -> '%s' does already exist in data frame! Cannot add it. Data frame has these columns -> %s",
|
|
2990
|
+
new_column,
|
|
2991
|
+
str(self._df.columns),
|
|
2992
|
+
)
|
|
2993
|
+
return False
|
|
2994
|
+
|
|
2995
|
+
# first we handle the very simple case to not have
|
|
2996
|
+
# a source column but just add an empty new column:
|
|
2997
|
+
if not source_column:
|
|
2998
|
+
self._df[new_column] = pd.Series(dtype=data_type)
|
|
2999
|
+
return True
|
|
3000
|
+
|
|
3001
|
+
# Check if the source column exists
|
|
3002
|
+
if source_column not in self._df.columns:
|
|
3003
|
+
self.logger.error(
|
|
3004
|
+
"Source column -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
|
|
3005
|
+
source_column,
|
|
3006
|
+
str(self._df.columns),
|
|
3007
|
+
)
|
|
3008
|
+
return False
|
|
3009
|
+
|
|
3010
|
+
# Validate the regex pattern
|
|
3011
|
+
try:
|
|
3012
|
+
re.compile(reg_exp) # Check if the pattern is a valid regex
|
|
3013
|
+
except re.error:
|
|
3014
|
+
self.logger.error(
|
|
3015
|
+
"Invalid regular expression -> %s. Cannot extract data for new column -> '%s'!",
|
|
3016
|
+
reg_exp,
|
|
3017
|
+
new_column,
|
|
3018
|
+
)
|
|
3019
|
+
return False
|
|
3020
|
+
|
|
3021
|
+
# Ensure the source column is of type string (convert it, if necessary)
|
|
3022
|
+
if self._df[source_column].dtype != "object":
|
|
3023
|
+
self._df[source_column] = self._df[source_column].astype(str)
|
|
3024
|
+
|
|
1499
3025
|
# Use str.extract to apply the regular expression to the source column
|
|
3026
|
+
# and then assign this modified column to the variable "extracted":
|
|
1500
3027
|
extracted = self._df[source_column].str.extract(pat=reg_exp, expand=False)
|
|
1501
3028
|
|
|
1502
3029
|
# Limit the result to the specified length
|
|
@@ -1505,9 +3032,9 @@ class Data:
|
|
|
1505
3032
|
|
|
1506
3033
|
if group_chars is not None:
|
|
1507
3034
|
|
|
1508
|
-
def process_grouping(x):
|
|
3035
|
+
def process_grouping(x) -> str | None: # noqa: ANN001
|
|
1509
3036
|
if pd.isna(x):
|
|
1510
|
-
return
|
|
3037
|
+
return None
|
|
1511
3038
|
# Split into groups
|
|
1512
3039
|
groups = [x[i : i + group_chars] for i in range(0, len(x), group_chars)]
|
|
1513
3040
|
if group_remove_leading_zero:
|
|
@@ -1525,3 +3052,216 @@ class Data:
|
|
|
1525
3052
|
self._df[new_column] = extracted
|
|
1526
3053
|
|
|
1527
3054
|
return True
|
|
3055
|
+
|
|
3056
|
+
# end method definition
|
|
3057
|
+
|
|
3058
|
+
def convert_to_lists(self, columns: list, delimiter: str = ",") -> None:
|
|
3059
|
+
"""Intelligently convert string values to list values, in defined data frame columns.
|
|
3060
|
+
|
|
3061
|
+
The delimiter to separate values in the string value can be configured.
|
|
3062
|
+
The method is ignoring delimiters that are inside quotes.
|
|
3063
|
+
|
|
3064
|
+
Args:
|
|
3065
|
+
columns (list):
|
|
3066
|
+
The name of the columns whose values should be converted to lists.
|
|
3067
|
+
delimiter (str, optional):
|
|
3068
|
+
Character that delimits list items. Defaults to ",".
|
|
3069
|
+
|
|
3070
|
+
Returns:
|
|
3071
|
+
None. self._df is modified in place.
|
|
3072
|
+
|
|
3073
|
+
"""
|
|
3074
|
+
|
|
3075
|
+
# Regex to split by the delimiter, ignoring those inside quotes or double quotes
|
|
3076
|
+
def split_string_ignoring_quotes(s: str, delimiter: str) -> list:
|
|
3077
|
+
"""Split a string into a list at positions that have a delimiter character.
|
|
3078
|
+
|
|
3079
|
+
Args:
|
|
3080
|
+
s (str): the string to split
|
|
3081
|
+
delimiter (str): The single character that is used for splitting.
|
|
3082
|
+
|
|
3083
|
+
Returns:
|
|
3084
|
+
A list of splitted values.
|
|
3085
|
+
|
|
3086
|
+
"""
|
|
3087
|
+
|
|
3088
|
+
# Escaping the delimiter in case it's a special regex character
|
|
3089
|
+
delimiter = re.escape(delimiter)
|
|
3090
|
+
# Match quoted strings and unquoted delimiters separately
|
|
3091
|
+
pattern = rf'(?:"[^"]*"|\'[^\']*\'|[^{delimiter}]+)'
|
|
3092
|
+
return re.findall(pattern, s)
|
|
3093
|
+
|
|
3094
|
+
for col in columns:
|
|
3095
|
+
self._df[col] = self._df[col].apply(
|
|
3096
|
+
lambda x: (split_string_ignoring_quotes(x, delimiter) if isinstance(x, str) and delimiter in x else x),
|
|
3097
|
+
)
|
|
3098
|
+
|
|
3099
|
+
# end method definition
|
|
3100
|
+
|
|
3101
|
+
def add_column_concat(
|
|
3102
|
+
self,
|
|
3103
|
+
source_columns: list,
|
|
3104
|
+
new_column: str,
|
|
3105
|
+
concat_char: str = "",
|
|
3106
|
+
upper: bool = False,
|
|
3107
|
+
lower: bool = False,
|
|
3108
|
+
capitalize: bool = False,
|
|
3109
|
+
title: bool = False,
|
|
3110
|
+
) -> None:
|
|
3111
|
+
"""Add a column as a concatenation of the values of multiple source columns.
|
|
3112
|
+
|
|
3113
|
+
Args:
|
|
3114
|
+
source_columns (list):
|
|
3115
|
+
The column names the list values are taken from.
|
|
3116
|
+
new_column (str):
|
|
3117
|
+
The name of the new column.
|
|
3118
|
+
concat_char (str, optional):
|
|
3119
|
+
Character to insert between the concatenated values. Default is "".
|
|
3120
|
+
upper (bool, optional):
|
|
3121
|
+
Convert result to uppercase if True.
|
|
3122
|
+
lower (bool, optional):
|
|
3123
|
+
Convert result to lowercase if True.
|
|
3124
|
+
capitalize (bool, optional):
|
|
3125
|
+
Capitalize the result if True.
|
|
3126
|
+
title (bool, optional):
|
|
3127
|
+
Convert result to title case if True.
|
|
3128
|
+
|
|
3129
|
+
Returns:
|
|
3130
|
+
None. self._df is modified in place.
|
|
3131
|
+
|
|
3132
|
+
"""
|
|
3133
|
+
|
|
3134
|
+
def concatenate(row: pd.Series) -> str:
|
|
3135
|
+
# Comprehension to create a list from all source column values:
|
|
3136
|
+
concatenated = concat_char.join(
|
|
3137
|
+
[str(row[col]) for col in source_columns if pd.notna(row[col])],
|
|
3138
|
+
)
|
|
3139
|
+
|
|
3140
|
+
# Apply case transformations based on parameters
|
|
3141
|
+
if upper:
|
|
3142
|
+
concatenated = concatenated.upper()
|
|
3143
|
+
elif lower:
|
|
3144
|
+
concatenated = concatenated.lower()
|
|
3145
|
+
elif capitalize:
|
|
3146
|
+
concatenated = concatenated.capitalize()
|
|
3147
|
+
elif title:
|
|
3148
|
+
concatenated = concatenated.title()
|
|
3149
|
+
|
|
3150
|
+
# end method definition
|
|
3151
|
+
|
|
3152
|
+
self._df[new_column] = self._df.apply(concatenate, axis=1)
|
|
3153
|
+
|
|
3154
|
+
# end method definition
|
|
3155
|
+
|
|
3156
|
+
def add_column_list(self, source_columns: list, new_column: str) -> None:
|
|
3157
|
+
"""Add a column with list objects.
|
|
3158
|
+
|
|
3159
|
+
The list items are taken from a list of source columns (row by row).
|
|
3160
|
+
|
|
3161
|
+
Args:
|
|
3162
|
+
source_columns (list):
|
|
3163
|
+
The column names the list values are taken from.
|
|
3164
|
+
new_column (str):
|
|
3165
|
+
The name of the new column.
|
|
3166
|
+
|
|
3167
|
+
Returns:
|
|
3168
|
+
None. self._df is modified in place.
|
|
3169
|
+
|
|
3170
|
+
"""
|
|
3171
|
+
|
|
3172
|
+
def create_list(row: pd.Series) -> list:
|
|
3173
|
+
# Comprehension to create a list from all source column values:
|
|
3174
|
+
return [row[col] for col in source_columns]
|
|
3175
|
+
|
|
3176
|
+
self._df[new_column] = self._df.apply(create_list, axis=1)
|
|
3177
|
+
|
|
3178
|
+
# end method definition
|
|
3179
|
+
|
|
3180
|
+
def add_column_table(
|
|
3181
|
+
self,
|
|
3182
|
+
source_columns: list,
|
|
3183
|
+
new_column: str,
|
|
3184
|
+
delimiter: str = ",",
|
|
3185
|
+
) -> None:
|
|
3186
|
+
"""Add a column with tabular objects (list of dictionaries).
|
|
3187
|
+
|
|
3188
|
+
The source columns should include lists. The resulting dictionary
|
|
3189
|
+
keys are the column names for the source columns.
|
|
3190
|
+
|
|
3191
|
+
Example (["X", "Y"] are the source_columns, "Table" is the new_column):
|
|
3192
|
+
X[1] = [1, 2, 3] # row 1
|
|
3193
|
+
Y[1] = ["A", "B", "C"] # row 1
|
|
3194
|
+
X[2] = [4, 5, 6] # row 2
|
|
3195
|
+
Y[2] = ["D", "E", "F"] # row 2
|
|
3196
|
+
|
|
3197
|
+
Table[1] = [
|
|
3198
|
+
{
|
|
3199
|
+
"X": "1"
|
|
3200
|
+
"Y": "A"
|
|
3201
|
+
},
|
|
3202
|
+
{
|
|
3203
|
+
"X": "2"
|
|
3204
|
+
"Y": "B"
|
|
3205
|
+
}
|
|
3206
|
+
{
|
|
3207
|
+
"X": "3"
|
|
3208
|
+
"Y": "C"
|
|
3209
|
+
}
|
|
3210
|
+
]
|
|
3211
|
+
Table[2] = [
|
|
3212
|
+
{
|
|
3213
|
+
"X": "4"
|
|
3214
|
+
"Y": "D"
|
|
3215
|
+
},
|
|
3216
|
+
{
|
|
3217
|
+
"X": "5"
|
|
3218
|
+
"Y": "E"
|
|
3219
|
+
}
|
|
3220
|
+
{
|
|
3221
|
+
"X": "6"
|
|
3222
|
+
"Y": "F"
|
|
3223
|
+
}
|
|
3224
|
+
]
|
|
3225
|
+
|
|
3226
|
+
Args:
|
|
3227
|
+
source_columns (list):
|
|
3228
|
+
The column names the list values are taken from.
|
|
3229
|
+
new_column (str):
|
|
3230
|
+
The name of the new column.
|
|
3231
|
+
delimiter (str, optional):
|
|
3232
|
+
Character that delimits list items. Defaults to ",".
|
|
3233
|
+
|
|
3234
|
+
Returns:
|
|
3235
|
+
None. self._df is modified in place.
|
|
3236
|
+
|
|
3237
|
+
"""
|
|
3238
|
+
|
|
3239
|
+
# Call the convert_to_lists method to ensure the columns are converted
|
|
3240
|
+
self.convert_to_lists(columns=source_columns, delimiter=delimiter)
|
|
3241
|
+
|
|
3242
|
+
# Sub-method to pad lists to the same length
|
|
3243
|
+
def pad_list(lst: list, max_len: int) -> list:
|
|
3244
|
+
return lst + [None] * (max_len - len(lst))
|
|
3245
|
+
|
|
3246
|
+
def create_table(row: pd.Series) -> list:
|
|
3247
|
+
max_len = max(len(row[col]) if isinstance(row[col], list) else 1 for col in source_columns)
|
|
3248
|
+
|
|
3249
|
+
# Pad lists to the maximum length, leave scalar values as they are
|
|
3250
|
+
for col in source_columns:
|
|
3251
|
+
if isinstance(row[col], list):
|
|
3252
|
+
row[col] = pad_list(row[col], max_len)
|
|
3253
|
+
elif not pd.isna(row[col]):
|
|
3254
|
+
row[col] = [
|
|
3255
|
+
row[col],
|
|
3256
|
+
] * max_len # Repeat scalar value to match the max length
|
|
3257
|
+
else:
|
|
3258
|
+
row[col] = [None] * max_len
|
|
3259
|
+
# Create a list of dictionaries for each row:
|
|
3260
|
+
table = [{col: row[col][i] for col in source_columns} for i in range(max_len)]
|
|
3261
|
+
|
|
3262
|
+
return table
|
|
3263
|
+
|
|
3264
|
+
# Apply the function to create a new column with table values:
|
|
3265
|
+
self._df[new_column] = self._df.apply(create_table, axis=1)
|
|
3266
|
+
|
|
3267
|
+
# end method definition
|