pyxecm 1.3.0__py3-none-any.whl → 1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyxecm might be problematic. Click here for more details.
- pyxecm/__init__.py +3 -0
- pyxecm/coreshare.py +2636 -0
- pyxecm/customizer/__init__.py +6 -0
- pyxecm/customizer/browser_automation.py +231 -56
- pyxecm/customizer/customizer.py +466 -235
- pyxecm/customizer/k8s.py +49 -27
- pyxecm/customizer/m365.py +1183 -263
- pyxecm/customizer/payload.py +13854 -5368
- pyxecm/customizer/pht.py +503 -0
- pyxecm/customizer/salesforce.py +1782 -0
- pyxecm/customizer/sap.py +5 -5
- pyxecm/customizer/servicenow.py +1221 -0
- pyxecm/customizer/successfactors.py +1056 -0
- pyxecm/customizer/translate.py +2 -2
- pyxecm/helper/__init__.py +2 -0
- pyxecm/helper/assoc.py +27 -7
- pyxecm/helper/data.py +1527 -0
- pyxecm/helper/web.py +189 -25
- pyxecm/helper/xml.py +244 -40
- pyxecm/otac.py +311 -25
- pyxecm/otcs.py +3866 -1103
- pyxecm/otds.py +397 -150
- pyxecm/otiv.py +1 -1
- pyxecm/otmm.py +808 -0
- pyxecm/otpd.py +17 -12
- {pyxecm-1.3.0.dist-info → pyxecm-1.5.dist-info}/METADATA +4 -1
- pyxecm-1.5.dist-info/RECORD +30 -0
- {pyxecm-1.3.0.dist-info → pyxecm-1.5.dist-info}/WHEEL +1 -1
- pyxecm-1.3.0.dist-info/RECORD +0 -23
- {pyxecm-1.3.0.dist-info → pyxecm-1.5.dist-info}/LICENSE +0 -0
- {pyxecm-1.3.0.dist-info → pyxecm-1.5.dist-info}/top_level.txt +0 -0
pyxecm/helper/data.py
ADDED
|
@@ -0,0 +1,1527 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Module to implement functions to leverage Pandas to
|
|
3
|
+
manipulte data structures read for bulk generation of Extended ECM items.
|
|
4
|
+
|
|
5
|
+
This code implements a class called data which is referring
|
|
6
|
+
to Pandas DataFrame.
|
|
7
|
+
|
|
8
|
+
Class: Payload
|
|
9
|
+
Methods:
|
|
10
|
+
|
|
11
|
+
__init__ : class initializer
|
|
12
|
+
__len__: Lenght of the embedded DataFrame object.
|
|
13
|
+
__str__: Print the DataFrame of the class
|
|
14
|
+
get_data_frame: Get the Pandas DataFrame object
|
|
15
|
+
set_data_frame: Set the Pandas DataFrame object
|
|
16
|
+
append: Append additional data to the data frame.
|
|
17
|
+
|
|
18
|
+
load_json_data: Load JSON data into DataFrame
|
|
19
|
+
save_json_data: Save JSON data from DataFrame to file
|
|
20
|
+
load_excel_data: Load Excel file into DataFrame
|
|
21
|
+
load_csv_data: Load CSV data into DataFrame
|
|
22
|
+
load_directory: Load directory structure into Pandas Data Frame
|
|
23
|
+
|
|
24
|
+
partitionate: Partition a data frame into equally sized partions
|
|
25
|
+
deduplicate: Remove dupclicate rows that have all fields in unique_fields in common
|
|
26
|
+
sort: Sort the data frame based on one or multiple fields.
|
|
27
|
+
flatten: Flatten a sub-dictionary by copying selected fields to the
|
|
28
|
+
parent dictionary.
|
|
29
|
+
explode_and_flatten: Explode a substructure in the Data Frame
|
|
30
|
+
drop_columns: Drop selected columns from the Data Frame
|
|
31
|
+
keep_columns: Keep only selected columns from the Data Frame. Drop the rest.
|
|
32
|
+
cleanse: Cleanse data with regular expressions and upper/lower case conversion.
|
|
33
|
+
filter: Filter the DataFrame based on conditions
|
|
34
|
+
|
|
35
|
+
fill_forward: Fill the missing cells appropriately by carrying forward
|
|
36
|
+
the values from the previous rows where necessary.
|
|
37
|
+
fill_na_in_column: Replace NA values in a column with a defined new default value
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
__author__ = "Dr. Marc Diefenbruch"
|
|
41
|
+
__copyright__ = "Copyright 2024, OpenText"
|
|
42
|
+
__credits__ = ["Kai-Philip Gatzweiler"]
|
|
43
|
+
__maintainer__ = "Dr. Marc Diefenbruch"
|
|
44
|
+
__email__ = "mdiefenb@opentext.com"
|
|
45
|
+
|
|
46
|
+
import logging
|
|
47
|
+
import json
|
|
48
|
+
import os
|
|
49
|
+
import re
|
|
50
|
+
import threading
|
|
51
|
+
|
|
52
|
+
import pandas as pd
|
|
53
|
+
|
|
54
|
+
logger = logging.getLogger("pyxecm.helper.data")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Data:
|
|
58
|
+
"""Used to automate data loading for the customizer."""
|
|
59
|
+
|
|
60
|
+
_df: pd.DataFrame
|
|
61
|
+
_lock = threading.Lock()
|
|
62
|
+
|
|
63
|
+
def __init__(self, init_data: pd.DataFrame | list = None):
|
|
64
|
+
"""Initialize the Data object.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
init_data (pd.DataFrame | list, optional): Data to initialize the data frame. Can either be
|
|
68
|
+
another data frame (that gets copied) or a list of dictionaries.
|
|
69
|
+
Defaults to None.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
if init_data is not None:
|
|
73
|
+
# if a data frame is passed to the constructor we
|
|
74
|
+
# copy its content to the new Data object
|
|
75
|
+
|
|
76
|
+
if isinstance(init_data, pd.DataFrame):
|
|
77
|
+
self._df: pd.DataFrame = init_data.copy()
|
|
78
|
+
elif isinstance(init_data, Data):
|
|
79
|
+
if init_data.get_data_frame() is not None:
|
|
80
|
+
self._df: pd.DataFrame = init_data.get_data_frame().copy()
|
|
81
|
+
elif isinstance(init_data, list):
|
|
82
|
+
self._df: pd.DataFrame = pd.DataFrame(init_data)
|
|
83
|
+
elif isinstance(init_data, dict):
|
|
84
|
+
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
85
|
+
self._df: pd.DataFrame = pd.DataFrame([init_data])
|
|
86
|
+
else:
|
|
87
|
+
logger.error("Illegal initialization data for 'Data' class!")
|
|
88
|
+
self._df = None
|
|
89
|
+
else:
|
|
90
|
+
self._df = None
|
|
91
|
+
|
|
92
|
+
# end method definition
|
|
93
|
+
|
|
94
|
+
def __len__(self) -> int:
|
|
95
|
+
"""Lenght of the embedded DataFrame object.
|
|
96
|
+
This is basically a convenience method.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
int: Lenght of the DataFrame
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
if self._df is not None:
|
|
103
|
+
return len(self._df)
|
|
104
|
+
return 0
|
|
105
|
+
|
|
106
|
+
# end method definition
|
|
107
|
+
|
|
108
|
+
def __str__(self) -> str:
|
|
109
|
+
"""Print the DataFrame of the class.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
str: String representation.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
# if data frame is initialized we return
|
|
116
|
+
# the string representation of pd.DataFrame
|
|
117
|
+
if self._df is not None:
|
|
118
|
+
return str(self._df)
|
|
119
|
+
|
|
120
|
+
return str(self)
|
|
121
|
+
|
|
122
|
+
# end method definition
|
|
123
|
+
|
|
124
|
+
def lock(self):
|
|
125
|
+
"""Return the threading lock object.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
_type_: threading lock object
|
|
129
|
+
"""
|
|
130
|
+
return self._lock
|
|
131
|
+
|
|
132
|
+
# end method definition
|
|
133
|
+
|
|
134
|
+
def get_data_frame(self) -> pd.DataFrame:
|
|
135
|
+
"""Get the Pandas DataFrame object
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
pd.DataFrame: Pandas DataFrame object
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
return self._df
|
|
142
|
+
|
|
143
|
+
# end method definition
|
|
144
|
+
|
|
145
|
+
def set_data_frame(self, df: pd.DataFrame):
|
|
146
|
+
"""Set the Pandas DataFrame object
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
df (pd.DataFrame): Pandas DataFrame object
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
self._df = df
|
|
153
|
+
|
|
154
|
+
# end method definition
|
|
155
|
+
|
|
156
|
+
def print_info(
|
|
157
|
+
self,
|
|
158
|
+
show_size: bool = True,
|
|
159
|
+
show_info: bool = False,
|
|
160
|
+
show_columns: bool = False,
|
|
161
|
+
show_first: bool = False,
|
|
162
|
+
show_last: bool = False,
|
|
163
|
+
show_sample: bool = False,
|
|
164
|
+
show_statistics: bool = False,
|
|
165
|
+
row_num: int = 10,
|
|
166
|
+
):
|
|
167
|
+
"""Log information about the data frame
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
show_size (bool, optional): Show size of data frame. Defaults to True.
|
|
171
|
+
show_info (bool, optional): Show information for data frame. Defaults to False.
|
|
172
|
+
show_columns (bool, optional): Show columns of data frame. Defaults to False.
|
|
173
|
+
show_first (bool, optional): Show first 10 items. Defaults to False.
|
|
174
|
+
show_last (bool, optional): Show last 10 items. Defaults to False.
|
|
175
|
+
show_sample (bool, optional): Show 10 sample items. Defaults to False.
|
|
176
|
+
show_statistics (bool, optional): Show data frame statistics. Defaults to False.
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
if self._df is None:
|
|
180
|
+
logger.warning("Data Frame is not initialized!")
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
if show_size:
|
|
184
|
+
logger.info(
|
|
185
|
+
"Data Frame has %s row(s) and %s column(s)",
|
|
186
|
+
self._df.shape[0],
|
|
187
|
+
self._df.shape[1],
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
if show_info:
|
|
191
|
+
# df.info() can not easily be embedded into a string
|
|
192
|
+
self._df.info()
|
|
193
|
+
|
|
194
|
+
if show_columns:
|
|
195
|
+
logger.info("Columns:\n%s", self._df.columns)
|
|
196
|
+
logger.info(
|
|
197
|
+
"Columns with number of null values:\n%s", self._df.isnull().sum()
|
|
198
|
+
)
|
|
199
|
+
logger.info(
|
|
200
|
+
"Columns with number of non-null values:\n%s", self._df.notnull().sum()
|
|
201
|
+
)
|
|
202
|
+
logger.info("Columns with number of NaN values:\n%s", self._df.isna().sum())
|
|
203
|
+
logger.info(
|
|
204
|
+
"Columns with number of non-NaN values:\n%s", self._df.notna().sum()
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if show_first:
|
|
208
|
+
# the default for head is n = 5:
|
|
209
|
+
logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
|
|
210
|
+
|
|
211
|
+
if show_last:
|
|
212
|
+
# the default for tail is n = 5:
|
|
213
|
+
logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
|
|
214
|
+
|
|
215
|
+
if show_sample:
|
|
216
|
+
# the default for sample is n = 1:
|
|
217
|
+
logger.info("%s Sample rows:\n%s", str(row_num), self._df.sample(n=row_num))
|
|
218
|
+
|
|
219
|
+
if show_statistics:
|
|
220
|
+
logger.info(
|
|
221
|
+
"Description of statistics for data frame:\n%s", self._df.describe()
|
|
222
|
+
)
|
|
223
|
+
logger.info(
|
|
224
|
+
"Description of statistics for data frame (Transformed):\n%s",
|
|
225
|
+
self._df.describe().T,
|
|
226
|
+
)
|
|
227
|
+
logger.info(
|
|
228
|
+
"Description of statistics for data frame (objects):\n%s",
|
|
229
|
+
self._df.describe(include="object"),
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# end method definition
|
|
233
|
+
|
|
234
|
+
def append(self, add_data: pd.DataFrame | list | dict) -> bool:
|
|
235
|
+
"""Append additional data to the data frame.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
add_data (pd.DataFrame | list | dict): Additional data. Can be pd.DataFrame or list of dicts (or Data)
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
bool: True = Success, False = Error
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
# Does the data frame has already content?
|
|
245
|
+
# Then we need to concat / append. Otherwise
|
|
246
|
+
# we just initialize self._df
|
|
247
|
+
if self._df is not None:
|
|
248
|
+
if isinstance(add_data, pd.DataFrame):
|
|
249
|
+
self._df = pd.concat([self._df, add_data], ignore_index=True)
|
|
250
|
+
return True
|
|
251
|
+
elif isinstance(add_data, Data):
|
|
252
|
+
df = add_data.get_data_frame()
|
|
253
|
+
if df:
|
|
254
|
+
self._df = pd.concat([self._df, df], ignore_index=True)
|
|
255
|
+
return True
|
|
256
|
+
elif isinstance(add_data, list):
|
|
257
|
+
if add_data:
|
|
258
|
+
df = Data(add_data)
|
|
259
|
+
self._df = pd.concat(
|
|
260
|
+
[self._df, df.get_data_frame()], ignore_index=True
|
|
261
|
+
)
|
|
262
|
+
return True
|
|
263
|
+
elif isinstance(add_data, dict):
|
|
264
|
+
if add_data:
|
|
265
|
+
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
266
|
+
df = Data([add_data])
|
|
267
|
+
self._df = pd.concat(
|
|
268
|
+
[self._df, df.get_data_frame()], ignore_index=True
|
|
269
|
+
)
|
|
270
|
+
return True
|
|
271
|
+
else:
|
|
272
|
+
logger.error("Illegal data type -> '%s'", type(add_data))
|
|
273
|
+
return False
|
|
274
|
+
else: # self._df is None (initial state)
|
|
275
|
+
if isinstance(add_data, pd.DataFrame):
|
|
276
|
+
self._df = add_data
|
|
277
|
+
return True
|
|
278
|
+
elif isinstance(add_data, Data):
|
|
279
|
+
self._df = add_data.get_data_frame()
|
|
280
|
+
return True
|
|
281
|
+
elif isinstance(add_data, list):
|
|
282
|
+
self._df = pd.DataFrame(add_data)
|
|
283
|
+
return True
|
|
284
|
+
elif isinstance(add_data, dict):
|
|
285
|
+
# it is important to wrap the dict in a list to avoid that more than 1 row is created
|
|
286
|
+
self._df = pd.DataFrame([add_data])
|
|
287
|
+
return True
|
|
288
|
+
else:
|
|
289
|
+
logger.error("Illegal data type -> '%s'", type(add_data))
|
|
290
|
+
return False
|
|
291
|
+
|
|
292
|
+
# end method definition
|
|
293
|
+
|
|
294
|
+
def load_json_data(self, json_path: str, convert_dates: bool = False) -> bool:
|
|
295
|
+
"""Load JSON data into DataFrame
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
json_path (str): Path to the JSON file.
|
|
299
|
+
convert_dates (bool, optional): whether or not dates should be converted
|
|
300
|
+
Returns:
|
|
301
|
+
bool: False in case an error occured, True otherwise.
|
|
302
|
+
"""
|
|
303
|
+
|
|
304
|
+
if json_path is not None and os.path.exists(json_path):
|
|
305
|
+
# Load data from JSON file
|
|
306
|
+
try:
|
|
307
|
+
df = pd.read_json(path_or_buf=json_path, convert_dates=convert_dates)
|
|
308
|
+
if self._df is None:
|
|
309
|
+
self._df = df
|
|
310
|
+
else:
|
|
311
|
+
self._df = pd.concat([self._df, df])
|
|
312
|
+
logger.info(
|
|
313
|
+
"After loading -> '%s' the Data Frame has %s row(s) and %s column(s)",
|
|
314
|
+
json_path,
|
|
315
|
+
self._df.shape[0],
|
|
316
|
+
self._df.shape[1],
|
|
317
|
+
)
|
|
318
|
+
except FileNotFoundError:
|
|
319
|
+
logger.error(
|
|
320
|
+
"File -> %s not found. Please check the file path.", json_path
|
|
321
|
+
)
|
|
322
|
+
return False
|
|
323
|
+
except PermissionError:
|
|
324
|
+
logger.error("Permission denied to access the file -> %s.", json_path)
|
|
325
|
+
return False
|
|
326
|
+
except IOError as e:
|
|
327
|
+
logger.error("An I/O error occurred -> %s", str(e))
|
|
328
|
+
return False
|
|
329
|
+
except json.JSONDecodeError as e:
|
|
330
|
+
logger.error("Error: Unable to decode JSON -> %s", str(e))
|
|
331
|
+
return False
|
|
332
|
+
except ValueError as e:
|
|
333
|
+
logger.error("Invalid JSON input -> %s", str(e))
|
|
334
|
+
return False
|
|
335
|
+
except AttributeError as e:
|
|
336
|
+
logger.error("Unexpected data structure -> %s", str(e))
|
|
337
|
+
return False
|
|
338
|
+
except TypeError as e:
|
|
339
|
+
logger.error("Unexpected data type -> %s", str(e))
|
|
340
|
+
return False
|
|
341
|
+
except KeyError as e:
|
|
342
|
+
logger.error("Missing key in JSON data -> %s", str(e))
|
|
343
|
+
return False
|
|
344
|
+
|
|
345
|
+
else:
|
|
346
|
+
logger.error(
|
|
347
|
+
"Missing JSON file - you have not specified a valid path -> %s.",
|
|
348
|
+
json_path,
|
|
349
|
+
)
|
|
350
|
+
return False
|
|
351
|
+
return True
|
|
352
|
+
|
|
353
|
+
# end method definition
|
|
354
|
+
|
|
355
|
+
def save_json_data(
|
|
356
|
+
self, json_path: str, orient: str = "records", preserve_index: bool = False
|
|
357
|
+
) -> bool:
|
|
358
|
+
"""Save JSON data from DataFrame to file
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
json_path (str): Path to the JSON file.
|
|
362
|
+
orient (str, optional): Structure of the JSON
|
|
363
|
+
preserve_index (bool, optional)
|
|
364
|
+
Returns:
|
|
365
|
+
bool: False in case an error occured, True otherwise.
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
if json_path is not None and os.path.exists(os.path.dirname(json_path)):
|
|
369
|
+
# Load data from JSON file
|
|
370
|
+
try:
|
|
371
|
+
if self._df is not None:
|
|
372
|
+
# index parameter is only allowed if orient has one of the following values:
|
|
373
|
+
if (
|
|
374
|
+
orient == "columns"
|
|
375
|
+
or orient == "index"
|
|
376
|
+
or orient == "table"
|
|
377
|
+
or orient == "split"
|
|
378
|
+
):
|
|
379
|
+
self._df.to_json(
|
|
380
|
+
path_or_buf=json_path,
|
|
381
|
+
index=preserve_index,
|
|
382
|
+
orient=orient,
|
|
383
|
+
indent=2,
|
|
384
|
+
)
|
|
385
|
+
else:
|
|
386
|
+
self._df.to_json(path_or_buf=json_path, orient=orient, indent=2)
|
|
387
|
+
else:
|
|
388
|
+
logger.warning("Data Frame is empty. Cannot write it to JSON")
|
|
389
|
+
return False
|
|
390
|
+
except FileNotFoundError:
|
|
391
|
+
logger.error(
|
|
392
|
+
"File -> '%s' not found. Please check the file path.", json_path
|
|
393
|
+
)
|
|
394
|
+
return False
|
|
395
|
+
except PermissionError:
|
|
396
|
+
logger.error("Permission denied to access the file -> '%s'.", json_path)
|
|
397
|
+
return False
|
|
398
|
+
except IOError as e:
|
|
399
|
+
logger.error("An I/O error occurred -> %s", str(e))
|
|
400
|
+
return False
|
|
401
|
+
except ValueError as e:
|
|
402
|
+
logger.error("Value Error -> %s", str(e))
|
|
403
|
+
return False
|
|
404
|
+
|
|
405
|
+
else:
|
|
406
|
+
logger.error(
|
|
407
|
+
"Missing JSON file -> '%s' you have not specified a valid path!",
|
|
408
|
+
json_path,
|
|
409
|
+
)
|
|
410
|
+
return False
|
|
411
|
+
return True
|
|
412
|
+
|
|
413
|
+
# end method definition
|
|
414
|
+
|
|
415
|
+
def load_excel_data(
|
|
416
|
+
self,
|
|
417
|
+
xlsx_path: str,
|
|
418
|
+
sheet_names: str | list | None = 0,
|
|
419
|
+
usecols: str | list | None = None,
|
|
420
|
+
skip_rows: int | None = None,
|
|
421
|
+
header: int | None = 0,
|
|
422
|
+
names: list | None = None,
|
|
423
|
+
na_values: list | None = None,
|
|
424
|
+
) -> bool:
|
|
425
|
+
"""Load Excel (xlsx) data into DataFrame. Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
|
|
426
|
+
read from a local filesystem or URL. Supports an option to read a single sheet or a list of sheets.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
xlsx_path (str): Path to the Excel file.
|
|
430
|
+
sheet_names (list | str | int, optional): Name or Index of the sheet in the Excel workbook to load.
|
|
431
|
+
If 'None' then all sheets will be loaded.
|
|
432
|
+
If 0 then first sheet in workbook will be loaded (this is the Default)
|
|
433
|
+
If string then this is interpreted as the name of the sheet to load.
|
|
434
|
+
If a list is passed, this can be a list of index values (int) or
|
|
435
|
+
a list of strings with the sheet names to load.
|
|
436
|
+
usecols (list | str, optional): List of columns to load, specified by general column names in Excel,
|
|
437
|
+
e.g. usecols='B:D', usecols=['A', 'C', 'F']
|
|
438
|
+
skip_rows (int, optional): List of rows to skip on top of the sheet (e.g. to not read headlines)
|
|
439
|
+
header (int | None, optional): Excel Row (0-indexed) to use for the column labels of the parsed DataFrame.
|
|
440
|
+
If file contains no header row, then you should explicitly pass header=None.
|
|
441
|
+
Default is 0.
|
|
442
|
+
names (list): List of column names to use. Default is None
|
|
443
|
+
na_values (list, optional): List of values in the Excel that should become the Pandas NA value.
|
|
444
|
+
Returns:
|
|
445
|
+
bool: False in case an error occured, True otherwise.
|
|
446
|
+
"""
|
|
447
|
+
|
|
448
|
+
if xlsx_path is not None and os.path.exists(xlsx_path):
|
|
449
|
+
# Load data from Excel file
|
|
450
|
+
try:
|
|
451
|
+
df = pd.read_excel(
|
|
452
|
+
io=xlsx_path,
|
|
453
|
+
sheet_name=sheet_names,
|
|
454
|
+
usecols=usecols,
|
|
455
|
+
skiprows=skip_rows,
|
|
456
|
+
header=header,
|
|
457
|
+
names=names,
|
|
458
|
+
na_values=na_values,
|
|
459
|
+
)
|
|
460
|
+
# if multiple sheets from an Excel workbook are loaded,
|
|
461
|
+
# then read_excel() returns a dictionary. The keys are
|
|
462
|
+
# the names of the sheets and the values are the Data Frames.
|
|
463
|
+
# we handle this case as follows:
|
|
464
|
+
if isinstance(df, dict):
|
|
465
|
+
logger.info("Loading multiple Excel sheets from the workbook!")
|
|
466
|
+
multi_sheet_df = pd.DataFrame()
|
|
467
|
+
for sheet in df.keys():
|
|
468
|
+
multi_sheet_df = pd.concat(
|
|
469
|
+
[multi_sheet_df, df[sheet]], ignore_index=True
|
|
470
|
+
)
|
|
471
|
+
df = multi_sheet_df
|
|
472
|
+
if self._df is None:
|
|
473
|
+
self._df = df
|
|
474
|
+
else:
|
|
475
|
+
self._df = pd.concat([self._df, df], ignore_index=True)
|
|
476
|
+
except FileNotFoundError:
|
|
477
|
+
logger.error(
|
|
478
|
+
"File -> '%s' not found. Please check the file path.", xlsx_path
|
|
479
|
+
)
|
|
480
|
+
return False
|
|
481
|
+
except PermissionError:
|
|
482
|
+
logger.error("Permission denied to access the file -> '%s'.", xlsx_path)
|
|
483
|
+
return False
|
|
484
|
+
except IOError as e:
|
|
485
|
+
logger.error("An I/O error occurred -> %s", str(e))
|
|
486
|
+
return False
|
|
487
|
+
except ValueError as e:
|
|
488
|
+
logger.error("Invalid Excel input -> %s", str(e))
|
|
489
|
+
return False
|
|
490
|
+
except AttributeError as e:
|
|
491
|
+
logger.error("Unexpected data structure -> %s", str(e))
|
|
492
|
+
return False
|
|
493
|
+
except TypeError as e:
|
|
494
|
+
logger.error("Unexpected data type -> %s", str(e))
|
|
495
|
+
return False
|
|
496
|
+
except KeyError as e:
|
|
497
|
+
logger.error("Missing key in Excel data -> %s", str(e))
|
|
498
|
+
return False
|
|
499
|
+
|
|
500
|
+
else:
|
|
501
|
+
logger.error(
|
|
502
|
+
"Missing Excel file -> '%s' you have not specified a valid path!",
|
|
503
|
+
xlsx_path,
|
|
504
|
+
)
|
|
505
|
+
return False
|
|
506
|
+
return True
|
|
507
|
+
|
|
508
|
+
# end method definition
|
|
509
|
+
|
|
510
|
+
def save_excel_data(
|
|
511
|
+
self, excel_path: str, sheet_name: str = "Pandas Export", index: bool = False
|
|
512
|
+
) -> bool:
|
|
513
|
+
"""
|
|
514
|
+
Save the DataFrame to an Excel file, with robust error handling and logging.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
excel_path (str): The file path to save the Excel file.
|
|
518
|
+
sheet_name (str): The sheet name where data will be saved. Default is 'Sheet1'.
|
|
519
|
+
index: Whether to write the row names (index). Default is False.
|
|
520
|
+
"""
|
|
521
|
+
try:
|
|
522
|
+
# Check if the directory exists
|
|
523
|
+
directory = os.path.dirname(excel_path)
|
|
524
|
+
if directory and not os.path.exists(directory):
|
|
525
|
+
raise FileNotFoundError(
|
|
526
|
+
"The directory -> '%s' does not exist." % directory
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# Attempt to save the DataFrame to Excel
|
|
530
|
+
self._df.to_excel(excel_path, sheet_name=sheet_name, index=index)
|
|
531
|
+
logger.info("Data saved successfully to -> %s", excel_path)
|
|
532
|
+
|
|
533
|
+
except FileNotFoundError as e:
|
|
534
|
+
logger.error("Error: %s", e)
|
|
535
|
+
return False
|
|
536
|
+
except PermissionError:
|
|
537
|
+
logger.error(
|
|
538
|
+
"Error: Permission denied. You do not have permission to write to '%s'.",
|
|
539
|
+
excel_path,
|
|
540
|
+
)
|
|
541
|
+
return False
|
|
542
|
+
except ValueError as ve:
|
|
543
|
+
logger.error("Error: Invalid data for Excel format -> %s", ve)
|
|
544
|
+
return False
|
|
545
|
+
except OSError as oe:
|
|
546
|
+
logger.error("Error: OS error occurred while saving file -> %s", oe)
|
|
547
|
+
return False
|
|
548
|
+
except Exception as e:
|
|
549
|
+
# Catch-all for any other unexpected errors
|
|
550
|
+
logger.error("An unexpected error occurred -> %s", e)
|
|
551
|
+
return False
|
|
552
|
+
|
|
553
|
+
return True
|
|
554
|
+
|
|
555
|
+
# end method definition
|
|
556
|
+
|
|
557
|
+
def load_csv_data(self, csv_path: str) -> bool:
|
|
558
|
+
"""Load CSV (Comma separated values) data into DataFrame
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
csv_path (str): Path to the CSV file.
|
|
562
|
+
Returns:
|
|
563
|
+
bool: False in case an error occured, True otherwise.
|
|
564
|
+
"""
|
|
565
|
+
|
|
566
|
+
if csv_path is not None and os.path.exists(csv_path):
|
|
567
|
+
# Load data from CSV file
|
|
568
|
+
try:
|
|
569
|
+
df = pd.read_csv(csv_path)
|
|
570
|
+
if self._df is None:
|
|
571
|
+
self._df = df
|
|
572
|
+
else:
|
|
573
|
+
self._df = pd.concat([self._df, df])
|
|
574
|
+
except FileNotFoundError:
|
|
575
|
+
logger.error(
|
|
576
|
+
"File -> '%s' not found. Please check the file path.", csv_path
|
|
577
|
+
)
|
|
578
|
+
return False
|
|
579
|
+
except PermissionError:
|
|
580
|
+
logger.error("Permission denied to access the file -> %s.", csv_path)
|
|
581
|
+
return False
|
|
582
|
+
except IOError as e:
|
|
583
|
+
logger.error("An I/O error occurred -> %s", str(e))
|
|
584
|
+
return False
|
|
585
|
+
except ValueError as e:
|
|
586
|
+
logger.error("Invalid CSV input -> %s", str(e))
|
|
587
|
+
return False
|
|
588
|
+
except AttributeError as e:
|
|
589
|
+
logger.error("Unexpected data structure -> %s", str(e))
|
|
590
|
+
return False
|
|
591
|
+
except TypeError as e:
|
|
592
|
+
logger.error("Unexpected data type -> %s", str(e))
|
|
593
|
+
return False
|
|
594
|
+
except KeyError as e:
|
|
595
|
+
logger.error("Missing key in CSV data -> %s", str(e))
|
|
596
|
+
return False
|
|
597
|
+
|
|
598
|
+
else:
|
|
599
|
+
logger.error(
|
|
600
|
+
"Missing CSV file -> '%s' you have not specified a valid path!",
|
|
601
|
+
csv_path,
|
|
602
|
+
)
|
|
603
|
+
return False
|
|
604
|
+
return True
|
|
605
|
+
|
|
606
|
+
# end method definition
|
|
607
|
+
|
|
608
|
+
def load_xml_data(
|
|
609
|
+
self, xml_path: str, xpath: str | None = None, xslt_path: str | None = None
|
|
610
|
+
) -> bool:
|
|
611
|
+
"""Load XML data into DataFrame
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
xml_path (str): Path to the XML file.
|
|
615
|
+
xpath (str, optional): XPath to the elements we want to select
|
|
616
|
+
xslt_path (str, optional): XSLT transformation file
|
|
617
|
+
Returns:
|
|
618
|
+
bool: False in cause an error occured, True otherwise.
|
|
619
|
+
"""
|
|
620
|
+
|
|
621
|
+
try:
|
|
622
|
+
df = pd.read_xml(path_or_buffer=xml_path, xpath=xpath, stylesheet=xslt_path)
|
|
623
|
+
# Process the loaded data as needed
|
|
624
|
+
if self._df is None:
|
|
625
|
+
self._df = df
|
|
626
|
+
else:
|
|
627
|
+
self._df = pd.concat([self._df, df])
|
|
628
|
+
logger.info("XML file loaded successfully!")
|
|
629
|
+
return True
|
|
630
|
+
except FileNotFoundError:
|
|
631
|
+
print("File not found.")
|
|
632
|
+
return False
|
|
633
|
+
except PermissionError:
|
|
634
|
+
logger.error("Permission denied to access the file -> %s.", xml_path)
|
|
635
|
+
return False
|
|
636
|
+
except IOError as e:
|
|
637
|
+
logger.error("An I/O error occurred -> %s", str(e))
|
|
638
|
+
return False
|
|
639
|
+
except ValueError as e:
|
|
640
|
+
logger.error("Invalid CSV input -> %s", str(e))
|
|
641
|
+
return False
|
|
642
|
+
except AttributeError as e:
|
|
643
|
+
logger.error("Unexpected data structure -> %s", str(e))
|
|
644
|
+
return False
|
|
645
|
+
except TypeError as e:
|
|
646
|
+
logger.error("Unexpected data type -> %s", str(e))
|
|
647
|
+
return False
|
|
648
|
+
except KeyError as e:
|
|
649
|
+
logger.error("Missing key in CSV data -> %s", str(e))
|
|
650
|
+
return False
|
|
651
|
+
|
|
652
|
+
# end method definition
|
|
653
|
+
|
|
654
|
+
def load_directory(self, path_to_root: str) -> bool:
|
|
655
|
+
"""Load directory structure into Pandas Data Frame
|
|
656
|
+
|
|
657
|
+
Args:
|
|
658
|
+
path_to_root (str): Path to the root element of the
|
|
659
|
+
directory structure
|
|
660
|
+
|
|
661
|
+
Returns:
|
|
662
|
+
bool: True = Success, False = Failure
|
|
663
|
+
"""
|
|
664
|
+
|
|
665
|
+
try:
|
|
666
|
+
# Check if the provided path is a directory
|
|
667
|
+
if not os.path.isdir(path_to_root):
|
|
668
|
+
logger.error(
|
|
669
|
+
"The provided path -> '%s' is not a valid directory.", path_to_root
|
|
670
|
+
)
|
|
671
|
+
return False
|
|
672
|
+
|
|
673
|
+
# Initialize a list to hold file information
|
|
674
|
+
data = []
|
|
675
|
+
|
|
676
|
+
# Walk through the directory
|
|
677
|
+
for root, _, files in os.walk(path_to_root):
|
|
678
|
+
for file in files:
|
|
679
|
+
file_path = os.path.join(root, file)
|
|
680
|
+
file_size = os.path.getsize(file_path)
|
|
681
|
+
relative_path = os.path.relpath(file_path, path_to_root)
|
|
682
|
+
path_parts = relative_path.split(os.sep)
|
|
683
|
+
|
|
684
|
+
# Create a dictionary with the path parts and file details
|
|
685
|
+
entry = {
|
|
686
|
+
"level {}".format(i): part
|
|
687
|
+
for i, part in enumerate(path_parts[:-1], start=1)
|
|
688
|
+
}
|
|
689
|
+
entry.update({"filename": path_parts[-1], "size": file_size})
|
|
690
|
+
data.append(entry)
|
|
691
|
+
|
|
692
|
+
# Create DataFrame from list of dictionaries
|
|
693
|
+
self._df = pd.DataFrame(data)
|
|
694
|
+
|
|
695
|
+
# Determine the maximum number of levels
|
|
696
|
+
max_levels = max((len(entry) - 2 for entry in data), default=0)
|
|
697
|
+
|
|
698
|
+
# Ensure all entries have the same number of levels
|
|
699
|
+
for entry in data:
|
|
700
|
+
for i in range(1, max_levels + 1):
|
|
701
|
+
entry.setdefault("level {}".format(i), "")
|
|
702
|
+
|
|
703
|
+
# Convert to DataFrame again to make sure all columns are consistent
|
|
704
|
+
self._df = pd.DataFrame(data)
|
|
705
|
+
|
|
706
|
+
except NotADirectoryError as nde:
|
|
707
|
+
print(f"Error: {nde}")
|
|
708
|
+
except FileNotFoundError as fnfe:
|
|
709
|
+
print(f"Error: {fnfe}")
|
|
710
|
+
except PermissionError as pe:
|
|
711
|
+
print(f"Error: {pe}")
|
|
712
|
+
|
|
713
|
+
return True
|
|
714
|
+
|
|
715
|
+
# end method definition
|
|
716
|
+
|
|
717
|
+
def load_xml_directory(self, path_to_root: str, xpath: str | None = None) -> bool:
|
|
718
|
+
"""Load directory structure into Pandas Data Frame
|
|
719
|
+
|
|
720
|
+
Args:
|
|
721
|
+
path_to_root (str): Path to the root element of the
|
|
722
|
+
directory structure
|
|
723
|
+
xpath (str, optional): XPath to the elements we want to select
|
|
724
|
+
|
|
725
|
+
Returns:
|
|
726
|
+
bool: True = Success, False = Failure
|
|
727
|
+
"""
|
|
728
|
+
|
|
729
|
+
try:
|
|
730
|
+
# Check if the provided path is a directory
|
|
731
|
+
if not os.path.isdir(path_to_root):
|
|
732
|
+
logger.error(
|
|
733
|
+
"The provided path -> '%s' is not a valid directory.", path_to_root
|
|
734
|
+
)
|
|
735
|
+
return False
|
|
736
|
+
|
|
737
|
+
# Walk through the directory
|
|
738
|
+
for root, _, files in os.walk(path_to_root):
|
|
739
|
+
for file in files:
|
|
740
|
+
file_path = os.path.join(root, file)
|
|
741
|
+
file_size = os.path.getsize(file_path)
|
|
742
|
+
file_name = os.path.basename(file_path)
|
|
743
|
+
|
|
744
|
+
if file_name == "docovw.xml":
|
|
745
|
+
logger.info(
|
|
746
|
+
"Load XML file -> '%s' of size -> %s", file_path, file_size
|
|
747
|
+
)
|
|
748
|
+
success = self.load_xml_data(file_path, xpath=xpath)
|
|
749
|
+
if success:
|
|
750
|
+
logger.info(
|
|
751
|
+
"Successfully loaded XML file -> '%s'", file_path
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
except NotADirectoryError as nde:
|
|
755
|
+
logger.error("Error -> %s", str(nde))
|
|
756
|
+
except FileNotFoundError as fnfe:
|
|
757
|
+
logger.error("Error -> %s", str(fnfe))
|
|
758
|
+
except PermissionError as pe:
|
|
759
|
+
logger.error("Error -> %s", str(pe))
|
|
760
|
+
|
|
761
|
+
return True
|
|
762
|
+
|
|
763
|
+
# end method definition
|
|
764
|
+
|
|
765
|
+
def partitionate(self, number: int) -> list:
|
|
766
|
+
"""Partition a data frame into equally sized
|
|
767
|
+
partions
|
|
768
|
+
|
|
769
|
+
Args:
|
|
770
|
+
number (int): Number of partitions
|
|
771
|
+
|
|
772
|
+
Returns:
|
|
773
|
+
list: List of partitions
|
|
774
|
+
"""
|
|
775
|
+
|
|
776
|
+
# Calculate the approximate size of each partition
|
|
777
|
+
size = len(self._df)
|
|
778
|
+
|
|
779
|
+
if size >= number:
|
|
780
|
+
partition_size = size // number
|
|
781
|
+
remainder = size % number
|
|
782
|
+
else:
|
|
783
|
+
partition_size = size
|
|
784
|
+
number = 1
|
|
785
|
+
remainder = 0
|
|
786
|
+
|
|
787
|
+
logger.info(
|
|
788
|
+
"Data set has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
|
|
789
|
+
str(size),
|
|
790
|
+
str(number),
|
|
791
|
+
str(partition_size),
|
|
792
|
+
str(remainder),
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
# Initialize a list to store partitions
|
|
796
|
+
partitions = []
|
|
797
|
+
start_index = 0
|
|
798
|
+
|
|
799
|
+
# Slice the DataFrame into equally sized partitions
|
|
800
|
+
for i in range(number):
|
|
801
|
+
# start_index = i * partition_size
|
|
802
|
+
# end_index = (i + 1) * partition_size if i < number - 1 else None
|
|
803
|
+
# partition = self._df.iloc[start_index:end_index]
|
|
804
|
+
# partitions.append(partition)
|
|
805
|
+
# Calculate the end index for this partition
|
|
806
|
+
end_index = start_index + partition_size + (1 if i < remainder else 0)
|
|
807
|
+
partition = self._df.iloc[start_index:end_index]
|
|
808
|
+
partitions.append(partition)
|
|
809
|
+
start_index = end_index
|
|
810
|
+
|
|
811
|
+
return partitions
|
|
812
|
+
|
|
813
|
+
# end method definition
|
|
814
|
+
|
|
815
|
+
def partitionate_by_column(self, column_name: str) -> list | None:
|
|
816
|
+
"""Partition a data frame based on equal values in a specified column.
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
column_name (str): The column name to partition by
|
|
820
|
+
|
|
821
|
+
Returns:
|
|
822
|
+
list: List of partitions
|
|
823
|
+
"""
|
|
824
|
+
|
|
825
|
+
if column_name not in self._df.columns:
|
|
826
|
+
logger.error(
|
|
827
|
+
"Column -> '%s' does not exist in the Data Frame. Data Frame has these columns -> %s",
|
|
828
|
+
column_name,
|
|
829
|
+
str(self._df.columns),
|
|
830
|
+
)
|
|
831
|
+
return None
|
|
832
|
+
|
|
833
|
+
# Separate rows with NaN or None values in the specified column
|
|
834
|
+
nan_partitions = self._df[self._df[column_name].isna()]
|
|
835
|
+
non_nan_df = self._df.dropna(subset=[column_name])
|
|
836
|
+
|
|
837
|
+
# Group by the specified column and create a list of DataFrames for each group
|
|
838
|
+
grouped = non_nan_df.groupby(column_name)
|
|
839
|
+
partitions = [group for _, group in grouped]
|
|
840
|
+
|
|
841
|
+
# Add each row with NaN or None values as its own partition
|
|
842
|
+
for i in range(len(nan_partitions)):
|
|
843
|
+
partitions.append(nan_partitions.iloc[[i]])
|
|
844
|
+
|
|
845
|
+
logger.info(
|
|
846
|
+
"Data Frame has been partitioned into -> %s partitions based on the values in column '%s'...",
|
|
847
|
+
str(len(partitions)),
|
|
848
|
+
column_name,
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
return partitions
|
|
852
|
+
|
|
853
|
+
# end method definition
|
|
854
|
+
|
|
855
|
+
def deduplicate(self, unique_fields: list, inplace: bool = True) -> pd.DataFrame:
|
|
856
|
+
"""Remove dupclicate rows that have all fields in
|
|
857
|
+
unique_fields in common.
|
|
858
|
+
|
|
859
|
+
Args:
|
|
860
|
+
unique_fields (list): Defines the fields for which we want a unique
|
|
861
|
+
combination.
|
|
862
|
+
inplace (bool, optional): True if the deduplication happens in-place.
|
|
863
|
+
Defaults to True.
|
|
864
|
+
Returns:
|
|
865
|
+
pd.DataFrame | None: If inplace is False than a new deduplicatd DataFrame
|
|
866
|
+
is returned. Otherwise the object is modified in place
|
|
867
|
+
and self._df is returned.
|
|
868
|
+
"""
|
|
869
|
+
|
|
870
|
+
if inplace:
|
|
871
|
+
self._df.drop_duplicates(subset=unique_fields, inplace=True)
|
|
872
|
+
self._df.reset_index(drop=True, inplace=True)
|
|
873
|
+
return self._df
|
|
874
|
+
else:
|
|
875
|
+
df = self._df.drop_duplicates(subset=unique_fields, inplace=False)
|
|
876
|
+
df = df.reset_index(drop=True, inplace=False)
|
|
877
|
+
return df
|
|
878
|
+
|
|
879
|
+
# end method definition
|
|
880
|
+
|
|
881
|
+
def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame:
|
|
882
|
+
"""Sort the data frame based on one or multiple fields -
|
|
883
|
+
either in place or return it as a new data frame (e.g. not modifying self._df)
|
|
884
|
+
|
|
885
|
+
Args:
|
|
886
|
+
sort_fields (list): Columns / fields to be used for sorting
|
|
887
|
+
inplace (bool, optional): If the sorting should be inplace, i.e. modifying self._df.
|
|
888
|
+
Defaults to True.
|
|
889
|
+
Returns:
|
|
890
|
+
pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
|
|
891
|
+
"""
|
|
892
|
+
|
|
893
|
+
if self._df is None:
|
|
894
|
+
return None
|
|
895
|
+
|
|
896
|
+
if not all(sort_field in self._df.columns for sort_field in sort_fields):
|
|
897
|
+
logger.warning(
|
|
898
|
+
"Not all of the given sort fields -> %s do exist in the Data Frame.",
|
|
899
|
+
str(sort_fields),
|
|
900
|
+
)
|
|
901
|
+
# Reduce the sort fields to those that really exist in the DataFrame:
|
|
902
|
+
sort_fields = [
|
|
903
|
+
sort_field
|
|
904
|
+
for sort_field in sort_fields
|
|
905
|
+
if sort_field in self._df.columns
|
|
906
|
+
]
|
|
907
|
+
logger.warning(
|
|
908
|
+
"Only these given sort fields -> %s do exist as columns in the Data Frame.",
|
|
909
|
+
str(sort_fields),
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
if inplace:
|
|
913
|
+
self._df.sort_values(by=sort_fields, inplace=True)
|
|
914
|
+
self._df.reset_index(drop=True, inplace=True)
|
|
915
|
+
return self._df
|
|
916
|
+
else:
|
|
917
|
+
df = self._df.sort_values(by=sort_fields, inplace=False)
|
|
918
|
+
df = df.reset_index(drop=True, inplace=False)
|
|
919
|
+
return df
|
|
920
|
+
|
|
921
|
+
# end method definition
|
|
922
|
+
|
|
923
|
+
def flatten(
|
|
924
|
+
self,
|
|
925
|
+
parent_field: str,
|
|
926
|
+
flatten_fields: list,
|
|
927
|
+
):
|
|
928
|
+
"""Flatten a sub-dictionary by copying selected fields to the
|
|
929
|
+
parent dictionary. This is e.g. useful for then de-duplicate
|
|
930
|
+
a data set.
|
|
931
|
+
|
|
932
|
+
Args:
|
|
933
|
+
parent_field (str): name of the field in the parent dictionary
|
|
934
|
+
flatten_fields (list): fields in the sub-dictionary to copy
|
|
935
|
+
into the parent dictionary.
|
|
936
|
+
"""
|
|
937
|
+
|
|
938
|
+
for flatten_field in flatten_fields:
|
|
939
|
+
flat_field = parent_field + "_" + flatten_field
|
|
940
|
+
# The following expression generates a new column in the
|
|
941
|
+
# data frame with the name of 'flat_field'.
|
|
942
|
+
# In the lambada function x is a dictionary that includes the subvalues
|
|
943
|
+
# and it returns the value of the given flatten field
|
|
944
|
+
# (if it exists, otherwise None). So x is self._df[parent_field], i.e.
|
|
945
|
+
# what the lambda function gets 'applied' on.
|
|
946
|
+
self._df[flat_field] = self._df[parent_field].apply(
|
|
947
|
+
lambda x, sub_field=flatten_field: (
|
|
948
|
+
x.get(sub_field, None) if isinstance(x, dict) else None
|
|
949
|
+
)
|
|
950
|
+
)
|
|
951
|
+
|
|
952
|
+
# end method definition
|
|
953
|
+
|
|
954
|
+
def explode_and_flatten(
|
|
955
|
+
self,
|
|
956
|
+
explode_field: str | list,
|
|
957
|
+
flatten_fields: list | None = None,
|
|
958
|
+
make_unique: bool = False,
|
|
959
|
+
reset_index: bool = False,
|
|
960
|
+
split_string_to_list: bool = False,
|
|
961
|
+
) -> pd.DataFrame:
|
|
962
|
+
"""Explode a substructure in the Data Frame
|
|
963
|
+
|
|
964
|
+
Args:
|
|
965
|
+
explode_field (str | list): Field(s) to explode which each has/have a list structure.
|
|
966
|
+
Exploding multiple columns at once is possible. This delivers
|
|
967
|
+
a very different result compared to exploding one column after
|
|
968
|
+
the other!
|
|
969
|
+
flatten_fields (list): Fields in the exploded substructure to include
|
|
970
|
+
in the main dictionaries for easier processing.
|
|
971
|
+
make_unique (bool, optional): if True deduplicate the exploded data frame.
|
|
972
|
+
flatten (bool, optional): if True flatten the exploded data frame.
|
|
973
|
+
Returns:
|
|
974
|
+
pd.DataFrame: Pointer to the Pandas DataFrame
|
|
975
|
+
"""
|
|
976
|
+
|
|
977
|
+
def update_column(row):
|
|
978
|
+
try:
|
|
979
|
+
if sub in row:
|
|
980
|
+
return row[sub]
|
|
981
|
+
except (IndexError, KeyError, ValueError):
|
|
982
|
+
return ""
|
|
983
|
+
|
|
984
|
+
# Define a function to split a string into a list
|
|
985
|
+
def string_to_list(string: str | None) -> list:
|
|
986
|
+
if not string or pd.isna(string):
|
|
987
|
+
return []
|
|
988
|
+
# Use regular expression to split by comma, semicolon, or comma followed by space
|
|
989
|
+
return re.split(r"[;,]\s*", str(string))
|
|
990
|
+
|
|
991
|
+
if isinstance(explode_field, list):
|
|
992
|
+
logger.info("Explode multiple columns -> %s", str(explode_field))
|
|
993
|
+
elif isinstance(explode_field, str):
|
|
994
|
+
logger.info("Explode single column -> '%s'", explode_field)
|
|
995
|
+
else:
|
|
996
|
+
logger.error(
|
|
997
|
+
"Illegal explode field(s) data type provided -> %s", type(explode_field)
|
|
998
|
+
)
|
|
999
|
+
return self._df
|
|
1000
|
+
|
|
1001
|
+
if split_string_to_list:
|
|
1002
|
+
# Apply the function to convert the 'string_column' values to lists
|
|
1003
|
+
self._df[explode_field] = self._df[explode_field].apply(string_to_list)
|
|
1004
|
+
|
|
1005
|
+
try:
|
|
1006
|
+
# remove the sub dictionary that sometimes is introduced by
|
|
1007
|
+
# XML loading
|
|
1008
|
+
if "." in explode_field:
|
|
1009
|
+
main = explode_field.split(".")[0]
|
|
1010
|
+
sub = explode_field.split(".")[1]
|
|
1011
|
+
self._df[main] = self._df[main].apply(update_column)
|
|
1012
|
+
explode_field = main
|
|
1013
|
+
# Explode the field that has list values
|
|
1014
|
+
self._df = self._df.explode(column=explode_field)
|
|
1015
|
+
except KeyError:
|
|
1016
|
+
logger.error("Column -> '%s' not found in Data Frame!", str(explode_field))
|
|
1017
|
+
except ValueError:
|
|
1018
|
+
logger.error(
|
|
1019
|
+
"Unable to explode the specified column -> '%s'!", str(explode_field)
|
|
1020
|
+
)
|
|
1021
|
+
|
|
1022
|
+
if flatten_fields:
|
|
1023
|
+
self.flatten(parent_field=explode_field, flatten_fields=flatten_fields)
|
|
1024
|
+
|
|
1025
|
+
if make_unique:
|
|
1026
|
+
self._df.drop_duplicates(subset=flatten_fields, inplace=True)
|
|
1027
|
+
|
|
1028
|
+
if reset_index:
|
|
1029
|
+
self._df.reset_index(inplace=True)
|
|
1030
|
+
|
|
1031
|
+
return self._df
|
|
1032
|
+
|
|
1033
|
+
# end method definition
|
|
1034
|
+
|
|
1035
|
+
def drop_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
|
|
1036
|
+
"""Drop selected columns from the Data Frame
|
|
1037
|
+
|
|
1038
|
+
Args:
|
|
1039
|
+
column_names (list): list of column names to drop.
|
|
1040
|
+
inplace (bool, optional): If the dropping should be inplace, i.e. modifying self._df.
|
|
1041
|
+
Defaults to True.
|
|
1042
|
+
Returns:
|
|
1043
|
+
pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
|
|
1044
|
+
"""
|
|
1045
|
+
|
|
1046
|
+
if not all(column_name in self._df.columns for column_name in column_names):
|
|
1047
|
+
# Reduce the column names to those that really exist in the DataFrame:
|
|
1048
|
+
column_names = [
|
|
1049
|
+
column_name
|
|
1050
|
+
for column_name in column_names
|
|
1051
|
+
if column_name in self._df.columns
|
|
1052
|
+
]
|
|
1053
|
+
logger.warning(
|
|
1054
|
+
"Reduce to these columns -> %s that do exist in the Data Frame.",
|
|
1055
|
+
str(column_names),
|
|
1056
|
+
)
|
|
1057
|
+
|
|
1058
|
+
if inplace:
|
|
1059
|
+
self._df.drop(column_names, axis=1, inplace=True)
|
|
1060
|
+
return self._df
|
|
1061
|
+
else:
|
|
1062
|
+
df = self._df.drop(column_names, axis=1, inplace=False)
|
|
1063
|
+
return df
|
|
1064
|
+
|
|
1065
|
+
# end method definition
|
|
1066
|
+
|
|
1067
|
+
def keep_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
|
|
1068
|
+
"""Keep only selected columns from the Data Frame. Drop the rest.
|
|
1069
|
+
|
|
1070
|
+
Args:
|
|
1071
|
+
column_names (list): list of column names to keep.
|
|
1072
|
+
inplace (bool, optional): If the keeping should be inplace, i.e. modifying self._df.
|
|
1073
|
+
Defaults to True.
|
|
1074
|
+
Returns:
|
|
1075
|
+
pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
|
|
1076
|
+
"""
|
|
1077
|
+
|
|
1078
|
+
if not all(column_name in self._df.columns for column_name in column_names):
|
|
1079
|
+
# Reduce the column names to those that really exist in the DataFrame:
|
|
1080
|
+
column_names = [
|
|
1081
|
+
column_name
|
|
1082
|
+
for column_name in column_names
|
|
1083
|
+
if column_name in self._df.columns
|
|
1084
|
+
]
|
|
1085
|
+
logger.warning(
|
|
1086
|
+
"Reduce to these columns -> %s that do exist in the Data Frame.",
|
|
1087
|
+
column_names,
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
if inplace:
|
|
1091
|
+
# keep only those columns which are in column_names:
|
|
1092
|
+
if column_names != []:
|
|
1093
|
+
self._df = self._df[column_names]
|
|
1094
|
+
return self._df
|
|
1095
|
+
else:
|
|
1096
|
+
# keep only those columns which are in column_names:
|
|
1097
|
+
if column_names != []:
|
|
1098
|
+
df = self._df[column_names]
|
|
1099
|
+
return df
|
|
1100
|
+
return None
|
|
1101
|
+
|
|
1102
|
+
# end method definition
|
|
1103
|
+
|
|
1104
|
+
def cleanse(self, cleansings: dict):
|
|
1105
|
+
"""Cleanse data with regular expressions and upper/lower case conversion.
|
|
1106
|
+
|
|
1107
|
+
Args:
|
|
1108
|
+
cleansings (dict): Dictionary with keys that equal the column names.
|
|
1109
|
+
The dictionary values are dictionaries itself with
|
|
1110
|
+
these fields:
|
|
1111
|
+
* replacements (dict): name of a column in the data frame
|
|
1112
|
+
* upper (bool): change the value to uppercase
|
|
1113
|
+
* lower (bool): change the value to lowercase
|
|
1114
|
+
Example:
|
|
1115
|
+
cleansings = {
|
|
1116
|
+
"airportName": {
|
|
1117
|
+
"upper": true
|
|
1118
|
+
"replacements" : {
|
|
1119
|
+
"-": " ", # replace hypen with space
|
|
1120
|
+
",\s*": " ", # remove commas followed by on or more spaces with a single space
|
|
1121
|
+
"\s+$": "", # remove trailing spaces at the end of the name
|
|
1122
|
+
"^\s+": "", # remove spaces at the beginning of the name
|
|
1123
|
+
}
|
|
1124
|
+
"length": 10
|
|
1125
|
+
}
|
|
1126
|
+
"airportId": {
|
|
1127
|
+
"upper": true
|
|
1128
|
+
"replacements" : {
|
|
1129
|
+
"K(.{3})": "\1", # if the airport has 4 charters and starts with a 'K' we remove the 'K'
|
|
1130
|
+
"\/": "", # remove forward slashes - this helps to have consistency with N/A, NA, n/a, na
|
|
1131
|
+
}
|
|
1132
|
+
}
|
|
1133
|
+
}
|
|
1134
|
+
"""
|
|
1135
|
+
|
|
1136
|
+
# Iterate over each column in regex_dict
|
|
1137
|
+
for column, cleansing in cleansings.items():
|
|
1138
|
+
# "colum" is the name of the field we want to cleanse.
|
|
1139
|
+
# "cleansing" is a dict with
|
|
1140
|
+
if "." in column:
|
|
1141
|
+
# Handle columns with subfields
|
|
1142
|
+
main_field, sub_field = column.split(".")
|
|
1143
|
+
if not main_field in self._df.columns:
|
|
1144
|
+
continue
|
|
1145
|
+
# we use the additional parameters for lambda (beside x)
|
|
1146
|
+
# to avoid linter warning W0640
|
|
1147
|
+
self._df[main_field] = self._df[main_field].apply(
|
|
1148
|
+
lambda x, sub_field=sub_field, cleansing=cleansing: self._cleanse_subfield(
|
|
1149
|
+
data=x,
|
|
1150
|
+
sub_field=sub_field,
|
|
1151
|
+
replacements=cleansing.get("replacements", {}),
|
|
1152
|
+
upper=cleansing.get("upper", False),
|
|
1153
|
+
lower=cleansing.get("lower", False),
|
|
1154
|
+
length=cleansing.get("length", 0),
|
|
1155
|
+
)
|
|
1156
|
+
)
|
|
1157
|
+
else:
|
|
1158
|
+
if not column in self._df.columns:
|
|
1159
|
+
continue
|
|
1160
|
+
|
|
1161
|
+
logger.debug("\nBEFORE:\n%s\n", self._df[column])
|
|
1162
|
+
|
|
1163
|
+
if cleansing.get("upper", False) and self._df[column].dtype == "object":
|
|
1164
|
+
self._df[column] = self._df[column].str.upper()
|
|
1165
|
+
if cleansing.get("lower", False) and self._df[column].dtype == "object":
|
|
1166
|
+
self._df[column] = self._df[column].str.lower()
|
|
1167
|
+
|
|
1168
|
+
# Handle regular columns
|
|
1169
|
+
for regex_pattern, replacement in cleansing.get(
|
|
1170
|
+
"replacements", {}
|
|
1171
|
+
).items():
|
|
1172
|
+
# if replacement:
|
|
1173
|
+
# \b is a word boundary anchor in regular expressions.
|
|
1174
|
+
# It matches a position where one side is a word character
|
|
1175
|
+
# (like a letter or digit) and the other side is a non-word character
|
|
1176
|
+
# (like whitespace or punctuation). It's often used to match whole words.
|
|
1177
|
+
# regex_pattern = rf"\b{regex_pattern}\b"
|
|
1178
|
+
# self._df[column] = self._df[column].replace(
|
|
1179
|
+
# regex=regex_pattern, value=replacement
|
|
1180
|
+
# )
|
|
1181
|
+
self._df[column] = self._df[column].str.replace(
|
|
1182
|
+
pat=regex_pattern, repl=replacement, regex=True
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
if (
|
|
1186
|
+
cleansing.get("length", 0) > 0
|
|
1187
|
+
and self._df[column].dtype == "object"
|
|
1188
|
+
):
|
|
1189
|
+
self._df[column] = self._df[column].str.slice(
|
|
1190
|
+
0, cleansing["length"]
|
|
1191
|
+
)
|
|
1192
|
+
|
|
1193
|
+
logger.debug("\nAFTER:\n%s\n", self._df[column])
|
|
1194
|
+
|
|
1195
|
+
# end method definition
|
|
1196
|
+
|
|
1197
|
+
def _cleanse_subfield(
|
|
1198
|
+
self,
|
|
1199
|
+
data: list | dict,
|
|
1200
|
+
sub_field: str,
|
|
1201
|
+
replacements: dict,
|
|
1202
|
+
upper: bool,
|
|
1203
|
+
lower: bool,
|
|
1204
|
+
length: int = 0,
|
|
1205
|
+
) -> list | dict:
|
|
1206
|
+
"""Helper function to cleanse subfield data
|
|
1207
|
+
|
|
1208
|
+
Args:
|
|
1209
|
+
data (list | dict): sub data - either a list of dictionaries or a dictionary
|
|
1210
|
+
sub_field (str): defines which field in the sub data should be updated
|
|
1211
|
+
regex_replacements (dict): Dictionary of regular expressions
|
|
1212
|
+
upper (bool): if True transform value in subfield to upper-case
|
|
1213
|
+
lower (bool): if True, transform value in subfield to lower-case
|
|
1214
|
+
length (int, optional): maximum length of the strings
|
|
1215
|
+
Returns:
|
|
1216
|
+
list | dict: Updated data
|
|
1217
|
+
"""
|
|
1218
|
+
|
|
1219
|
+
if isinstance(data, list):
|
|
1220
|
+
# If data is a list, apply cleansing to each dictionary in the list
|
|
1221
|
+
for i, item in enumerate(data):
|
|
1222
|
+
if (
|
|
1223
|
+
item is not None
|
|
1224
|
+
and sub_field in item
|
|
1225
|
+
and not pd.isnull(item[sub_field])
|
|
1226
|
+
):
|
|
1227
|
+
if upper:
|
|
1228
|
+
item[sub_field] = item[sub_field].upper()
|
|
1229
|
+
elif lower:
|
|
1230
|
+
item[sub_field] = item[sub_field].lower()
|
|
1231
|
+
for regex_pattern, replacement in replacements.items():
|
|
1232
|
+
if replacement:
|
|
1233
|
+
regex_pattern = rf"\b{regex_pattern}\b"
|
|
1234
|
+
item[sub_field] = re.sub(
|
|
1235
|
+
regex_pattern, replacement, item[sub_field]
|
|
1236
|
+
)
|
|
1237
|
+
if length > 0:
|
|
1238
|
+
item[sub_field] = item[sub_field][:length]
|
|
1239
|
+
data[i] = item
|
|
1240
|
+
elif isinstance(data, dict):
|
|
1241
|
+
# If data is a dictionary, apply cleansing directly to the subfield
|
|
1242
|
+
if sub_field in data and not pd.isnull(data[sub_field]):
|
|
1243
|
+
if upper:
|
|
1244
|
+
data[sub_field] = data[sub_field].upper()
|
|
1245
|
+
elif lower:
|
|
1246
|
+
data[sub_field] = data[sub_field].lower()
|
|
1247
|
+
for regex_pattern, replacement in replacements.items():
|
|
1248
|
+
if replacement:
|
|
1249
|
+
regex_pattern = rf"\b{regex_pattern}\b"
|
|
1250
|
+
data[sub_field] = re.sub(
|
|
1251
|
+
regex_pattern, replacement, data[sub_field]
|
|
1252
|
+
)
|
|
1253
|
+
if length > 0:
|
|
1254
|
+
data[sub_field] = data[sub_field][:length]
|
|
1255
|
+
return data
|
|
1256
|
+
|
|
1257
|
+
# end method definition
|
|
1258
|
+
|
|
1259
|
+
def filter(self, conditions: list, inplace: bool = True) -> pd.DataFrame:
|
|
1260
|
+
"""Filter the DataFrame based on (multiple) conditions.
|
|
1261
|
+
|
|
1262
|
+
Args:
|
|
1263
|
+
conditions (list): Conditions are a list of dictionaries with 3 items:
|
|
1264
|
+
* field (str): name of a column in the data frame
|
|
1265
|
+
* value (str or list): expected value (filter criterium).
|
|
1266
|
+
If it is a list then one of
|
|
1267
|
+
the list elements must match the field value (OR)
|
|
1268
|
+
* regex (bool): this flag controls if the value is interpreted as a
|
|
1269
|
+
regular expression. If there is no regex item in the
|
|
1270
|
+
dictionary then the default is False (= values is NOT regex).
|
|
1271
|
+
If there are multiple conditions in the list each has to evaluate to True (AND)
|
|
1272
|
+
inplace (bool, optional): Defines if the self._df is modified (inplace) or just
|
|
1273
|
+
a new DataFrame is returned. Defaults to True.
|
|
1274
|
+
Returns:
|
|
1275
|
+
pd.DataFrame: new data frame or pointer to self._df (depending on the value of 'inplace')
|
|
1276
|
+
"""
|
|
1277
|
+
|
|
1278
|
+
if self._df is None:
|
|
1279
|
+
logger.error("DataFrame is not initialized.")
|
|
1280
|
+
return None
|
|
1281
|
+
|
|
1282
|
+
if self._df.empty:
|
|
1283
|
+
logger.error("DataFrame is empty.")
|
|
1284
|
+
return None
|
|
1285
|
+
|
|
1286
|
+
# first filtered_df is the full DataFreame.
|
|
1287
|
+
# then it is subsequentially reduced by each condition
|
|
1288
|
+
# at the end it is just those rows that match all conditions.
|
|
1289
|
+
filtered_df = self._df
|
|
1290
|
+
|
|
1291
|
+
# We traverse a list of conditions. Each condition must evaluate to true
|
|
1292
|
+
# otherwise the current workspace or document (i.e. the data set for these objects)
|
|
1293
|
+
# will be skipped. The variable filtered_df is
|
|
1294
|
+
for condition in conditions:
|
|
1295
|
+
field = condition.get("field", None)
|
|
1296
|
+
if not field:
|
|
1297
|
+
logger.error("Missing value for filter condition field in payload!")
|
|
1298
|
+
continue
|
|
1299
|
+
if field not in self._df.columns:
|
|
1300
|
+
logger.warning(
|
|
1301
|
+
"Filter condition field -> %s does not exist as column in data frame! Data frame has these columns -> %s",
|
|
1302
|
+
field,
|
|
1303
|
+
str(self._df.columns),
|
|
1304
|
+
)
|
|
1305
|
+
continue # Skip filtering for columns not present in DataFrame
|
|
1306
|
+
value = condition.get("value", None)
|
|
1307
|
+
if not value:
|
|
1308
|
+
logger.error(
|
|
1309
|
+
"Missing filter value of for filter condition field -> '%s'!", field
|
|
1310
|
+
)
|
|
1311
|
+
continue
|
|
1312
|
+
regex = condition.get("regex", False)
|
|
1313
|
+
|
|
1314
|
+
logger.info(
|
|
1315
|
+
"Data Frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
|
|
1316
|
+
filtered_df.shape[0],
|
|
1317
|
+
filtered_df.shape[1],
|
|
1318
|
+
str(condition),
|
|
1319
|
+
)
|
|
1320
|
+
|
|
1321
|
+
filtered_dfs = []
|
|
1322
|
+
|
|
1323
|
+
# if a single string is passed as value we put
|
|
1324
|
+
# it into an 1-item list to simplify the following code:
|
|
1325
|
+
if not isinstance(value, list):
|
|
1326
|
+
value = [value]
|
|
1327
|
+
|
|
1328
|
+
# multiple values are treated like a logical "or" condition
|
|
1329
|
+
for value_item in value:
|
|
1330
|
+
if regex:
|
|
1331
|
+
filtered_dfs.append(
|
|
1332
|
+
filtered_df[
|
|
1333
|
+
~filtered_df[field].isna()
|
|
1334
|
+
& filtered_df[field].str.contains(value_item, regex=True)
|
|
1335
|
+
]
|
|
1336
|
+
)
|
|
1337
|
+
else:
|
|
1338
|
+
result_df = filtered_df[
|
|
1339
|
+
~filtered_df[field].isna() & filtered_df[field].eq(value_item)
|
|
1340
|
+
]
|
|
1341
|
+
if not result_df.empty:
|
|
1342
|
+
filtered_dfs.append(result_df)
|
|
1343
|
+
# end for values
|
|
1344
|
+
|
|
1345
|
+
if not filtered_dfs:
|
|
1346
|
+
logger.warning(
|
|
1347
|
+
"Filter with field -> '%s' and value -> '%s' delivered an empty Data Frame",
|
|
1348
|
+
field,
|
|
1349
|
+
str(value),
|
|
1350
|
+
)
|
|
1351
|
+
filtered_df.drop(filtered_df.index, inplace=True)
|
|
1352
|
+
else:
|
|
1353
|
+
# Concatenate the filtered DataFrames for each value in the list
|
|
1354
|
+
filtered_df = pd.concat(filtered_dfs, ignore_index=True)
|
|
1355
|
+
|
|
1356
|
+
logger.info(
|
|
1357
|
+
"Data Frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
|
|
1358
|
+
filtered_df.shape[0],
|
|
1359
|
+
filtered_df.shape[1],
|
|
1360
|
+
str(condition),
|
|
1361
|
+
)
|
|
1362
|
+
# end for condition
|
|
1363
|
+
|
|
1364
|
+
if inplace:
|
|
1365
|
+
self._df = filtered_df
|
|
1366
|
+
|
|
1367
|
+
return filtered_df
|
|
1368
|
+
|
|
1369
|
+
# end method definition
|
|
1370
|
+
|
|
1371
|
+
def fill_na_in_column(self, column_name: str, default_value: str | int):
|
|
1372
|
+
"""Replace NA values in a column with a defined new default value
|
|
1373
|
+
|
|
1374
|
+
Args:
|
|
1375
|
+
column_name (str): name of the column in the DataFrame
|
|
1376
|
+
default_value (str | int): value to replace NA with
|
|
1377
|
+
"""
|
|
1378
|
+
|
|
1379
|
+
if column_name in self._df.columns:
|
|
1380
|
+
self._df[column_name] = self._df[column_name].fillna(value=default_value)
|
|
1381
|
+
else:
|
|
1382
|
+
logger.error(
|
|
1383
|
+
"Cannot replace NA values as column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
|
|
1384
|
+
column_name,
|
|
1385
|
+
str(self._df.columns),
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
# end method definition
|
|
1389
|
+
|
|
1390
|
+
def fill_forward(self, inplace: bool) -> pd.DataFrame:
|
|
1391
|
+
"""Fill the missing cells appropriately by carrying forward
|
|
1392
|
+
the values from the previous rows where necessary.
|
|
1393
|
+
This has applications if a hierarchy is represented by
|
|
1394
|
+
nested cells e.g. in an Excel sheet.
|
|
1395
|
+
|
|
1396
|
+
Args:
|
|
1397
|
+
inplace (bool): Should the modification happen inplace or not.
|
|
1398
|
+
|
|
1399
|
+
Returns:
|
|
1400
|
+
pd.DataFrame: Resulting dataframe
|
|
1401
|
+
"""
|
|
1402
|
+
|
|
1403
|
+
# To convert an Excel representation of a folder structure with nested
|
|
1404
|
+
# columns into a format appropriate for Pandas,
|
|
1405
|
+
# where all cells should be filled
|
|
1406
|
+
df_filled = self._df.ffill(inplace=inplace)
|
|
1407
|
+
|
|
1408
|
+
return df_filled
|
|
1409
|
+
|
|
1410
|
+
# end method definition
|
|
1411
|
+
|
|
1412
|
+
def lookup_value(
|
|
1413
|
+
self, lookup_column: str, lookup_value: str, separator: str = "|"
|
|
1414
|
+
) -> pd.Series | None:
|
|
1415
|
+
"""Lookup a row that includes a lookup value in the value of a given column.
|
|
1416
|
+
|
|
1417
|
+
Args:
|
|
1418
|
+
lookup_column (str): name of the column to search in
|
|
1419
|
+
lookup_value (str): value to search for
|
|
1420
|
+
separator (str): string list delimiter / separator
|
|
1421
|
+
|
|
1422
|
+
Returns:
|
|
1423
|
+
pd.Series | None: data frame row that matches or None if no match was found.
|
|
1424
|
+
"""
|
|
1425
|
+
|
|
1426
|
+
# Use the `apply` function to filter rows where the lookup value matches a whole item in the comma-separated list
|
|
1427
|
+
def match_lookup_value(string_list: str) -> bool:
|
|
1428
|
+
"""Spilt delimiter-separated list into a python list
|
|
1429
|
+
|
|
1430
|
+
Args:
|
|
1431
|
+
string_list (str): delimiter-separated string list like "a, b, c" or "a | b | c"
|
|
1432
|
+
|
|
1433
|
+
Returns:
|
|
1434
|
+
bool: True if lookup_value is equal to one of the delimiter-separated terms
|
|
1435
|
+
"""
|
|
1436
|
+
return lookup_value in [
|
|
1437
|
+
item.strip() for item in string_list.split(separator)
|
|
1438
|
+
]
|
|
1439
|
+
|
|
1440
|
+
df = self._df
|
|
1441
|
+
|
|
1442
|
+
if self._df is None:
|
|
1443
|
+
return None
|
|
1444
|
+
|
|
1445
|
+
if lookup_column not in self._df.columns:
|
|
1446
|
+
logger.error(
|
|
1447
|
+
"Column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
|
|
1448
|
+
lookup_column,
|
|
1449
|
+
str(self._df.columns),
|
|
1450
|
+
)
|
|
1451
|
+
return None
|
|
1452
|
+
|
|
1453
|
+
# Fill NaN or None values in the lookup column with empty strings
|
|
1454
|
+
df[lookup_column] = df[lookup_column].fillna("")
|
|
1455
|
+
|
|
1456
|
+
# Use the `apply` function to filter rows where the lookup value is in the Synonyms list
|
|
1457
|
+
matched_row = df[df[lookup_column].apply(match_lookup_value)]
|
|
1458
|
+
|
|
1459
|
+
# Return the first matched row, if any
|
|
1460
|
+
if not matched_row.empty:
|
|
1461
|
+
return matched_row.iloc[0]
|
|
1462
|
+
|
|
1463
|
+
return None
|
|
1464
|
+
|
|
1465
|
+
# end method definition
|
|
1466
|
+
|
|
1467
|
+
def add_column(
|
|
1468
|
+
self,
|
|
1469
|
+
source_column: str,
|
|
1470
|
+
reg_exp: str,
|
|
1471
|
+
new_column: str,
|
|
1472
|
+
prefix="",
|
|
1473
|
+
suffix="",
|
|
1474
|
+
length: int | None = None,
|
|
1475
|
+
group_chars: int | None = None,
|
|
1476
|
+
group_separator: str = ".",
|
|
1477
|
+
group_remove_leading_zero: bool = True,
|
|
1478
|
+
) -> bool:
|
|
1479
|
+
"""Add additional column to the data frame.
|
|
1480
|
+
|
|
1481
|
+
Args:
|
|
1482
|
+
source_column (str): name of the source column
|
|
1483
|
+
reg_exp (str): regular expression to apply on the content of the source column
|
|
1484
|
+
new_column (str): name of the column to add
|
|
1485
|
+
prefix (str, optional): Prefix to add in front of the value. Defaults to "".
|
|
1486
|
+
suffix (str, optional): Suffix to add at the end of the value. Defaults to "".
|
|
1487
|
+
length (int | None, optional): Length to reduce to. Defaults to None.
|
|
1488
|
+
group_chars (int | None, optional): group the resulting string in characters of group_chars. Defaults to None.
|
|
1489
|
+
group_separator (str, optional): Separator string for the grouping. Defaults to ".".
|
|
1490
|
+
group_remove_leading_zero (bool, optional): Remove leading zeros from the groups. Defaults to True.
|
|
1491
|
+
|
|
1492
|
+
Returns:
|
|
1493
|
+
bool: True = Success, False = Failure
|
|
1494
|
+
"""
|
|
1495
|
+
|
|
1496
|
+
if self._df is None:
|
|
1497
|
+
return False
|
|
1498
|
+
|
|
1499
|
+
# Use str.extract to apply the regular expression to the source column
|
|
1500
|
+
extracted = self._df[source_column].str.extract(pat=reg_exp, expand=False)
|
|
1501
|
+
|
|
1502
|
+
# Limit the result to the specified length
|
|
1503
|
+
if length is not None:
|
|
1504
|
+
extracted = extracted.str[:length]
|
|
1505
|
+
|
|
1506
|
+
if group_chars is not None:
|
|
1507
|
+
|
|
1508
|
+
def process_grouping(x):
|
|
1509
|
+
if pd.isna(x):
|
|
1510
|
+
return x
|
|
1511
|
+
# Split into groups
|
|
1512
|
+
groups = [x[i : i + group_chars] for i in range(0, len(x), group_chars)]
|
|
1513
|
+
if group_remove_leading_zero:
|
|
1514
|
+
# Remove leading zeros from each group
|
|
1515
|
+
groups = [group.lstrip("0") or "0" for group in groups]
|
|
1516
|
+
# Join groups with separator
|
|
1517
|
+
return group_separator.join(groups)
|
|
1518
|
+
|
|
1519
|
+
extracted = extracted.apply(process_grouping)
|
|
1520
|
+
|
|
1521
|
+
# Add prefix and suffix
|
|
1522
|
+
if prefix or suffix:
|
|
1523
|
+
extracted = prefix + extracted.astype(str) + suffix
|
|
1524
|
+
|
|
1525
|
+
self._df[new_column] = extracted
|
|
1526
|
+
|
|
1527
|
+
return True
|