pyxecm 1.5__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyxecm might be problematic. Click here for more details.

Files changed (56) hide show
  1. pyxecm/__init__.py +6 -2
  2. pyxecm/avts.py +1492 -0
  3. pyxecm/coreshare.py +1075 -960
  4. pyxecm/customizer/__init__.py +16 -4
  5. pyxecm/customizer/__main__.py +58 -0
  6. pyxecm/customizer/api/__init__.py +5 -0
  7. pyxecm/customizer/api/__main__.py +6 -0
  8. pyxecm/customizer/api/app.py +914 -0
  9. pyxecm/customizer/api/auth.py +154 -0
  10. pyxecm/customizer/api/metrics.py +92 -0
  11. pyxecm/customizer/api/models.py +13 -0
  12. pyxecm/customizer/api/payload_list.py +865 -0
  13. pyxecm/customizer/api/settings.py +103 -0
  14. pyxecm/customizer/browser_automation.py +332 -139
  15. pyxecm/customizer/customizer.py +1075 -1057
  16. pyxecm/customizer/exceptions.py +35 -0
  17. pyxecm/customizer/guidewire.py +322 -0
  18. pyxecm/customizer/k8s.py +787 -338
  19. pyxecm/customizer/log.py +107 -0
  20. pyxecm/customizer/m365.py +3424 -2270
  21. pyxecm/customizer/nhc.py +1169 -0
  22. pyxecm/customizer/openapi.py +258 -0
  23. pyxecm/customizer/payload.py +18201 -7030
  24. pyxecm/customizer/pht.py +1047 -210
  25. pyxecm/customizer/salesforce.py +836 -727
  26. pyxecm/customizer/sap.py +58 -41
  27. pyxecm/customizer/servicenow.py +851 -383
  28. pyxecm/customizer/settings.py +442 -0
  29. pyxecm/customizer/successfactors.py +408 -346
  30. pyxecm/customizer/translate.py +83 -48
  31. pyxecm/helper/__init__.py +5 -2
  32. pyxecm/helper/assoc.py +98 -38
  33. pyxecm/helper/data.py +2482 -742
  34. pyxecm/helper/logadapter.py +27 -0
  35. pyxecm/helper/web.py +229 -101
  36. pyxecm/helper/xml.py +528 -172
  37. pyxecm/maintenance_page/__init__.py +5 -0
  38. pyxecm/maintenance_page/__main__.py +6 -0
  39. pyxecm/maintenance_page/app.py +51 -0
  40. pyxecm/maintenance_page/settings.py +28 -0
  41. pyxecm/maintenance_page/static/favicon.avif +0 -0
  42. pyxecm/maintenance_page/templates/maintenance.html +165 -0
  43. pyxecm/otac.py +234 -140
  44. pyxecm/otawp.py +2689 -0
  45. pyxecm/otcs.py +12344 -7547
  46. pyxecm/otds.py +3166 -2219
  47. pyxecm/otiv.py +36 -21
  48. pyxecm/otmm.py +1363 -296
  49. pyxecm/otpd.py +231 -127
  50. pyxecm-2.0.0.dist-info/METADATA +145 -0
  51. pyxecm-2.0.0.dist-info/RECORD +54 -0
  52. {pyxecm-1.5.dist-info → pyxecm-2.0.0.dist-info}/WHEEL +1 -1
  53. pyxecm-1.5.dist-info/METADATA +0 -51
  54. pyxecm-1.5.dist-info/RECORD +0 -30
  55. {pyxecm-1.5.dist-info → pyxecm-2.0.0.dist-info/licenses}/LICENSE +0 -0
  56. {pyxecm-1.5.dist-info → pyxecm-2.0.0.dist-info}/top_level.txt +0 -0
pyxecm/helper/data.py CHANGED
@@ -1,74 +1,61 @@
1
- """
2
- Data Module to implement functions to leverage Pandas to
3
- manipulte data structures read for bulk generation of Extended ECM items.
4
-
5
- This code implements a class called data which is referring
6
- to Pandas DataFrame.
7
-
8
- Class: Payload
9
- Methods:
10
-
11
- __init__ : class initializer
12
- __len__: Lenght of the embedded DataFrame object.
13
- __str__: Print the DataFrame of the class
14
- get_data_frame: Get the Pandas DataFrame object
15
- set_data_frame: Set the Pandas DataFrame object
16
- append: Append additional data to the data frame.
17
-
18
- load_json_data: Load JSON data into DataFrame
19
- save_json_data: Save JSON data from DataFrame to file
20
- load_excel_data: Load Excel file into DataFrame
21
- load_csv_data: Load CSV data into DataFrame
22
- load_directory: Load directory structure into Pandas Data Frame
23
-
24
- partitionate: Partition a data frame into equally sized partions
25
- deduplicate: Remove dupclicate rows that have all fields in unique_fields in common
26
- sort: Sort the data frame based on one or multiple fields.
27
- flatten: Flatten a sub-dictionary by copying selected fields to the
28
- parent dictionary.
29
- explode_and_flatten: Explode a substructure in the Data Frame
30
- drop_columns: Drop selected columns from the Data Frame
31
- keep_columns: Keep only selected columns from the Data Frame. Drop the rest.
32
- cleanse: Cleanse data with regular expressions and upper/lower case conversion.
33
- filter: Filter the DataFrame based on conditions
34
-
35
- fill_forward: Fill the missing cells appropriately by carrying forward
36
- the values from the previous rows where necessary.
37
- fill_na_in_column: Replace NA values in a column with a defined new default value
1
+ """Data Module leveraging Pandas to manipulte data sets read for bulk generation of Content Server items.
2
+
3
+ See: https://pandas.pydata.org
4
+
5
+ This code implements a class called "Data" which is a wrapper
6
+ to Pandas data frame.
38
7
  """
39
8
 
40
9
  __author__ = "Dr. Marc Diefenbruch"
41
- __copyright__ = "Copyright 2024, OpenText"
10
+ __copyright__ = "Copyright (C) 2024-2025, OpenText"
42
11
  __credits__ = ["Kai-Philip Gatzweiler"]
43
12
  __maintainer__ = "Dr. Marc Diefenbruch"
44
13
  __email__ = "mdiefenb@opentext.com"
45
14
 
46
- import logging
47
15
  import json
16
+ import logging
48
17
  import os
49
18
  import re
50
19
  import threading
20
+ from io import StringIO
51
21
 
52
22
  import pandas as pd
23
+ import requests
53
24
 
54
- logger = logging.getLogger("pyxecm.helper.data")
25
+ default_logger = logging.getLogger("pyxecm.helper.data")
55
26
 
56
27
 
57
28
  class Data:
58
29
  """Used to automate data loading for the customizer."""
59
30
 
31
+ logger: logging.Logger = default_logger
32
+
60
33
  _df: pd.DataFrame
61
- _lock = threading.Lock()
34
+ _lock: threading.Lock = threading.Lock()
62
35
 
63
- def __init__(self, init_data: pd.DataFrame | list = None):
36
+ def __init__(
37
+ self,
38
+ init_data: pd.DataFrame | list = None,
39
+ logger: logging.Logger = default_logger,
40
+ ) -> None:
64
41
  """Initialize the Data object.
65
42
 
66
43
  Args:
67
- init_data (pd.DataFrame | list, optional): Data to initialize the data frame. Can either be
68
- another data frame (that gets copied) or a list of dictionaries.
69
- Defaults to None.
44
+ init_data (pd.DataFrame | list, optional):
45
+ Data to initialize the data frame. Can either be
46
+ another data frame (that gets copied) or a list of dictionaries.
47
+ Defaults to None.
48
+ logger (logging.Logger, optional):
49
+ Pass a special logging object. This is optional. If not provided,
50
+ the default logger is used.
51
+
70
52
  """
71
53
 
54
+ if logger != default_logger:
55
+ self.logger = logger.getChild("data")
56
+ for logfilter in logger.filters:
57
+ self.logger.addFilter(logfilter)
58
+
72
59
  if init_data is not None:
73
60
  # if a data frame is passed to the constructor we
74
61
  # copy its content to the new Data object
@@ -84,7 +71,7 @@ class Data:
84
71
  # it is important to wrap the dict in a list to avoid that more than 1 row is created
85
72
  self._df: pd.DataFrame = pd.DataFrame([init_data])
86
73
  else:
87
- logger.error("Illegal initialization data for 'Data' class!")
74
+ self.logger.error("Illegal initialization data for 'Data' class!")
88
75
  self._df = None
89
76
  else:
90
77
  self._df = None
@@ -92,11 +79,14 @@ class Data:
92
79
  # end method definition
93
80
 
94
81
  def __len__(self) -> int:
95
- """Lenght of the embedded DataFrame object.
96
- This is basically a convenience method.
82
+ """Return lenght of the embedded Pandas data frame object.
83
+
84
+ This is basically a convenience method.
97
85
 
98
86
  Returns:
99
- int: Lenght of the DataFrame
87
+ int:
88
+ Lenght of the data frame.
89
+
100
90
  """
101
91
 
102
92
  if self._df is not None:
@@ -106,10 +96,12 @@ class Data:
106
96
  # end method definition
107
97
 
108
98
  def __str__(self) -> str:
109
- """Print the DataFrame of the class.
99
+ """Print the Pandas data frame object.
110
100
 
111
101
  Returns:
112
- str: String representation.
102
+ str:
103
+ String representation.
104
+
113
105
  """
114
106
 
115
107
  # if data frame is initialized we return
@@ -121,38 +113,73 @@ class Data:
121
113
 
122
114
  # end method definition
123
115
 
124
- def lock(self):
116
+ def __getitem__(self, column: str) -> pd.Series:
117
+ """Return the column corresponding to the key from the data frame.
118
+
119
+ Args:
120
+ column (str): The name of the data frame column.
121
+
122
+ Returns:
123
+ pd.Series: The column of the data frame with the given name.
124
+
125
+ """
126
+
127
+ return self._df[column]
128
+
129
+ # end method definition
130
+
131
+ def lock(self) -> threading.Lock:
125
132
  """Return the threading lock object.
126
133
 
127
134
  Returns:
128
- _type_: threading lock object
135
+ threading.Lock: The threading lock object.
136
+
129
137
  """
138
+
130
139
  return self._lock
131
140
 
132
141
  # end method definition
133
142
 
134
143
  def get_data_frame(self) -> pd.DataFrame:
135
- """Get the Pandas DataFrame object
144
+ """Get the Pandas data frame object.
136
145
 
137
146
  Returns:
138
- pd.DataFrame: Pandas DataFrame object
147
+ pd.DataFrame: The Pandas data frame object.
148
+
139
149
  """
140
150
 
141
151
  return self._df
142
152
 
143
153
  # end method definition
144
154
 
145
- def set_data_frame(self, df: pd.DataFrame):
146
- """Set the Pandas DataFrame object
155
+ def set_data_frame(self, df: pd.DataFrame) -> None:
156
+ """Set the Pandas data frame object.
147
157
 
148
158
  Args:
149
- df (pd.DataFrame): Pandas DataFrame object
159
+ df (pd.DataFrame): The new Pandas data frame object.
160
+
150
161
  """
151
162
 
152
163
  self._df = df
153
164
 
154
165
  # end method definition
155
166
 
167
+ def get_columns(self) -> list | None:
168
+ """Get the list of column names of the data frame.
169
+
170
+ Returns:
171
+ list | None:
172
+ The list of column names in the data frame.
173
+
174
+ """
175
+
176
+ if self._df is None:
177
+ return None
178
+
179
+ return self._df.columns
180
+
181
+ # end method definition
182
+
156
183
  def print_info(
157
184
  self,
158
185
  show_size: bool = True,
@@ -163,26 +190,40 @@ class Data:
163
190
  show_sample: bool = False,
164
191
  show_statistics: bool = False,
165
192
  row_num: int = 10,
166
- ):
167
- """Log information about the data frame
193
+ ) -> None:
194
+ """Log information about the data frame.
168
195
 
169
196
  Args:
170
- show_size (bool, optional): Show size of data frame. Defaults to True.
171
- show_info (bool, optional): Show information for data frame. Defaults to False.
172
- show_columns (bool, optional): Show columns of data frame. Defaults to False.
173
- show_first (bool, optional): Show first 10 items. Defaults to False.
174
- show_last (bool, optional): Show last 10 items. Defaults to False.
175
- show_sample (bool, optional): Show 10 sample items. Defaults to False.
176
- show_statistics (bool, optional): Show data frame statistics. Defaults to False.
197
+ show_size (bool, optional):
198
+ Show size of data frame. Defaults to True.
199
+ show_info (bool, optional):
200
+ Show information for data frame. Defaults to False.
201
+ show_columns (bool, optional):
202
+ Show columns of data frame. Defaults to False.
203
+ show_first (bool, optional):
204
+ Show first N items. Defaults to False. N is defined
205
+ by the row_num parameter.
206
+ show_last (bool, optional):
207
+ Show last N items. Defaults to False. N is defined
208
+ by the row_num parameter.
209
+ show_sample (bool, optional):
210
+ Show N sample items. Defaults to False. N is defined
211
+ by the row_num parameter.
212
+ show_statistics (bool, optional):
213
+ Show data frame statistics. Defaults to False.
214
+ row_num (int, optional):
215
+ Used as the number of rows printed using show_first,
216
+ show_last, show_sample. Default is 10.
217
+
177
218
  """
178
219
 
179
220
  if self._df is None:
180
- logger.warning("Data Frame is not initialized!")
221
+ self.logger.warning("Data frame is not initialized!")
181
222
  return
182
223
 
183
224
  if show_size:
184
- logger.info(
185
- "Data Frame has %s row(s) and %s column(s)",
225
+ self.logger.info(
226
+ "Data frame has %s row(s) and %s column(s)",
186
227
  self._df.shape[0],
187
228
  self._df.shape[1],
188
229
  )
@@ -192,39 +233,42 @@ class Data:
192
233
  self._df.info()
193
234
 
194
235
  if show_columns:
195
- logger.info("Columns:\n%s", self._df.columns)
196
- logger.info(
197
- "Columns with number of null values:\n%s", self._df.isnull().sum()
236
+ self.logger.info("Columns:\n%s", self._df.columns)
237
+ self.logger.info(
238
+ "Columns with number of NaN values:\n%s",
239
+ self._df.isna().sum(),
198
240
  )
199
- logger.info(
200
- "Columns with number of non-null values:\n%s", self._df.notnull().sum()
201
- )
202
- logger.info("Columns with number of NaN values:\n%s", self._df.isna().sum())
203
- logger.info(
204
- "Columns with number of non-NaN values:\n%s", self._df.notna().sum()
241
+ self.logger.info(
242
+ "Columns with number of non-NaN values:\n%s",
243
+ self._df.notna().sum(),
205
244
  )
206
245
 
207
246
  if show_first:
208
247
  # the default for head is n = 5:
209
- logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
248
+ self.logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
210
249
 
211
250
  if show_last:
212
251
  # the default for tail is n = 5:
213
- logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
252
+ self.logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
214
253
 
215
254
  if show_sample:
216
255
  # the default for sample is n = 1:
217
- logger.info("%s Sample rows:\n%s", str(row_num), self._df.sample(n=row_num))
256
+ self.logger.info(
257
+ "%s Sample rows:\n%s",
258
+ str(row_num),
259
+ self._df.sample(n=row_num),
260
+ )
218
261
 
219
262
  if show_statistics:
220
- logger.info(
221
- "Description of statistics for data frame:\n%s", self._df.describe()
263
+ self.logger.info(
264
+ "Description of statistics for data frame:\n%s",
265
+ self._df.describe(),
222
266
  )
223
- logger.info(
224
- "Description of statistics for data frame (Transformed):\n%s",
267
+ self.logger.info(
268
+ "Description of statistics for data frame (transformed):\n%s",
225
269
  self._df.describe().T,
226
270
  )
227
- logger.info(
271
+ self.logger.info(
228
272
  "Description of statistics for data frame (objects):\n%s",
229
273
  self._df.describe(include="object"),
230
274
  )
@@ -235,10 +279,13 @@ class Data:
235
279
  """Append additional data to the data frame.
236
280
 
237
281
  Args:
238
- add_data (pd.DataFrame | list | dict): Additional data. Can be pd.DataFrame or list of dicts (or Data)
282
+ add_data (pd.DataFrame | list | dict):
283
+ Additional data. Can be pd.DataFrame or list of dicts (or Data).
239
284
 
240
285
  Returns:
241
- bool: True = Success, False = Error
286
+ bool:
287
+ True = Success, False = Error
288
+
242
289
  """
243
290
 
244
291
  # Does the data frame has already content?
@@ -250,164 +297,395 @@ class Data:
250
297
  return True
251
298
  elif isinstance(add_data, Data):
252
299
  df = add_data.get_data_frame()
253
- if df:
300
+ if df is not None and not df.empty:
254
301
  self._df = pd.concat([self._df, df], ignore_index=True)
255
302
  return True
256
303
  elif isinstance(add_data, list):
257
304
  if add_data:
258
- df = Data(add_data)
305
+ df = Data(add_data, logger=self.logger)
259
306
  self._df = pd.concat(
260
- [self._df, df.get_data_frame()], ignore_index=True
307
+ [self._df, df.get_data_frame()],
308
+ ignore_index=True,
261
309
  )
262
310
  return True
263
311
  elif isinstance(add_data, dict):
264
312
  if add_data:
265
313
  # it is important to wrap the dict in a list to avoid that more than 1 row is created
266
- df = Data([add_data])
314
+ df = Data([add_data], logger=self.logger)
267
315
  self._df = pd.concat(
268
- [self._df, df.get_data_frame()], ignore_index=True
316
+ [self._df, df.get_data_frame()],
317
+ ignore_index=True,
269
318
  )
270
319
  return True
271
320
  else:
272
- logger.error("Illegal data type -> '%s'", type(add_data))
273
- return False
274
- else: # self._df is None (initial state)
275
- if isinstance(add_data, pd.DataFrame):
276
- self._df = add_data
277
- return True
278
- elif isinstance(add_data, Data):
279
- self._df = add_data.get_data_frame()
280
- return True
281
- elif isinstance(add_data, list):
282
- self._df = pd.DataFrame(add_data)
283
- return True
284
- elif isinstance(add_data, dict):
285
- # it is important to wrap the dict in a list to avoid that more than 1 row is created
286
- self._df = pd.DataFrame([add_data])
287
- return True
288
- else:
289
- logger.error("Illegal data type -> '%s'", type(add_data))
321
+ self.logger.error("Illegal data type -> '%s'", type(add_data))
290
322
  return False
323
+ elif isinstance(add_data, pd.DataFrame):
324
+ self._df = add_data
325
+ return True
326
+ elif isinstance(add_data, Data):
327
+ self._df = add_data.get_data_frame()
328
+ return True
329
+ elif isinstance(add_data, list):
330
+ self._df = pd.DataFrame(add_data)
331
+ return True
332
+ elif isinstance(add_data, dict):
333
+ # it is important to wrap the dict in a list to avoid that more than 1 row is created
334
+ self._df = pd.DataFrame([add_data])
335
+ return True
336
+ else:
337
+ self.logger.error("Illegal data type -> '%s'", type(add_data))
338
+ return False
339
+
340
+ # end method definition
341
+
342
+ def merge(
343
+ self,
344
+ merge_data: pd.DataFrame,
345
+ on: str | list[str] | None = None,
346
+ how: str = "inner",
347
+ left_on: str | list[str] | None = None,
348
+ right_on: str | list[str] | None = None,
349
+ left_index: bool = False,
350
+ right_index: bool = False,
351
+ suffixes: tuple[str, str] = ("_x", "_y"),
352
+ indicator: bool = False,
353
+ validate: str | None = None,
354
+ ) -> pd.DataFrame | None:
355
+ """Merge the current DataFrame (_df) with another DataFrame.
356
+
357
+ Args:
358
+ merge_data (pd.DataFrame | Data):
359
+ The DataFrame to merge with.
360
+ on (str | list[str]):
361
+ Column(s) to merge on. Defaults to None.
362
+ how (str, optional):
363
+ Type of merge ('inner', 'outer', 'left', 'right', 'cross'). Defaults to 'inner'.
364
+ left_on (str | list[str] | None, optional):
365
+ Column(s) from self._df to merge on. Defaults to None.
366
+ right_on (str | list[str] | None, optional):
367
+ Column(s) from other DataFrame to merge on. Defaults to None.
368
+ left_index (str | list[str], optional):
369
+ Whether to merge on the index of self._df. Defaults to False.
370
+ right_index (bool, optional):
371
+ Whether to merge on the index of other. Defaults to False.
372
+ suffixes (tuple[str, str]):
373
+ Suffixes for overlapping column names. Defaults to ('_x', '_y').
374
+ indicator (bool, optional):
375
+ If True, adds a column showing the merge source. Defaults to False.
376
+ validate ():
377
+ If provided, checks merge integrity
378
+ ('one_to_one', 'one_to_many', 'many_to_one', 'many_to_many'). Defaults to None.
379
+
380
+ Returns:
381
+ The merged DataFrame or None in case of an error.
382
+
383
+ Exceptions:
384
+ ValueError: If `other` is not a DataFrame.
385
+ KeyError: If required columns for merging are missing.
386
+ ValueError: If `validate` check fails.
387
+
388
+ """
389
+
390
+ if self._df is None or self._df.empty:
391
+ self._df = merge_data
392
+
393
+ if isinstance(merge_data, Data):
394
+ merge_data = merge_data.get_data_frame() # Extract DataFrame from Data instance
395
+
396
+ try:
397
+ return self._df.merge(
398
+ merge_data,
399
+ how=how,
400
+ on=on,
401
+ left_on=left_on,
402
+ right_on=right_on,
403
+ left_index=left_index,
404
+ right_index=right_index,
405
+ suffixes=suffixes,
406
+ indicator=indicator,
407
+ validate=validate,
408
+ )
409
+ except KeyError:
410
+ self.logger.error("Column(s) not found for merging!")
411
+ except ValueError:
412
+ self.logger.error("Invalid merge operation!")
413
+
414
+ return None
415
+
416
+ # end method definition
417
+
418
+ def strip(self, columns: list | None = None, inplace: bool = True) -> pd.DataFrame:
419
+ """Strip leading and trailing spaces from specified columns in a data frame.
420
+
421
+ Args:
422
+ columns (list | None):
423
+ The list of column names to strip. If None, it strips
424
+ leading and trailing spaces from _all_ string columns.
425
+ inplace (bool, optional):
426
+ If True, the data modification is done in place, i.e.
427
+ modifying the existing data frame of the object.
428
+ If False, the data frame is copied and the copy is modified
429
+ and returned.
430
+
431
+ Returns:
432
+ pd.DataFrame:
433
+ The modified data frame with stripped columns.
434
+
435
+ """
436
+
437
+ df = self._df.copy() if not inplace else self._df
438
+
439
+ if columns is None:
440
+ # Strip spaces from all string columns
441
+ df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
442
+ else:
443
+ # Strip spaces from specified columns
444
+ for col in columns:
445
+ if col in df.columns and df[col].dtype == "object": # Check if the column exists and is of string type
446
+ df[col] = df[col].str.strip()
447
+
448
+ if inplace:
449
+ self._df = df
450
+
451
+ return df
291
452
 
292
453
  # end method definition
293
454
 
294
- def load_json_data(self, json_path: str, convert_dates: bool = False) -> bool:
295
- """Load JSON data into DataFrame
455
+ def load_json_data(
456
+ self,
457
+ json_path: str,
458
+ convert_dates: bool = False,
459
+ index_column: str | None = None,
460
+ compression: str | None = None,
461
+ ) -> bool:
462
+ """Load JSON data into a Pandas data frame.
296
463
 
297
464
  Args:
298
- json_path (str): Path to the JSON file.
299
- convert_dates (bool, optional): whether or not dates should be converted
465
+ json_path (str):
466
+ The path to the JSON file.
467
+ convert_dates (bool, optional):
468
+ Defines whether or not dates should be converted.
469
+ The default is False = dates are NOT converted.
470
+ index_column (str | None, optional):
471
+ The Name of the column (i.e. JSON data field) that should
472
+ become the index in the loaded data frame.
473
+ compression (str | None):
474
+ Remove a compression:
475
+ * gzip (.gz)
476
+ * bz2 (.bz2)
477
+ * zip (.zip)
478
+ * xz (.xz)
479
+ The value for compression should not include the dot.
480
+ Default is None = no compression.
481
+
300
482
  Returns:
301
483
  bool: False in case an error occured, True otherwise.
484
+
302
485
  """
303
486
 
304
- if json_path is not None and os.path.exists(json_path):
305
- # Load data from JSON file
306
- try:
307
- df = pd.read_json(path_or_buf=json_path, convert_dates=convert_dates)
308
- if self._df is None:
309
- self._df = df
310
- else:
311
- self._df = pd.concat([self._df, df])
312
- logger.info(
313
- "After loading -> '%s' the Data Frame has %s row(s) and %s column(s)",
314
- json_path,
315
- self._df.shape[0],
316
- self._df.shape[1],
317
- )
318
- except FileNotFoundError:
319
- logger.error(
320
- "File -> %s not found. Please check the file path.", json_path
487
+ if not json_path:
488
+ self.logger.error(
489
+ "You have not specified a JSON path!",
490
+ )
491
+ return False
492
+
493
+ # If compression is enabled the file path should have
494
+ # the matching file name extension:
495
+ if compression:
496
+ compression = compression.lstrip(".") # remove a dot prefix if present
497
+ suffix = "." + compression if compression != "gzip" else "gz"
498
+ if not json_path.endswith(suffix):
499
+ json_path += suffix
500
+
501
+ if not os.path.exists(json_path):
502
+ self.logger.error(
503
+ "Missing JSON file - you have not specified a valid path -> '%s'.",
504
+ json_path,
505
+ )
506
+ return False
507
+
508
+ # Load data from JSON file
509
+ try:
510
+ df = pd.read_json(
511
+ path_or_buf=json_path,
512
+ convert_dates=convert_dates,
513
+ compression=compression,
514
+ )
515
+
516
+ if index_column and index_column not in df.columns:
517
+ self.logger.error(
518
+ "Specified index column -> '%s' not found in the JSON data.",
519
+ index_column,
321
520
  )
322
521
  return False
323
- except PermissionError:
324
- logger.error("Permission denied to access the file -> %s.", json_path)
325
- return False
326
- except IOError as e:
327
- logger.error("An I/O error occurred -> %s", str(e))
328
- return False
329
- except json.JSONDecodeError as e:
330
- logger.error("Error: Unable to decode JSON -> %s", str(e))
331
- return False
332
- except ValueError as e:
333
- logger.error("Invalid JSON input -> %s", str(e))
334
- return False
335
- except AttributeError as e:
336
- logger.error("Unexpected data structure -> %s", str(e))
337
- return False
338
- except TypeError as e:
339
- logger.error("Unexpected data type -> %s", str(e))
340
- return False
341
- except KeyError as e:
342
- logger.error("Missing key in JSON data -> %s", str(e))
343
- return False
344
522
 
345
- else:
346
- logger.error(
347
- "Missing JSON file - you have not specified a valid path -> %s.",
523
+ if index_column:
524
+ df = df.set_index(keys=index_column)
525
+ if self._df is None:
526
+ self._df = df
527
+ else:
528
+ self._df = pd.concat([self._df, df])
529
+ self.logger.info(
530
+ "After loading JSON file -> '%s', the data frame has %s row(s) and %s column(s)",
531
+ json_path,
532
+ self._df.shape[0],
533
+ self._df.shape[1],
534
+ )
535
+ except FileNotFoundError:
536
+ self.logger.error(
537
+ "JSON file -> '%s' not found. Please check the file path.",
538
+ json_path,
539
+ )
540
+ return False
541
+ except PermissionError:
542
+ self.logger.error(
543
+ "Missing permission to access the JSON file -> '%s'.",
348
544
  json_path,
349
545
  )
350
546
  return False
547
+ except OSError:
548
+ self.logger.error("An I/O error occurred!")
549
+ return False
550
+ except json.JSONDecodeError:
551
+ self.logger.error(
552
+ "Unable to decode JSON file -> '%s'",
553
+ json_path,
554
+ )
555
+ return False
556
+ except ValueError:
557
+ self.logger.error("Invalid JSON input -> %s", json_path)
558
+ return False
559
+ except AttributeError:
560
+ self.logger.error("Unexpected JSON data structure in file -> %s", json_path)
561
+ return False
562
+ except TypeError:
563
+ self.logger.error("Unexpected JSON data type in file -> %s", json_path)
564
+ return False
565
+ except KeyError:
566
+ self.logger.error("Missing key in JSON data in file -> %s", json_path)
567
+ return False
568
+
351
569
  return True
352
570
 
353
571
  # end method definition
354
572
 
355
573
  def save_json_data(
356
- self, json_path: str, orient: str = "records", preserve_index: bool = False
574
+ self,
575
+ json_path: str,
576
+ orient: str = "records",
577
+ preserve_index: bool = False,
578
+ index_column: str = "index",
579
+ compression: str | None = None,
357
580
  ) -> bool:
358
- """Save JSON data from DataFrame to file
581
+ """Save JSON data from data frame to file.
359
582
 
360
583
  Args:
361
- json_path (str): Path to the JSON file.
362
- orient (str, optional): Structure of the JSON
363
- preserve_index (bool, optional)
584
+ json_path (str): The path to where the JSON file should be safed.
585
+ orient (str, optional):
586
+ The structure of the JSON. Possible values:
587
+ * "records" (this is the default)
588
+ * "columns"
589
+ * "index"
590
+ * "table"
591
+ * "split"
592
+ preserve_index (bool, optional):
593
+ Defines if the index column of the data frame should be exported as well.
594
+ The default is False (index is not exported).
595
+ index_column (str, optional):
596
+ The Name of the column (i.e. JSON data field) that should
597
+ become the index in the loaded data frame. The default is "index".
598
+ compression (str | None):
599
+ Apply a compression:
600
+ * gzip (.gz)
601
+ * bz2 (.bz2)
602
+ * zip (.zip)
603
+ * xz (.xz)
604
+
364
605
  Returns:
365
- bool: False in case an error occured, True otherwise.
606
+ bool:
607
+ False in case an error occured, True otherwise.
608
+
366
609
  """
367
610
 
368
- if json_path is not None and os.path.exists(os.path.dirname(json_path)):
369
- # Load data from JSON file
370
- try:
371
- if self._df is not None:
372
- # index parameter is only allowed if orient has one of the following values:
373
- if (
374
- orient == "columns"
375
- or orient == "index"
376
- or orient == "table"
377
- or orient == "split"
378
- ):
379
- self._df.to_json(
380
- path_or_buf=json_path,
381
- index=preserve_index,
382
- orient=orient,
383
- indent=2,
384
- )
385
- else:
386
- self._df.to_json(path_or_buf=json_path, orient=orient, indent=2)
611
+ if not json_path:
612
+ self.logger.error(
613
+ "You have not specified a JSON path!",
614
+ )
615
+ return False
616
+
617
+ # If compression is enabled the file path should have
618
+ # the matching file name extension:
619
+ if compression:
620
+ suffix = "." + compression if compression != "gzip" else ".gz"
621
+ if not json_path.endswith(suffix):
622
+ json_path += suffix
623
+
624
+ # Save data to JSON file
625
+ try:
626
+ if self._df is not None:
627
+ if not os.path.exists(os.path.dirname(json_path)):
628
+ os.makedirs(os.path.dirname(json_path), exist_ok=True)
629
+
630
+ # index parameter is only allowed if orient has one of the following values:
631
+ if orient in ("columns", "index", "table", "split"):
632
+ self._df.to_json(
633
+ path_or_buf=json_path,
634
+ index=preserve_index,
635
+ orient=orient,
636
+ indent=2,
637
+ compression=compression,
638
+ date_format="iso",
639
+ )
640
+ # In this case we cannot use the index parameter as this would give this error:
641
+ # Value Error -> 'index=True' is only valid when 'orient' is 'split', 'table', 'index', or 'columns'
642
+ # So we create a new column that preserves the original row IDs from the index. The nasme
643
+
644
+ elif preserve_index:
645
+ df_with_index = self._df.reset_index(
646
+ names=index_column,
647
+ inplace=False,
648
+ )
649
+ df_with_index.to_json(
650
+ path_or_buf=json_path,
651
+ orient=orient,
652
+ indent=2,
653
+ compression=compression,
654
+ date_format="iso",
655
+ )
387
656
  else:
388
- logger.warning("Data Frame is empty. Cannot write it to JSON")
389
- return False
390
- except FileNotFoundError:
391
- logger.error(
392
- "File -> '%s' not found. Please check the file path.", json_path
657
+ self._df.to_json(
658
+ path_or_buf=json_path,
659
+ orient=orient,
660
+ indent=2,
661
+ compression=compression,
662
+ date_format="iso",
663
+ )
664
+ else:
665
+ self.logger.warning(
666
+ "Data frame is empty. Cannot write it to JSON file -> '%s'.",
667
+ json_path,
393
668
  )
394
669
  return False
395
- except PermissionError:
396
- logger.error("Permission denied to access the file -> '%s'.", json_path)
397
- return False
398
- except IOError as e:
399
- logger.error("An I/O error occurred -> %s", str(e))
400
- return False
401
- except ValueError as e:
402
- logger.error("Value Error -> %s", str(e))
403
- return False
404
-
405
- else:
406
- logger.error(
407
- "Missing JSON file -> '%s' you have not specified a valid path!",
670
+ except FileNotFoundError:
671
+ self.logger.error(
672
+ "File -> '%s' not found. Please check the file path.",
673
+ json_path,
674
+ )
675
+ return False
676
+ except PermissionError:
677
+ self.logger.error(
678
+ "Permission denied to access the file -> '%s'.",
408
679
  json_path,
409
680
  )
410
681
  return False
682
+ except OSError:
683
+ self.logger.error("An I/O error occurred accessing file -> %s", json_path)
684
+ return False
685
+ except ValueError:
686
+ self.logger.error("Value error!")
687
+ return False
688
+
411
689
  return True
412
690
 
413
691
  # end method definition
@@ -422,27 +700,40 @@ class Data:
422
700
  names: list | None = None,
423
701
  na_values: list | None = None,
424
702
  ) -> bool:
425
- """Load Excel (xlsx) data into DataFrame. Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
426
- read from a local filesystem or URL. Supports an option to read a single sheet or a list of sheets.
703
+ """Load Excel (xlsx) data into Pandas data frame.
704
+
705
+ Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
706
+ read from a local filesystem or URL. Supports an option to read a
707
+ single sheet or a list of sheets.
427
708
 
428
709
  Args:
429
- xlsx_path (str): Path to the Excel file.
430
- sheet_names (list | str | int, optional): Name or Index of the sheet in the Excel workbook to load.
431
- If 'None' then all sheets will be loaded.
432
- If 0 then first sheet in workbook will be loaded (this is the Default)
433
- If string then this is interpreted as the name of the sheet to load.
434
- If a list is passed, this can be a list of index values (int) or
435
- a list of strings with the sheet names to load.
436
- usecols (list | str, optional): List of columns to load, specified by general column names in Excel,
437
- e.g. usecols='B:D', usecols=['A', 'C', 'F']
438
- skip_rows (int, optional): List of rows to skip on top of the sheet (e.g. to not read headlines)
439
- header (int | None, optional): Excel Row (0-indexed) to use for the column labels of the parsed DataFrame.
440
- If file contains no header row, then you should explicitly pass header=None.
441
- Default is 0.
442
- names (list): List of column names to use. Default is None
443
- na_values (list, optional): List of values in the Excel that should become the Pandas NA value.
710
+ xlsx_path (str):
711
+ The path to the Excel file to load.
712
+ sheet_names (list | str | int, optional):
713
+ Name or Index of the sheet in the Excel workbook to load.
714
+ If 'None' then all sheets will be loaded.
715
+ If 0 then first sheet in workbook will be loaded (this is the Default).
716
+ If string then this is interpreted as the name of the sheet to load.
717
+ If a list is passed, this can be a list of index values (int) or
718
+ a list of strings with the sheet names to load.
719
+ usecols (list | str, optional):
720
+ A list of columns to load, specified by general column names in Excel,
721
+ e.g. usecols='B:D', usecols=['A', 'C', 'F']
722
+ skip_rows (int, optional):
723
+ List of rows to skip on top of the sheet (e.g. to not read headlines)
724
+ header (int | None, optional):
725
+ Excel Row (0-indexed) to use for the column labels of the parsed data frame.
726
+ If file contains no header row, then you should explicitly pass header=None.
727
+ Default is 0.
728
+ names (list, optional):
729
+ A list of column names to use. Default is None.
730
+ na_values (list, optional):
731
+ A list of values in the Excel that should become the Pandas NA value.
732
+
444
733
  Returns:
445
- bool: False in case an error occured, True otherwise.
734
+ bool:
735
+ False in case an error occured, True otherwise.
736
+
446
737
  """
447
738
 
448
739
  if xlsx_path is not None and os.path.exists(xlsx_path):
@@ -457,16 +748,21 @@ class Data:
457
748
  names=names,
458
749
  na_values=na_values,
459
750
  )
460
- # if multiple sheets from an Excel workbook are loaded,
751
+ # If multiple sheets from an Excel workbook are loaded,
461
752
  # then read_excel() returns a dictionary. The keys are
462
- # the names of the sheets and the values are the Data Frames.
463
- # we handle this case as follows:
753
+ # the names of the sheets and the values are the data frames.
754
+ # As this class can only handle one data frame per object,
755
+ # We handle this case by concatenating the different sheets.
756
+ # If you don't want this make sure your Excel workbook has only
757
+ # one sheet or use the "sheet_name" parameter to select the one(s)
758
+ # you want to load.
464
759
  if isinstance(df, dict):
465
- logger.info("Loading multiple Excel sheets from the workbook!")
760
+ self.logger.info("Loading multiple Excel sheets from the workbook!")
466
761
  multi_sheet_df = pd.DataFrame()
467
- for sheet in df.keys():
762
+ for sheet in df:
468
763
  multi_sheet_df = pd.concat(
469
- [multi_sheet_df, df[sheet]], ignore_index=True
764
+ [multi_sheet_df, df[sheet]],
765
+ ignore_index=True,
470
766
  )
471
767
  df = multi_sheet_df
472
768
  if self._df is None:
@@ -474,199 +770,390 @@ class Data:
474
770
  else:
475
771
  self._df = pd.concat([self._df, df], ignore_index=True)
476
772
  except FileNotFoundError:
477
- logger.error(
478
- "File -> '%s' not found. Please check the file path.", xlsx_path
773
+ self.logger.error(
774
+ "Excel file -> '%s' not found. Please check the file path.",
775
+ xlsx_path,
479
776
  )
480
777
  return False
481
778
  except PermissionError:
482
- logger.error("Permission denied to access the file -> '%s'.", xlsx_path)
779
+ self.logger.error(
780
+ "Missing permission to access the Excel file -> '%s'.",
781
+ xlsx_path,
782
+ )
483
783
  return False
484
- except IOError as e:
485
- logger.error("An I/O error occurred -> %s", str(e))
784
+ except OSError:
785
+ self.logger.error(
786
+ "An I/O error occurred while reading the Excel file -> '%s'",
787
+ xlsx_path,
788
+ )
486
789
  return False
487
- except ValueError as e:
488
- logger.error("Invalid Excel input -> %s", str(e))
790
+ except ValueError:
791
+ self.logger.error(
792
+ "Invalid Excel input in file -> '%s'",
793
+ xlsx_path,
794
+ )
489
795
  return False
490
- except AttributeError as e:
491
- logger.error("Unexpected data structure -> %s", str(e))
796
+ except AttributeError:
797
+ self.logger.error("Unexpected data structure in file -> %s", xlsx_path)
492
798
  return False
493
- except TypeError as e:
494
- logger.error("Unexpected data type -> %s", str(e))
799
+ except TypeError:
800
+ self.logger.error("Unexpected data type in file -> %s", xlsx_path)
495
801
  return False
496
- except KeyError as e:
497
- logger.error("Missing key in Excel data -> %s", str(e))
802
+ except KeyError:
803
+ self.logger.error("Missing key in Excel data in file -> %s", xlsx_path)
498
804
  return False
499
805
 
500
806
  else:
501
- logger.error(
502
- "Missing Excel file -> '%s' you have not specified a valid path!",
807
+ self.logger.error(
808
+ "Missing Excel file -> '%s'. You have not specified a valid path!",
503
809
  xlsx_path,
504
810
  )
505
811
  return False
812
+
506
813
  return True
507
814
 
508
815
  # end method definition
509
816
 
510
817
  def save_excel_data(
511
- self, excel_path: str, sheet_name: str = "Pandas Export", index: bool = False
818
+ self,
819
+ excel_path: str,
820
+ sheet_name: str = "Pandas Export",
821
+ index: bool = False,
822
+ columns: list | None = None,
512
823
  ) -> bool:
513
- """
514
- Save the DataFrame to an Excel file, with robust error handling and logging.
824
+ """Save the data frame to an Excel file, with robust error handling and logging.
515
825
 
516
826
  Args:
517
- excel_path (str): The file path to save the Excel file.
518
- sheet_name (str): The sheet name where data will be saved. Default is 'Sheet1'.
519
- index: Whether to write the row names (index). Default is False.
827
+ excel_path (str):
828
+ The file path to save the Excel file.
829
+ sheet_name (str):
830
+ The sheet name where data will be saved. Default is 'Sheet1'.
831
+ index (bool, optional):
832
+ Whether to write the row names (index). Default is False.
833
+ columns (list | None, optional):
834
+ A list of column names to write into the excel file.
835
+
836
+ Returns:
837
+ bool:
838
+ True = success, False = error.
839
+
520
840
  """
841
+
521
842
  try:
522
843
  # Check if the directory exists
523
844
  directory = os.path.dirname(excel_path)
524
845
  if directory and not os.path.exists(directory):
525
- raise FileNotFoundError(
526
- "The directory -> '%s' does not exist." % directory
527
- )
846
+ os.makedirs(directory)
847
+
848
+ # Validate columns if provided
849
+ if columns:
850
+ existing_columns = [col for col in columns if col in self._df.columns]
851
+ missing_columns = set(columns) - set(existing_columns)
852
+ if missing_columns:
853
+ self.logger.warning(
854
+ "The following columns do not exist in the data frame and cannot be saved to Excel -> %s",
855
+ ", ".join(missing_columns),
856
+ )
857
+ columns = existing_columns
528
858
 
529
- # Attempt to save the DataFrame to Excel
530
- self._df.to_excel(excel_path, sheet_name=sheet_name, index=index)
531
- logger.info("Data saved successfully to -> %s", excel_path)
859
+ # Attempt to save the data frame to Excel:
860
+ self._df.to_excel(
861
+ excel_path,
862
+ sheet_name=sheet_name,
863
+ index=index,
864
+ columns=columns or None, # Pass None if no columns provided
865
+ )
866
+ self.logger.info(
867
+ "Data frame saved successfully to Excel file -> '%s'.",
868
+ excel_path,
869
+ )
532
870
 
533
- except FileNotFoundError as e:
534
- logger.error("Error: %s", e)
871
+ except FileNotFoundError:
872
+ self.logger.error(
873
+ "Cannot write data frame to Excel file -> '%s'",
874
+ excel_path,
875
+ )
535
876
  return False
536
877
  except PermissionError:
537
- logger.error(
538
- "Error: Permission denied. You do not have permission to write to '%s'.",
878
+ self.logger.error(
879
+ "Cannot write data frame to Excel file -> '%s'",
539
880
  excel_path,
540
881
  )
541
882
  return False
542
- except ValueError as ve:
543
- logger.error("Error: Invalid data for Excel format -> %s", ve)
544
- return False
545
- except OSError as oe:
546
- logger.error("Error: OS error occurred while saving file -> %s", oe)
883
+ except ValueError:
884
+ self.logger.error(
885
+ "Cannot write data frame to Excel file -> '%s'",
886
+ excel_path,
887
+ )
547
888
  return False
548
- except Exception as e:
549
- # Catch-all for any other unexpected errors
550
- logger.error("An unexpected error occurred -> %s", e)
889
+ except OSError:
890
+ self.logger.error(
891
+ "Cannot write data frame to Excel file -> '%s'",
892
+ excel_path,
893
+ )
551
894
  return False
552
895
 
553
896
  return True
554
897
 
555
898
  # end method definition
556
899
 
557
- def load_csv_data(self, csv_path: str) -> bool:
558
- """Load CSV (Comma separated values) data into DataFrame
900
+ def load_csv_data(
901
+ self,
902
+ csv_path: str,
903
+ delimiter: str = ",",
904
+ names: list | None = None,
905
+ header: int | None = 0,
906
+ usecols: list | None = None,
907
+ encoding: str = "utf-8",
908
+ ) -> bool:
909
+ """Load CSV (Comma separated values) data into data frame.
559
910
 
560
911
  Args:
561
- csv_path (str): Path to the CSV file.
912
+ csv_path (str):
913
+ The path to the CSV file.
914
+ delimiter (str, optional, length = 1):
915
+ The character used to delimit values. Default is "," (comma).
916
+ names (list | None, optional):
917
+ The list of column names. This is useful if file does not have a header line
918
+ but just the data.
919
+ header (int | None, optional):
920
+ The index of the header line. Default is 0 (first line). None indicates
921
+ that the file does not have a header line
922
+ usecols (list | None, optional):
923
+ There are three possible list values types:
924
+ 1. int:
925
+ These values are treated as column indices for columns to keep
926
+ (first column has index 0).
927
+ 2. str:
928
+ The names of the columns to keep. For this to work the file needs
929
+ either a header line (i.e. 'header != None') or the 'names'
930
+ parameter must be specified.
931
+ 3. bool:
932
+ The length of the list must match the number of columns. Only
933
+ columns that have a value of True are kept.
934
+ encoding (str, optional):
935
+ The encoding of the file. Default = "utf-8".
936
+
562
937
  Returns:
563
- bool: False in case an error occured, True otherwise.
938
+ bool:
939
+ False in case an error occured, True otherwise.
940
+
564
941
  """
565
942
 
566
- if csv_path is not None and os.path.exists(csv_path):
567
- # Load data from CSV file
943
+ if csv_path.startswith("http"):
944
+ # Download file from remote location specified by the packageUrl
945
+ # this must be a public place without authentication:
946
+ self.logger.debug("Download CSV file from URL -> '%s'.", csv_path)
947
+
568
948
  try:
569
- df = pd.read_csv(csv_path)
570
- if self._df is None:
571
- self._df = df
572
- else:
573
- self._df = pd.concat([self._df, df])
574
- except FileNotFoundError:
575
- logger.error(
576
- "File -> '%s' not found. Please check the file path.", csv_path
577
- )
578
- return False
579
- except PermissionError:
580
- logger.error("Permission denied to access the file -> %s.", csv_path)
949
+ response = requests.get(url=csv_path, timeout=1200)
950
+ response.raise_for_status()
951
+ except requests.exceptions.HTTPError:
952
+ self.logger.error("HTTP error with -> %s", csv_path)
581
953
  return False
582
- except IOError as e:
583
- logger.error("An I/O error occurred -> %s", str(e))
954
+ except requests.exceptions.ConnectionError:
955
+ self.logger.error("Connection error with -> %s", csv_path)
584
956
  return False
585
- except ValueError as e:
586
- logger.error("Invalid CSV input -> %s", str(e))
957
+ except requests.exceptions.Timeout:
958
+ self.logger.error("Timeout error with -> %s", csv_path)
587
959
  return False
588
- except AttributeError as e:
589
- logger.error("Unexpected data structure -> %s", str(e))
590
- return False
591
- except TypeError as e:
592
- logger.error("Unexpected data type -> %s", str(e))
593
- return False
594
- except KeyError as e:
595
- logger.error("Missing key in CSV data -> %s", str(e))
960
+ except requests.exceptions.RequestException:
961
+ self.logger.error("Request error with -> %s", csv_path)
596
962
  return False
597
963
 
598
- else:
599
- logger.error(
600
- "Missing CSV file -> '%s' you have not specified a valid path!",
964
+ self.logger.debug(
965
+ "Successfully downloaded CSV file -> %s; status code -> %s",
601
966
  csv_path,
967
+ response.status_code,
602
968
  )
603
- return False
604
- return True
605
969
 
606
- # end method definition
970
+ # Convert bytes to a string using utf-8 and create a file-like object
971
+ csv_file = StringIO(response.content.decode(encoding))
607
972
 
608
- def load_xml_data(
609
- self, xml_path: str, xpath: str | None = None, xslt_path: str | None = None
610
- ) -> bool:
611
- """Load XML data into DataFrame
973
+ elif os.path.exists(csv_path):
974
+ self.logger.debug("Using local CSV file -> '%s'.", csv_path)
975
+ csv_file = csv_path
612
976
 
613
- Args:
614
- xml_path (str): Path to the XML file.
615
- xpath (str, optional): XPath to the elements we want to select
616
- xslt_path (str, optional): XSLT transformation file
617
- Returns:
618
- bool: False in cause an error occured, True otherwise.
619
- """
977
+ else:
978
+ self.logger.error(
979
+ "Missing CSV file -> '%s' you have not specified a valid path!",
980
+ csv_path,
981
+ )
982
+ return False
620
983
 
984
+ # Load data from CSV file or buffer
621
985
  try:
622
- df = pd.read_xml(path_or_buffer=xml_path, xpath=xpath, stylesheet=xslt_path)
623
- # Process the loaded data as needed
986
+ df = pd.read_csv(
987
+ filepath_or_buffer=csv_file,
988
+ delimiter=delimiter,
989
+ names=names,
990
+ header=header,
991
+ usecols=usecols,
992
+ encoding=encoding,
993
+ skipinitialspace=True,
994
+ )
624
995
  if self._df is None:
625
996
  self._df = df
626
997
  else:
627
998
  self._df = pd.concat([self._df, df])
628
- logger.info("XML file loaded successfully!")
629
- return True
630
999
  except FileNotFoundError:
631
- print("File not found.")
1000
+ self.logger.error(
1001
+ "CSV file -> '%s' not found. Please check the file path.",
1002
+ csv_path,
1003
+ )
632
1004
  return False
633
1005
  except PermissionError:
634
- logger.error("Permission denied to access the file -> %s.", xml_path)
1006
+ self.logger.error(
1007
+ "Permission denied to access the CSV file -> '%s'.",
1008
+ csv_path,
1009
+ )
635
1010
  return False
636
- except IOError as e:
637
- logger.error("An I/O error occurred -> %s", str(e))
1011
+ except OSError:
1012
+ self.logger.error("An I/O error occurred!")
638
1013
  return False
639
- except ValueError as e:
640
- logger.error("Invalid CSV input -> %s", str(e))
1014
+ except ValueError:
1015
+ self.logger.error("Invalid CSV input in file -> %s", csv_path)
641
1016
  return False
642
- except AttributeError as e:
643
- logger.error("Unexpected data structure -> %s", str(e))
1017
+ except AttributeError:
1018
+ self.logger.error("Unexpected data structure in file -> %s", csv_path)
644
1019
  return False
645
- except TypeError as e:
646
- logger.error("Unexpected data type -> %s", str(e))
1020
+ except TypeError:
1021
+ self.logger.error("Unexpected data type in file -> %s", csv_path)
647
1022
  return False
648
- except KeyError as e:
649
- logger.error("Missing key in CSV data -> %s", str(e))
1023
+ except KeyError:
1024
+ self.logger.error("Missing key in CSV data -> %s", csv_path)
650
1025
  return False
651
1026
 
1027
+ return True
1028
+
652
1029
  # end method definition
653
1030
 
654
- def load_directory(self, path_to_root: str) -> bool:
655
- """Load directory structure into Pandas Data Frame
1031
+ def load_xml_data(
1032
+ self,
1033
+ xml_path: str,
1034
+ xpath: str | None = None,
1035
+ xslt_path: str | None = None,
1036
+ encoding: str = "utf-8",
1037
+ ) -> bool:
1038
+ """Load XML data into a Pandas data frame.
656
1039
 
657
1040
  Args:
658
- path_to_root (str): Path to the root element of the
659
- directory structure
1041
+ xml_path (str):
1042
+ The path to the XML file to load.
1043
+ xpath (str, optional):
1044
+ An XPath to the elements we want to select.
1045
+ xslt_path (str, optional):
1046
+ An XSLT transformation file to convert the XML data.
1047
+ encoding (str, optional):
1048
+ The encoding of the file. Default is UTF-8.
1049
+
1050
+ Returns:
1051
+ bool:
1052
+ False in case an error occured, True otherwise.
1053
+
1054
+ """
1055
+
1056
+ if xml_path.startswith("http"):
1057
+ # Download file from remote location specified by the packageUrl
1058
+ # this must be a public place without authentication:
1059
+ self.logger.debug("Download XML file from URL -> '%s'.", xml_path)
1060
+
1061
+ try:
1062
+ response = requests.get(url=xml_path, timeout=1200)
1063
+ response.raise_for_status()
1064
+ except requests.exceptions.HTTPError:
1065
+ self.logger.error("HTTP error with -> %s", xml_path)
1066
+ return False
1067
+ except requests.exceptions.ConnectionError:
1068
+ self.logger.error("Connection error with -> %s", xml_path)
1069
+ return False
1070
+ except requests.exceptions.Timeout:
1071
+ self.logger.error("Timeout error with -> %s", xml_path)
1072
+ return False
1073
+ except requests.exceptions.RequestException:
1074
+ self.logger.error("Request error with -> %s", xml_path)
1075
+ return False
1076
+
1077
+ self.logger.debug(
1078
+ "Successfully downloaded XML file -> '%s'; status code -> %s",
1079
+ xml_path,
1080
+ response.status_code,
1081
+ )
1082
+ # Convert bytes to a string using utf-8 and create a file-like object
1083
+ xml_file = StringIO(response.content.decode(encoding))
1084
+
1085
+ elif os.path.exists(xml_path):
1086
+ self.logger.debug("Using local XML file -> '%s'.", xml_path)
1087
+ xml_file = xml_path
1088
+
1089
+ else:
1090
+ self.logger.error(
1091
+ "Missing XML file -> '%s'. You have not specified a valid path or URL!",
1092
+ xml_path,
1093
+ )
1094
+ return False
1095
+
1096
+ # Load data from XML file or buffer
1097
+ try:
1098
+ df = pd.read_xml(
1099
+ path_or_buffer=xml_file,
1100
+ xpath=xpath,
1101
+ stylesheet=xslt_path,
1102
+ encoding=encoding,
1103
+ )
1104
+ # Process the loaded data as needed
1105
+ if self._df is None:
1106
+ self._df = df
1107
+ else:
1108
+ self._df = pd.concat([self._df, df])
1109
+ self.logger.info("XML file -> '%s' loaded successfully!", xml_path)
1110
+ except FileNotFoundError:
1111
+ self.logger.error("XML file -> '%s' not found.", xml_path)
1112
+ return False
1113
+ except PermissionError:
1114
+ self.logger.error(
1115
+ "Missing permission to access the XML file -> '%s'.",
1116
+ xml_path,
1117
+ )
1118
+ return False
1119
+ except OSError:
1120
+ self.logger.error("An I/O error occurred loading from -> %s", xml_path)
1121
+ return False
1122
+ except ValueError:
1123
+ self.logger.error("Invalid XML data in file -> %s", xml_path)
1124
+ return False
1125
+ except AttributeError:
1126
+ self.logger.error("Unexpected data structure in XML file -> %s", xml_path)
1127
+ return False
1128
+ except TypeError:
1129
+ self.logger.error("Unexpected data type in XML file -> %s", xml_path)
1130
+ return False
1131
+ except KeyError:
1132
+ self.logger.error("Missing key in XML file -> %s", xml_path)
1133
+ return False
1134
+
1135
+ return True
1136
+
1137
+ # end method definition
1138
+
1139
+ def load_directory(self, path_to_root: str) -> bool:
1140
+ """Load directory structure into Pandas data frame.
1141
+
1142
+ Args:
1143
+ path_to_root (str):
1144
+ Path to the root element of the directory structure.
660
1145
 
661
1146
  Returns:
662
1147
  bool: True = Success, False = Failure
1148
+
663
1149
  """
664
1150
 
665
1151
  try:
666
1152
  # Check if the provided path is a directory
667
1153
  if not os.path.isdir(path_to_root):
668
- logger.error(
669
- "The provided path -> '%s' is not a valid directory.", path_to_root
1154
+ self.logger.error(
1155
+ "The provided path -> '%s' is not a valid directory.",
1156
+ path_to_root,
670
1157
  )
671
1158
  return False
672
1159
 
@@ -682,55 +1169,88 @@ class Data:
682
1169
  path_parts = relative_path.split(os.sep)
683
1170
 
684
1171
  # Create a dictionary with the path parts and file details
685
- entry = {
686
- "level {}".format(i): part
687
- for i, part in enumerate(path_parts[:-1], start=1)
688
- }
689
- entry.update({"filename": path_parts[-1], "size": file_size})
1172
+ entry = {"level {}".format(i): part for i, part in enumerate(path_parts[:-1], start=1)}
1173
+
1174
+ entry.update(
1175
+ {
1176
+ "filename": path_parts[-1],
1177
+ "size": file_size,
1178
+ "path": path_parts[1:-1],
1179
+ "relative_path": relative_path,
1180
+ "download_dir": root,
1181
+ },
1182
+ )
690
1183
  data.append(entry)
691
1184
 
692
- # Create DataFrame from list of dictionaries
1185
+ # Create data frame from list of dictionaries:
693
1186
  self._df = pd.DataFrame(data)
694
1187
 
695
1188
  # Determine the maximum number of levels
696
1189
  max_levels = max((len(entry) - 2 for entry in data), default=0)
697
1190
 
698
- # Ensure all entries have the same number of levels
1191
+ # Ensure all entries have the same number of levels:
699
1192
  for entry in data:
700
1193
  for i in range(1, max_levels + 1):
701
1194
  entry.setdefault("level {}".format(i), "")
702
1195
 
703
- # Convert to DataFrame again to make sure all columns are consistent
1196
+ # Convert to data frame again to make sure all columns are consistent:
704
1197
  self._df = pd.DataFrame(data)
705
1198
 
706
- except NotADirectoryError as nde:
707
- print(f"Error: {nde}")
708
- except FileNotFoundError as fnfe:
709
- print(f"Error: {fnfe}")
710
- except PermissionError as pe:
711
- print(f"Error: {pe}")
1199
+ except NotADirectoryError:
1200
+ self.logger.error(
1201
+ "Provided path -> '%s' is not a directory!",
1202
+ path_to_root,
1203
+ )
1204
+ return False
1205
+ except FileNotFoundError:
1206
+ self.logger.error(
1207
+ "Provided path -> '%s' does not exist in file system!",
1208
+ path_to_root,
1209
+ )
1210
+ return False
1211
+ except PermissionError:
1212
+ self.logger.error(
1213
+ "Permission error accessing path -> '%s'!",
1214
+ path_to_root,
1215
+ )
1216
+ return False
712
1217
 
713
1218
  return True
714
1219
 
715
1220
  # end method definition
716
1221
 
717
- def load_xml_directory(self, path_to_root: str, xpath: str | None = None) -> bool:
718
- """Load directory structure into Pandas Data Frame
1222
+ def load_xml_directory(
1223
+ self,
1224
+ path_to_root: str,
1225
+ xpath: str | None = None,
1226
+ xml_files: list | None = None,
1227
+ ) -> bool:
1228
+ """Load XML files from a directory structure into Pandas data frame.
719
1229
 
720
1230
  Args:
721
- path_to_root (str): Path to the root element of the
722
- directory structure
723
- xpath (str, optional): XPath to the elements we want to select
1231
+ path_to_root (str):
1232
+ Path to the root element of the directory structure.
1233
+ xpath (str, optional):
1234
+ XPath to the XML elements we want to select.
1235
+ xml_files (list | None, optional):
1236
+ Names of the XML files to load from the directory.
724
1237
 
725
1238
  Returns:
726
- bool: True = Success, False = Failure
1239
+ bool:
1240
+ True = Success, False = Failure
1241
+
727
1242
  """
728
1243
 
1244
+ # Establish a default if None is passed via the parameter:
1245
+ if not xml_files:
1246
+ xml_files = ["docovw.xml"]
1247
+
729
1248
  try:
730
1249
  # Check if the provided path is a directory
731
1250
  if not os.path.isdir(path_to_root):
732
- logger.error(
733
- "The provided path -> '%s' is not a valid directory.", path_to_root
1251
+ self.logger.error(
1252
+ "The provided path -> '%s' is not a valid directory.",
1253
+ path_to_root,
734
1254
  )
735
1255
  return False
736
1256
 
@@ -741,36 +1261,223 @@ class Data:
741
1261
  file_size = os.path.getsize(file_path)
742
1262
  file_name = os.path.basename(file_path)
743
1263
 
744
- if file_name == "docovw.xml":
745
- logger.info(
746
- "Load XML file -> '%s' of size -> %s", file_path, file_size
1264
+ if file_name in xml_files:
1265
+ self.logger.info(
1266
+ "Load XML file -> '%s' of size -> %s from -> '%s'...",
1267
+ file_name,
1268
+ file_size,
1269
+ file_path,
747
1270
  )
748
1271
  success = self.load_xml_data(file_path, xpath=xpath)
749
1272
  if success:
750
- logger.info(
751
- "Successfully loaded XML file -> '%s'", file_path
1273
+ self.logger.info(
1274
+ "Successfully loaded XML file -> '%s'.",
1275
+ file_path,
752
1276
  )
753
1277
 
754
- except NotADirectoryError as nde:
755
- logger.error("Error -> %s", str(nde))
756
- except FileNotFoundError as fnfe:
757
- logger.error("Error -> %s", str(fnfe))
758
- except PermissionError as pe:
759
- logger.error("Error -> %s", str(pe))
1278
+ except NotADirectoryError:
1279
+ self.logger.error(
1280
+ "Provided path -> '%s' is not a directory",
1281
+ path_to_root,
1282
+ )
1283
+ return False
1284
+ except FileNotFoundError:
1285
+ self.logger.error(
1286
+ "Provided path -> '%s' does not exist in file system!",
1287
+ path_to_root,
1288
+ )
1289
+ return False
1290
+ except PermissionError:
1291
+ self.logger.error(
1292
+ "Missing permission to access path -> '%s'",
1293
+ path_to_root,
1294
+ )
1295
+ return False
1296
+
1297
+ return True
1298
+
1299
+ # end method definition
1300
+
1301
+ def load_web_links(
1302
+ self,
1303
+ url: str,
1304
+ common_data: dict | None = None,
1305
+ pattern: str = r"",
1306
+ ) -> list | None:
1307
+ """Get all linked file URLs on a given web page (url) that are following a given pattern.
1308
+
1309
+ Construct a list of dictionaries based on this. This method is a helper method for load_web() below.
1310
+
1311
+ Args:
1312
+ url (str):
1313
+ The web page URL.
1314
+ common_data (dict | None, optional):
1315
+ Fields that should be added to each dictionary item. Defaults to None.
1316
+ pattern (str, optional):
1317
+ Regular Expression. Defaults to r"".
1318
+
1319
+ Returns:
1320
+ list | None:
1321
+ List of links on the web page that are complying with the given regular expression.
1322
+
1323
+ """
1324
+
1325
+ try:
1326
+ response = requests.get(url, timeout=300)
1327
+ response.raise_for_status()
1328
+ except requests.RequestException:
1329
+ self.logger.error("Failed to retrieve page at %s", url)
1330
+ return []
1331
+
1332
+ # Find all file links (hyperlinks) on the page (no file extension assumed)
1333
+ # Example filename pattern: "al022023.public.005"
1334
+ file_links = re.findall(r'href="([^"]+)"', response.text)
1335
+ if not file_links:
1336
+ self.logger.warning("No file links found on the web page -> %s", url)
1337
+ return []
1338
+
1339
+ result_list = []
1340
+ base_url = url if url.endswith("/") else url + "/"
1341
+
1342
+ for link in file_links:
1343
+ data = common_data.copy() if common_data else {}
1344
+
1345
+ # Construct the full URL
1346
+ full_url = base_url + link.lstrip("/")
1347
+
1348
+ if pattern:
1349
+ # Filter by expected naming pattern for links
1350
+ match = re.search(pattern, link)
1351
+ if not match:
1352
+ continue
1353
+
1354
+ # Extract and assign groups if they exist
1355
+ # TODO(mdiefenb): these names are currently hard-coded
1356
+ # for the National Hurricane Center Dataset (NHC)
1357
+ if len(match.groups()) >= 1:
1358
+ data["Code"] = match.group(1).upper()
1359
+ if len(match.groups()) >= 2:
1360
+ data["Type"] = match.group(2)
1361
+ if len(match.groups()) >= 3:
1362
+ data["Message ID"] = match.group(3)
1363
+
1364
+ data["URL"] = full_url
1365
+ data["Filename"] = link
1366
+
1367
+ result_list.append(data)
1368
+
1369
+ return result_list
1370
+
1371
+ # end method definition
1372
+
1373
+ def load_web(
1374
+ self,
1375
+ values: list,
1376
+ value_name: str,
1377
+ url_templates: list,
1378
+ special_values: list | None = None,
1379
+ special_url_templates: dict | None = None,
1380
+ pattern: str = r"",
1381
+ ) -> bool:
1382
+ """Traverse years and bulletin types to collect all bulletin URLs.
1383
+
1384
+ Args:
1385
+ values (list):
1386
+ List of values to travers over
1387
+ value_name (str):
1388
+ Dictionary key to construct an item in combination with a value from values
1389
+ url_templates (list):
1390
+ URLs to travers per value. The URLs should contain one {} that is
1391
+ replace by the current value.
1392
+ special_values (list | None, optional):
1393
+ List of vales (a subset of the other values list)
1394
+ that we want to handle in a special way. Defaults to None.
1395
+ special_url_templates (dict | None, optional):
1396
+ URLs for the special values. Defaults to None.
1397
+ The dictionary keys are the special values. The
1398
+ dictionary values are lists of special URLs with placeholders.
1399
+ pattern (str, optional):
1400
+ Regular expression to find the proper links on the page. Defaults to r"".
1401
+
1402
+ Returns:
1403
+ bool:
1404
+ True for success, False in case of an error.
1405
+
1406
+ """
1407
+
1408
+ result_list = []
1409
+
1410
+ # We have two nested for loops below. The out traverses over all placeholder values.
1411
+ # These could be the calendar years, e.g. [2003,...,2024]
1412
+ # The inner for loop traverses over the list of specified URLs. We can have multiple for
1413
+ # each value.
1414
+
1415
+ # Do we have a list of placeholder values we want to iterate over?
1416
+ if values:
1417
+ # Traverse all values in the values list:
1418
+ for value in values:
1419
+ # Do we want a special treatment for this value (e.g. the current year)
1420
+ if value in special_values:
1421
+ self.logger.info("Processing special value -> '%s'...", value)
1422
+ if value not in special_url_templates and str(value) not in special_url_templates:
1423
+ self.logger.error(
1424
+ "Cannot find key -> '%s' in special URL templates dictionary -> %s! Skipping...",
1425
+ value,
1426
+ str(special_url_templates),
1427
+ )
1428
+ continue
1429
+ # If the dictionary uses string keys then we need to convert the value
1430
+ # to a string as well to avoid key errors:
1431
+ if str(value) in special_url_templates:
1432
+ value = str(value)
1433
+ special_url_template_list = special_url_templates[value]
1434
+ for special_url_template in special_url_template_list:
1435
+ # Now the value is inserted into the placeholder in the URL:
1436
+ special_url = special_url_template.format(value)
1437
+ common_data = {value_name: value} if value_name else None
1438
+ result_list += self.load_web_links(
1439
+ url=special_url,
1440
+ common_data=common_data,
1441
+ pattern=pattern,
1442
+ )
1443
+ else: # normal URLs
1444
+ self.logger.info("Processing value -> '%s'...", value)
1445
+ for url_template in url_templates:
1446
+ # Now the value is inserted into the placeholder in the URL:
1447
+ url = url_template.format(value)
1448
+ common_data = {value_name: value} if value_name else None
1449
+ result_list += self.load_web_links(
1450
+ url=url,
1451
+ common_data=common_data,
1452
+ pattern=pattern,
1453
+ )
1454
+ else:
1455
+ for url_template in url_templates:
1456
+ url = url_template.format(value)
1457
+ result_list += self.load_web_links(
1458
+ url=url,
1459
+ common_data=None,
1460
+ pattern=pattern,
1461
+ )
1462
+
1463
+ # Add the data list to the data frame:
1464
+ self.append(result_list)
760
1465
 
761
1466
  return True
762
1467
 
763
1468
  # end method definition
764
1469
 
765
1470
  def partitionate(self, number: int) -> list:
766
- """Partition a data frame into equally sized
767
- partions
1471
+ """Partition a data frame into equally sized partitions.
768
1472
 
769
1473
  Args:
770
- number (int): Number of partitions
1474
+ number (int):
1475
+ The number of desired partitions.
771
1476
 
772
1477
  Returns:
773
- list: List of partitions
1478
+ list:
1479
+ A list of created partitions.
1480
+
774
1481
  """
775
1482
 
776
1483
  # Calculate the approximate size of each partition
@@ -784,24 +1491,20 @@ class Data:
784
1491
  number = 1
785
1492
  remainder = 0
786
1493
 
787
- logger.info(
788
- "Data set has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
1494
+ self.logger.info(
1495
+ "Data frame has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
789
1496
  str(size),
790
1497
  str(number),
791
1498
  str(partition_size),
792
1499
  str(remainder),
793
1500
  )
794
1501
 
795
- # Initialize a list to store partitions
1502
+ # Initialize a list to store partitions:
796
1503
  partitions = []
797
1504
  start_index = 0
798
1505
 
799
- # Slice the DataFrame into equally sized partitions
1506
+ # Slice the data frame into equally sized partitions:
800
1507
  for i in range(number):
801
- # start_index = i * partition_size
802
- # end_index = (i + 1) * partition_size if i < number - 1 else None
803
- # partition = self._df.iloc[start_index:end_index]
804
- # partitions.append(partition)
805
1508
  # Calculate the end index for this partition
806
1509
  end_index = start_index + partition_size + (1 if i < remainder else 0)
807
1510
  partition = self._df.iloc[start_index:end_index]
@@ -816,34 +1519,44 @@ class Data:
816
1519
  """Partition a data frame based on equal values in a specified column.
817
1520
 
818
1521
  Args:
819
- column_name (str): The column name to partition by
1522
+ column_name (str):
1523
+ The column name to partition by.
820
1524
 
821
1525
  Returns:
822
- list: List of partitions
1526
+ list | None:
1527
+ List of partitions or None in case of an error (e.g. column name does not exist).
1528
+
823
1529
  """
824
1530
 
825
1531
  if column_name not in self._df.columns:
826
- logger.error(
827
- "Column -> '%s' does not exist in the Data Frame. Data Frame has these columns -> %s",
1532
+ self.logger.error(
1533
+ "Cannot partitionate by column -> '%s'. Column does not exist in the data frame. Data frame has these columns -> %s",
828
1534
  column_name,
829
1535
  str(self._df.columns),
830
1536
  )
831
1537
  return None
832
1538
 
833
- # Separate rows with NaN or None values in the specified column
1539
+ # Separate rows with NaN or None values in the specified column:
834
1540
  nan_partitions = self._df[self._df[column_name].isna()]
1541
+
1542
+ # Keep only rows where the specified column has valid (non-NaN) values:
835
1543
  non_nan_df = self._df.dropna(subset=[column_name])
836
1544
 
837
- # Group by the specified column and create a list of DataFrames for each group
1545
+ # Group the non-NaN DataFrame by the specified column's values:
838
1546
  grouped = non_nan_df.groupby(column_name)
1547
+
1548
+ # Create a list of partitions (DataFrames) for each unique value in the column:
839
1549
  partitions = [group for _, group in grouped]
840
1550
 
841
- # Add each row with NaN or None values as its own partition
842
- for i in range(len(nan_partitions)):
843
- partitions.append(nan_partitions.iloc[[i]])
1551
+ # Add each row with NaN/None as its own partition
1552
+ # iterrows() returns each row as a Series. To convert it back to a DataFrame:
1553
+ # 1. .to_frame() turns the Series into a DataFrame, but with the original column names as rows.
1554
+ # 2. .T (transpose) flips it back, turning the original row into a proper DataFrame row.
1555
+ # This ensures that even rows with NaN values are treated as DataFrame partitions.
1556
+ partitions.extend([row.to_frame().T for _, row in nan_partitions.iterrows()])
844
1557
 
845
- logger.info(
846
- "Data Frame has been partitioned into -> %s partitions based on the values in column '%s'...",
1558
+ self.logger.info(
1559
+ "Data frame has been partitioned into -> %s partitions based on the values in column -> '%s'...",
847
1560
  str(len(partitions)),
848
1561
  column_name,
849
1562
  )
@@ -853,18 +1566,19 @@ class Data:
853
1566
  # end method definition
854
1567
 
855
1568
  def deduplicate(self, unique_fields: list, inplace: bool = True) -> pd.DataFrame:
856
- """Remove dupclicate rows that have all fields in
857
- unique_fields in common.
1569
+ """Remove dupclicate rows that have all fields in unique_fields in common.
858
1570
 
859
1571
  Args:
860
- unique_fields (list): Defines the fields for which we want a unique
861
- combination.
862
- inplace (bool, optional): True if the deduplication happens in-place.
863
- Defaults to True.
1572
+ unique_fields (list):
1573
+ Defines the fields for which we want a unique combination for.
1574
+ inplace (bool, optional):
1575
+ True if the deduplication happens in-place. Defaults to True.
1576
+
864
1577
  Returns:
865
- pd.DataFrame | None: If inplace is False than a new deduplicatd DataFrame
866
- is returned. Otherwise the object is modified in place
867
- and self._df is returned.
1578
+ pd.DataFrame:
1579
+ If inplace is False than a new deduplicatd data frame is returned.
1580
+ Otherwise the object is modified in place and self._df is returned.
1581
+
868
1582
  """
869
1583
 
870
1584
  if inplace:
@@ -878,34 +1592,38 @@ class Data:
878
1592
 
879
1593
  # end method definition
880
1594
 
881
- def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame:
882
- """Sort the data frame based on one or multiple fields -
883
- either in place or return it as a new data frame (e.g. not modifying self._df)
1595
+ def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame | None:
1596
+ """Sort the data frame based on one or multiple fields.
1597
+
1598
+ Sorting can be either in place or return it as a new data frame
1599
+ (e.g. not modifying self._df).
884
1600
 
885
1601
  Args:
886
- sort_fields (list): Columns / fields to be used for sorting
887
- inplace (bool, optional): If the sorting should be inplace, i.e. modifying self._df.
888
- Defaults to True.
1602
+ sort_fields (list):
1603
+ The columns / fields to be used for sorting.
1604
+ inplace (bool, optional):
1605
+ If the sorting should be inplace, i.e. modifying self._df.
1606
+ Defaults to True.
1607
+
889
1608
  Returns:
890
- pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1609
+ pd.DataFrame | None:
1610
+ New data frame (if inplace = False) or self._df (if inplace = True).
1611
+ None in case of an error.
1612
+
891
1613
  """
892
1614
 
893
1615
  if self._df is None:
894
1616
  return None
895
1617
 
896
1618
  if not all(sort_field in self._df.columns for sort_field in sort_fields):
897
- logger.warning(
898
- "Not all of the given sort fields -> %s do exist in the Data Frame.",
1619
+ self.logger.warning(
1620
+ "Not all of the given sort fields -> %s do exist in the data frame.",
899
1621
  str(sort_fields),
900
1622
  )
901
- # Reduce the sort fields to those that really exist in the DataFrame:
902
- sort_fields = [
903
- sort_field
904
- for sort_field in sort_fields
905
- if sort_field in self._df.columns
906
- ]
907
- logger.warning(
908
- "Only these given sort fields -> %s do exist as columns in the Data Frame.",
1623
+ # Reduce the sort fields to those that really exist in the data frame:
1624
+ sort_fields = [sort_field for sort_field in sort_fields if sort_field in self._df.columns]
1625
+ self.logger.warning(
1626
+ "Only these given sort fields -> %s do exist as columns in the data frame.",
909
1627
  str(sort_fields),
910
1628
  )
911
1629
 
@@ -920,138 +1638,278 @@ class Data:
920
1638
 
921
1639
  # end method definition
922
1640
 
923
- def flatten(
924
- self,
925
- parent_field: str,
926
- flatten_fields: list,
927
- ):
928
- """Flatten a sub-dictionary by copying selected fields to the
929
- parent dictionary. This is e.g. useful for then de-duplicate
930
- a data set.
1641
+ def flatten(self, parent_field: str, flatten_fields: list, concatenator: str = "_") -> None:
1642
+ """Flatten a sub-dictionary by copying selected fields to the parent dictionary.
1643
+
1644
+ This is e.g. useful for then de-duplicate a data frame.
1645
+ To flatten a data frame makes sense in situation when a column used
1646
+ to have a list of dictionaries and got "exploded" (see explode_and_flatten()
1647
+ method below). In this case the column as dictionary values that then can
1648
+ be flattened.
931
1649
 
932
1650
  Args:
933
- parent_field (str): name of the field in the parent dictionary
934
- flatten_fields (list): fields in the sub-dictionary to copy
935
- into the parent dictionary.
1651
+ parent_field (str):
1652
+ Name prefix of the new column in the data frame. The flattened field
1653
+ names are added with a leading underscore.
1654
+ flatten_fields (list):
1655
+ Fields in the dictionary of the source column that are copied
1656
+ as new columns into the data frame.
1657
+ concatenator (str, optional):
1658
+ Character or string used to concatenate the parent field with the flattened field
1659
+ to create a unique name.
1660
+
936
1661
  """
937
1662
 
1663
+ # First do a sanity check if the data frame is not yet initialized.
1664
+ if self._df is None:
1665
+ self.logger.error(
1666
+ "The data frame is not initialized or empty. Cannot flatten field(s) -> '%s' in the data frame.",
1667
+ flatten_fields,
1668
+ )
1669
+ return
1670
+
1671
+ if parent_field not in self._df.columns:
1672
+ self.logger.warning(
1673
+ "The parent field -> '%s' cannot be flattened as it doesn't exist as column in the data frame!",
1674
+ parent_field,
1675
+ )
1676
+ return
1677
+
938
1678
  for flatten_field in flatten_fields:
939
- flat_field = parent_field + "_" + flatten_field
1679
+ flat_field = parent_field + concatenator + flatten_field
940
1680
  # The following expression generates a new column in the
941
1681
  # data frame with the name of 'flat_field'.
942
- # In the lambada function x is a dictionary that includes the subvalues
1682
+ # In the lambda function x is a dictionary that includes the subvalues
943
1683
  # and it returns the value of the given flatten field
944
1684
  # (if it exists, otherwise None). So x is self._df[parent_field], i.e.
945
1685
  # what the lambda function gets 'applied' on.
946
1686
  self._df[flat_field] = self._df[parent_field].apply(
947
- lambda x, sub_field=flatten_field: (
948
- x.get(sub_field, None) if isinstance(x, dict) else None
949
- )
1687
+ lambda x, sub_field=flatten_field: (x.get(sub_field, None) if isinstance(x, dict) else None),
950
1688
  )
951
1689
 
952
1690
  # end method definition
953
1691
 
954
1692
  def explode_and_flatten(
955
1693
  self,
956
- explode_field: str | list,
1694
+ explode_fields: str | list,
957
1695
  flatten_fields: list | None = None,
958
1696
  make_unique: bool = False,
959
1697
  reset_index: bool = False,
960
1698
  split_string_to_list: bool = False,
961
- ) -> pd.DataFrame:
962
- """Explode a substructure in the Data Frame
1699
+ separator: str = ";,",
1700
+ ) -> pd.DataFrame | None:
1701
+ """Explode a substructure in the Pandas data frame.
963
1702
 
964
1703
  Args:
965
- explode_field (str | list): Field(s) to explode which each has/have a list structure.
966
- Exploding multiple columns at once is possible. This delivers
967
- a very different result compared to exploding one column after
968
- the other!
969
- flatten_fields (list): Fields in the exploded substructure to include
970
- in the main dictionaries for easier processing.
971
- make_unique (bool, optional): if True deduplicate the exploded data frame.
972
- flatten (bool, optional): if True flatten the exploded data frame.
1704
+ explode_fields (str | list):
1705
+ Field(s) to explode. Each field to explode should have a list structure.
1706
+ Exploding multiple columns at once is possible. This delivers
1707
+ a very different result compared to exploding one column after the other!
1708
+ flatten_fields (list):
1709
+ Fields in the exploded substructure to include
1710
+ in the main dictionaries for easier processing.
1711
+ make_unique (bool, optional):
1712
+ If True, deduplicate the exploded data frame.
1713
+ reset_index (bool, False):
1714
+ If True, then the index is reset, False = Index is not reset.
1715
+ split_string_to_list (bool, optional):
1716
+ If True flatten the exploded data frame.
1717
+ separator (str, optional):
1718
+ Characters used to split the string values in the given column into a list.
1719
+
973
1720
  Returns:
974
- pd.DataFrame: Pointer to the Pandas DataFrame
1721
+ pd.DataFrame | None:
1722
+ Pointer to the Pandas data frame.
1723
+
975
1724
  """
976
1725
 
977
- def update_column(row):
978
- try:
979
- if sub in row:
980
- return row[sub]
981
- except (IndexError, KeyError, ValueError):
982
- return ""
983
-
984
- # Define a function to split a string into a list
985
- def string_to_list(string: str | None) -> list:
986
- if not string or pd.isna(string):
1726
+ def update_column(row: pd.Series, sub: str) -> str:
1727
+ """Extract the value of a sub-column from a nested dictionary within a Pandas Series.
1728
+
1729
+ Args:
1730
+ row (pd.Series):
1731
+ A row from the data frame.
1732
+ sub (str):
1733
+ The sub-column name to extract.
1734
+
1735
+ Returns:
1736
+ str:
1737
+ The value of the sub-column, or an empty string if not found.
1738
+
1739
+ """
1740
+
1741
+ if isinstance(row, dict) and sub in row:
1742
+ return row[sub]
1743
+ return ""
1744
+
1745
+ # end def update_column()
1746
+
1747
+ def string_to_list(value: str) -> list:
1748
+ """Convert a string to a list by splitting it using a specified separator.
1749
+
1750
+ If the input is already a list, it is returned as-is. If the input is `None` or a missing value,
1751
+ an empty list is returned. Otherwise, the string is split into a list of substrings using
1752
+ the given separator. Leading and trailing spaces in the resulting substrings are removed.
1753
+
1754
+ Args:
1755
+ value (str):
1756
+ The input string to be converted into a list. Can also be a list, `None`,
1757
+ or a missing value (e.g., NaN).
1758
+
1759
+ Returns:
1760
+ list:
1761
+ A list of substrings if the input is a string, or an empty list if the input
1762
+ is `None` or a missing value. If the input is already a list, it is returned unchanged.
1763
+
1764
+ """
1765
+
1766
+ # Check if the value is already a list; if so, return it directly
1767
+ if isinstance(value, list):
1768
+ return value
1769
+
1770
+ # If the value is None or a missing value (e.g., NaN), return an empty list
1771
+ if not value or pd.isna(value):
987
1772
  return []
988
- # Use regular expression to split by comma, semicolon, or comma followed by space
989
- return re.split(r"[;,]\s*", str(string))
990
1773
 
991
- if isinstance(explode_field, list):
992
- logger.info("Explode multiple columns -> %s", str(explode_field))
993
- elif isinstance(explode_field, str):
994
- logger.info("Explode single column -> '%s'", explode_field)
1774
+ # Use a regular expression to split the string by the separator
1775
+ # and remove leading/trailing spaces from each resulting substring
1776
+ return_list = re.split(rf"[{separator}]\s*", str(value))
1777
+
1778
+ return return_list
1779
+
1780
+ # end def string_to_list()
1781
+
1782
+ #
1783
+ # Start of main method:
1784
+ #
1785
+
1786
+ # First do a sanity check if the data frame is not yet initialized.
1787
+ if self._df is None:
1788
+ self.logger.error(
1789
+ "The data frame is not initialized or empty. Cannot explode data frame.",
1790
+ )
1791
+ return None
1792
+
1793
+ # Next do a sanity check for the given explode_field. It should
1794
+ # either be a string (single column name) or a list (multiple column names):
1795
+ if isinstance(explode_fields, list):
1796
+ self.logger.info("Exploding list of columns -> %s", str(explode_fields))
1797
+ elif isinstance(explode_fields, str):
1798
+ self.logger.info("Exploding single column -> '%s'", explode_fields)
995
1799
  else:
996
- logger.error(
997
- "Illegal explode field(s) data type provided -> %s", type(explode_field)
1800
+ self.logger.error(
1801
+ "Illegal explode field(s) data type -> %s. Explode field must either be a string or a list of strings.",
1802
+ type(explode_fields),
998
1803
  )
999
1804
  return self._df
1000
1805
 
1001
- if split_string_to_list:
1002
- # Apply the function to convert the 'string_column' values to lists
1003
- self._df[explode_field] = self._df[explode_field].apply(string_to_list)
1806
+ # Ensure explode_fields is a list for uniform processing:
1807
+ if isinstance(explode_fields, str):
1808
+ explode_fields = [explode_fields]
1809
+
1810
+ # Process nested field names with '.'
1811
+ processed_fields = []
1812
+ for field in explode_fields:
1813
+ # The "." indicates that the column has dictionary values:
1814
+ if "." in field:
1815
+ main, sub = field.split(".", 1)
1816
+ if main not in self._df.columns:
1817
+ self.logger.error(
1818
+ "The column -> '%s' does not exist in the data frame! Cannot explode it. Data frame has these columns -> %s",
1819
+ main,
1820
+ str(self._df.columns.tolist()),
1821
+ )
1822
+ continue
1823
+
1824
+ # Use update_column to extract the dictionary key specified by the sub value:
1825
+ self.logger.info(
1826
+ "Extracting dictionary value for key -> '%s' from column -> '%s'.",
1827
+ sub,
1828
+ main,
1829
+ )
1830
+ self._df[main] = self._df[main].apply(update_column, args=(sub,))
1831
+ processed_fields.append(main)
1832
+ else:
1833
+ processed_fields.append(field)
1834
+
1835
+ # Verify all processed fields exist in the data frame:
1836
+ missing_columns = [col for col in processed_fields if col not in self._df.columns]
1837
+ if missing_columns:
1838
+ self.logger.error(
1839
+ "The following columns are missing in the data frame and cannot be exploded -> %s. Data frame has these columns -> %s",
1840
+ missing_columns,
1841
+ str(self._df.columns.tolist()),
1842
+ )
1843
+ return self._df
1004
1844
 
1845
+ # Handle splitting strings into lists if required:
1846
+ if split_string_to_list:
1847
+ for field in processed_fields:
1848
+ self.logger.info(
1849
+ "Splitting strings in column -> '%s' into lists using separator -> '%s'",
1850
+ field,
1851
+ separator,
1852
+ )
1853
+ # Apply the function to convert the string values in the column (give by the name in explode_field) to lists
1854
+ # The string_to_list() sub-method above also considers the separator parameter.
1855
+ self._df[field] = self._df[field].apply(string_to_list)
1856
+
1857
+ # Explode all specified columns at once.
1858
+ # explode() can either take a string field or a list of fields.
1859
+ # # It is VERY important to do the explosion of multiple columns together -
1860
+ # otherwise we get combinatorial explosion. Explosion of multiple columns 1-by-1
1861
+ # is VERY different from doing the explosion together!
1862
+ self.logger.info("Validated column(s) to explode -> %s", processed_fields)
1005
1863
  try:
1006
- # remove the sub dictionary that sometimes is introduced by
1007
- # XML loading
1008
- if "." in explode_field:
1009
- main = explode_field.split(".")[0]
1010
- sub = explode_field.split(".")[1]
1011
- self._df[main] = self._df[main].apply(update_column)
1012
- explode_field = main
1013
- # Explode the field that has list values
1014
- self._df = self._df.explode(column=explode_field)
1015
- except KeyError:
1016
- logger.error("Column -> '%s' not found in Data Frame!", str(explode_field))
1864
+ self._df = self._df.explode(
1865
+ column=processed_fields,
1866
+ ignore_index=reset_index,
1867
+ )
1017
1868
  except ValueError:
1018
- logger.error(
1019
- "Unable to explode the specified column -> '%s'!", str(explode_field)
1869
+ self.logger.error(
1870
+ "Error exploding columns -> %s",
1871
+ processed_fields,
1020
1872
  )
1873
+ return self._df
1021
1874
 
1022
1875
  if flatten_fields:
1023
- self.flatten(parent_field=explode_field, flatten_fields=flatten_fields)
1876
+ # Ensure that flatten() is called for each exploded column
1877
+ for field in processed_fields:
1878
+ self.flatten(parent_field=field, flatten_fields=flatten_fields)
1024
1879
 
1880
+ # Deduplicate rows if required
1025
1881
  if make_unique:
1026
1882
  self._df.drop_duplicates(subset=flatten_fields, inplace=True)
1027
1883
 
1884
+ # Reset index explicitly if not handled during explode
1028
1885
  if reset_index:
1029
- self._df.reset_index(inplace=True)
1886
+ self._df.reset_index(drop=True, inplace=True)
1030
1887
 
1031
1888
  return self._df
1032
1889
 
1033
1890
  # end method definition
1034
1891
 
1035
1892
  def drop_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
1036
- """Drop selected columns from the Data Frame
1893
+ """Drop selected columns from the Pandas data frame.
1037
1894
 
1038
1895
  Args:
1039
- column_names (list): list of column names to drop.
1040
- inplace (bool, optional): If the dropping should be inplace, i.e. modifying self._df.
1041
- Defaults to True.
1896
+ column_names (list):
1897
+ The list of column names to drop.
1898
+ inplace (bool, optional):
1899
+ Whether or not the dropping should be inplace, i.e. modifying self._df.
1900
+ Defaults to True.
1901
+
1042
1902
  Returns:
1043
- pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1903
+ pd.DataFrame:
1904
+ New data frame (if inplace = False) or self._df (if inplace = True)
1905
+
1044
1906
  """
1045
1907
 
1046
1908
  if not all(column_name in self._df.columns for column_name in column_names):
1047
- # Reduce the column names to those that really exist in the DataFrame:
1048
- column_names = [
1049
- column_name
1050
- for column_name in column_names
1051
- if column_name in self._df.columns
1052
- ]
1053
- logger.warning(
1054
- "Reduce to these columns -> %s that do exist in the Data Frame.",
1909
+ # Reduce the column names to those that really exist in the data frame:
1910
+ column_names = [column_name for column_name in column_names if column_name in self._df.columns]
1911
+ self.logger.info(
1912
+ "Drop columns -> %s from the data frame.",
1055
1913
  str(column_names),
1056
1914
  )
1057
1915
 
@@ -1065,25 +1923,26 @@ class Data:
1065
1923
  # end method definition
1066
1924
 
1067
1925
  def keep_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
1068
- """Keep only selected columns from the Data Frame. Drop the rest.
1926
+ """Keep only selected columns in the data frame. Drop the rest.
1069
1927
 
1070
1928
  Args:
1071
- column_names (list): list of column names to keep.
1072
- inplace (bool, optional): If the keeping should be inplace, i.e. modifying self._df.
1073
- Defaults to True.
1929
+ column_names (list):
1930
+ A list of column names to keep.
1931
+ inplace (bool, optional):
1932
+ If the keeping should be inplace, i.e. modifying self._df.
1933
+ Defaults to True.
1934
+
1074
1935
  Returns:
1075
- pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1936
+ pd.DataFrame:
1937
+ New data frame (if inplace = False) or self._df (if inplace = True).
1938
+
1076
1939
  """
1077
1940
 
1078
1941
  if not all(column_name in self._df.columns for column_name in column_names):
1079
- # Reduce the column names to those that really exist in the DataFrame:
1080
- column_names = [
1081
- column_name
1082
- for column_name in column_names
1083
- if column_name in self._df.columns
1084
- ]
1085
- logger.warning(
1086
- "Reduce to these columns -> %s that do exist in the Data Frame.",
1942
+ # Reduce the column names to those that really exist in the data frame:
1943
+ column_names = [column_name for column_name in column_names if column_name in self._df.columns]
1944
+ self.logger.info(
1945
+ "Reduce columns to keep to these columns -> %s that do exist in the data frame.",
1087
1946
  column_names,
1088
1947
  )
1089
1948
 
@@ -1101,262 +1960,797 @@ class Data:
1101
1960
 
1102
1961
  # end method definition
1103
1962
 
1104
- def cleanse(self, cleansings: dict):
1105
- """Cleanse data with regular expressions and upper/lower case conversion.
1963
+ def rename_column(self, old_column_name: str, new_column_name: str) -> bool:
1964
+ """Rename a data frame column.
1106
1965
 
1107
1966
  Args:
1108
- cleansings (dict): Dictionary with keys that equal the column names.
1109
- The dictionary values are dictionaries itself with
1110
- these fields:
1111
- * replacements (dict): name of a column in the data frame
1112
- * upper (bool): change the value to uppercase
1113
- * lower (bool): change the value to lowercase
1114
- Example:
1115
- cleansings = {
1116
- "airportName": {
1117
- "upper": true
1118
- "replacements" : {
1119
- "-": " ", # replace hypen with space
1120
- ",\s*": " ", # remove commas followed by on or more spaces with a single space
1121
- "\s+$": "", # remove trailing spaces at the end of the name
1122
- "^\s+": "", # remove spaces at the beginning of the name
1123
- }
1124
- "length": 10
1125
- }
1126
- "airportId": {
1127
- "upper": true
1128
- "replacements" : {
1129
- "K(.{3})": "\1", # if the airport has 4 charters and starts with a 'K' we remove the 'K'
1130
- "\/": "", # remove forward slashes - this helps to have consistency with N/A, NA, n/a, na
1131
- }
1132
- }
1133
- }
1967
+ old_column_name (str):
1968
+ The old name of the column.
1969
+ new_column_name (str):
1970
+ The new name of the column.
1971
+
1972
+ Returns:
1973
+ bool:
1974
+ True = Success, False = Error
1975
+
1976
+ """
1977
+
1978
+ if self._df is None:
1979
+ return False
1980
+
1981
+ if old_column_name not in self._df.columns:
1982
+ self.logger.error(
1983
+ "Cannot rename column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
1984
+ old_column_name,
1985
+ str(self._df.columns),
1986
+ )
1987
+ return False
1988
+
1989
+ if new_column_name in self._df.columns:
1990
+ self.logger.error(
1991
+ "Cannot rename column -> '%s' to -> '%s'. New name does already exist as column in the data frame! Data frame has these columns -> %s",
1992
+ old_column_name,
1993
+ new_column_name,
1994
+ str(self._df.columns),
1995
+ )
1996
+ return False
1997
+
1998
+ self._df.rename(columns={old_column_name: new_column_name}, inplace=True)
1999
+
2000
+ return True
2001
+
2002
+ # end method definition
2003
+
2004
+ def is_dict_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
2005
+ """Safely checks if a column predominantly contains dictionary-like objects.
2006
+
2007
+ Args:
2008
+ column (pd.Series):
2009
+ The pandas Series (column) to check.
2010
+ threshold (float, optional):
2011
+ 0.0 < threshold <= 1.0. Float representation of the percentage.
2012
+ Default = 0.5 (50%).
2013
+
2014
+ Returns:
2015
+ bool:
2016
+ True if the column contains mostly dictionary-like objects, False otherwise.
2017
+
1134
2018
  """
1135
2019
 
1136
- # Iterate over each column in regex_dict
2020
+ if not isinstance(column, pd.Series):
2021
+ self.logger.error(
2022
+ "Expected Pandas series, but got -> %s",
2023
+ str(type(column)),
2024
+ )
2025
+ return False
2026
+ if not 0.0 < threshold <= 1.0:
2027
+ self.logger.error(
2028
+ "Threshold must be between 0.0 and 1.0, but got -> %s",
2029
+ str(threshold),
2030
+ )
2031
+ return False
2032
+
2033
+ # Drop null values (NaN or None) and check types of remaining values
2034
+ non_null_values = column.dropna()
2035
+ dict_count = non_null_values.apply(lambda x: isinstance(x, dict)).sum()
2036
+
2037
+ # If more than threshold % of non-null values are dictionaries, return True.
2038
+ # Else return False.
2039
+ return dict_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
2040
+
2041
+ # end method definition
2042
+
2043
+ def is_list_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
2044
+ """Safely checks if a column predominantly contains list-like objects.
2045
+
2046
+ Args:
2047
+ column (pd.Series):
2048
+ The pandas Series (column) to check.
2049
+ threshold (float, optional):
2050
+ 0.0 < threshold <= 1.0. Float representation of the percentage. Default = 0.5 (50%).
2051
+
2052
+ Returns:
2053
+ bool:
2054
+ True if the column contains list-like objects, False otherwise.
2055
+
2056
+ """
2057
+
2058
+ if not isinstance(column, pd.Series):
2059
+ self.logger.error(
2060
+ "Expected pandas series, but got -> %s",
2061
+ str(type(column)),
2062
+ )
2063
+ return False
2064
+ if not 0.0 < threshold <= 1.0:
2065
+ self.logger.error(
2066
+ "Threshold must be between 0.0 and 1.0, but got -> %s",
2067
+ str(threshold),
2068
+ )
2069
+ return False
2070
+
2071
+ # Drop null values (NaN or None) and check types of remaining values
2072
+ non_null_values = column.dropna()
2073
+ list_count = non_null_values.apply(lambda x: isinstance(x, list)).sum()
2074
+
2075
+ # If more than threshold % of non-null values are lists, return True.
2076
+ # Else return False.
2077
+ return list_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
2078
+
2079
+ # end method definition
2080
+
2081
+ def is_string_column(self, column: pd.Series) -> bool:
2082
+ """Determine if a Pandas series predominantly contains string values, ignoring NaN values.
2083
+
2084
+ Args:
2085
+ column (pd.Series):
2086
+ The Pandas Series to check.
2087
+
2088
+ Returns:
2089
+ bool:
2090
+ True if all non-NaN values in the column are strings, False otherwise.
2091
+
2092
+ """
2093
+
2094
+ # Drop NaN values and check if remaining values are strings
2095
+ return column.dropna().map(lambda x: isinstance(x, str)).all()
2096
+
2097
+ # end method definition
2098
+
2099
+ def cleanse(self, cleansings: dict) -> None:
2100
+ """Cleanse data with regular expressions and upper/lower case conversions.
2101
+
2102
+ Args:
2103
+ cleansings (dict):
2104
+ Dictionary with keys that equal the column names.
2105
+ The dictionary values are dictionaries themselves with
2106
+ these fields:
2107
+ * replacements (dict): name of a column in the data frame
2108
+ * upper (bool, optional, default = False): change the value to uppercase
2109
+ * lower (bool, optional, default = False): change the value to lowercase
2110
+ * capitalize (bool, optional, default = False) - first character upper case, rest lower-case
2111
+ * title (bool, optional, default = False) - first character of each word upper case
2112
+ * length (int, optional, default = 0): truncate to max length
2113
+
2114
+ """
2115
+
2116
+ # Iterate over each column in the cleansing dictionary
1137
2117
  for column, cleansing in cleansings.items():
1138
- # "colum" is the name of the field we want to cleanse.
1139
- # "cleansing" is a dict with
2118
+ # Read the cleansing parameters:
2119
+ replacements = cleansing.get("replacements", {})
2120
+ upper = cleansing.get("upper", False)
2121
+ lower = cleansing.get("lower", False)
2122
+ capitalize = cleansing.get("capitalize", False)
2123
+ title = cleansing.get("title", False)
2124
+ length = cleansing.get("length", 0)
2125
+
2126
+ # Handle dict columns - we expect the column name to seperate
2127
+ # main field from sub field using a dot syntax (e.g., "column.subfield")
1140
2128
  if "." in column:
1141
- # Handle columns with subfields
1142
- main_field, sub_field = column.split(".")
1143
- if not main_field in self._df.columns:
2129
+ column, dict_key = column.split(".")
2130
+ if column not in self._df.columns:
2131
+ self.logger.error(
2132
+ "Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
2133
+ column,
2134
+ str(self._df.columns),
2135
+ )
1144
2136
  continue
1145
- # we use the additional parameters for lambda (beside x)
1146
- # to avoid linter warning W0640
1147
- self._df[main_field] = self._df[main_field].apply(
1148
- lambda x, sub_field=sub_field, cleansing=cleansing: self._cleanse_subfield(
2137
+ # Apply cleansing to dictionary values in the main column
2138
+ self.logger.info(
2139
+ "Cleansing for column -> '%s' has a subfield -> '%s' configured. Do cleansing for dictionary items with key -> '%s'...",
2140
+ column,
2141
+ dict_key,
2142
+ dict_key,
2143
+ )
2144
+ self._df[column] = self._df[column].apply(
2145
+ lambda x,
2146
+ dict_key=dict_key,
2147
+ replacements=replacements,
2148
+ upper=upper,
2149
+ lower=lower,
2150
+ capitalize=capitalize,
2151
+ title=title,
2152
+ length=length: self._cleanse_subfield(
1149
2153
  data=x,
1150
- sub_field=sub_field,
1151
- replacements=cleansing.get("replacements", {}),
1152
- upper=cleansing.get("upper", False),
1153
- lower=cleansing.get("lower", False),
1154
- length=cleansing.get("length", 0),
1155
- )
2154
+ dict_key=dict_key,
2155
+ replacements=replacements,
2156
+ upper=upper,
2157
+ lower=lower,
2158
+ capitalize=capitalize,
2159
+ title=title,
2160
+ length=length,
2161
+ ),
1156
2162
  )
1157
- else:
1158
- if not column in self._df.columns:
2163
+ # end if "." in column
2164
+ else: # the else case handles strings and list columns
2165
+ if column not in self._df.columns:
2166
+ self.logger.error(
2167
+ "Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
2168
+ column,
2169
+ str(self._df.columns),
2170
+ )
1159
2171
  continue
1160
2172
 
1161
- logger.debug("\nBEFORE:\n%s\n", self._df[column])
2173
+ # Handle string columns:
2174
+ if self.is_string_column(self._df[column]):
2175
+ # Apply cleansing operations on string column
2176
+ self.logger.info(
2177
+ "Column -> '%s' has string values. Do cleansing for string values...",
2178
+ column,
2179
+ )
2180
+ self._df[column] = self._df[column].apply(
2181
+ lambda x,
2182
+ replacements=replacements,
2183
+ upper=upper,
2184
+ lower=lower,
2185
+ capitalize=capitalize,
2186
+ title=title,
2187
+ length=length: (
2188
+ self._apply_string_cleansing(
2189
+ value=x,
2190
+ replacements=replacements,
2191
+ upper=upper,
2192
+ lower=lower,
2193
+ capitalize=capitalize,
2194
+ title=title,
2195
+ length=length,
2196
+ )
2197
+ if isinstance(x, str)
2198
+ else x
2199
+ ),
2200
+ )
1162
2201
 
1163
- if cleansing.get("upper", False) and self._df[column].dtype == "object":
1164
- self._df[column] = self._df[column].str.upper()
1165
- if cleansing.get("lower", False) and self._df[column].dtype == "object":
1166
- self._df[column] = self._df[column].str.lower()
2202
+ # Handle list columns:
2203
+ elif self.is_list_column(self._df[column]):
2204
+ # Handle list-like columns for this we iterate over each list item
2205
+ # and apply the cleansing by calling _apply_string_cleansing() for item:
2206
+ self.logger.info(
2207
+ "Column -> '%s' has list values. Do cleansing for each list item...",
2208
+ column,
2209
+ )
2210
+ self._df[column] = self._df[column].apply(
2211
+ lambda x,
2212
+ replacements=replacements,
2213
+ upper=upper,
2214
+ lower=lower,
2215
+ capitalize=capitalize,
2216
+ title=title,
2217
+ length=length: (
2218
+ [
2219
+ (
2220
+ self._apply_string_cleansing(
2221
+ value=item,
2222
+ replacements=replacements,
2223
+ upper=upper,
2224
+ lower=lower,
2225
+ capitalize=capitalize,
2226
+ title=title,
2227
+ length=length,
2228
+ )
2229
+ if isinstance(
2230
+ item,
2231
+ str,
2232
+ ) # we just change string list items
2233
+ else item
2234
+ )
2235
+ for item in x
2236
+ ]
2237
+ if isinstance(x, list)
2238
+ else x
2239
+ ),
2240
+ )
1167
2241
 
1168
- # Handle regular columns
1169
- for regex_pattern, replacement in cleansing.get(
1170
- "replacements", {}
1171
- ).items():
1172
- # if replacement:
1173
- # \b is a word boundary anchor in regular expressions.
1174
- # It matches a position where one side is a word character
1175
- # (like a letter or digit) and the other side is a non-word character
1176
- # (like whitespace or punctuation). It's often used to match whole words.
1177
- # regex_pattern = rf"\b{regex_pattern}\b"
1178
- # self._df[column] = self._df[column].replace(
1179
- # regex=regex_pattern, value=replacement
1180
- # )
1181
- self._df[column] = self._df[column].str.replace(
1182
- pat=regex_pattern, repl=replacement, regex=True
2242
+ else:
2243
+ self.logger.error(
2244
+ "Column -> '%s' is not a string, list, or dict-like column. Skipping cleansing...",
2245
+ column,
1183
2246
  )
2247
+ # end else handling strings and lists
2248
+ # for column, cleansing in cleansings.items()
2249
+
2250
+ # end method definition
2251
+
2252
+ def _cleanse_dictionary(
2253
+ self,
2254
+ data: dict,
2255
+ dict_key: str,
2256
+ replacements: dict[str, str],
2257
+ upper: bool,
2258
+ lower: bool,
2259
+ capitalize: bool = False,
2260
+ title: bool = False,
2261
+ length: int = 0,
2262
+ ) -> dict:
2263
+ """Cleanse dictionary data within a single column value that has a given key.
2264
+
2265
+ Args:
2266
+ data (dict):
2267
+ The column dictionary value.
2268
+ dict_key (str):
2269
+ The dictionary key whose value should be cleansed in the row to cleanse.
2270
+ replacements (dict):
2271
+ Dictionary of regex replacements to apply to the subfield value.
2272
+ upper (bool):
2273
+ If True, convert value in subfield to upper-case.
2274
+ lower (bool):
2275
+ If True, convert value in subfield to lower-case.
2276
+ capitalize (bool, optional):
2277
+ If True, capitalize the first letter of the subfield value.
2278
+ title (bool, optional):
2279
+ If True, title-case the subfield value.
2280
+ length (int, optional):
2281
+ The maximum length for the subfield value.
2282
+
2283
+ Returns:
2284
+ dict:
2285
+ The updated data with the cleansing applied to the dictionary item with the given key.
2286
+
2287
+ """
2288
+
2289
+ if pd.isna(data):
2290
+ return data
1184
2291
 
1185
- if (
1186
- cleansing.get("length", 0) > 0
1187
- and self._df[column].dtype == "object"
1188
- ):
1189
- self._df[column] = self._df[column].str.slice(
1190
- 0, cleansing["length"]
2292
+ if dict_key not in data:
2293
+ self.logger.warning(
2294
+ "The dictionary key -> '%s' (field) is not in the data frame row! Cleansing skipped!",
2295
+ dict_key,
2296
+ )
2297
+ return data
2298
+
2299
+ # 1. Read the value to be cleansed from the data dict:
2300
+ value = data[dict_key]
2301
+
2302
+ # 2. Apply string operations based on the type of the value (str, list, or dict)
2303
+
2304
+ if isinstance(value, str):
2305
+ # If the value is a string, apply the string operations directly
2306
+ value: str = self._apply_string_cleansing(
2307
+ value=value,
2308
+ replacements=replacements,
2309
+ upper=upper,
2310
+ lower=lower,
2311
+ capitalize=capitalize,
2312
+ title=title,
2313
+ length=length,
2314
+ )
2315
+ elif isinstance(value, list):
2316
+ # If the value is a list, apply string operations to each element
2317
+ value: list = [
2318
+ (
2319
+ self._apply_string_cleansing(
2320
+ value=item,
2321
+ replacements=replacements,
2322
+ upper=upper,
2323
+ lower=lower,
2324
+ capitalize=capitalize,
2325
+ title=title,
2326
+ length=length,
1191
2327
  )
2328
+ if isinstance(item, str)
2329
+ else item
2330
+ )
2331
+ for item in value
2332
+ ]
2333
+ elif isinstance(value, dict):
2334
+ # If the value is a dictionary, apply string operations to each value
2335
+ value: dict = {
2336
+ k: (
2337
+ self._apply_string_cleansing(
2338
+ value=v,
2339
+ replacements=replacements,
2340
+ upper=upper,
2341
+ lower=lower,
2342
+ capitalize=capitalize,
2343
+ title=title,
2344
+ length=length,
2345
+ )
2346
+ if isinstance(v, str)
2347
+ else v
2348
+ )
2349
+ for k, v in value.items()
2350
+ }
1192
2351
 
1193
- logger.debug("\nAFTER:\n%s\n", self._df[column])
2352
+ # 3. Write back the cleansed value to the data dict:
2353
+ data[dict_key] = value
2354
+
2355
+ return data
1194
2356
 
1195
2357
  # end method definition
1196
2358
 
1197
2359
  def _cleanse_subfield(
1198
2360
  self,
1199
- data: list | dict,
1200
- sub_field: str,
1201
- replacements: dict,
2361
+ data: dict | list,
2362
+ dict_key: str,
2363
+ replacements: dict[str, str],
1202
2364
  upper: bool,
1203
2365
  lower: bool,
2366
+ capitalize: bool = False,
2367
+ title: bool = False,
1204
2368
  length: int = 0,
1205
- ) -> list | dict:
1206
- """Helper function to cleanse subfield data
2369
+ ) -> dict | list:
2370
+ """Cleanse subfield data within a single column value.
2371
+
2372
+ This is NOT a pd.Series but either a dictionary or a list of dictionaries.
1207
2373
 
1208
2374
  Args:
1209
- data (list | dict): sub data - either a list of dictionaries or a dictionary
1210
- sub_field (str): defines which field in the sub data should be updated
1211
- regex_replacements (dict): Dictionary of regular expressions
1212
- upper (bool): if True transform value in subfield to upper-case
1213
- lower (bool): if True, transform value in subfield to lower-case
1214
- length (int, optional): maximum length of the strings
2375
+ data (dict | list):
2376
+ The column value. Can be a dictionary or a list of dictionaries
2377
+ dict_key (str):
2378
+ The dictionary key whose value should be cleansed in the data to cleanse.
2379
+ replacements (dict):
2380
+ Dictionary of regex replacements to apply to the subfield value.
2381
+ upper (bool):
2382
+ If True, convert value in subfield to upper-case.
2383
+ lower (bool):
2384
+ If True, convert value in subfield to lower-case.
2385
+ capitalize (bool, optional):
2386
+ If True, capitalize the first letter of the subfield value.
2387
+ title (bool, optional):
2388
+ If True, title-case the subfield value.
2389
+ length (int, optional):
2390
+ The maximum length for the subfield value.
2391
+
1215
2392
  Returns:
1216
- list | dict: Updated data
2393
+ dict | list:
2394
+ The updated data with the cleansing applied to the subfield.
2395
+
1217
2396
  """
1218
2397
 
1219
2398
  if isinstance(data, list):
1220
- # If data is a list, apply cleansing to each dictionary in the list
1221
- for i, item in enumerate(data):
1222
- if (
1223
- item is not None
1224
- and sub_field in item
1225
- and not pd.isnull(item[sub_field])
1226
- ):
1227
- if upper:
1228
- item[sub_field] = item[sub_field].upper()
1229
- elif lower:
1230
- item[sub_field] = item[sub_field].lower()
1231
- for regex_pattern, replacement in replacements.items():
1232
- if replacement:
1233
- regex_pattern = rf"\b{regex_pattern}\b"
1234
- item[sub_field] = re.sub(
1235
- regex_pattern, replacement, item[sub_field]
1236
- )
1237
- if length > 0:
1238
- item[sub_field] = item[sub_field][:length]
1239
- data[i] = item
1240
- elif isinstance(data, dict):
1241
- # If data is a dictionary, apply cleansing directly to the subfield
1242
- if sub_field in data and not pd.isnull(data[sub_field]):
1243
- if upper:
1244
- data[sub_field] = data[sub_field].upper()
1245
- elif lower:
1246
- data[sub_field] = data[sub_field].lower()
1247
- for regex_pattern, replacement in replacements.items():
1248
- if replacement:
1249
- regex_pattern = rf"\b{regex_pattern}\b"
1250
- data[sub_field] = re.sub(
1251
- regex_pattern, replacement, data[sub_field]
2399
+ data = [
2400
+ (
2401
+ self._cleanse_dictionary(
2402
+ data=item,
2403
+ dict_key=dict_key,
2404
+ replacements=replacements,
2405
+ upper=upper,
2406
+ lower=lower,
2407
+ capitalize=capitalize,
2408
+ title=title,
2409
+ length=length,
1252
2410
  )
1253
- if length > 0:
1254
- data[sub_field] = data[sub_field][:length]
2411
+ if item is not None and dict_key in item and not pd.isna(item[dict_key])
2412
+ else item
2413
+ )
2414
+ for item in data
2415
+ ]
2416
+ elif isinstance(data, dict):
2417
+ data = self._cleanse_dictionary(
2418
+ data=data,
2419
+ dict_key=dict_key,
2420
+ replacements=replacements,
2421
+ upper=upper,
2422
+ lower=lower,
2423
+ capitalize=capitalize,
2424
+ title=title,
2425
+ length=length,
2426
+ )
2427
+
1255
2428
  return data
1256
2429
 
1257
2430
  # end method definition
1258
2431
 
1259
- def filter(self, conditions: list, inplace: bool = True) -> pd.DataFrame:
1260
- """Filter the DataFrame based on (multiple) conditions.
2432
+ def _apply_string_cleansing(
2433
+ self,
2434
+ value: str,
2435
+ replacements: dict[str, str],
2436
+ upper: bool,
2437
+ lower: bool,
2438
+ capitalize: bool,
2439
+ title: bool,
2440
+ length: int,
2441
+ ) -> str | None:
2442
+ """Apply string operations (upper, lower, capitalize, title-case, replacements) to a string.
1261
2443
 
1262
2444
  Args:
1263
- conditions (list): Conditions are a list of dictionaries with 3 items:
1264
- * field (str): name of a column in the data frame
1265
- * value (str or list): expected value (filter criterium).
1266
- If it is a list then one of
1267
- the list elements must match the field value (OR)
1268
- * regex (bool): this flag controls if the value is interpreted as a
1269
- regular expression. If there is no regex item in the
1270
- dictionary then the default is False (= values is NOT regex).
1271
- If there are multiple conditions in the list each has to evaluate to True (AND)
1272
- inplace (bool, optional): Defines if the self._df is modified (inplace) or just
1273
- a new DataFrame is returned. Defaults to True.
2445
+ value (str):
2446
+ The string value to which the operations will be applied.
2447
+ replacements (dict[str, str]):
2448
+ A dictionary of regular expression patterns (keys) and replacement strings (values) to apply to the string.
2449
+ upper (bool):
2450
+ If True, convert the string to uppercase.
2451
+ lower (bool):
2452
+ If True, convert the string to lowercase.
2453
+ capitalize (bool):
2454
+ If True, capitalize the first letter of the string and lowercase the rest. Default is False.
2455
+ title (bool):
2456
+ If True, convert the string to title-case (first letter of each word is capitalized). Default is False.
2457
+ length (int):
2458
+ If greater than 0, truncate the string to this length. Default is 0 (no truncation).
2459
+
1274
2460
  Returns:
1275
- pd.DataFrame: new data frame or pointer to self._df (depending on the value of 'inplace')
2461
+ str | None:
2462
+ The updated string with all the applied operations. None in case an error occured.
2463
+
2464
+ Example:
2465
+ value = "hello world"
2466
+ replacements = {r"world": "there"}
2467
+ upper = True
2468
+ length = 5
2469
+
2470
+ result = _apply_string_cleansing(value, replacements, upper, length=length)
2471
+ # result would be "HELLO"
2472
+
2473
+ """
2474
+
2475
+ if not isinstance(
2476
+ value,
2477
+ str,
2478
+ ): # Only apply string operations if the value is a string
2479
+ return None
2480
+
2481
+ if upper:
2482
+ value = value.upper()
2483
+ if lower:
2484
+ value = value.lower()
2485
+ if capitalize:
2486
+ value = value.capitalize()
2487
+ if title:
2488
+ value = value.title()
2489
+
2490
+ # Handle regex replacements
2491
+ for regex_pattern, replacement in replacements.items():
2492
+ if regex_pattern:
2493
+ # Check if the pattern does NOT contain any regex special characters
2494
+ # (excluding dot and ampersand) and ONLY then use \b ... \b
2495
+ # Special regexp characters include: ^ $ * + ? ( ) | [ ] { } \
2496
+ if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
2497
+ # Wrap with word boundaries for whole-word matching
2498
+ # \b is a word boundary anchor in regular expressions.
2499
+ # It matches a position where one side is a word character
2500
+ # (like a letter or digit) and the other side is a non-word character
2501
+ # (like whitespace or punctuation). It's used to match whole words.
2502
+ # We want to have this to e.g. not replace "INT" with "INTERNATIONAL"
2503
+ # if the word is already "INTERNATIONAL". It is important
2504
+ # that the \b ... \b enclosure is ONLY used if regex_pattern is NOT
2505
+ # a regular expression but just a normal string.
2506
+ # TODO: we may reconsider if re.escape() is required or not:
2507
+ regex_pattern = re.escape(regex_pattern)
2508
+ regex_pattern = rf"\b{regex_pattern}\b"
2509
+ try:
2510
+ value = re.sub(regex_pattern, replacement, value)
2511
+ except re.error:
2512
+ self.logger.error(
2513
+ "Invalid regex pattern -> '%s' in replacement processing!",
2514
+ regex_pattern,
2515
+ )
2516
+ continue
2517
+
2518
+ # Truncate to the specified length, starting from index 0
2519
+ if 0 < length < len(value):
2520
+ value = value[:length]
2521
+
2522
+ return value
2523
+
2524
+ # end method definition
2525
+
2526
+ def filter(
2527
+ self,
2528
+ conditions: list,
2529
+ inplace: bool = True,
2530
+ reset_index: bool = True,
2531
+ ) -> pd.DataFrame | None:
2532
+ """Filter the data frame based on (multiple) conditions.
2533
+
2534
+ Args:
2535
+ conditions (list):
2536
+ Conditions are a list of dictionaries with 3 items:
2537
+ * field (str): The name of a column in the data frame
2538
+ * value (str or list):
2539
+ Expected value (filter criterium).
2540
+ If it is a list then one of the list elements must match the field value (OR)
2541
+ * equal (bool):
2542
+ Whether to test for equal or non-equal. If not specified equal is treated as True.
2543
+ * regex (bool):
2544
+ This flag controls if the value is interpreted as a
2545
+ regular expression. If there is no regex item in the
2546
+ dictionary then the default is False (= values is NOT regex).
2547
+ * enabled (bool):
2548
+ True or False. The filter is only applied if 'enabled = True'
2549
+ If there are multiple conditions in the list each has to evaluate to True (AND)
2550
+ inplace (bool, optional):
2551
+ Defines if the self._df is modified (inplace) or just
2552
+ a new data frame is returned. Defaults to True.
2553
+ reset_index (bool, optional):
2554
+ Filter removes rows. If filter_index = True then the numbering
2555
+ of the index is newly calculated
2556
+
2557
+ Returns:
2558
+ pd.DataFrame | None:
2559
+ A new data frame or pointer to self._df (depending on the value of 'inplace').
2560
+ None in case of an error.
2561
+
1276
2562
  """
1277
2563
 
1278
2564
  if self._df is None:
1279
- logger.error("DataFrame is not initialized.")
2565
+ self.logger.error("Data frame is not initialized.")
1280
2566
  return None
1281
2567
 
1282
2568
  if self._df.empty:
1283
- logger.error("DataFrame is empty.")
2569
+ self.logger.error("Data frame is empty.")
1284
2570
  return None
1285
2571
 
1286
- # first filtered_df is the full DataFreame.
1287
- # then it is subsequentially reduced by each condition
2572
+ # First filtered_df is the full data frame.
2573
+ # Then it is subsequentially reduced by each condition
1288
2574
  # at the end it is just those rows that match all conditions.
1289
- filtered_df = self._df
2575
+ filtered_df = self._df if inplace else self._df.copy()
2576
+
2577
+ def list_matches(row: list, values: list) -> bool:
2578
+ """Check if any item in the 'values' list is present in the given 'row' list.
2579
+
2580
+ Args:
2581
+ row (list):
2582
+ A list of items from the data frame column.
2583
+ values (list):
2584
+ A list of values to check for in the 'row'.
2585
+
2586
+ Returns:
2587
+ bool:
2588
+ True if any item in 'values' is found in 'row', otherwise False.
2589
+
2590
+ """
2591
+
2592
+ return any(item in values for item in row)
2593
+
2594
+ def dict_matches(row: dict, key: str, values: list) -> bool:
2595
+ """Check if the value for the dictionary 'key' is in 'values'.
2596
+
2597
+ Args:
2598
+ row (dict):
2599
+ A dictionary from the data frame column.
2600
+ key (str):
2601
+ The key to lookup in the dictionary.
2602
+ values (list):
2603
+ A list of values to check for in the 'row'.
1290
2604
 
1291
- # We traverse a list of conditions. Each condition must evaluate to true
2605
+ Returns:
2606
+ bool:
2607
+ True, if the value for the dictionary key is in 'values', otherwise False.
2608
+
2609
+ """
2610
+
2611
+ if not row or key not in row:
2612
+ return False
2613
+
2614
+ return row[key] in values
2615
+
2616
+ # We traverse a list of conditions. Each condition must evaluate to True
1292
2617
  # otherwise the current workspace or document (i.e. the data set for these objects)
1293
- # will be skipped. The variable filtered_df is
2618
+ # will be skipped.
1294
2619
  for condition in conditions:
2620
+ # Check if the condition is enabled. If 'enabled' is not
2621
+ # in the condition dict then we assume it is enabled.
2622
+ if not condition.get("enabled", True):
2623
+ continue
1295
2624
  field = condition.get("field", None)
1296
2625
  if not field:
1297
- logger.error("Missing value for filter condition field in payload!")
2626
+ self.logger.error(
2627
+ "Missing value for filter condition 'field' in payload!",
2628
+ )
1298
2629
  continue
2630
+ if "." in field:
2631
+ field, sub = field.split(".", 1)
2632
+ else:
2633
+ sub = None
2634
+
1299
2635
  if field not in self._df.columns:
1300
- logger.warning(
1301
- "Filter condition field -> %s does not exist as column in data frame! Data frame has these columns -> %s",
2636
+ self.logger.warning(
2637
+ "Filter condition field -> '%s' does not exist as column in the data frame! Data frame has these columns -> %s",
1302
2638
  field,
1303
2639
  str(self._df.columns),
1304
2640
  )
1305
- continue # Skip filtering for columns not present in DataFrame
2641
+ continue # Skip filtering for columns not present in data frame
2642
+
2643
+ regex = condition.get("regex", False)
2644
+ # We need the column to be of type string if we want to use regular expressions
2645
+ # so if the column is not yet a string we convert the column to string:
2646
+ if regex and filtered_df[field].dtype != "object":
2647
+ # Change type of column to string:
2648
+ filtered_df[field] = filtered_df[field].astype(str)
2649
+ filtered_df[field] = filtered_df[field].fillna("")
2650
+
1306
2651
  value = condition.get("value", None)
1307
- if not value:
1308
- logger.error(
1309
- "Missing filter value of for filter condition field -> '%s'!", field
2652
+ if value is None:
2653
+ # Support alternative syntax using plural.
2654
+ value = condition.get("values", None)
2655
+ if value is None:
2656
+ self.logger.error(
2657
+ "Missing filter value(s) for filter condition field -> '%s'!",
2658
+ field,
1310
2659
  )
1311
2660
  continue
1312
- regex = condition.get("regex", False)
1313
-
1314
- logger.info(
1315
- "Data Frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
1316
- filtered_df.shape[0],
1317
- filtered_df.shape[1],
1318
- str(condition),
1319
- )
1320
-
1321
- filtered_dfs = []
1322
2661
 
1323
2662
  # if a single string is passed as value we put
1324
2663
  # it into an 1-item list to simplify the following code:
1325
2664
  if not isinstance(value, list):
1326
2665
  value = [value]
1327
2666
 
1328
- # multiple values are treated like a logical "or" condition
1329
- for value_item in value:
1330
- if regex:
1331
- filtered_dfs.append(
1332
- filtered_df[
1333
- ~filtered_df[field].isna()
1334
- & filtered_df[field].str.contains(value_item, regex=True)
1335
- ]
2667
+ # If all values in the condition are strings then we
2668
+ # want the column also to be of type string:
2669
+ if all(isinstance(v, str) for v in value):
2670
+ # Change type of column to string:
2671
+ # filtered_df[field] = filtered_df[field].astype(str)
2672
+ # filtered_df[field] = filtered_df[field].fillna("").astype(str)
2673
+ # filtered_df[field] = filtered_df[field].fillna("")
2674
+
2675
+ # When inplace == True, filtered_df is just a reference to self._df.
2676
+ # Using .loc[:, field] ensures that Pandas updates the column correctly in self._df.
2677
+ # When inplace == False, filtered_df is a full copy (self._df.copy() above),
2678
+ # so modifications remain in filtered_df.
2679
+ # .loc[:, field] ensures no SettingWithCopyWarning, since filtered_df is now a separate DataFrame.
2680
+ filtered_df.loc[:, field] = filtered_df[field].fillna("").astype(str)
2681
+
2682
+ self.logger.info(
2683
+ "Data frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
2684
+ str(filtered_df.shape[0]),
2685
+ str(filtered_df.shape[1]),
2686
+ str(condition),
2687
+ )
2688
+
2689
+ # Check if the column is boolean
2690
+ if pd.api.types.is_bool_dtype(filtered_df[field]):
2691
+ # Convert string representations of booleans to actual booleans
2692
+ value = [v.lower() in ["true", "1"] if isinstance(v, str) else bool(v) for v in value]
2693
+
2694
+ # Do we want to test for equalitiy or non-equality?
2695
+ # For lists equality means: value is in the list
2696
+ # For lists non-equality means: value is NOT in the list
2697
+ test_for_equal = condition.get("equal", True)
2698
+
2699
+ # Check if the column contains only lists (every non-empty element in the column is a list).
2700
+ # `filtered_df[field]`: Access the column with the name specified in 'field'.
2701
+ # `.dropna()`: Drop None or NaN rows for the test.
2702
+ # `.apply(lambda x: isinstance(x, list))`: For each element in the column, check if it is a list.
2703
+ # `.all()`: Ensure that all elements in the column satisfy the condition of being a list.
2704
+ if filtered_df[field].dropna().apply(lambda x: isinstance(x, list)).all():
2705
+ if not test_for_equal:
2706
+ filtered_df = filtered_df[~filtered_df[field].apply(list_matches, values=value)]
2707
+ else:
2708
+ filtered_df = filtered_df[filtered_df[field].apply(list_matches, values=value)]
2709
+ # Check if the column contains only dictionaries (every non-empty element in the column is a dict).
2710
+ # `filtered_df[field]`: Access the column with the name specified in 'field'.
2711
+ # `.dropna()`: Drop None or NaN rows for the test.
2712
+ # `.apply(lambda x: isinstance(x, dict))`: For each element in the column, check if it is a dict.
2713
+ # `.all()`: Ensure that all elements in the column satisfy the condition of being a dictionary.
2714
+ elif filtered_df[field].dropna().apply(lambda x: isinstance(x, dict)).all():
2715
+ if not sub:
2716
+ self.logger.error(
2717
+ "Filtering on dictionary values need a key. This needs to be provided with 'field.key' syntax!",
1336
2718
  )
2719
+ continue
2720
+ if not test_for_equal:
2721
+ filtered_df = filtered_df[~filtered_df[field].apply(dict_matches, key=sub, values=value)]
1337
2722
  else:
1338
- result_df = filtered_df[
1339
- ~filtered_df[field].isna() & filtered_df[field].eq(value_item)
1340
- ]
1341
- if not result_df.empty:
1342
- filtered_dfs.append(result_df)
1343
- # end for values
1344
-
1345
- if not filtered_dfs:
1346
- logger.warning(
1347
- "Filter with field -> '%s' and value -> '%s' delivered an empty Data Frame",
1348
- field,
1349
- str(value),
1350
- )
1351
- filtered_df.drop(filtered_df.index, inplace=True)
2723
+ filtered_df = filtered_df[filtered_df[field].apply(dict_matches, key=sub, values=value)]
2724
+ # Check if the column has boolean values:
2725
+ elif pd.api.types.is_bool_dtype(filtered_df[field]):
2726
+ # For a boolean filter we can drop NA values:
2727
+ filtered_df = filtered_df.dropna(subset=[field])
2728
+ if not test_for_equal:
2729
+ filtered_df = filtered_df[~filtered_df[field].isin(value)]
2730
+ else:
2731
+ filtered_df = filtered_df[filtered_df[field].isin(value)]
2732
+ elif not regex:
2733
+ if pd.api.types.is_string_dtype(filtered_df[field]):
2734
+ filtered_df[field] = filtered_df[field].str.strip()
2735
+ if not test_for_equal:
2736
+ filtered_df = filtered_df[~filtered_df[field].isin(value)]
2737
+ else:
2738
+ filtered_df = filtered_df[filtered_df[field].isin(value)]
1352
2739
  else:
1353
- # Concatenate the filtered DataFrames for each value in the list
1354
- filtered_df = pd.concat(filtered_dfs, ignore_index=True)
1355
-
1356
- logger.info(
1357
- "Data Frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
1358
- filtered_df.shape[0],
1359
- filtered_df.shape[1],
2740
+ # Create a pure boolean pd.Series as a filter criterium:
2741
+ regex_condition = filtered_df[field].str.contains(
2742
+ "|".join(value),
2743
+ regex=True,
2744
+ na=False,
2745
+ )
2746
+ # Apply the boolean pd.Series named 'regex_condition' as
2747
+ # a filter - either non-negated or negated (using ~):
2748
+ filtered_df = filtered_df[~regex_condition] if not test_for_equal else filtered_df[regex_condition]
2749
+
2750
+ self.logger.info(
2751
+ "Data frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
2752
+ str(filtered_df.shape[0]),
2753
+ str(filtered_df.shape[1]),
1360
2754
  str(condition),
1361
2755
  )
1362
2756
  # end for condition
@@ -1364,23 +2758,29 @@ class Data:
1364
2758
  if inplace:
1365
2759
  self._df = filtered_df
1366
2760
 
2761
+ if reset_index:
2762
+ self._df.reset_index(inplace=True, drop=True)
2763
+
1367
2764
  return filtered_df
1368
2765
 
1369
2766
  # end method definition
1370
2767
 
1371
- def fill_na_in_column(self, column_name: str, default_value: str | int):
1372
- """Replace NA values in a column with a defined new default value
2768
+ def fill_na_in_column(self, column_name: str, default_value: str | int) -> None:
2769
+ """Replace NA values in a column with a defined new default value.
1373
2770
 
1374
2771
  Args:
1375
- column_name (str): name of the column in the DataFrame
1376
- default_value (str | int): value to replace NA with
2772
+ column_name (str):
2773
+ The name of the column in the data frame.
2774
+ default_value (str | int):
2775
+ The value to replace NA with.
2776
+
1377
2777
  """
1378
2778
 
1379
2779
  if column_name in self._df.columns:
1380
2780
  self._df[column_name] = self._df[column_name].fillna(value=default_value)
1381
2781
  else:
1382
- logger.error(
1383
- "Cannot replace NA values as column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
2782
+ self.logger.error(
2783
+ "Cannot replace NA values as column -> '%s' does not exist in the data frame! Available columns -> %s",
1384
2784
  column_name,
1385
2785
  str(self._df.columns),
1386
2786
  )
@@ -1388,16 +2788,19 @@ class Data:
1388
2788
  # end method definition
1389
2789
 
1390
2790
  def fill_forward(self, inplace: bool) -> pd.DataFrame:
1391
- """Fill the missing cells appropriately by carrying forward
1392
- the values from the previous rows where necessary.
1393
- This has applications if a hierarchy is represented by
1394
- nested cells e.g. in an Excel sheet.
2791
+ """Fill the missing cells appropriately by carrying forward the values from the previous rows where necessary.
2792
+
2793
+ This has applications if a hierarchy is represented by
2794
+ nested cells e.g. in an Excel sheet.
1395
2795
 
1396
2796
  Args:
1397
- inplace (bool): Should the modification happen inplace or not.
2797
+ inplace (bool):
2798
+ Should the modification happen inplace or not.
1398
2799
 
1399
2800
  Returns:
1400
- pd.DataFrame: Resulting dataframe
2801
+ pd.DataFrame:
2802
+ The resulting data frame.
2803
+
1401
2804
  """
1402
2805
 
1403
2806
  # To convert an Excel representation of a folder structure with nested
@@ -1410,67 +2813,137 @@ class Data:
1410
2813
  # end method definition
1411
2814
 
1412
2815
  def lookup_value(
1413
- self, lookup_column: str, lookup_value: str, separator: str = "|"
1414
- ) -> pd.Series | None:
1415
- """Lookup a row that includes a lookup value in the value of a given column.
2816
+ self,
2817
+ lookup_column: str,
2818
+ lookup_value: str,
2819
+ separator: str = "|",
2820
+ single_row: bool = True,
2821
+ ) -> pd.Series | pd.DataFrame | None:
2822
+ """Lookup row(s) that includes a lookup value in the value of a given column.
1416
2823
 
1417
2824
  Args:
1418
- lookup_column (str): name of the column to search in
1419
- lookup_value (str): value to search for
1420
- separator (str): string list delimiter / separator
2825
+ lookup_column (str):
2826
+ The name of the column to search in.
2827
+ lookup_value (str):
2828
+ The value to search for.
2829
+ separator (str):
2830
+ The string list delimiter / separator. The pipe symbol | is the default
2831
+ as it is unlikely to appear in a normal string (other than a plain comma).
2832
+ The separator is NOT looked for in the lookup_value but in the column that
2833
+ is given by lookup_column!
2834
+ single_row (bool, optional):
2835
+ This defines if we just return the first matching row if multiple matching rows
2836
+ are found. Default is True (= single row).
1421
2837
 
1422
2838
  Returns:
1423
- pd.Series | None: data frame row that matches or None if no match was found.
2839
+ pd.Series | pd.DataFrame | None:
2840
+ Data frame (multiple rows) or Series (row) that matches the lookup value.
2841
+ None if no match was found.
2842
+
1424
2843
  """
1425
2844
 
1426
- # Use the `apply` function to filter rows where the lookup value matches a whole item in the comma-separated list
1427
- def match_lookup_value(string_list: str) -> bool:
1428
- """Spilt delimiter-separated list into a python list
2845
+ # Use the `apply` function to filter rows where the lookup value matches a
2846
+ # whole item in the separator-divided list:
2847
+ def match_lookup_value(string_list: str | None) -> bool:
2848
+ """Check if the lookup value is in a string list.
2849
+
2850
+ For this the string list is converted to a python
2851
+ list. A separator is used for the splitting.
1429
2852
 
1430
2853
  Args:
1431
- string_list (str): delimiter-separated string list like "a, b, c" or "a | b | c"
2854
+ string_list (str):
2855
+ Delimiter-separated string list like "a, b, c" or "a | b | c"
1432
2856
 
1433
2857
  Returns:
1434
- bool: True if lookup_value is equal to one of the delimiter-separated terms
2858
+ bool:
2859
+ True if lookup_value is equal to one of the delimiter-separated terms.
2860
+
1435
2861
  """
1436
- return lookup_value in [
1437
- item.strip() for item in string_list.split(separator)
1438
- ]
1439
2862
 
1440
- df = self._df
2863
+ if pd.isna(string_list): # Handle None/NaN safely
2864
+ return False
2865
+
2866
+ # Ensure that the string is a string
2867
+ string_list = str(string_list)
2868
+
2869
+ return lookup_value in [item.strip() for item in string_list.split(separator)]
2870
+
2871
+ # end method definition
1441
2872
 
1442
2873
  if self._df is None:
1443
2874
  return None
1444
2875
 
2876
+ df = self._df
2877
+
1445
2878
  if lookup_column not in self._df.columns:
1446
- logger.error(
1447
- "Column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
2879
+ self.logger.error(
2880
+ "Cannot lookup value in column -> '%s'. Column does not exist in the data frame! Data frame has these columns -> %s",
1448
2881
  lookup_column,
1449
2882
  str(self._df.columns),
1450
2883
  )
1451
2884
  return None
1452
2885
 
1453
2886
  # Fill NaN or None values in the lookup column with empty strings
1454
- df[lookup_column] = df[lookup_column].fillna("")
2887
+ # df[lookup_column] = df[lookup_column].fillna("")
2888
+
2889
+ # Use the `apply` function to filter rows where the lookup value is in row cell
2890
+ # of column given by lookup_column. match_lookup_value() is called with
2891
+ # the content of the individual cell contents:
2892
+ matched_rows = df[df[lookup_column].apply(match_lookup_value)]
1455
2893
 
1456
- # Use the `apply` function to filter rows where the lookup value is in the Synonyms list
1457
- matched_row = df[df[lookup_column].apply(match_lookup_value)]
2894
+ # If nothing was found we return None:
2895
+ if matched_rows.empty:
2896
+ return None
2897
+
2898
+ # If it is OK to have multiple matches (= multiple rows = pd.DataFrame).
2899
+ # We can just return the matched_rows now which should be a pd.DataFrame:
2900
+ if not single_row:
2901
+ return matched_rows
2902
+
2903
+ # Check if more than one row matches, and log a warning if so
2904
+ if len(matched_rows) > 1:
2905
+ self.logger.warning(
2906
+ "More than one match found for lookup value -> '%s' in column -> '%s'. Returning the first match.",
2907
+ lookup_value,
2908
+ lookup_column,
2909
+ )
1458
2910
 
1459
2911
  # Return the first matched row, if any
1460
- if not matched_row.empty:
1461
- return matched_row.iloc[0]
2912
+ return matched_rows.iloc[0]
1462
2913
 
1463
- return None
2914
+ # end method definition
2915
+
2916
+ def set_value(self, column: str, value, condition: pd.Series | None = None) -> None: # noqa: ANN001
2917
+ """Set the value in the data frame based on a condition.
2918
+
2919
+ Args:
2920
+ column (str):
2921
+ The name of the column.
2922
+ value (Any):
2923
+ The value to set for those rows that fulfill the condition.
2924
+ condition (pd.Series, optional):
2925
+ This should be a boolean Series where each element is True or False,
2926
+ representing rows in the data frame that meet a certain condition.
2927
+ If None is provided then ALL rows get the 'value' in the given
2928
+ column.
2929
+
2930
+ """
2931
+
2932
+ if condition is None:
2933
+ self._df[column] = value # Set value unconditionally
2934
+ else:
2935
+ self._df.loc[condition, column] = value # Set value based on condition
1464
2936
 
1465
2937
  # end method definition
1466
2938
 
1467
2939
  def add_column(
1468
2940
  self,
1469
- source_column: str,
1470
- reg_exp: str,
1471
2941
  new_column: str,
1472
- prefix="",
1473
- suffix="",
2942
+ data_type: str = "string",
2943
+ source_column: str = "",
2944
+ reg_exp: str = "",
2945
+ prefix: str = "",
2946
+ suffix: str = "",
1474
2947
  length: int | None = None,
1475
2948
  group_chars: int | None = None,
1476
2949
  group_separator: str = ".",
@@ -1479,24 +2952,78 @@ class Data:
1479
2952
  """Add additional column to the data frame.
1480
2953
 
1481
2954
  Args:
1482
- source_column (str): name of the source column
1483
- reg_exp (str): regular expression to apply on the content of the source column
1484
- new_column (str): name of the column to add
1485
- prefix (str, optional): Prefix to add in front of the value. Defaults to "".
1486
- suffix (str, optional): Suffix to add at the end of the value. Defaults to "".
1487
- length (int | None, optional): Length to reduce to. Defaults to None.
1488
- group_chars (int | None, optional): group the resulting string in characters of group_chars. Defaults to None.
1489
- group_separator (str, optional): Separator string for the grouping. Defaults to ".".
1490
- group_remove_leading_zero (bool, optional): Remove leading zeros from the groups. Defaults to True.
2955
+ new_column (str):
2956
+ The name of the column to add.
2957
+ data_type (str, optional):
2958
+ The data type of the new column.
2959
+ source_column (str, optional):
2960
+ The name of the source column.
2961
+ reg_exp (str, optional):
2962
+ A regular expression to apply on the content of the source column.
2963
+ prefix (str, optional):
2964
+ Prefix to add in front of the value. Defaults to "".
2965
+ suffix (str, optional):
2966
+ Suffix to add at the end of the value. Defaults to "".
2967
+ length (int | None, optional):
2968
+ Length to reduce to. Defaults to None (= unlimited).
2969
+ group_chars (int | None, optional):
2970
+ Group the resulting string in characters of group_chars. Defaults to None.
2971
+ Usable e.g. for thousand seperator "."
2972
+ group_separator (str, optional):
2973
+ Separator string for the grouping. Defaults to ".".
2974
+ group_remove_leading_zero (bool, optional):
2975
+ Remove leading zeros from the groups. Defaults to True.
1491
2976
 
1492
2977
  Returns:
1493
- bool: True = Success, False = Failure
2978
+ bool:
2979
+ True = Success, False = Failure
2980
+
1494
2981
  """
1495
2982
 
1496
2983
  if self._df is None:
1497
2984
  return False
1498
2985
 
2986
+ # Check that the new column does not yet exist
2987
+ if new_column in self._df.columns:
2988
+ self.logger.error(
2989
+ "New column -> '%s' does already exist in data frame! Cannot add it. Data frame has these columns -> %s",
2990
+ new_column,
2991
+ str(self._df.columns),
2992
+ )
2993
+ return False
2994
+
2995
+ # first we handle the very simple case to not have
2996
+ # a source column but just add an empty new column:
2997
+ if not source_column:
2998
+ self._df[new_column] = pd.Series(dtype=data_type)
2999
+ return True
3000
+
3001
+ # Check if the source column exists
3002
+ if source_column not in self._df.columns:
3003
+ self.logger.error(
3004
+ "Source column -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
3005
+ source_column,
3006
+ str(self._df.columns),
3007
+ )
3008
+ return False
3009
+
3010
+ # Validate the regex pattern
3011
+ try:
3012
+ re.compile(reg_exp) # Check if the pattern is a valid regex
3013
+ except re.error:
3014
+ self.logger.error(
3015
+ "Invalid regular expression -> %s. Cannot extract data for new column -> '%s'!",
3016
+ reg_exp,
3017
+ new_column,
3018
+ )
3019
+ return False
3020
+
3021
+ # Ensure the source column is of type string (convert it, if necessary)
3022
+ if self._df[source_column].dtype != "object":
3023
+ self._df[source_column] = self._df[source_column].astype(str)
3024
+
1499
3025
  # Use str.extract to apply the regular expression to the source column
3026
+ # and then assign this modified column to the variable "extracted":
1500
3027
  extracted = self._df[source_column].str.extract(pat=reg_exp, expand=False)
1501
3028
 
1502
3029
  # Limit the result to the specified length
@@ -1505,9 +3032,9 @@ class Data:
1505
3032
 
1506
3033
  if group_chars is not None:
1507
3034
 
1508
- def process_grouping(x):
3035
+ def process_grouping(x) -> str | None: # noqa: ANN001
1509
3036
  if pd.isna(x):
1510
- return x
3037
+ return None
1511
3038
  # Split into groups
1512
3039
  groups = [x[i : i + group_chars] for i in range(0, len(x), group_chars)]
1513
3040
  if group_remove_leading_zero:
@@ -1525,3 +3052,216 @@ class Data:
1525
3052
  self._df[new_column] = extracted
1526
3053
 
1527
3054
  return True
3055
+
3056
+ # end method definition
3057
+
3058
+ def convert_to_lists(self, columns: list, delimiter: str = ",") -> None:
3059
+ """Intelligently convert string values to list values, in defined data frame columns.
3060
+
3061
+ The delimiter to separate values in the string value can be configured.
3062
+ The method is ignoring delimiters that are inside quotes.
3063
+
3064
+ Args:
3065
+ columns (list):
3066
+ The name of the columns whose values should be converted to lists.
3067
+ delimiter (str, optional):
3068
+ Character that delimits list items. Defaults to ",".
3069
+
3070
+ Returns:
3071
+ None. self._df is modified in place.
3072
+
3073
+ """
3074
+
3075
+ # Regex to split by the delimiter, ignoring those inside quotes or double quotes
3076
+ def split_string_ignoring_quotes(s: str, delimiter: str) -> list:
3077
+ """Split a string into a list at positions that have a delimiter character.
3078
+
3079
+ Args:
3080
+ s (str): the string to split
3081
+ delimiter (str): The single character that is used for splitting.
3082
+
3083
+ Returns:
3084
+ A list of splitted values.
3085
+
3086
+ """
3087
+
3088
+ # Escaping the delimiter in case it's a special regex character
3089
+ delimiter = re.escape(delimiter)
3090
+ # Match quoted strings and unquoted delimiters separately
3091
+ pattern = rf'(?:"[^"]*"|\'[^\']*\'|[^{delimiter}]+)'
3092
+ return re.findall(pattern, s)
3093
+
3094
+ for col in columns:
3095
+ self._df[col] = self._df[col].apply(
3096
+ lambda x: (split_string_ignoring_quotes(x, delimiter) if isinstance(x, str) and delimiter in x else x),
3097
+ )
3098
+
3099
+ # end method definition
3100
+
3101
+ def add_column_concat(
3102
+ self,
3103
+ source_columns: list,
3104
+ new_column: str,
3105
+ concat_char: str = "",
3106
+ upper: bool = False,
3107
+ lower: bool = False,
3108
+ capitalize: bool = False,
3109
+ title: bool = False,
3110
+ ) -> None:
3111
+ """Add a column as a concatenation of the values of multiple source columns.
3112
+
3113
+ Args:
3114
+ source_columns (list):
3115
+ The column names the list values are taken from.
3116
+ new_column (str):
3117
+ The name of the new column.
3118
+ concat_char (str, optional):
3119
+ Character to insert between the concatenated values. Default is "".
3120
+ upper (bool, optional):
3121
+ Convert result to uppercase if True.
3122
+ lower (bool, optional):
3123
+ Convert result to lowercase if True.
3124
+ capitalize (bool, optional):
3125
+ Capitalize the result if True.
3126
+ title (bool, optional):
3127
+ Convert result to title case if True.
3128
+
3129
+ Returns:
3130
+ None. self._df is modified in place.
3131
+
3132
+ """
3133
+
3134
+ def concatenate(row: pd.Series) -> str:
3135
+ # Comprehension to create a list from all source column values:
3136
+ concatenated = concat_char.join(
3137
+ [str(row[col]) for col in source_columns if pd.notna(row[col])],
3138
+ )
3139
+
3140
+ # Apply case transformations based on parameters
3141
+ if upper:
3142
+ concatenated = concatenated.upper()
3143
+ elif lower:
3144
+ concatenated = concatenated.lower()
3145
+ elif capitalize:
3146
+ concatenated = concatenated.capitalize()
3147
+ elif title:
3148
+ concatenated = concatenated.title()
3149
+
3150
+ # end method definition
3151
+
3152
+ self._df[new_column] = self._df.apply(concatenate, axis=1)
3153
+
3154
+ # end method definition
3155
+
3156
+ def add_column_list(self, source_columns: list, new_column: str) -> None:
3157
+ """Add a column with list objects.
3158
+
3159
+ The list items are taken from a list of source columns (row by row).
3160
+
3161
+ Args:
3162
+ source_columns (list):
3163
+ The column names the list values are taken from.
3164
+ new_column (str):
3165
+ The name of the new column.
3166
+
3167
+ Returns:
3168
+ None. self._df is modified in place.
3169
+
3170
+ """
3171
+
3172
+ def create_list(row: pd.Series) -> list:
3173
+ # Comprehension to create a list from all source column values:
3174
+ return [row[col] for col in source_columns]
3175
+
3176
+ self._df[new_column] = self._df.apply(create_list, axis=1)
3177
+
3178
+ # end method definition
3179
+
3180
+ def add_column_table(
3181
+ self,
3182
+ source_columns: list,
3183
+ new_column: str,
3184
+ delimiter: str = ",",
3185
+ ) -> None:
3186
+ """Add a column with tabular objects (list of dictionaries).
3187
+
3188
+ The source columns should include lists. The resulting dictionary
3189
+ keys are the column names for the source columns.
3190
+
3191
+ Example (["X", "Y"] are the source_columns, "Table" is the new_column):
3192
+ X[1] = [1, 2, 3] # row 1
3193
+ Y[1] = ["A", "B", "C"] # row 1
3194
+ X[2] = [4, 5, 6] # row 2
3195
+ Y[2] = ["D", "E", "F"] # row 2
3196
+
3197
+ Table[1] = [
3198
+ {
3199
+ "X": "1"
3200
+ "Y": "A"
3201
+ },
3202
+ {
3203
+ "X": "2"
3204
+ "Y": "B"
3205
+ }
3206
+ {
3207
+ "X": "3"
3208
+ "Y": "C"
3209
+ }
3210
+ ]
3211
+ Table[2] = [
3212
+ {
3213
+ "X": "4"
3214
+ "Y": "D"
3215
+ },
3216
+ {
3217
+ "X": "5"
3218
+ "Y": "E"
3219
+ }
3220
+ {
3221
+ "X": "6"
3222
+ "Y": "F"
3223
+ }
3224
+ ]
3225
+
3226
+ Args:
3227
+ source_columns (list):
3228
+ The column names the list values are taken from.
3229
+ new_column (str):
3230
+ The name of the new column.
3231
+ delimiter (str, optional):
3232
+ Character that delimits list items. Defaults to ",".
3233
+
3234
+ Returns:
3235
+ None. self._df is modified in place.
3236
+
3237
+ """
3238
+
3239
+ # Call the convert_to_lists method to ensure the columns are converted
3240
+ self.convert_to_lists(columns=source_columns, delimiter=delimiter)
3241
+
3242
+ # Sub-method to pad lists to the same length
3243
+ def pad_list(lst: list, max_len: int) -> list:
3244
+ return lst + [None] * (max_len - len(lst))
3245
+
3246
+ def create_table(row: pd.Series) -> list:
3247
+ max_len = max(len(row[col]) if isinstance(row[col], list) else 1 for col in source_columns)
3248
+
3249
+ # Pad lists to the maximum length, leave scalar values as they are
3250
+ for col in source_columns:
3251
+ if isinstance(row[col], list):
3252
+ row[col] = pad_list(row[col], max_len)
3253
+ elif not pd.isna(row[col]):
3254
+ row[col] = [
3255
+ row[col],
3256
+ ] * max_len # Repeat scalar value to match the max length
3257
+ else:
3258
+ row[col] = [None] * max_len
3259
+ # Create a list of dictionaries for each row:
3260
+ table = [{col: row[col][i] for col in source_columns} for i in range(max_len)]
3261
+
3262
+ return table
3263
+
3264
+ # Apply the function to create a new column with table values:
3265
+ self._df[new_column] = self._df.apply(create_table, axis=1)
3266
+
3267
+ # end method definition