pyxecm 1.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyxecm might be problematic. Click here for more details.

Files changed (56) hide show
  1. pyxecm/__init__.py +6 -4
  2. pyxecm/avts.py +673 -246
  3. pyxecm/coreshare.py +686 -467
  4. pyxecm/customizer/__init__.py +16 -4
  5. pyxecm/customizer/__main__.py +58 -0
  6. pyxecm/customizer/api/__init__.py +5 -0
  7. pyxecm/customizer/api/__main__.py +6 -0
  8. pyxecm/customizer/api/app.py +914 -0
  9. pyxecm/customizer/api/auth.py +154 -0
  10. pyxecm/customizer/api/metrics.py +92 -0
  11. pyxecm/customizer/api/models.py +13 -0
  12. pyxecm/customizer/api/payload_list.py +865 -0
  13. pyxecm/customizer/api/settings.py +103 -0
  14. pyxecm/customizer/browser_automation.py +332 -139
  15. pyxecm/customizer/customizer.py +1007 -1130
  16. pyxecm/customizer/exceptions.py +35 -0
  17. pyxecm/customizer/guidewire.py +322 -0
  18. pyxecm/customizer/k8s.py +713 -378
  19. pyxecm/customizer/log.py +107 -0
  20. pyxecm/customizer/m365.py +2867 -909
  21. pyxecm/customizer/nhc.py +1169 -0
  22. pyxecm/customizer/openapi.py +258 -0
  23. pyxecm/customizer/payload.py +16817 -7467
  24. pyxecm/customizer/pht.py +699 -285
  25. pyxecm/customizer/salesforce.py +516 -342
  26. pyxecm/customizer/sap.py +58 -41
  27. pyxecm/customizer/servicenow.py +593 -371
  28. pyxecm/customizer/settings.py +442 -0
  29. pyxecm/customizer/successfactors.py +408 -346
  30. pyxecm/customizer/translate.py +83 -48
  31. pyxecm/helper/__init__.py +5 -2
  32. pyxecm/helper/assoc.py +83 -43
  33. pyxecm/helper/data.py +2406 -870
  34. pyxecm/helper/logadapter.py +27 -0
  35. pyxecm/helper/web.py +229 -101
  36. pyxecm/helper/xml.py +527 -171
  37. pyxecm/maintenance_page/__init__.py +5 -0
  38. pyxecm/maintenance_page/__main__.py +6 -0
  39. pyxecm/maintenance_page/app.py +51 -0
  40. pyxecm/maintenance_page/settings.py +28 -0
  41. pyxecm/maintenance_page/static/favicon.avif +0 -0
  42. pyxecm/maintenance_page/templates/maintenance.html +165 -0
  43. pyxecm/otac.py +234 -140
  44. pyxecm/otawp.py +1436 -557
  45. pyxecm/otcs.py +7716 -3161
  46. pyxecm/otds.py +2150 -919
  47. pyxecm/otiv.py +36 -21
  48. pyxecm/otmm.py +1272 -325
  49. pyxecm/otpd.py +231 -127
  50. pyxecm-2.0.0.dist-info/METADATA +145 -0
  51. pyxecm-2.0.0.dist-info/RECORD +54 -0
  52. {pyxecm-1.6.dist-info → pyxecm-2.0.0.dist-info}/WHEEL +1 -1
  53. pyxecm-1.6.dist-info/METADATA +0 -53
  54. pyxecm-1.6.dist-info/RECORD +0 -32
  55. {pyxecm-1.6.dist-info → pyxecm-2.0.0.dist-info/licenses}/LICENSE +0 -0
  56. {pyxecm-1.6.dist-info → pyxecm-2.0.0.dist-info}/top_level.txt +0 -0
pyxecm/helper/data.py CHANGED
@@ -1,74 +1,61 @@
1
- """
2
- Data Module to implement functions to leverage Pandas to
3
- manipulte data structures read for bulk generation of Extended ECM items.
4
-
5
- This code implements a class called data which is referring
6
- to Pandas DataFrame.
7
-
8
- Class: Payload
9
- Methods:
10
-
11
- __init__ : class initializer
12
- __len__: Lenght of the embedded DataFrame object.
13
- __str__: Print the DataFrame of the class
14
- get_data_frame: Get the Pandas DataFrame object
15
- set_data_frame: Set the Pandas DataFrame object
16
- append: Append additional data to the data frame.
17
-
18
- load_json_data: Load JSON data into DataFrame
19
- save_json_data: Save JSON data from DataFrame to file
20
- load_excel_data: Load Excel file into DataFrame
21
- load_csv_data: Load CSV data into DataFrame
22
- load_directory: Load directory structure into Pandas Data Frame
23
-
24
- partitionate: Partition a data frame into equally sized partions
25
- deduplicate: Remove dupclicate rows that have all fields in unique_fields in common
26
- sort: Sort the data frame based on one or multiple fields.
27
- flatten: Flatten a sub-dictionary by copying selected fields to the
28
- parent dictionary.
29
- explode_and_flatten: Explode a substructure in the Data Frame
30
- drop_columns: Drop selected columns from the Data Frame
31
- keep_columns: Keep only selected columns from the Data Frame. Drop the rest.
32
- cleanse: Cleanse data with regular expressions and upper/lower case conversion.
33
- filter: Filter the DataFrame based on conditions
34
-
35
- fill_forward: Fill the missing cells appropriately by carrying forward
36
- the values from the previous rows where necessary.
37
- fill_na_in_column: Replace NA values in a column with a defined new default value
1
+ """Data Module leveraging Pandas to manipulte data sets read for bulk generation of Content Server items.
2
+
3
+ See: https://pandas.pydata.org
4
+
5
+ This code implements a class called "Data" which is a wrapper
6
+ to Pandas data frame.
38
7
  """
39
8
 
40
9
  __author__ = "Dr. Marc Diefenbruch"
41
- __copyright__ = "Copyright 2024, OpenText"
10
+ __copyright__ = "Copyright (C) 2024-2025, OpenText"
42
11
  __credits__ = ["Kai-Philip Gatzweiler"]
43
12
  __maintainer__ = "Dr. Marc Diefenbruch"
44
13
  __email__ = "mdiefenb@opentext.com"
45
14
 
46
- import logging
47
15
  import json
16
+ import logging
48
17
  import os
49
18
  import re
50
19
  import threading
20
+ from io import StringIO
51
21
 
52
22
  import pandas as pd
23
+ import requests
53
24
 
54
- logger = logging.getLogger("pyxecm.helper.data")
25
+ default_logger = logging.getLogger("pyxecm.helper.data")
55
26
 
56
27
 
57
28
  class Data:
58
29
  """Used to automate data loading for the customizer."""
59
30
 
31
+ logger: logging.Logger = default_logger
32
+
60
33
  _df: pd.DataFrame
61
- _lock = threading.Lock()
34
+ _lock: threading.Lock = threading.Lock()
62
35
 
63
- def __init__(self, init_data: pd.DataFrame | list = None):
36
+ def __init__(
37
+ self,
38
+ init_data: pd.DataFrame | list = None,
39
+ logger: logging.Logger = default_logger,
40
+ ) -> None:
64
41
  """Initialize the Data object.
65
42
 
66
43
  Args:
67
- init_data (pd.DataFrame | list, optional): Data to initialize the data frame. Can either be
68
- another data frame (that gets copied) or a list of dictionaries.
69
- Defaults to None.
44
+ init_data (pd.DataFrame | list, optional):
45
+ Data to initialize the data frame. Can either be
46
+ another data frame (that gets copied) or a list of dictionaries.
47
+ Defaults to None.
48
+ logger (logging.Logger, optional):
49
+ Pass a special logging object. This is optional. If not provided,
50
+ the default logger is used.
51
+
70
52
  """
71
53
 
54
+ if logger != default_logger:
55
+ self.logger = logger.getChild("data")
56
+ for logfilter in logger.filters:
57
+ self.logger.addFilter(logfilter)
58
+
72
59
  if init_data is not None:
73
60
  # if a data frame is passed to the constructor we
74
61
  # copy its content to the new Data object
@@ -84,7 +71,7 @@ class Data:
84
71
  # it is important to wrap the dict in a list to avoid that more than 1 row is created
85
72
  self._df: pd.DataFrame = pd.DataFrame([init_data])
86
73
  else:
87
- logger.error("Illegal initialization data for 'Data' class!")
74
+ self.logger.error("Illegal initialization data for 'Data' class!")
88
75
  self._df = None
89
76
  else:
90
77
  self._df = None
@@ -92,11 +79,14 @@ class Data:
92
79
  # end method definition
93
80
 
94
81
  def __len__(self) -> int:
95
- """Lenght of the embedded DataFrame object.
96
- This is basically a convenience method.
82
+ """Return lenght of the embedded Pandas data frame object.
83
+
84
+ This is basically a convenience method.
97
85
 
98
86
  Returns:
99
- int: Lenght of the DataFrame
87
+ int:
88
+ Lenght of the data frame.
89
+
100
90
  """
101
91
 
102
92
  if self._df is not None:
@@ -106,10 +96,12 @@ class Data:
106
96
  # end method definition
107
97
 
108
98
  def __str__(self) -> str:
109
- """Print the DataFrame of the class.
99
+ """Print the Pandas data frame object.
110
100
 
111
101
  Returns:
112
- str: String representation.
102
+ str:
103
+ String representation.
104
+
113
105
  """
114
106
 
115
107
  # if data frame is initialized we return
@@ -122,51 +114,72 @@ class Data:
122
114
  # end method definition
123
115
 
124
116
  def __getitem__(self, column: str) -> pd.Series:
125
- """Return the column corresponding to the key from the DataFrame
117
+ """Return the column corresponding to the key from the data frame.
126
118
 
127
119
  Args:
128
- column (str): name of the Data Frame column
120
+ column (str): The name of the data frame column.
129
121
 
130
122
  Returns:
131
- pd.Series: column of the Data Frame with the given name
123
+ pd.Series: The column of the data frame with the given name.
124
+
132
125
  """
133
126
 
134
127
  return self._df[column]
135
128
 
136
129
  # end method definition
137
130
 
138
- def lock(self):
131
+ def lock(self) -> threading.Lock:
139
132
  """Return the threading lock object.
140
133
 
141
134
  Returns:
142
- _type_: threading lock object
135
+ threading.Lock: The threading lock object.
136
+
143
137
  """
138
+
144
139
  return self._lock
145
140
 
146
141
  # end method definition
147
142
 
148
143
  def get_data_frame(self) -> pd.DataFrame:
149
- """Get the Pandas DataFrame object
144
+ """Get the Pandas data frame object.
150
145
 
151
146
  Returns:
152
- pd.DataFrame: Pandas DataFrame object
147
+ pd.DataFrame: The Pandas data frame object.
148
+
153
149
  """
154
150
 
155
151
  return self._df
156
152
 
157
153
  # end method definition
158
154
 
159
- def set_data_frame(self, df: pd.DataFrame):
160
- """Set the Pandas DataFrame object
155
+ def set_data_frame(self, df: pd.DataFrame) -> None:
156
+ """Set the Pandas data frame object.
161
157
 
162
158
  Args:
163
- df (pd.DataFrame): Pandas DataFrame object
159
+ df (pd.DataFrame): The new Pandas data frame object.
160
+
164
161
  """
165
162
 
166
163
  self._df = df
167
164
 
168
165
  # end method definition
169
166
 
167
+ def get_columns(self) -> list | None:
168
+ """Get the list of column names of the data frame.
169
+
170
+ Returns:
171
+ list | None:
172
+ The list of column names in the data frame.
173
+
174
+ """
175
+
176
+ if self._df is None:
177
+ return None
178
+
179
+ return self._df.columns
180
+
181
+ # end method definition
182
+
170
183
  def print_info(
171
184
  self,
172
185
  show_size: bool = True,
@@ -177,26 +190,40 @@ class Data:
177
190
  show_sample: bool = False,
178
191
  show_statistics: bool = False,
179
192
  row_num: int = 10,
180
- ):
181
- """Log information about the data frame
193
+ ) -> None:
194
+ """Log information about the data frame.
182
195
 
183
196
  Args:
184
- show_size (bool, optional): Show size of data frame. Defaults to True.
185
- show_info (bool, optional): Show information for data frame. Defaults to False.
186
- show_columns (bool, optional): Show columns of data frame. Defaults to False.
187
- show_first (bool, optional): Show first 10 items. Defaults to False.
188
- show_last (bool, optional): Show last 10 items. Defaults to False.
189
- show_sample (bool, optional): Show 10 sample items. Defaults to False.
190
- show_statistics (bool, optional): Show data frame statistics. Defaults to False.
197
+ show_size (bool, optional):
198
+ Show size of data frame. Defaults to True.
199
+ show_info (bool, optional):
200
+ Show information for data frame. Defaults to False.
201
+ show_columns (bool, optional):
202
+ Show columns of data frame. Defaults to False.
203
+ show_first (bool, optional):
204
+ Show first N items. Defaults to False. N is defined
205
+ by the row_num parameter.
206
+ show_last (bool, optional):
207
+ Show last N items. Defaults to False. N is defined
208
+ by the row_num parameter.
209
+ show_sample (bool, optional):
210
+ Show N sample items. Defaults to False. N is defined
211
+ by the row_num parameter.
212
+ show_statistics (bool, optional):
213
+ Show data frame statistics. Defaults to False.
214
+ row_num (int, optional):
215
+ Used as the number of rows printed using show_first,
216
+ show_last, show_sample. Default is 10.
217
+
191
218
  """
192
219
 
193
220
  if self._df is None:
194
- logger.warning("Data Frame is not initialized!")
221
+ self.logger.warning("Data frame is not initialized!")
195
222
  return
196
223
 
197
224
  if show_size:
198
- logger.info(
199
- "Data Frame has %s row(s) and %s column(s)",
225
+ self.logger.info(
226
+ "Data frame has %s row(s) and %s column(s)",
200
227
  self._df.shape[0],
201
228
  self._df.shape[1],
202
229
  )
@@ -206,39 +233,42 @@ class Data:
206
233
  self._df.info()
207
234
 
208
235
  if show_columns:
209
- logger.info("Columns:\n%s", self._df.columns)
210
- logger.info(
211
- "Columns with number of null values:\n%s", self._df.isnull().sum()
212
- )
213
- logger.info(
214
- "Columns with number of non-null values:\n%s", self._df.notnull().sum()
236
+ self.logger.info("Columns:\n%s", self._df.columns)
237
+ self.logger.info(
238
+ "Columns with number of NaN values:\n%s",
239
+ self._df.isna().sum(),
215
240
  )
216
- logger.info("Columns with number of NaN values:\n%s", self._df.isna().sum())
217
- logger.info(
218
- "Columns with number of non-NaN values:\n%s", self._df.notna().sum()
241
+ self.logger.info(
242
+ "Columns with number of non-NaN values:\n%s",
243
+ self._df.notna().sum(),
219
244
  )
220
245
 
221
246
  if show_first:
222
247
  # the default for head is n = 5:
223
- logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
248
+ self.logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
224
249
 
225
250
  if show_last:
226
251
  # the default for tail is n = 5:
227
- logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
252
+ self.logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
228
253
 
229
254
  if show_sample:
230
255
  # the default for sample is n = 1:
231
- logger.info("%s Sample rows:\n%s", str(row_num), self._df.sample(n=row_num))
256
+ self.logger.info(
257
+ "%s Sample rows:\n%s",
258
+ str(row_num),
259
+ self._df.sample(n=row_num),
260
+ )
232
261
 
233
262
  if show_statistics:
234
- logger.info(
235
- "Description of statistics for data frame:\n%s", self._df.describe()
263
+ self.logger.info(
264
+ "Description of statistics for data frame:\n%s",
265
+ self._df.describe(),
236
266
  )
237
- logger.info(
238
- "Description of statistics for data frame (Transformed):\n%s",
267
+ self.logger.info(
268
+ "Description of statistics for data frame (transformed):\n%s",
239
269
  self._df.describe().T,
240
270
  )
241
- logger.info(
271
+ self.logger.info(
242
272
  "Description of statistics for data frame (objects):\n%s",
243
273
  self._df.describe(include="object"),
244
274
  )
@@ -249,10 +279,13 @@ class Data:
249
279
  """Append additional data to the data frame.
250
280
 
251
281
  Args:
252
- add_data (pd.DataFrame | list | dict): Additional data. Can be pd.DataFrame or list of dicts (or Data)
282
+ add_data (pd.DataFrame | list | dict):
283
+ Additional data. Can be pd.DataFrame or list of dicts (or Data).
253
284
 
254
285
  Returns:
255
- bool: True = Success, False = Error
286
+ bool:
287
+ True = Success, False = Error
288
+
256
289
  """
257
290
 
258
291
  # Does the data frame has already content?
@@ -264,166 +297,395 @@ class Data:
264
297
  return True
265
298
  elif isinstance(add_data, Data):
266
299
  df = add_data.get_data_frame()
267
- if df:
300
+ if df is not None and not df.empty:
268
301
  self._df = pd.concat([self._df, df], ignore_index=True)
269
302
  return True
270
303
  elif isinstance(add_data, list):
271
304
  if add_data:
272
- df = Data(add_data)
305
+ df = Data(add_data, logger=self.logger)
273
306
  self._df = pd.concat(
274
- [self._df, df.get_data_frame()], ignore_index=True
307
+ [self._df, df.get_data_frame()],
308
+ ignore_index=True,
275
309
  )
276
310
  return True
277
311
  elif isinstance(add_data, dict):
278
312
  if add_data:
279
313
  # it is important to wrap the dict in a list to avoid that more than 1 row is created
280
- df = Data([add_data])
314
+ df = Data([add_data], logger=self.logger)
281
315
  self._df = pd.concat(
282
- [self._df, df.get_data_frame()], ignore_index=True
316
+ [self._df, df.get_data_frame()],
317
+ ignore_index=True,
283
318
  )
284
319
  return True
285
320
  else:
286
- logger.error("Illegal data type -> '%s'", type(add_data))
287
- return False
288
- else: # self._df is None (initial state)
289
- if isinstance(add_data, pd.DataFrame):
290
- self._df = add_data
291
- return True
292
- elif isinstance(add_data, Data):
293
- self._df = add_data.get_data_frame()
294
- return True
295
- elif isinstance(add_data, list):
296
- self._df = pd.DataFrame(add_data)
297
- return True
298
- elif isinstance(add_data, dict):
299
- # it is important to wrap the dict in a list to avoid that more than 1 row is created
300
- self._df = pd.DataFrame([add_data])
301
- return True
302
- else:
303
- logger.error("Illegal data type -> '%s'", type(add_data))
321
+ self.logger.error("Illegal data type -> '%s'", type(add_data))
304
322
  return False
323
+ elif isinstance(add_data, pd.DataFrame):
324
+ self._df = add_data
325
+ return True
326
+ elif isinstance(add_data, Data):
327
+ self._df = add_data.get_data_frame()
328
+ return True
329
+ elif isinstance(add_data, list):
330
+ self._df = pd.DataFrame(add_data)
331
+ return True
332
+ elif isinstance(add_data, dict):
333
+ # it is important to wrap the dict in a list to avoid that more than 1 row is created
334
+ self._df = pd.DataFrame([add_data])
335
+ return True
336
+ else:
337
+ self.logger.error("Illegal data type -> '%s'", type(add_data))
338
+ return False
339
+
340
+ # end method definition
341
+
342
+ def merge(
343
+ self,
344
+ merge_data: pd.DataFrame,
345
+ on: str | list[str] | None = None,
346
+ how: str = "inner",
347
+ left_on: str | list[str] | None = None,
348
+ right_on: str | list[str] | None = None,
349
+ left_index: bool = False,
350
+ right_index: bool = False,
351
+ suffixes: tuple[str, str] = ("_x", "_y"),
352
+ indicator: bool = False,
353
+ validate: str | None = None,
354
+ ) -> pd.DataFrame | None:
355
+ """Merge the current DataFrame (_df) with another DataFrame.
356
+
357
+ Args:
358
+ merge_data (pd.DataFrame | Data):
359
+ The DataFrame to merge with.
360
+ on (str | list[str]):
361
+ Column(s) to merge on. Defaults to None.
362
+ how (str, optional):
363
+ Type of merge ('inner', 'outer', 'left', 'right', 'cross'). Defaults to 'inner'.
364
+ left_on (str | list[str] | None, optional):
365
+ Column(s) from self._df to merge on. Defaults to None.
366
+ right_on (str | list[str] | None, optional):
367
+ Column(s) from other DataFrame to merge on. Defaults to None.
368
+ left_index (str | list[str], optional):
369
+ Whether to merge on the index of self._df. Defaults to False.
370
+ right_index (bool, optional):
371
+ Whether to merge on the index of other. Defaults to False.
372
+ suffixes (tuple[str, str]):
373
+ Suffixes for overlapping column names. Defaults to ('_x', '_y').
374
+ indicator (bool, optional):
375
+ If True, adds a column showing the merge source. Defaults to False.
376
+ validate ():
377
+ If provided, checks merge integrity
378
+ ('one_to_one', 'one_to_many', 'many_to_one', 'many_to_many'). Defaults to None.
379
+
380
+ Returns:
381
+ The merged DataFrame or None in case of an error.
382
+
383
+ Exceptions:
384
+ ValueError: If `other` is not a DataFrame.
385
+ KeyError: If required columns for merging are missing.
386
+ ValueError: If `validate` check fails.
387
+
388
+ """
389
+
390
+ if self._df is None or self._df.empty:
391
+ self._df = merge_data
392
+
393
+ if isinstance(merge_data, Data):
394
+ merge_data = merge_data.get_data_frame() # Extract DataFrame from Data instance
395
+
396
+ try:
397
+ return self._df.merge(
398
+ merge_data,
399
+ how=how,
400
+ on=on,
401
+ left_on=left_on,
402
+ right_on=right_on,
403
+ left_index=left_index,
404
+ right_index=right_index,
405
+ suffixes=suffixes,
406
+ indicator=indicator,
407
+ validate=validate,
408
+ )
409
+ except KeyError:
410
+ self.logger.error("Column(s) not found for merging!")
411
+ except ValueError:
412
+ self.logger.error("Invalid merge operation!")
413
+
414
+ return None
415
+
416
+ # end method definition
417
+
418
+ def strip(self, columns: list | None = None, inplace: bool = True) -> pd.DataFrame:
419
+ """Strip leading and trailing spaces from specified columns in a data frame.
420
+
421
+ Args:
422
+ columns (list | None):
423
+ The list of column names to strip. If None, it strips
424
+ leading and trailing spaces from _all_ string columns.
425
+ inplace (bool, optional):
426
+ If True, the data modification is done in place, i.e.
427
+ modifying the existing data frame of the object.
428
+ If False, the data frame is copied and the copy is modified
429
+ and returned.
430
+
431
+ Returns:
432
+ pd.DataFrame:
433
+ The modified data frame with stripped columns.
434
+
435
+ """
436
+
437
+ df = self._df.copy() if not inplace else self._df
438
+
439
+ if columns is None:
440
+ # Strip spaces from all string columns
441
+ df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
442
+ else:
443
+ # Strip spaces from specified columns
444
+ for col in columns:
445
+ if col in df.columns and df[col].dtype == "object": # Check if the column exists and is of string type
446
+ df[col] = df[col].str.strip()
447
+
448
+ if inplace:
449
+ self._df = df
450
+
451
+ return df
305
452
 
306
453
  # end method definition
307
454
 
308
- def load_json_data(self, json_path: str, convert_dates: bool = False) -> bool:
309
- """Load JSON data into DataFrame
455
+ def load_json_data(
456
+ self,
457
+ json_path: str,
458
+ convert_dates: bool = False,
459
+ index_column: str | None = None,
460
+ compression: str | None = None,
461
+ ) -> bool:
462
+ """Load JSON data into a Pandas data frame.
310
463
 
311
464
  Args:
312
- json_path (str): Path to the JSON file.
313
- convert_dates (bool, optional): whether or not dates should be converted
465
+ json_path (str):
466
+ The path to the JSON file.
467
+ convert_dates (bool, optional):
468
+ Defines whether or not dates should be converted.
469
+ The default is False = dates are NOT converted.
470
+ index_column (str | None, optional):
471
+ The Name of the column (i.e. JSON data field) that should
472
+ become the index in the loaded data frame.
473
+ compression (str | None):
474
+ Remove a compression:
475
+ * gzip (.gz)
476
+ * bz2 (.bz2)
477
+ * zip (.zip)
478
+ * xz (.xz)
479
+ The value for compression should not include the dot.
480
+ Default is None = no compression.
481
+
314
482
  Returns:
315
483
  bool: False in case an error occured, True otherwise.
484
+
316
485
  """
317
486
 
318
- if json_path is not None and os.path.exists(json_path):
319
- # Load data from JSON file
320
- try:
321
- df = pd.read_json(path_or_buf=json_path, convert_dates=convert_dates)
322
- if self._df is None:
323
- self._df = df
324
- else:
325
- self._df = pd.concat([self._df, df])
326
- logger.info(
327
- "After loading -> '%s' the Data Frame has %s row(s) and %s column(s)",
328
- json_path,
329
- self._df.shape[0],
330
- self._df.shape[1],
331
- )
332
- except FileNotFoundError:
333
- logger.error(
334
- "JSON file -> %s not found. Please check the file path.", json_path
335
- )
336
- return False
337
- except PermissionError:
338
- logger.error(
339
- "Permission denied to access the JSON file -> %s.", json_path
487
+ if not json_path:
488
+ self.logger.error(
489
+ "You have not specified a JSON path!",
490
+ )
491
+ return False
492
+
493
+ # If compression is enabled the file path should have
494
+ # the matching file name extension:
495
+ if compression:
496
+ compression = compression.lstrip(".") # remove a dot prefix if present
497
+ suffix = "." + compression if compression != "gzip" else "gz"
498
+ if not json_path.endswith(suffix):
499
+ json_path += suffix
500
+
501
+ if not os.path.exists(json_path):
502
+ self.logger.error(
503
+ "Missing JSON file - you have not specified a valid path -> '%s'.",
504
+ json_path,
505
+ )
506
+ return False
507
+
508
+ # Load data from JSON file
509
+ try:
510
+ df = pd.read_json(
511
+ path_or_buf=json_path,
512
+ convert_dates=convert_dates,
513
+ compression=compression,
514
+ )
515
+
516
+ if index_column and index_column not in df.columns:
517
+ self.logger.error(
518
+ "Specified index column -> '%s' not found in the JSON data.",
519
+ index_column,
340
520
  )
341
521
  return False
342
- except IOError as e:
343
- logger.error("An I/O error occurred -> %s", str(e))
344
- return False
345
- except json.JSONDecodeError as e:
346
- logger.error("Error: Unable to decode JSON -> %s", str(e))
347
- return False
348
- except ValueError as e:
349
- logger.error("Invalid JSON input -> %s", str(e))
350
- return False
351
- except AttributeError as e:
352
- logger.error("Unexpected JSON data structure -> %s", str(e))
353
- return False
354
- except TypeError as e:
355
- logger.error("Unexpected JSON data type -> %s", str(e))
356
- return False
357
- except KeyError as e:
358
- logger.error("Missing key in JSON data -> %s", str(e))
359
- return False
360
522
 
361
- else:
362
- logger.error(
363
- "Missing JSON file - you have not specified a valid path -> %s.",
523
+ if index_column:
524
+ df = df.set_index(keys=index_column)
525
+ if self._df is None:
526
+ self._df = df
527
+ else:
528
+ self._df = pd.concat([self._df, df])
529
+ self.logger.info(
530
+ "After loading JSON file -> '%s', the data frame has %s row(s) and %s column(s)",
531
+ json_path,
532
+ self._df.shape[0],
533
+ self._df.shape[1],
534
+ )
535
+ except FileNotFoundError:
536
+ self.logger.error(
537
+ "JSON file -> '%s' not found. Please check the file path.",
538
+ json_path,
539
+ )
540
+ return False
541
+ except PermissionError:
542
+ self.logger.error(
543
+ "Missing permission to access the JSON file -> '%s'.",
364
544
  json_path,
365
545
  )
366
546
  return False
547
+ except OSError:
548
+ self.logger.error("An I/O error occurred!")
549
+ return False
550
+ except json.JSONDecodeError:
551
+ self.logger.error(
552
+ "Unable to decode JSON file -> '%s'",
553
+ json_path,
554
+ )
555
+ return False
556
+ except ValueError:
557
+ self.logger.error("Invalid JSON input -> %s", json_path)
558
+ return False
559
+ except AttributeError:
560
+ self.logger.error("Unexpected JSON data structure in file -> %s", json_path)
561
+ return False
562
+ except TypeError:
563
+ self.logger.error("Unexpected JSON data type in file -> %s", json_path)
564
+ return False
565
+ except KeyError:
566
+ self.logger.error("Missing key in JSON data in file -> %s", json_path)
567
+ return False
568
+
367
569
  return True
368
570
 
369
571
  # end method definition
370
572
 
371
573
  def save_json_data(
372
- self, json_path: str, orient: str = "records", preserve_index: bool = False
574
+ self,
575
+ json_path: str,
576
+ orient: str = "records",
577
+ preserve_index: bool = False,
578
+ index_column: str = "index",
579
+ compression: str | None = None,
373
580
  ) -> bool:
374
- """Save JSON data from DataFrame to file
581
+ """Save JSON data from data frame to file.
375
582
 
376
583
  Args:
377
- json_path (str): Path to the JSON file.
378
- orient (str, optional): Structure of the JSON
379
- preserve_index (bool, optional)
584
+ json_path (str): The path to where the JSON file should be safed.
585
+ orient (str, optional):
586
+ The structure of the JSON. Possible values:
587
+ * "records" (this is the default)
588
+ * "columns"
589
+ * "index"
590
+ * "table"
591
+ * "split"
592
+ preserve_index (bool, optional):
593
+ Defines if the index column of the data frame should be exported as well.
594
+ The default is False (index is not exported).
595
+ index_column (str, optional):
596
+ The Name of the column (i.e. JSON data field) that should
597
+ become the index in the loaded data frame. The default is "index".
598
+ compression (str | None):
599
+ Apply a compression:
600
+ * gzip (.gz)
601
+ * bz2 (.bz2)
602
+ * zip (.zip)
603
+ * xz (.xz)
604
+
380
605
  Returns:
381
- bool: False in case an error occured, True otherwise.
606
+ bool:
607
+ False in case an error occured, True otherwise.
608
+
382
609
  """
383
610
 
384
- if json_path is not None and os.path.exists(os.path.dirname(json_path)):
385
- # Load data from JSON file
386
- try:
387
- if self._df is not None:
388
- # index parameter is only allowed if orient has one of the following values:
389
- if (
390
- orient == "columns"
391
- or orient == "index"
392
- or orient == "table"
393
- or orient == "split"
394
- ):
395
- self._df.to_json(
396
- path_or_buf=json_path,
397
- index=preserve_index,
398
- orient=orient,
399
- indent=2,
400
- )
401
- else:
402
- self._df.to_json(path_or_buf=json_path, orient=orient, indent=2)
611
+ if not json_path:
612
+ self.logger.error(
613
+ "You have not specified a JSON path!",
614
+ )
615
+ return False
616
+
617
+ # If compression is enabled the file path should have
618
+ # the matching file name extension:
619
+ if compression:
620
+ suffix = "." + compression if compression != "gzip" else ".gz"
621
+ if not json_path.endswith(suffix):
622
+ json_path += suffix
623
+
624
+ # Save data to JSON file
625
+ try:
626
+ if self._df is not None:
627
+ if not os.path.exists(os.path.dirname(json_path)):
628
+ os.makedirs(os.path.dirname(json_path), exist_ok=True)
629
+
630
+ # index parameter is only allowed if orient has one of the following values:
631
+ if orient in ("columns", "index", "table", "split"):
632
+ self._df.to_json(
633
+ path_or_buf=json_path,
634
+ index=preserve_index,
635
+ orient=orient,
636
+ indent=2,
637
+ compression=compression,
638
+ date_format="iso",
639
+ )
640
+ # In this case we cannot use the index parameter as this would give this error:
641
+ # Value Error -> 'index=True' is only valid when 'orient' is 'split', 'table', 'index', or 'columns'
642
+ # So we create a new column that preserves the original row IDs from the index. The nasme
643
+
644
+ elif preserve_index:
645
+ df_with_index = self._df.reset_index(
646
+ names=index_column,
647
+ inplace=False,
648
+ )
649
+ df_with_index.to_json(
650
+ path_or_buf=json_path,
651
+ orient=orient,
652
+ indent=2,
653
+ compression=compression,
654
+ date_format="iso",
655
+ )
403
656
  else:
404
- logger.warning("Data Frame is empty. Cannot write it to JSON")
405
- return False
406
- except FileNotFoundError:
407
- logger.error(
408
- "File -> '%s' not found. Please check the file path.", json_path
657
+ self._df.to_json(
658
+ path_or_buf=json_path,
659
+ orient=orient,
660
+ indent=2,
661
+ compression=compression,
662
+ date_format="iso",
663
+ )
664
+ else:
665
+ self.logger.warning(
666
+ "Data frame is empty. Cannot write it to JSON file -> '%s'.",
667
+ json_path,
409
668
  )
410
669
  return False
411
- except PermissionError:
412
- logger.error("Permission denied to access the file -> '%s'.", json_path)
413
- return False
414
- except IOError as e:
415
- logger.error("An I/O error occurred -> %s", str(e))
416
- return False
417
- except ValueError as e:
418
- logger.error("Value Error -> %s", str(e))
419
- return False
420
-
421
- else:
422
- logger.error(
423
- "Missing JSON file -> '%s' you have not specified a valid path!",
670
+ except FileNotFoundError:
671
+ self.logger.error(
672
+ "File -> '%s' not found. Please check the file path.",
673
+ json_path,
674
+ )
675
+ return False
676
+ except PermissionError:
677
+ self.logger.error(
678
+ "Permission denied to access the file -> '%s'.",
424
679
  json_path,
425
680
  )
426
681
  return False
682
+ except OSError:
683
+ self.logger.error("An I/O error occurred accessing file -> %s", json_path)
684
+ return False
685
+ except ValueError:
686
+ self.logger.error("Value error!")
687
+ return False
688
+
427
689
  return True
428
690
 
429
691
  # end method definition
@@ -438,27 +700,40 @@ class Data:
438
700
  names: list | None = None,
439
701
  na_values: list | None = None,
440
702
  ) -> bool:
441
- """Load Excel (xlsx) data into DataFrame. Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
442
- read from a local filesystem or URL. Supports an option to read a single sheet or a list of sheets.
703
+ """Load Excel (xlsx) data into Pandas data frame.
704
+
705
+ Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
706
+ read from a local filesystem or URL. Supports an option to read a
707
+ single sheet or a list of sheets.
443
708
 
444
709
  Args:
445
- xlsx_path (str): Path to the Excel file.
446
- sheet_names (list | str | int, optional): Name or Index of the sheet in the Excel workbook to load.
447
- If 'None' then all sheets will be loaded.
448
- If 0 then first sheet in workbook will be loaded (this is the Default)
449
- If string then this is interpreted as the name of the sheet to load.
450
- If a list is passed, this can be a list of index values (int) or
451
- a list of strings with the sheet names to load.
452
- usecols (list | str, optional): List of columns to load, specified by general column names in Excel,
453
- e.g. usecols='B:D', usecols=['A', 'C', 'F']
454
- skip_rows (int, optional): List of rows to skip on top of the sheet (e.g. to not read headlines)
455
- header (int | None, optional): Excel Row (0-indexed) to use for the column labels of the parsed DataFrame.
456
- If file contains no header row, then you should explicitly pass header=None.
457
- Default is 0.
458
- names (list): List of column names to use. Default is None
459
- na_values (list, optional): List of values in the Excel that should become the Pandas NA value.
710
+ xlsx_path (str):
711
+ The path to the Excel file to load.
712
+ sheet_names (list | str | int, optional):
713
+ Name or Index of the sheet in the Excel workbook to load.
714
+ If 'None' then all sheets will be loaded.
715
+ If 0 then first sheet in workbook will be loaded (this is the Default).
716
+ If string then this is interpreted as the name of the sheet to load.
717
+ If a list is passed, this can be a list of index values (int) or
718
+ a list of strings with the sheet names to load.
719
+ usecols (list | str, optional):
720
+ A list of columns to load, specified by general column names in Excel,
721
+ e.g. usecols='B:D', usecols=['A', 'C', 'F']
722
+ skip_rows (int, optional):
723
+ List of rows to skip on top of the sheet (e.g. to not read headlines)
724
+ header (int | None, optional):
725
+ Excel Row (0-indexed) to use for the column labels of the parsed data frame.
726
+ If file contains no header row, then you should explicitly pass header=None.
727
+ Default is 0.
728
+ names (list, optional):
729
+ A list of column names to use. Default is None.
730
+ na_values (list, optional):
731
+ A list of values in the Excel that should become the Pandas NA value.
732
+
460
733
  Returns:
461
- bool: False in case an error occured, True otherwise.
734
+ bool:
735
+ False in case an error occured, True otherwise.
736
+
462
737
  """
463
738
 
464
739
  if xlsx_path is not None and os.path.exists(xlsx_path):
@@ -473,16 +748,21 @@ class Data:
473
748
  names=names,
474
749
  na_values=na_values,
475
750
  )
476
- # if multiple sheets from an Excel workbook are loaded,
751
+ # If multiple sheets from an Excel workbook are loaded,
477
752
  # then read_excel() returns a dictionary. The keys are
478
- # the names of the sheets and the values are the Data Frames.
479
- # we handle this case as follows:
753
+ # the names of the sheets and the values are the data frames.
754
+ # As this class can only handle one data frame per object,
755
+ # We handle this case by concatenating the different sheets.
756
+ # If you don't want this make sure your Excel workbook has only
757
+ # one sheet or use the "sheet_name" parameter to select the one(s)
758
+ # you want to load.
480
759
  if isinstance(df, dict):
481
- logger.info("Loading multiple Excel sheets from the workbook!")
760
+ self.logger.info("Loading multiple Excel sheets from the workbook!")
482
761
  multi_sheet_df = pd.DataFrame()
483
- for sheet in df.keys():
762
+ for sheet in df:
484
763
  multi_sheet_df = pd.concat(
485
- [multi_sheet_df, df[sheet]], ignore_index=True
764
+ [multi_sheet_df, df[sheet]],
765
+ ignore_index=True,
486
766
  )
487
767
  df = multi_sheet_df
488
768
  if self._df is None:
@@ -490,89 +770,127 @@ class Data:
490
770
  else:
491
771
  self._df = pd.concat([self._df, df], ignore_index=True)
492
772
  except FileNotFoundError:
493
- logger.error(
773
+ self.logger.error(
494
774
  "Excel file -> '%s' not found. Please check the file path.",
495
775
  xlsx_path,
496
776
  )
497
777
  return False
498
778
  except PermissionError:
499
- logger.error(
500
- "Permission denied to access the Excel file -> '%s'.", xlsx_path
779
+ self.logger.error(
780
+ "Missing permission to access the Excel file -> '%s'.",
781
+ xlsx_path,
501
782
  )
502
783
  return False
503
- except IOError as e:
504
- logger.error(
505
- "An I/O error occurred -> %s while reading the Excel file -> %s",
506
- str(e),
784
+ except OSError:
785
+ self.logger.error(
786
+ "An I/O error occurred while reading the Excel file -> '%s'",
507
787
  xlsx_path,
508
788
  )
509
789
  return False
510
- except ValueError as e:
511
- logger.error(
512
- "Invalid Excel input -> %s in Excel file -> %s", str(e), xlsx_path
790
+ except ValueError:
791
+ self.logger.error(
792
+ "Invalid Excel input in file -> '%s'",
793
+ xlsx_path,
513
794
  )
514
795
  return False
515
- except AttributeError as e:
516
- logger.error("Unexpected data structure -> %s", str(e))
796
+ except AttributeError:
797
+ self.logger.error("Unexpected data structure in file -> %s", xlsx_path)
517
798
  return False
518
- except TypeError as e:
519
- logger.error("Unexpected data type -> %s", str(e))
799
+ except TypeError:
800
+ self.logger.error("Unexpected data type in file -> %s", xlsx_path)
520
801
  return False
521
- except KeyError as e:
522
- logger.error("Missing key in Excel data -> %s", str(e))
802
+ except KeyError:
803
+ self.logger.error("Missing key in Excel data in file -> %s", xlsx_path)
523
804
  return False
524
805
 
525
806
  else:
526
- logger.error(
527
- "Missing Excel file -> '%s' you have not specified a valid path!",
807
+ self.logger.error(
808
+ "Missing Excel file -> '%s'. You have not specified a valid path!",
528
809
  xlsx_path,
529
810
  )
530
811
  return False
812
+
531
813
  return True
532
814
 
533
815
  # end method definition
534
816
 
535
817
  def save_excel_data(
536
- self, excel_path: str, sheet_name: str = "Pandas Export", index: bool = False
818
+ self,
819
+ excel_path: str,
820
+ sheet_name: str = "Pandas Export",
821
+ index: bool = False,
822
+ columns: list | None = None,
537
823
  ) -> bool:
538
- """
539
- Save the DataFrame to an Excel file, with robust error handling and logging.
824
+ """Save the data frame to an Excel file, with robust error handling and logging.
540
825
 
541
826
  Args:
542
- excel_path (str): The file path to save the Excel file.
543
- sheet_name (str): The sheet name where data will be saved. Default is 'Sheet1'.
544
- index: Whether to write the row names (index). Default is False.
827
+ excel_path (str):
828
+ The file path to save the Excel file.
829
+ sheet_name (str):
830
+ The sheet name where data will be saved. Default is 'Sheet1'.
831
+ index (bool, optional):
832
+ Whether to write the row names (index). Default is False.
833
+ columns (list | None, optional):
834
+ A list of column names to write into the excel file.
835
+
836
+ Returns:
837
+ bool:
838
+ True = success, False = error.
839
+
545
840
  """
841
+
546
842
  try:
547
843
  # Check if the directory exists
548
844
  directory = os.path.dirname(excel_path)
549
845
  if directory and not os.path.exists(directory):
550
- raise FileNotFoundError(
551
- "The directory -> '%s' does not exist." % directory
552
- )
846
+ os.makedirs(directory)
847
+
848
+ # Validate columns if provided
849
+ if columns:
850
+ existing_columns = [col for col in columns if col in self._df.columns]
851
+ missing_columns = set(columns) - set(existing_columns)
852
+ if missing_columns:
853
+ self.logger.warning(
854
+ "The following columns do not exist in the data frame and cannot be saved to Excel -> %s",
855
+ ", ".join(missing_columns),
856
+ )
857
+ columns = existing_columns
553
858
 
554
- # Attempt to save the DataFrame to Excel
555
- self._df.to_excel(excel_path, sheet_name=sheet_name, index=index)
556
- logger.info("Data saved successfully to -> %s", excel_path)
859
+ # Attempt to save the data frame to Excel:
860
+ self._df.to_excel(
861
+ excel_path,
862
+ sheet_name=sheet_name,
863
+ index=index,
864
+ columns=columns or None, # Pass None if no columns provided
865
+ )
866
+ self.logger.info(
867
+ "Data frame saved successfully to Excel file -> '%s'.",
868
+ excel_path,
869
+ )
557
870
 
558
- except FileNotFoundError as e:
559
- logger.error("Error: %s", e)
871
+ except FileNotFoundError:
872
+ self.logger.error(
873
+ "Cannot write data frame to Excel file -> '%s'",
874
+ excel_path,
875
+ )
560
876
  return False
561
877
  except PermissionError:
562
- logger.error(
563
- "Error: Permission denied. You do not have permission to write to '%s'.",
878
+ self.logger.error(
879
+ "Cannot write data frame to Excel file -> '%s'",
564
880
  excel_path,
565
881
  )
566
882
  return False
567
- except ValueError as ve:
568
- logger.error("Error: Invalid data for Excel format -> %s", ve)
569
- return False
570
- except OSError as oe:
571
- logger.error("Error: OS error occurred while saving file -> %s", oe)
883
+ except ValueError:
884
+ self.logger.error(
885
+ "Cannot write data frame to Excel file -> '%s'",
886
+ excel_path,
887
+ )
572
888
  return False
573
- except Exception as e:
574
- # Catch-all for any other unexpected errors
575
- logger.error("An unexpected error occurred -> %s", e)
889
+ except OSError:
890
+ self.logger.error(
891
+ "Cannot write data frame to Excel file -> '%s'",
892
+ excel_path,
893
+ )
576
894
  return False
577
895
 
578
896
  return True
@@ -580,130 +898,266 @@ class Data:
580
898
  # end method definition
581
899
 
582
900
  def load_csv_data(
583
- self, csv_path: str, delimiter: str = ",", encoding: str = "utf-8"
901
+ self,
902
+ csv_path: str,
903
+ delimiter: str = ",",
904
+ names: list | None = None,
905
+ header: int | None = 0,
906
+ usecols: list | None = None,
907
+ encoding: str = "utf-8",
584
908
  ) -> bool:
585
- """Load CSV (Comma separated values) data into DataFrame
909
+ """Load CSV (Comma separated values) data into data frame.
586
910
 
587
911
  Args:
588
- csv_path (str): Path to the CSV file.
589
- delimiter (str, optional, length = 1): chracter to delimit values. Default ="," (comma)
590
- encoding (str, optional): encoding of the file. Default = "utf-8".
912
+ csv_path (str):
913
+ The path to the CSV file.
914
+ delimiter (str, optional, length = 1):
915
+ The character used to delimit values. Default is "," (comma).
916
+ names (list | None, optional):
917
+ The list of column names. This is useful if file does not have a header line
918
+ but just the data.
919
+ header (int | None, optional):
920
+ The index of the header line. Default is 0 (first line). None indicates
921
+ that the file does not have a header line
922
+ usecols (list | None, optional):
923
+ There are three possible list values types:
924
+ 1. int:
925
+ These values are treated as column indices for columns to keep
926
+ (first column has index 0).
927
+ 2. str:
928
+ The names of the columns to keep. For this to work the file needs
929
+ either a header line (i.e. 'header != None') or the 'names'
930
+ parameter must be specified.
931
+ 3. bool:
932
+ The length of the list must match the number of columns. Only
933
+ columns that have a value of True are kept.
934
+ encoding (str, optional):
935
+ The encoding of the file. Default = "utf-8".
936
+
591
937
  Returns:
592
- bool: False in case an error occured, True otherwise.
938
+ bool:
939
+ False in case an error occured, True otherwise.
940
+
593
941
  """
594
942
 
595
- if csv_path is not None and os.path.exists(csv_path):
596
- # Load data from CSV file
943
+ if csv_path.startswith("http"):
944
+ # Download file from remote location specified by the packageUrl
945
+ # this must be a public place without authentication:
946
+ self.logger.debug("Download CSV file from URL -> '%s'.", csv_path)
947
+
597
948
  try:
598
- df = pd.read_csv(
599
- filepath_or_buffer=csv_path, delimiter=delimiter, encoding=encoding
600
- )
601
- if self._df is None:
602
- self._df = df
603
- else:
604
- self._df = pd.concat([self._df, df])
605
- except FileNotFoundError:
606
- logger.error(
607
- "CSV file -> '%s' not found. Please check the file path.", csv_path
608
- )
609
- return False
610
- except PermissionError:
611
- logger.error(
612
- "Permission denied to access the CSV file -> %s.", csv_path
613
- )
949
+ response = requests.get(url=csv_path, timeout=1200)
950
+ response.raise_for_status()
951
+ except requests.exceptions.HTTPError:
952
+ self.logger.error("HTTP error with -> %s", csv_path)
614
953
  return False
615
- except IOError as e:
616
- logger.error("An I/O error occurred -> %s", str(e))
954
+ except requests.exceptions.ConnectionError:
955
+ self.logger.error("Connection error with -> %s", csv_path)
617
956
  return False
618
- except ValueError as e:
619
- logger.error("Invalid CSV input -> %s", str(e))
957
+ except requests.exceptions.Timeout:
958
+ self.logger.error("Timeout error with -> %s", csv_path)
620
959
  return False
621
- except AttributeError as e:
622
- logger.error("Unexpected data structure -> %s", str(e))
623
- return False
624
- except TypeError as e:
625
- logger.error("Unexpected data type -> %s", str(e))
626
- return False
627
- except KeyError as e:
628
- logger.error("Missing key in CSV data -> %s", str(e))
960
+ except requests.exceptions.RequestException:
961
+ self.logger.error("Request error with -> %s", csv_path)
629
962
  return False
630
963
 
631
- else:
632
- logger.error(
633
- "Missing CSV file -> '%s' you have not specified a valid path!",
964
+ self.logger.debug(
965
+ "Successfully downloaded CSV file -> %s; status code -> %s",
634
966
  csv_path,
967
+ response.status_code,
635
968
  )
636
- return False
637
- return True
638
969
 
639
- # end method definition
970
+ # Convert bytes to a string using utf-8 and create a file-like object
971
+ csv_file = StringIO(response.content.decode(encoding))
640
972
 
641
- def load_xml_data(
642
- self, xml_path: str, xpath: str | None = None, xslt_path: str | None = None
643
- ) -> bool:
644
- """Load XML data into DataFrame
973
+ elif os.path.exists(csv_path):
974
+ self.logger.debug("Using local CSV file -> '%s'.", csv_path)
975
+ csv_file = csv_path
645
976
 
646
- Args:
647
- xml_path (str): Path to the XML file.
648
- xpath (str, optional): XPath to the elements we want to select
649
- xslt_path (str, optional): XSLT transformation file
650
- Returns:
651
- bool: False in cause an error occured, True otherwise.
652
- """
977
+ else:
978
+ self.logger.error(
979
+ "Missing CSV file -> '%s' you have not specified a valid path!",
980
+ csv_path,
981
+ )
982
+ return False
653
983
 
984
+ # Load data from CSV file or buffer
654
985
  try:
655
- df = pd.read_xml(path_or_buffer=xml_path, xpath=xpath, stylesheet=xslt_path)
656
- # Process the loaded data as needed
986
+ df = pd.read_csv(
987
+ filepath_or_buffer=csv_file,
988
+ delimiter=delimiter,
989
+ names=names,
990
+ header=header,
991
+ usecols=usecols,
992
+ encoding=encoding,
993
+ skipinitialspace=True,
994
+ )
657
995
  if self._df is None:
658
996
  self._df = df
659
997
  else:
660
998
  self._df = pd.concat([self._df, df])
661
- logger.info("XML file loaded successfully!")
662
- return True
663
999
  except FileNotFoundError:
664
- print("File not found.")
1000
+ self.logger.error(
1001
+ "CSV file -> '%s' not found. Please check the file path.",
1002
+ csv_path,
1003
+ )
665
1004
  return False
666
1005
  except PermissionError:
667
- logger.error("Permission denied to access the file -> %s.", xml_path)
1006
+ self.logger.error(
1007
+ "Permission denied to access the CSV file -> '%s'.",
1008
+ csv_path,
1009
+ )
668
1010
  return False
669
- except IOError as e:
670
- logger.error("An I/O error occurred -> %s", str(e))
1011
+ except OSError:
1012
+ self.logger.error("An I/O error occurred!")
671
1013
  return False
672
- except ValueError as e:
673
- logger.error("Invalid CSV input -> %s", str(e))
1014
+ except ValueError:
1015
+ self.logger.error("Invalid CSV input in file -> %s", csv_path)
674
1016
  return False
675
- except AttributeError as e:
676
- logger.error("Unexpected data structure -> %s", str(e))
1017
+ except AttributeError:
1018
+ self.logger.error("Unexpected data structure in file -> %s", csv_path)
677
1019
  return False
678
- except TypeError as e:
679
- logger.error("Unexpected data type -> %s", str(e))
1020
+ except TypeError:
1021
+ self.logger.error("Unexpected data type in file -> %s", csv_path)
680
1022
  return False
681
- except KeyError as e:
682
- logger.error("Missing key in CSV data -> %s", str(e))
1023
+ except KeyError:
1024
+ self.logger.error("Missing key in CSV data -> %s", csv_path)
683
1025
  return False
684
1026
 
1027
+ return True
1028
+
685
1029
  # end method definition
686
1030
 
687
- def load_directory(self, path_to_root: str) -> bool:
688
- """Load directory structure into Pandas Data Frame
1031
+ def load_xml_data(
1032
+ self,
1033
+ xml_path: str,
1034
+ xpath: str | None = None,
1035
+ xslt_path: str | None = None,
1036
+ encoding: str = "utf-8",
1037
+ ) -> bool:
1038
+ """Load XML data into a Pandas data frame.
689
1039
 
690
1040
  Args:
691
- path_to_root (str): Path to the root element of the
692
- directory structure
1041
+ xml_path (str):
1042
+ The path to the XML file to load.
1043
+ xpath (str, optional):
1044
+ An XPath to the elements we want to select.
1045
+ xslt_path (str, optional):
1046
+ An XSLT transformation file to convert the XML data.
1047
+ encoding (str, optional):
1048
+ The encoding of the file. Default is UTF-8.
693
1049
 
694
1050
  Returns:
695
- bool: True = Success, False = Failure
1051
+ bool:
1052
+ False in case an error occured, True otherwise.
1053
+
696
1054
  """
697
1055
 
698
- try:
699
- # Check if the provided path is a directory
700
- if not os.path.isdir(path_to_root):
701
- logger.error(
702
- "The provided path -> '%s' is not a valid directory.", path_to_root
703
- )
704
- return False
1056
+ if xml_path.startswith("http"):
1057
+ # Download file from remote location specified by the packageUrl
1058
+ # this must be a public place without authentication:
1059
+ self.logger.debug("Download XML file from URL -> '%s'.", xml_path)
705
1060
 
706
- # Initialize a list to hold file information
1061
+ try:
1062
+ response = requests.get(url=xml_path, timeout=1200)
1063
+ response.raise_for_status()
1064
+ except requests.exceptions.HTTPError:
1065
+ self.logger.error("HTTP error with -> %s", xml_path)
1066
+ return False
1067
+ except requests.exceptions.ConnectionError:
1068
+ self.logger.error("Connection error with -> %s", xml_path)
1069
+ return False
1070
+ except requests.exceptions.Timeout:
1071
+ self.logger.error("Timeout error with -> %s", xml_path)
1072
+ return False
1073
+ except requests.exceptions.RequestException:
1074
+ self.logger.error("Request error with -> %s", xml_path)
1075
+ return False
1076
+
1077
+ self.logger.debug(
1078
+ "Successfully downloaded XML file -> '%s'; status code -> %s",
1079
+ xml_path,
1080
+ response.status_code,
1081
+ )
1082
+ # Convert bytes to a string using utf-8 and create a file-like object
1083
+ xml_file = StringIO(response.content.decode(encoding))
1084
+
1085
+ elif os.path.exists(xml_path):
1086
+ self.logger.debug("Using local XML file -> '%s'.", xml_path)
1087
+ xml_file = xml_path
1088
+
1089
+ else:
1090
+ self.logger.error(
1091
+ "Missing XML file -> '%s'. You have not specified a valid path or URL!",
1092
+ xml_path,
1093
+ )
1094
+ return False
1095
+
1096
+ # Load data from XML file or buffer
1097
+ try:
1098
+ df = pd.read_xml(
1099
+ path_or_buffer=xml_file,
1100
+ xpath=xpath,
1101
+ stylesheet=xslt_path,
1102
+ encoding=encoding,
1103
+ )
1104
+ # Process the loaded data as needed
1105
+ if self._df is None:
1106
+ self._df = df
1107
+ else:
1108
+ self._df = pd.concat([self._df, df])
1109
+ self.logger.info("XML file -> '%s' loaded successfully!", xml_path)
1110
+ except FileNotFoundError:
1111
+ self.logger.error("XML file -> '%s' not found.", xml_path)
1112
+ return False
1113
+ except PermissionError:
1114
+ self.logger.error(
1115
+ "Missing permission to access the XML file -> '%s'.",
1116
+ xml_path,
1117
+ )
1118
+ return False
1119
+ except OSError:
1120
+ self.logger.error("An I/O error occurred loading from -> %s", xml_path)
1121
+ return False
1122
+ except ValueError:
1123
+ self.logger.error("Invalid XML data in file -> %s", xml_path)
1124
+ return False
1125
+ except AttributeError:
1126
+ self.logger.error("Unexpected data structure in XML file -> %s", xml_path)
1127
+ return False
1128
+ except TypeError:
1129
+ self.logger.error("Unexpected data type in XML file -> %s", xml_path)
1130
+ return False
1131
+ except KeyError:
1132
+ self.logger.error("Missing key in XML file -> %s", xml_path)
1133
+ return False
1134
+
1135
+ return True
1136
+
1137
+ # end method definition
1138
+
1139
+ def load_directory(self, path_to_root: str) -> bool:
1140
+ """Load directory structure into Pandas data frame.
1141
+
1142
+ Args:
1143
+ path_to_root (str):
1144
+ Path to the root element of the directory structure.
1145
+
1146
+ Returns:
1147
+ bool: True = Success, False = Failure
1148
+
1149
+ """
1150
+
1151
+ try:
1152
+ # Check if the provided path is a directory
1153
+ if not os.path.isdir(path_to_root):
1154
+ self.logger.error(
1155
+ "The provided path -> '%s' is not a valid directory.",
1156
+ path_to_root,
1157
+ )
1158
+ return False
1159
+
1160
+ # Initialize a list to hold file information
707
1161
  data = []
708
1162
 
709
1163
  # Walk through the directory
@@ -715,55 +1169,88 @@ class Data:
715
1169
  path_parts = relative_path.split(os.sep)
716
1170
 
717
1171
  # Create a dictionary with the path parts and file details
718
- entry = {
719
- "level {}".format(i): part
720
- for i, part in enumerate(path_parts[:-1], start=1)
721
- }
722
- entry.update({"filename": path_parts[-1], "size": file_size})
1172
+ entry = {"level {}".format(i): part for i, part in enumerate(path_parts[:-1], start=1)}
1173
+
1174
+ entry.update(
1175
+ {
1176
+ "filename": path_parts[-1],
1177
+ "size": file_size,
1178
+ "path": path_parts[1:-1],
1179
+ "relative_path": relative_path,
1180
+ "download_dir": root,
1181
+ },
1182
+ )
723
1183
  data.append(entry)
724
1184
 
725
- # Create DataFrame from list of dictionaries
1185
+ # Create data frame from list of dictionaries:
726
1186
  self._df = pd.DataFrame(data)
727
1187
 
728
1188
  # Determine the maximum number of levels
729
1189
  max_levels = max((len(entry) - 2 for entry in data), default=0)
730
1190
 
731
- # Ensure all entries have the same number of levels
1191
+ # Ensure all entries have the same number of levels:
732
1192
  for entry in data:
733
1193
  for i in range(1, max_levels + 1):
734
1194
  entry.setdefault("level {}".format(i), "")
735
1195
 
736
- # Convert to DataFrame again to make sure all columns are consistent
1196
+ # Convert to data frame again to make sure all columns are consistent:
737
1197
  self._df = pd.DataFrame(data)
738
1198
 
739
- except NotADirectoryError as nde:
740
- print(f"Error: {nde}")
741
- except FileNotFoundError as fnfe:
742
- print(f"Error: {fnfe}")
743
- except PermissionError as pe:
744
- print(f"Error: {pe}")
1199
+ except NotADirectoryError:
1200
+ self.logger.error(
1201
+ "Provided path -> '%s' is not a directory!",
1202
+ path_to_root,
1203
+ )
1204
+ return False
1205
+ except FileNotFoundError:
1206
+ self.logger.error(
1207
+ "Provided path -> '%s' does not exist in file system!",
1208
+ path_to_root,
1209
+ )
1210
+ return False
1211
+ except PermissionError:
1212
+ self.logger.error(
1213
+ "Permission error accessing path -> '%s'!",
1214
+ path_to_root,
1215
+ )
1216
+ return False
745
1217
 
746
1218
  return True
747
1219
 
748
1220
  # end method definition
749
1221
 
750
- def load_xml_directory(self, path_to_root: str, xpath: str | None = None) -> bool:
751
- """Load directory structure into Pandas Data Frame
1222
+ def load_xml_directory(
1223
+ self,
1224
+ path_to_root: str,
1225
+ xpath: str | None = None,
1226
+ xml_files: list | None = None,
1227
+ ) -> bool:
1228
+ """Load XML files from a directory structure into Pandas data frame.
752
1229
 
753
1230
  Args:
754
- path_to_root (str): Path to the root element of the
755
- directory structure
756
- xpath (str, optional): XPath to the elements we want to select
1231
+ path_to_root (str):
1232
+ Path to the root element of the directory structure.
1233
+ xpath (str, optional):
1234
+ XPath to the XML elements we want to select.
1235
+ xml_files (list | None, optional):
1236
+ Names of the XML files to load from the directory.
757
1237
 
758
1238
  Returns:
759
- bool: True = Success, False = Failure
1239
+ bool:
1240
+ True = Success, False = Failure
1241
+
760
1242
  """
761
1243
 
1244
+ # Establish a default if None is passed via the parameter:
1245
+ if not xml_files:
1246
+ xml_files = ["docovw.xml"]
1247
+
762
1248
  try:
763
1249
  # Check if the provided path is a directory
764
1250
  if not os.path.isdir(path_to_root):
765
- logger.error(
766
- "The provided path -> '%s' is not a valid directory.", path_to_root
1251
+ self.logger.error(
1252
+ "The provided path -> '%s' is not a valid directory.",
1253
+ path_to_root,
767
1254
  )
768
1255
  return False
769
1256
 
@@ -774,36 +1261,223 @@ class Data:
774
1261
  file_size = os.path.getsize(file_path)
775
1262
  file_name = os.path.basename(file_path)
776
1263
 
777
- if file_name == "docovw.xml":
778
- logger.info(
779
- "Load XML file -> '%s' of size -> %s", file_path, file_size
1264
+ if file_name in xml_files:
1265
+ self.logger.info(
1266
+ "Load XML file -> '%s' of size -> %s from -> '%s'...",
1267
+ file_name,
1268
+ file_size,
1269
+ file_path,
780
1270
  )
781
1271
  success = self.load_xml_data(file_path, xpath=xpath)
782
1272
  if success:
783
- logger.info(
784
- "Successfully loaded XML file -> '%s'", file_path
1273
+ self.logger.info(
1274
+ "Successfully loaded XML file -> '%s'.",
1275
+ file_path,
785
1276
  )
786
1277
 
787
- except NotADirectoryError as nde:
788
- logger.error("Error -> %s", str(nde))
789
- except FileNotFoundError as fnfe:
790
- logger.error("Error -> %s", str(fnfe))
791
- except PermissionError as pe:
792
- logger.error("Error -> %s", str(pe))
1278
+ except NotADirectoryError:
1279
+ self.logger.error(
1280
+ "Provided path -> '%s' is not a directory",
1281
+ path_to_root,
1282
+ )
1283
+ return False
1284
+ except FileNotFoundError:
1285
+ self.logger.error(
1286
+ "Provided path -> '%s' does not exist in file system!",
1287
+ path_to_root,
1288
+ )
1289
+ return False
1290
+ except PermissionError:
1291
+ self.logger.error(
1292
+ "Missing permission to access path -> '%s'",
1293
+ path_to_root,
1294
+ )
1295
+ return False
1296
+
1297
+ return True
1298
+
1299
+ # end method definition
1300
+
1301
+ def load_web_links(
1302
+ self,
1303
+ url: str,
1304
+ common_data: dict | None = None,
1305
+ pattern: str = r"",
1306
+ ) -> list | None:
1307
+ """Get all linked file URLs on a given web page (url) that are following a given pattern.
1308
+
1309
+ Construct a list of dictionaries based on this. This method is a helper method for load_web() below.
1310
+
1311
+ Args:
1312
+ url (str):
1313
+ The web page URL.
1314
+ common_data (dict | None, optional):
1315
+ Fields that should be added to each dictionary item. Defaults to None.
1316
+ pattern (str, optional):
1317
+ Regular Expression. Defaults to r"".
1318
+
1319
+ Returns:
1320
+ list | None:
1321
+ List of links on the web page that are complying with the given regular expression.
1322
+
1323
+ """
1324
+
1325
+ try:
1326
+ response = requests.get(url, timeout=300)
1327
+ response.raise_for_status()
1328
+ except requests.RequestException:
1329
+ self.logger.error("Failed to retrieve page at %s", url)
1330
+ return []
1331
+
1332
+ # Find all file links (hyperlinks) on the page (no file extension assumed)
1333
+ # Example filename pattern: "al022023.public.005"
1334
+ file_links = re.findall(r'href="([^"]+)"', response.text)
1335
+ if not file_links:
1336
+ self.logger.warning("No file links found on the web page -> %s", url)
1337
+ return []
1338
+
1339
+ result_list = []
1340
+ base_url = url if url.endswith("/") else url + "/"
1341
+
1342
+ for link in file_links:
1343
+ data = common_data.copy() if common_data else {}
1344
+
1345
+ # Construct the full URL
1346
+ full_url = base_url + link.lstrip("/")
1347
+
1348
+ if pattern:
1349
+ # Filter by expected naming pattern for links
1350
+ match = re.search(pattern, link)
1351
+ if not match:
1352
+ continue
1353
+
1354
+ # Extract and assign groups if they exist
1355
+ # TODO(mdiefenb): these names are currently hard-coded
1356
+ # for the National Hurricane Center Dataset (NHC)
1357
+ if len(match.groups()) >= 1:
1358
+ data["Code"] = match.group(1).upper()
1359
+ if len(match.groups()) >= 2:
1360
+ data["Type"] = match.group(2)
1361
+ if len(match.groups()) >= 3:
1362
+ data["Message ID"] = match.group(3)
1363
+
1364
+ data["URL"] = full_url
1365
+ data["Filename"] = link
1366
+
1367
+ result_list.append(data)
1368
+
1369
+ return result_list
1370
+
1371
+ # end method definition
1372
+
1373
+ def load_web(
1374
+ self,
1375
+ values: list,
1376
+ value_name: str,
1377
+ url_templates: list,
1378
+ special_values: list | None = None,
1379
+ special_url_templates: dict | None = None,
1380
+ pattern: str = r"",
1381
+ ) -> bool:
1382
+ """Traverse years and bulletin types to collect all bulletin URLs.
1383
+
1384
+ Args:
1385
+ values (list):
1386
+ List of values to travers over
1387
+ value_name (str):
1388
+ Dictionary key to construct an item in combination with a value from values
1389
+ url_templates (list):
1390
+ URLs to travers per value. The URLs should contain one {} that is
1391
+ replace by the current value.
1392
+ special_values (list | None, optional):
1393
+ List of vales (a subset of the other values list)
1394
+ that we want to handle in a special way. Defaults to None.
1395
+ special_url_templates (dict | None, optional):
1396
+ URLs for the special values. Defaults to None.
1397
+ The dictionary keys are the special values. The
1398
+ dictionary values are lists of special URLs with placeholders.
1399
+ pattern (str, optional):
1400
+ Regular expression to find the proper links on the page. Defaults to r"".
1401
+
1402
+ Returns:
1403
+ bool:
1404
+ True for success, False in case of an error.
1405
+
1406
+ """
1407
+
1408
+ result_list = []
1409
+
1410
+ # We have two nested for loops below. The out traverses over all placeholder values.
1411
+ # These could be the calendar years, e.g. [2003,...,2024]
1412
+ # The inner for loop traverses over the list of specified URLs. We can have multiple for
1413
+ # each value.
1414
+
1415
+ # Do we have a list of placeholder values we want to iterate over?
1416
+ if values:
1417
+ # Traverse all values in the values list:
1418
+ for value in values:
1419
+ # Do we want a special treatment for this value (e.g. the current year)
1420
+ if value in special_values:
1421
+ self.logger.info("Processing special value -> '%s'...", value)
1422
+ if value not in special_url_templates and str(value) not in special_url_templates:
1423
+ self.logger.error(
1424
+ "Cannot find key -> '%s' in special URL templates dictionary -> %s! Skipping...",
1425
+ value,
1426
+ str(special_url_templates),
1427
+ )
1428
+ continue
1429
+ # If the dictionary uses string keys then we need to convert the value
1430
+ # to a string as well to avoid key errors:
1431
+ if str(value) in special_url_templates:
1432
+ value = str(value)
1433
+ special_url_template_list = special_url_templates[value]
1434
+ for special_url_template in special_url_template_list:
1435
+ # Now the value is inserted into the placeholder in the URL:
1436
+ special_url = special_url_template.format(value)
1437
+ common_data = {value_name: value} if value_name else None
1438
+ result_list += self.load_web_links(
1439
+ url=special_url,
1440
+ common_data=common_data,
1441
+ pattern=pattern,
1442
+ )
1443
+ else: # normal URLs
1444
+ self.logger.info("Processing value -> '%s'...", value)
1445
+ for url_template in url_templates:
1446
+ # Now the value is inserted into the placeholder in the URL:
1447
+ url = url_template.format(value)
1448
+ common_data = {value_name: value} if value_name else None
1449
+ result_list += self.load_web_links(
1450
+ url=url,
1451
+ common_data=common_data,
1452
+ pattern=pattern,
1453
+ )
1454
+ else:
1455
+ for url_template in url_templates:
1456
+ url = url_template.format(value)
1457
+ result_list += self.load_web_links(
1458
+ url=url,
1459
+ common_data=None,
1460
+ pattern=pattern,
1461
+ )
1462
+
1463
+ # Add the data list to the data frame:
1464
+ self.append(result_list)
793
1465
 
794
1466
  return True
795
1467
 
796
1468
  # end method definition
797
1469
 
798
1470
  def partitionate(self, number: int) -> list:
799
- """Partition a data frame into equally sized
800
- partions
1471
+ """Partition a data frame into equally sized partitions.
801
1472
 
802
1473
  Args:
803
- number (int): Number of partitions
1474
+ number (int):
1475
+ The number of desired partitions.
804
1476
 
805
1477
  Returns:
806
- list: List of partitions
1478
+ list:
1479
+ A list of created partitions.
1480
+
807
1481
  """
808
1482
 
809
1483
  # Calculate the approximate size of each partition
@@ -817,24 +1491,20 @@ class Data:
817
1491
  number = 1
818
1492
  remainder = 0
819
1493
 
820
- logger.info(
821
- "Data set has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
1494
+ self.logger.info(
1495
+ "Data frame has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
822
1496
  str(size),
823
1497
  str(number),
824
1498
  str(partition_size),
825
1499
  str(remainder),
826
1500
  )
827
1501
 
828
- # Initialize a list to store partitions
1502
+ # Initialize a list to store partitions:
829
1503
  partitions = []
830
1504
  start_index = 0
831
1505
 
832
- # Slice the DataFrame into equally sized partitions
1506
+ # Slice the data frame into equally sized partitions:
833
1507
  for i in range(number):
834
- # start_index = i * partition_size
835
- # end_index = (i + 1) * partition_size if i < number - 1 else None
836
- # partition = self._df.iloc[start_index:end_index]
837
- # partitions.append(partition)
838
1508
  # Calculate the end index for this partition
839
1509
  end_index = start_index + partition_size + (1 if i < remainder else 0)
840
1510
  partition = self._df.iloc[start_index:end_index]
@@ -849,34 +1519,44 @@ class Data:
849
1519
  """Partition a data frame based on equal values in a specified column.
850
1520
 
851
1521
  Args:
852
- column_name (str): The column name to partition by
1522
+ column_name (str):
1523
+ The column name to partition by.
853
1524
 
854
1525
  Returns:
855
- list | None: List of partitions or None in case of an error (e.g. column name does not exist).
1526
+ list | None:
1527
+ List of partitions or None in case of an error (e.g. column name does not exist).
1528
+
856
1529
  """
857
1530
 
858
1531
  if column_name not in self._df.columns:
859
- logger.error(
860
- "Column -> '%s' does not exist in the Data Frame. Data Frame has these columns -> %s",
1532
+ self.logger.error(
1533
+ "Cannot partitionate by column -> '%s'. Column does not exist in the data frame. Data frame has these columns -> %s",
861
1534
  column_name,
862
1535
  str(self._df.columns),
863
1536
  )
864
1537
  return None
865
1538
 
866
- # Separate rows with NaN or None values in the specified column
1539
+ # Separate rows with NaN or None values in the specified column:
867
1540
  nan_partitions = self._df[self._df[column_name].isna()]
1541
+
1542
+ # Keep only rows where the specified column has valid (non-NaN) values:
868
1543
  non_nan_df = self._df.dropna(subset=[column_name])
869
1544
 
870
- # Group by the specified column and create a list of DataFrames for each group
1545
+ # Group the non-NaN DataFrame by the specified column's values:
871
1546
  grouped = non_nan_df.groupby(column_name)
1547
+
1548
+ # Create a list of partitions (DataFrames) for each unique value in the column:
872
1549
  partitions = [group for _, group in grouped]
873
1550
 
874
- # Add each row with NaN or None values as its own partition
875
- for i in range(len(nan_partitions)):
876
- partitions.append(nan_partitions.iloc[[i]])
1551
+ # Add each row with NaN/None as its own partition
1552
+ # iterrows() returns each row as a Series. To convert it back to a DataFrame:
1553
+ # 1. .to_frame() turns the Series into a DataFrame, but with the original column names as rows.
1554
+ # 2. .T (transpose) flips it back, turning the original row into a proper DataFrame row.
1555
+ # This ensures that even rows with NaN values are treated as DataFrame partitions.
1556
+ partitions.extend([row.to_frame().T for _, row in nan_partitions.iterrows()])
877
1557
 
878
- logger.info(
879
- "Data Frame has been partitioned into -> %s partitions based on the values in column '%s'...",
1558
+ self.logger.info(
1559
+ "Data frame has been partitioned into -> %s partitions based on the values in column -> '%s'...",
880
1560
  str(len(partitions)),
881
1561
  column_name,
882
1562
  )
@@ -886,18 +1566,19 @@ class Data:
886
1566
  # end method definition
887
1567
 
888
1568
  def deduplicate(self, unique_fields: list, inplace: bool = True) -> pd.DataFrame:
889
- """Remove dupclicate rows that have all fields in
890
- unique_fields in common.
1569
+ """Remove dupclicate rows that have all fields in unique_fields in common.
891
1570
 
892
1571
  Args:
893
- unique_fields (list): Defines the fields for which we want a unique
894
- combination.
895
- inplace (bool, optional): True if the deduplication happens in-place.
896
- Defaults to True.
1572
+ unique_fields (list):
1573
+ Defines the fields for which we want a unique combination for.
1574
+ inplace (bool, optional):
1575
+ True if the deduplication happens in-place. Defaults to True.
1576
+
897
1577
  Returns:
898
- pd.DataFrame | None: If inplace is False than a new deduplicatd DataFrame
899
- is returned. Otherwise the object is modified in place
900
- and self._df is returned.
1578
+ pd.DataFrame:
1579
+ If inplace is False than a new deduplicatd data frame is returned.
1580
+ Otherwise the object is modified in place and self._df is returned.
1581
+
901
1582
  """
902
1583
 
903
1584
  if inplace:
@@ -911,34 +1592,38 @@ class Data:
911
1592
 
912
1593
  # end method definition
913
1594
 
914
- def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame:
915
- """Sort the data frame based on one or multiple fields -
916
- either in place or return it as a new data frame (e.g. not modifying self._df)
1595
+ def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame | None:
1596
+ """Sort the data frame based on one or multiple fields.
1597
+
1598
+ Sorting can be either in place or return it as a new data frame
1599
+ (e.g. not modifying self._df).
917
1600
 
918
1601
  Args:
919
- sort_fields (list): Columns / fields to be used for sorting
920
- inplace (bool, optional): If the sorting should be inplace, i.e. modifying self._df.
921
- Defaults to True.
1602
+ sort_fields (list):
1603
+ The columns / fields to be used for sorting.
1604
+ inplace (bool, optional):
1605
+ If the sorting should be inplace, i.e. modifying self._df.
1606
+ Defaults to True.
1607
+
922
1608
  Returns:
923
- pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1609
+ pd.DataFrame | None:
1610
+ New data frame (if inplace = False) or self._df (if inplace = True).
1611
+ None in case of an error.
1612
+
924
1613
  """
925
1614
 
926
1615
  if self._df is None:
927
1616
  return None
928
1617
 
929
1618
  if not all(sort_field in self._df.columns for sort_field in sort_fields):
930
- logger.warning(
931
- "Not all of the given sort fields -> %s do exist in the Data Frame.",
1619
+ self.logger.warning(
1620
+ "Not all of the given sort fields -> %s do exist in the data frame.",
932
1621
  str(sort_fields),
933
1622
  )
934
- # Reduce the sort fields to those that really exist in the DataFrame:
935
- sort_fields = [
936
- sort_field
937
- for sort_field in sort_fields
938
- if sort_field in self._df.columns
939
- ]
940
- logger.warning(
941
- "Only these given sort fields -> %s do exist as columns in the Data Frame.",
1623
+ # Reduce the sort fields to those that really exist in the data frame:
1624
+ sort_fields = [sort_field for sort_field in sort_fields if sort_field in self._df.columns]
1625
+ self.logger.warning(
1626
+ "Only these given sort fields -> %s do exist as columns in the data frame.",
942
1627
  str(sort_fields),
943
1628
  )
944
1629
 
@@ -953,156 +1638,278 @@ class Data:
953
1638
 
954
1639
  # end method definition
955
1640
 
956
- def flatten(
957
- self,
958
- parent_field: str,
959
- flatten_fields: list,
960
- ):
961
- """Flatten a sub-dictionary by copying selected fields to the
962
- parent dictionary. This is e.g. useful for then de-duplicate
963
- a data set.
1641
+ def flatten(self, parent_field: str, flatten_fields: list, concatenator: str = "_") -> None:
1642
+ """Flatten a sub-dictionary by copying selected fields to the parent dictionary.
1643
+
1644
+ This is e.g. useful for then de-duplicate a data frame.
1645
+ To flatten a data frame makes sense in situation when a column used
1646
+ to have a list of dictionaries and got "exploded" (see explode_and_flatten()
1647
+ method below). In this case the column as dictionary values that then can
1648
+ be flattened.
964
1649
 
965
1650
  Args:
966
- parent_field (str): name of the field in the parent dictionary
967
- flatten_fields (list): fields in the sub-dictionary to copy
968
- into the parent dictionary.
1651
+ parent_field (str):
1652
+ Name prefix of the new column in the data frame. The flattened field
1653
+ names are added with a leading underscore.
1654
+ flatten_fields (list):
1655
+ Fields in the dictionary of the source column that are copied
1656
+ as new columns into the data frame.
1657
+ concatenator (str, optional):
1658
+ Character or string used to concatenate the parent field with the flattened field
1659
+ to create a unique name.
1660
+
969
1661
  """
970
1662
 
1663
+ # First do a sanity check if the data frame is not yet initialized.
1664
+ if self._df is None:
1665
+ self.logger.error(
1666
+ "The data frame is not initialized or empty. Cannot flatten field(s) -> '%s' in the data frame.",
1667
+ flatten_fields,
1668
+ )
1669
+ return
1670
+
1671
+ if parent_field not in self._df.columns:
1672
+ self.logger.warning(
1673
+ "The parent field -> '%s' cannot be flattened as it doesn't exist as column in the data frame!",
1674
+ parent_field,
1675
+ )
1676
+ return
1677
+
971
1678
  for flatten_field in flatten_fields:
972
- flat_field = parent_field + "_" + flatten_field
1679
+ flat_field = parent_field + concatenator + flatten_field
973
1680
  # The following expression generates a new column in the
974
1681
  # data frame with the name of 'flat_field'.
975
- # In the lambada function x is a dictionary that includes the subvalues
1682
+ # In the lambda function x is a dictionary that includes the subvalues
976
1683
  # and it returns the value of the given flatten field
977
1684
  # (if it exists, otherwise None). So x is self._df[parent_field], i.e.
978
1685
  # what the lambda function gets 'applied' on.
979
1686
  self._df[flat_field] = self._df[parent_field].apply(
980
- lambda x, sub_field=flatten_field: (
981
- x.get(sub_field, None) if isinstance(x, dict) else None
982
- )
1687
+ lambda x, sub_field=flatten_field: (x.get(sub_field, None) if isinstance(x, dict) else None),
983
1688
  )
984
1689
 
985
1690
  # end method definition
986
1691
 
987
1692
  def explode_and_flatten(
988
1693
  self,
989
- explode_field: str | list,
1694
+ explode_fields: str | list,
990
1695
  flatten_fields: list | None = None,
991
1696
  make_unique: bool = False,
992
1697
  reset_index: bool = False,
993
1698
  split_string_to_list: bool = False,
994
1699
  separator: str = ";,",
995
- ) -> pd.DataFrame:
996
- """Explode a substructure in the Data Frame
1700
+ ) -> pd.DataFrame | None:
1701
+ """Explode a substructure in the Pandas data frame.
997
1702
 
998
1703
  Args:
999
- explode_field (str | list): Field(s) to explode which each has/have a list structure.
1000
- Exploding multiple columns at once is possible. This delivers
1001
- a very different result compared to exploding one column after
1002
- the other!
1003
- flatten_fields (list): Fields in the exploded substructure to include
1004
- in the main dictionaries for easier processing.
1005
- make_unique (bool, optional): if True deduplicate the exploded data frame.
1006
- reset_index (bool, False): True = index is reset, False = Index is not reset
1007
- split_string_to_list (bool, optional): if True flatten the exploded data frame.
1008
- separator (str, optional): characters used to split the string values in the given column into a list
1704
+ explode_fields (str | list):
1705
+ Field(s) to explode. Each field to explode should have a list structure.
1706
+ Exploding multiple columns at once is possible. This delivers
1707
+ a very different result compared to exploding one column after the other!
1708
+ flatten_fields (list):
1709
+ Fields in the exploded substructure to include
1710
+ in the main dictionaries for easier processing.
1711
+ make_unique (bool, optional):
1712
+ If True, deduplicate the exploded data frame.
1713
+ reset_index (bool, False):
1714
+ If True, then the index is reset, False = Index is not reset.
1715
+ split_string_to_list (bool, optional):
1716
+ If True flatten the exploded data frame.
1717
+ separator (str, optional):
1718
+ Characters used to split the string values in the given column into a list.
1719
+
1009
1720
  Returns:
1010
- pd.DataFrame: Pointer to the Pandas DataFrame
1721
+ pd.DataFrame | None:
1722
+ Pointer to the Pandas data frame.
1723
+
1011
1724
  """
1012
1725
 
1013
- def update_column(row):
1014
- try:
1015
- if sub in row:
1016
- return row[sub]
1017
- except (IndexError, KeyError, ValueError):
1018
- return ""
1019
-
1020
- # Define a function to split a string into a list
1021
- def string_to_list(string: str | None) -> list:
1022
- # Do nothing if the string is already a list
1023
- if isinstance(string, list):
1024
- return_list = string
1025
- elif not string or pd.isna(string):
1026
- return_list = []
1027
- else:
1028
- # Use regular expression to split by comma, semicolon, or comma followed by space
1029
- return_list = re.split(rf"[{separator}]\s*", str(string))
1726
+ def update_column(row: pd.Series, sub: str) -> str:
1727
+ """Extract the value of a sub-column from a nested dictionary within a Pandas Series.
1728
+
1729
+ Args:
1730
+ row (pd.Series):
1731
+ A row from the data frame.
1732
+ sub (str):
1733
+ The sub-column name to extract.
1734
+
1735
+ Returns:
1736
+ str:
1737
+ The value of the sub-column, or an empty string if not found.
1738
+
1739
+ """
1740
+
1741
+ if isinstance(row, dict) and sub in row:
1742
+ return row[sub]
1743
+ return ""
1744
+
1745
+ # end def update_column()
1746
+
1747
+ def string_to_list(value: str) -> list:
1748
+ """Convert a string to a list by splitting it using a specified separator.
1749
+
1750
+ If the input is already a list, it is returned as-is. If the input is `None` or a missing value,
1751
+ an empty list is returned. Otherwise, the string is split into a list of substrings using
1752
+ the given separator. Leading and trailing spaces in the resulting substrings are removed.
1753
+
1754
+ Args:
1755
+ value (str):
1756
+ The input string to be converted into a list. Can also be a list, `None`,
1757
+ or a missing value (e.g., NaN).
1758
+
1759
+ Returns:
1760
+ list:
1761
+ A list of substrings if the input is a string, or an empty list if the input
1762
+ is `None` or a missing value. If the input is already a list, it is returned unchanged.
1763
+
1764
+ """
1765
+
1766
+ # Check if the value is already a list; if so, return it directly
1767
+ if isinstance(value, list):
1768
+ return value
1769
+
1770
+ # If the value is None or a missing value (e.g., NaN), return an empty list
1771
+ if not value or pd.isna(value):
1772
+ return []
1773
+
1774
+ # Use a regular expression to split the string by the separator
1775
+ # and remove leading/trailing spaces from each resulting substring
1776
+ return_list = re.split(rf"[{separator}]\s*", str(value))
1030
1777
 
1031
1778
  return return_list
1032
1779
 
1033
- if isinstance(explode_field, list):
1034
- logger.info("Explode multiple columns -> %s", str(explode_field))
1035
- elif isinstance(explode_field, str):
1036
- logger.info("Explode single column -> '%s'", explode_field)
1780
+ # end def string_to_list()
1781
+
1782
+ #
1783
+ # Start of main method:
1784
+ #
1785
+
1786
+ # First do a sanity check if the data frame is not yet initialized.
1787
+ if self._df is None:
1788
+ self.logger.error(
1789
+ "The data frame is not initialized or empty. Cannot explode data frame.",
1790
+ )
1791
+ return None
1792
+
1793
+ # Next do a sanity check for the given explode_field. It should
1794
+ # either be a string (single column name) or a list (multiple column names):
1795
+ if isinstance(explode_fields, list):
1796
+ self.logger.info("Exploding list of columns -> %s", str(explode_fields))
1797
+ elif isinstance(explode_fields, str):
1798
+ self.logger.info("Exploding single column -> '%s'", explode_fields)
1037
1799
  else:
1038
- logger.error(
1039
- "Illegal explode field(s) data type provided -> %s", type(explode_field)
1800
+ self.logger.error(
1801
+ "Illegal explode field(s) data type -> %s. Explode field must either be a string or a list of strings.",
1802
+ type(explode_fields),
1040
1803
  )
1041
1804
  return self._df
1042
1805
 
1043
- try:
1044
- # remove the sub dictionary that sometimes is introduced by
1045
- # XML loading. We just want the main part.
1046
- if "." in explode_field:
1047
- main = explode_field.split(".")[0]
1048
- sub = explode_field.split(".")[1]
1049
- self._df[main] = self._df[main].apply(update_column)
1050
- explode_field = main
1051
-
1052
- # Now that we have the right explode column
1053
- # we need to convert it to a list if it is inside a string (with delimiters)
1054
- if split_string_to_list:
1055
- logger.info(
1056
- "Split the string values of column -> '%s' into a list using separator -> '%s'",
1057
- explode_field,
1806
+ # Ensure explode_fields is a list for uniform processing:
1807
+ if isinstance(explode_fields, str):
1808
+ explode_fields = [explode_fields]
1809
+
1810
+ # Process nested field names with '.'
1811
+ processed_fields = []
1812
+ for field in explode_fields:
1813
+ # The "." indicates that the column has dictionary values:
1814
+ if "." in field:
1815
+ main, sub = field.split(".", 1)
1816
+ if main not in self._df.columns:
1817
+ self.logger.error(
1818
+ "The column -> '%s' does not exist in the data frame! Cannot explode it. Data frame has these columns -> %s",
1819
+ main,
1820
+ str(self._df.columns.tolist()),
1821
+ )
1822
+ continue
1823
+
1824
+ # Use update_column to extract the dictionary key specified by the sub value:
1825
+ self.logger.info(
1826
+ "Extracting dictionary value for key -> '%s' from column -> '%s'.",
1827
+ sub,
1828
+ main,
1829
+ )
1830
+ self._df[main] = self._df[main].apply(update_column, args=(sub,))
1831
+ processed_fields.append(main)
1832
+ else:
1833
+ processed_fields.append(field)
1834
+
1835
+ # Verify all processed fields exist in the data frame:
1836
+ missing_columns = [col for col in processed_fields if col not in self._df.columns]
1837
+ if missing_columns:
1838
+ self.logger.error(
1839
+ "The following columns are missing in the data frame and cannot be exploded -> %s. Data frame has these columns -> %s",
1840
+ missing_columns,
1841
+ str(self._df.columns.tolist()),
1842
+ )
1843
+ return self._df
1844
+
1845
+ # Handle splitting strings into lists if required:
1846
+ if split_string_to_list:
1847
+ for field in processed_fields:
1848
+ self.logger.info(
1849
+ "Splitting strings in column -> '%s' into lists using separator -> '%s'",
1850
+ field,
1058
1851
  separator,
1059
1852
  )
1060
1853
  # Apply the function to convert the string values in the column (give by the name in explode_field) to lists
1061
1854
  # The string_to_list() sub-method above also considers the separator parameter.
1062
- self._df[explode_field] = self._df[explode_field].apply(string_to_list)
1063
-
1064
- # Explode the field that has list values
1065
- self._df = self._df.explode(column=explode_field)
1066
- except KeyError:
1067
- logger.error("Column -> '%s' not found in Data Frame!", str(explode_field))
1855
+ self._df[field] = self._df[field].apply(string_to_list)
1856
+
1857
+ # Explode all specified columns at once.
1858
+ # explode() can either take a string field or a list of fields.
1859
+ # # It is VERY important to do the explosion of multiple columns together -
1860
+ # otherwise we get combinatorial explosion. Explosion of multiple columns 1-by-1
1861
+ # is VERY different from doing the explosion together!
1862
+ self.logger.info("Validated column(s) to explode -> %s", processed_fields)
1863
+ try:
1864
+ self._df = self._df.explode(
1865
+ column=processed_fields,
1866
+ ignore_index=reset_index,
1867
+ )
1068
1868
  except ValueError:
1069
- logger.error(
1070
- "Unable to explode the specified column -> '%s'!", str(explode_field)
1869
+ self.logger.error(
1870
+ "Error exploding columns -> %s",
1871
+ processed_fields,
1071
1872
  )
1873
+ return self._df
1072
1874
 
1073
1875
  if flatten_fields:
1074
- self.flatten(parent_field=explode_field, flatten_fields=flatten_fields)
1876
+ # Ensure that flatten() is called for each exploded column
1877
+ for field in processed_fields:
1878
+ self.flatten(parent_field=field, flatten_fields=flatten_fields)
1075
1879
 
1880
+ # Deduplicate rows if required
1076
1881
  if make_unique:
1077
1882
  self._df.drop_duplicates(subset=flatten_fields, inplace=True)
1078
1883
 
1884
+ # Reset index explicitly if not handled during explode
1079
1885
  if reset_index:
1080
- self._df.reset_index(inplace=True)
1886
+ self._df.reset_index(drop=True, inplace=True)
1081
1887
 
1082
1888
  return self._df
1083
1889
 
1084
1890
  # end method definition
1085
1891
 
1086
1892
  def drop_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
1087
- """Drop selected columns from the Data Frame
1893
+ """Drop selected columns from the Pandas data frame.
1088
1894
 
1089
1895
  Args:
1090
- column_names (list): list of column names to drop.
1091
- inplace (bool, optional): If the dropping should be inplace, i.e. modifying self._df.
1092
- Defaults to True.
1896
+ column_names (list):
1897
+ The list of column names to drop.
1898
+ inplace (bool, optional):
1899
+ Whether or not the dropping should be inplace, i.e. modifying self._df.
1900
+ Defaults to True.
1901
+
1093
1902
  Returns:
1094
- pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1903
+ pd.DataFrame:
1904
+ New data frame (if inplace = False) or self._df (if inplace = True)
1905
+
1095
1906
  """
1096
1907
 
1097
1908
  if not all(column_name in self._df.columns for column_name in column_names):
1098
- # Reduce the column names to those that really exist in the DataFrame:
1099
- column_names = [
1100
- column_name
1101
- for column_name in column_names
1102
- if column_name in self._df.columns
1103
- ]
1104
- logger.warning(
1105
- "Reduce to these columns -> %s that do exist in the Data Frame.",
1909
+ # Reduce the column names to those that really exist in the data frame:
1910
+ column_names = [column_name for column_name in column_names if column_name in self._df.columns]
1911
+ self.logger.info(
1912
+ "Drop columns -> %s from the data frame.",
1106
1913
  str(column_names),
1107
1914
  )
1108
1915
 
@@ -1116,25 +1923,26 @@ class Data:
1116
1923
  # end method definition
1117
1924
 
1118
1925
  def keep_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
1119
- """Keep only selected columns from the Data Frame. Drop the rest.
1926
+ """Keep only selected columns in the data frame. Drop the rest.
1120
1927
 
1121
1928
  Args:
1122
- column_names (list): list of column names to keep.
1123
- inplace (bool, optional): If the keeping should be inplace, i.e. modifying self._df.
1124
- Defaults to True.
1929
+ column_names (list):
1930
+ A list of column names to keep.
1931
+ inplace (bool, optional):
1932
+ If the keeping should be inplace, i.e. modifying self._df.
1933
+ Defaults to True.
1934
+
1125
1935
  Returns:
1126
- pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1936
+ pd.DataFrame:
1937
+ New data frame (if inplace = False) or self._df (if inplace = True).
1938
+
1127
1939
  """
1128
1940
 
1129
1941
  if not all(column_name in self._df.columns for column_name in column_names):
1130
- # Reduce the column names to those that really exist in the DataFrame:
1131
- column_names = [
1132
- column_name
1133
- for column_name in column_names
1134
- if column_name in self._df.columns
1135
- ]
1136
- logger.warning(
1137
- "Reduce to these columns -> %s that do exist in the Data Frame.",
1942
+ # Reduce the column names to those that really exist in the data frame:
1943
+ column_names = [column_name for column_name in column_names if column_name in self._df.columns]
1944
+ self.logger.info(
1945
+ "Reduce columns to keep to these columns -> %s that do exist in the data frame.",
1138
1946
  column_names,
1139
1947
  )
1140
1948
 
@@ -1152,272 +1960,797 @@ class Data:
1152
1960
 
1153
1961
  # end method definition
1154
1962
 
1155
- def cleanse(self, cleansings: dict):
1156
- """Cleanse data with regular expressions and upper/lower case conversion.
1963
+ def rename_column(self, old_column_name: str, new_column_name: str) -> bool:
1964
+ """Rename a data frame column.
1965
+
1966
+ Args:
1967
+ old_column_name (str):
1968
+ The old name of the column.
1969
+ new_column_name (str):
1970
+ The new name of the column.
1971
+
1972
+ Returns:
1973
+ bool:
1974
+ True = Success, False = Error
1975
+
1976
+ """
1977
+
1978
+ if self._df is None:
1979
+ return False
1980
+
1981
+ if old_column_name not in self._df.columns:
1982
+ self.logger.error(
1983
+ "Cannot rename column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
1984
+ old_column_name,
1985
+ str(self._df.columns),
1986
+ )
1987
+ return False
1988
+
1989
+ if new_column_name in self._df.columns:
1990
+ self.logger.error(
1991
+ "Cannot rename column -> '%s' to -> '%s'. New name does already exist as column in the data frame! Data frame has these columns -> %s",
1992
+ old_column_name,
1993
+ new_column_name,
1994
+ str(self._df.columns),
1995
+ )
1996
+ return False
1997
+
1998
+ self._df.rename(columns={old_column_name: new_column_name}, inplace=True)
1999
+
2000
+ return True
2001
+
2002
+ # end method definition
2003
+
2004
+ def is_dict_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
2005
+ """Safely checks if a column predominantly contains dictionary-like objects.
2006
+
2007
+ Args:
2008
+ column (pd.Series):
2009
+ The pandas Series (column) to check.
2010
+ threshold (float, optional):
2011
+ 0.0 < threshold <= 1.0. Float representation of the percentage.
2012
+ Default = 0.5 (50%).
2013
+
2014
+ Returns:
2015
+ bool:
2016
+ True if the column contains mostly dictionary-like objects, False otherwise.
2017
+
2018
+ """
2019
+
2020
+ if not isinstance(column, pd.Series):
2021
+ self.logger.error(
2022
+ "Expected Pandas series, but got -> %s",
2023
+ str(type(column)),
2024
+ )
2025
+ return False
2026
+ if not 0.0 < threshold <= 1.0:
2027
+ self.logger.error(
2028
+ "Threshold must be between 0.0 and 1.0, but got -> %s",
2029
+ str(threshold),
2030
+ )
2031
+ return False
2032
+
2033
+ # Drop null values (NaN or None) and check types of remaining values
2034
+ non_null_values = column.dropna()
2035
+ dict_count = non_null_values.apply(lambda x: isinstance(x, dict)).sum()
2036
+
2037
+ # If more than threshold % of non-null values are dictionaries, return True.
2038
+ # Else return False.
2039
+ return dict_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
2040
+
2041
+ # end method definition
2042
+
2043
+ def is_list_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
2044
+ """Safely checks if a column predominantly contains list-like objects.
2045
+
2046
+ Args:
2047
+ column (pd.Series):
2048
+ The pandas Series (column) to check.
2049
+ threshold (float, optional):
2050
+ 0.0 < threshold <= 1.0. Float representation of the percentage. Default = 0.5 (50%).
2051
+
2052
+ Returns:
2053
+ bool:
2054
+ True if the column contains list-like objects, False otherwise.
2055
+
2056
+ """
2057
+
2058
+ if not isinstance(column, pd.Series):
2059
+ self.logger.error(
2060
+ "Expected pandas series, but got -> %s",
2061
+ str(type(column)),
2062
+ )
2063
+ return False
2064
+ if not 0.0 < threshold <= 1.0:
2065
+ self.logger.error(
2066
+ "Threshold must be between 0.0 and 1.0, but got -> %s",
2067
+ str(threshold),
2068
+ )
2069
+ return False
2070
+
2071
+ # Drop null values (NaN or None) and check types of remaining values
2072
+ non_null_values = column.dropna()
2073
+ list_count = non_null_values.apply(lambda x: isinstance(x, list)).sum()
2074
+
2075
+ # If more than threshold % of non-null values are lists, return True.
2076
+ # Else return False.
2077
+ return list_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
2078
+
2079
+ # end method definition
2080
+
2081
+ def is_string_column(self, column: pd.Series) -> bool:
2082
+ """Determine if a Pandas series predominantly contains string values, ignoring NaN values.
1157
2083
 
1158
2084
  Args:
1159
- cleansings (dict): Dictionary with keys that equal the column names.
1160
- The dictionary values are dictionaries itself with
1161
- these fields:
1162
- * replacements (dict): name of a column in the data frame
1163
- * upper (bool): change the value to uppercase
1164
- * lower (bool): change the value to lowercase
1165
- Example:
1166
- cleansings = {
1167
- "airportName": {
1168
- "upper": true
1169
- "replacements" : {
1170
- "-": " ", # replace hypen with space
1171
- ",\s*": " ", # remove commas followed by on or more spaces with a single space
1172
- "\s+$": "", # remove trailing spaces at the end of the name
1173
- "^\s+": "", # remove spaces at the beginning of the name
1174
- }
1175
- "length": 10
1176
- }
1177
- "airportId": {
1178
- "upper": true
1179
- "replacements" : {
1180
- "K(.{3})": "\1", # if the airport has 4 charters and starts with a 'K' we remove the 'K'
1181
- "\/": "", # remove forward slashes - this helps to have consistency with N/A, NA, n/a, na
1182
- }
1183
- }
1184
- }
2085
+ column (pd.Series):
2086
+ The Pandas Series to check.
2087
+
2088
+ Returns:
2089
+ bool:
2090
+ True if all non-NaN values in the column are strings, False otherwise.
2091
+
1185
2092
  """
1186
2093
 
1187
- # Iterate over each column in regex_dict
2094
+ # Drop NaN values and check if remaining values are strings
2095
+ return column.dropna().map(lambda x: isinstance(x, str)).all()
2096
+
2097
+ # end method definition
2098
+
2099
+ def cleanse(self, cleansings: dict) -> None:
2100
+ """Cleanse data with regular expressions and upper/lower case conversions.
2101
+
2102
+ Args:
2103
+ cleansings (dict):
2104
+ Dictionary with keys that equal the column names.
2105
+ The dictionary values are dictionaries themselves with
2106
+ these fields:
2107
+ * replacements (dict): name of a column in the data frame
2108
+ * upper (bool, optional, default = False): change the value to uppercase
2109
+ * lower (bool, optional, default = False): change the value to lowercase
2110
+ * capitalize (bool, optional, default = False) - first character upper case, rest lower-case
2111
+ * title (bool, optional, default = False) - first character of each word upper case
2112
+ * length (int, optional, default = 0): truncate to max length
2113
+
2114
+ """
2115
+
2116
+ # Iterate over each column in the cleansing dictionary
1188
2117
  for column, cleansing in cleansings.items():
1189
- # "colum" is the name of the field we want to cleanse.
1190
- # "cleansing" is a dict with
2118
+ # Read the cleansing parameters:
2119
+ replacements = cleansing.get("replacements", {})
2120
+ upper = cleansing.get("upper", False)
2121
+ lower = cleansing.get("lower", False)
2122
+ capitalize = cleansing.get("capitalize", False)
2123
+ title = cleansing.get("title", False)
2124
+ length = cleansing.get("length", 0)
2125
+
2126
+ # Handle dict columns - we expect the column name to seperate
2127
+ # main field from sub field using a dot syntax (e.g., "column.subfield")
1191
2128
  if "." in column:
1192
- # Handle columns with subfields
1193
- main_field, sub_field = column.split(".")
1194
- if not main_field in self._df.columns:
2129
+ column, dict_key = column.split(".")
2130
+ if column not in self._df.columns:
2131
+ self.logger.error(
2132
+ "Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
2133
+ column,
2134
+ str(self._df.columns),
2135
+ )
1195
2136
  continue
1196
- # we use the additional parameters for lambda (beside x)
1197
- # to avoid linter warning W0640
1198
- self._df[main_field] = self._df[main_field].apply(
1199
- lambda x, sub_field=sub_field, cleansing=cleansing: self._cleanse_subfield(
2137
+ # Apply cleansing to dictionary values in the main column
2138
+ self.logger.info(
2139
+ "Cleansing for column -> '%s' has a subfield -> '%s' configured. Do cleansing for dictionary items with key -> '%s'...",
2140
+ column,
2141
+ dict_key,
2142
+ dict_key,
2143
+ )
2144
+ self._df[column] = self._df[column].apply(
2145
+ lambda x,
2146
+ dict_key=dict_key,
2147
+ replacements=replacements,
2148
+ upper=upper,
2149
+ lower=lower,
2150
+ capitalize=capitalize,
2151
+ title=title,
2152
+ length=length: self._cleanse_subfield(
1200
2153
  data=x,
1201
- sub_field=sub_field,
1202
- replacements=cleansing.get("replacements", {}),
1203
- upper=cleansing.get("upper", False),
1204
- lower=cleansing.get("lower", False),
1205
- length=cleansing.get("length", 0),
1206
- )
2154
+ dict_key=dict_key,
2155
+ replacements=replacements,
2156
+ upper=upper,
2157
+ lower=lower,
2158
+ capitalize=capitalize,
2159
+ title=title,
2160
+ length=length,
2161
+ ),
1207
2162
  )
1208
- else:
1209
- if not column in self._df.columns:
2163
+ # end if "." in column
2164
+ else: # the else case handles strings and list columns
2165
+ if column not in self._df.columns:
2166
+ self.logger.error(
2167
+ "Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
2168
+ column,
2169
+ str(self._df.columns),
2170
+ )
1210
2171
  continue
1211
2172
 
1212
- logger.debug("\nBEFORE:\n%s\n", self._df[column])
1213
-
1214
- if cleansing.get("upper", False) and self._df[column].dtype == "object":
1215
- self._df[column] = self._df[column].str.upper()
1216
- if cleansing.get("lower", False) and self._df[column].dtype == "object":
1217
- self._df[column] = self._df[column].str.lower()
1218
-
1219
- # Handle regular columns. regexp_pattern is on the left side
1220
- # of the colon, and replacement the string on the right side of
1221
- # the colon:
1222
- for regex_pattern, replacement in cleansing.get(
1223
- "replacements", {}
1224
- ).items():
1225
- if not regex_pattern:
1226
- logger.error("Empty search / regexp pattern!")
1227
- continue
1228
- # \b is a word boundary anchor in regular expressions.
1229
- # It matches a position where one side is a word character
1230
- # (like a letter or digit) and the other side is a non-word character
1231
- # (like whitespace or punctuation). It's used to match whole words.
1232
- # We want to have this to e.g. not replace "INT" with "INTERNATIONAL"
1233
- # if the word is already "INTERNATIONAL". It is important
1234
- # that the \b ... \b enclosure is ONLY used if regex_pattern is NOT
1235
- # a regular expression but just a normal string.
1236
- # Check if the pattern does NOT contain any regex special characters
1237
- # (excluding dot and ampersand) and ONLY then use \b ... \b
1238
- # Special regexp characters include: ^ $ * + ? ( ) [ ] { } | \
1239
- if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
1240
- # Wrap with word boundaries for whole-word matching
1241
- regex_pattern = rf"\b{regex_pattern}\b"
1242
- self._df[column] = self._df[column].str.replace(
1243
- pat=regex_pattern, repl=replacement, regex=True
2173
+ # Handle string columns:
2174
+ if self.is_string_column(self._df[column]):
2175
+ # Apply cleansing operations on string column
2176
+ self.logger.info(
2177
+ "Column -> '%s' has string values. Do cleansing for string values...",
2178
+ column,
2179
+ )
2180
+ self._df[column] = self._df[column].apply(
2181
+ lambda x,
2182
+ replacements=replacements,
2183
+ upper=upper,
2184
+ lower=lower,
2185
+ capitalize=capitalize,
2186
+ title=title,
2187
+ length=length: (
2188
+ self._apply_string_cleansing(
2189
+ value=x,
2190
+ replacements=replacements,
2191
+ upper=upper,
2192
+ lower=lower,
2193
+ capitalize=capitalize,
2194
+ title=title,
2195
+ length=length,
2196
+ )
2197
+ if isinstance(x, str)
2198
+ else x
2199
+ ),
1244
2200
  )
1245
2201
 
1246
- if (
1247
- cleansing.get("length", 0) > 0
1248
- and self._df[column].dtype == "object"
1249
- ):
1250
- self._df[column] = self._df[column].str.slice(
1251
- 0, cleansing["length"]
2202
+ # Handle list columns:
2203
+ elif self.is_list_column(self._df[column]):
2204
+ # Handle list-like columns for this we iterate over each list item
2205
+ # and apply the cleansing by calling _apply_string_cleansing() for item:
2206
+ self.logger.info(
2207
+ "Column -> '%s' has list values. Do cleansing for each list item...",
2208
+ column,
2209
+ )
2210
+ self._df[column] = self._df[column].apply(
2211
+ lambda x,
2212
+ replacements=replacements,
2213
+ upper=upper,
2214
+ lower=lower,
2215
+ capitalize=capitalize,
2216
+ title=title,
2217
+ length=length: (
2218
+ [
2219
+ (
2220
+ self._apply_string_cleansing(
2221
+ value=item,
2222
+ replacements=replacements,
2223
+ upper=upper,
2224
+ lower=lower,
2225
+ capitalize=capitalize,
2226
+ title=title,
2227
+ length=length,
2228
+ )
2229
+ if isinstance(
2230
+ item,
2231
+ str,
2232
+ ) # we just change string list items
2233
+ else item
2234
+ )
2235
+ for item in x
2236
+ ]
2237
+ if isinstance(x, list)
2238
+ else x
2239
+ ),
2240
+ )
2241
+
2242
+ else:
2243
+ self.logger.error(
2244
+ "Column -> '%s' is not a string, list, or dict-like column. Skipping cleansing...",
2245
+ column,
2246
+ )
2247
+ # end else handling strings and lists
2248
+ # for column, cleansing in cleansings.items()
2249
+
2250
+ # end method definition
2251
+
2252
+ def _cleanse_dictionary(
2253
+ self,
2254
+ data: dict,
2255
+ dict_key: str,
2256
+ replacements: dict[str, str],
2257
+ upper: bool,
2258
+ lower: bool,
2259
+ capitalize: bool = False,
2260
+ title: bool = False,
2261
+ length: int = 0,
2262
+ ) -> dict:
2263
+ """Cleanse dictionary data within a single column value that has a given key.
2264
+
2265
+ Args:
2266
+ data (dict):
2267
+ The column dictionary value.
2268
+ dict_key (str):
2269
+ The dictionary key whose value should be cleansed in the row to cleanse.
2270
+ replacements (dict):
2271
+ Dictionary of regex replacements to apply to the subfield value.
2272
+ upper (bool):
2273
+ If True, convert value in subfield to upper-case.
2274
+ lower (bool):
2275
+ If True, convert value in subfield to lower-case.
2276
+ capitalize (bool, optional):
2277
+ If True, capitalize the first letter of the subfield value.
2278
+ title (bool, optional):
2279
+ If True, title-case the subfield value.
2280
+ length (int, optional):
2281
+ The maximum length for the subfield value.
2282
+
2283
+ Returns:
2284
+ dict:
2285
+ The updated data with the cleansing applied to the dictionary item with the given key.
2286
+
2287
+ """
2288
+
2289
+ if pd.isna(data):
2290
+ return data
2291
+
2292
+ if dict_key not in data:
2293
+ self.logger.warning(
2294
+ "The dictionary key -> '%s' (field) is not in the data frame row! Cleansing skipped!",
2295
+ dict_key,
2296
+ )
2297
+ return data
2298
+
2299
+ # 1. Read the value to be cleansed from the data dict:
2300
+ value = data[dict_key]
2301
+
2302
+ # 2. Apply string operations based on the type of the value (str, list, or dict)
2303
+
2304
+ if isinstance(value, str):
2305
+ # If the value is a string, apply the string operations directly
2306
+ value: str = self._apply_string_cleansing(
2307
+ value=value,
2308
+ replacements=replacements,
2309
+ upper=upper,
2310
+ lower=lower,
2311
+ capitalize=capitalize,
2312
+ title=title,
2313
+ length=length,
2314
+ )
2315
+ elif isinstance(value, list):
2316
+ # If the value is a list, apply string operations to each element
2317
+ value: list = [
2318
+ (
2319
+ self._apply_string_cleansing(
2320
+ value=item,
2321
+ replacements=replacements,
2322
+ upper=upper,
2323
+ lower=lower,
2324
+ capitalize=capitalize,
2325
+ title=title,
2326
+ length=length,
2327
+ )
2328
+ if isinstance(item, str)
2329
+ else item
2330
+ )
2331
+ for item in value
2332
+ ]
2333
+ elif isinstance(value, dict):
2334
+ # If the value is a dictionary, apply string operations to each value
2335
+ value: dict = {
2336
+ k: (
2337
+ self._apply_string_cleansing(
2338
+ value=v,
2339
+ replacements=replacements,
2340
+ upper=upper,
2341
+ lower=lower,
2342
+ capitalize=capitalize,
2343
+ title=title,
2344
+ length=length,
1252
2345
  )
2346
+ if isinstance(v, str)
2347
+ else v
2348
+ )
2349
+ for k, v in value.items()
2350
+ }
2351
+
2352
+ # 3. Write back the cleansed value to the data dict:
2353
+ data[dict_key] = value
1253
2354
 
1254
- logger.debug("\nAFTER:\n%s\n", self._df[column])
2355
+ return data
1255
2356
 
1256
2357
  # end method definition
1257
2358
 
1258
2359
  def _cleanse_subfield(
1259
2360
  self,
1260
- data: list | dict,
1261
- sub_field: str,
1262
- replacements: dict,
2361
+ data: dict | list,
2362
+ dict_key: str,
2363
+ replacements: dict[str, str],
1263
2364
  upper: bool,
1264
2365
  lower: bool,
2366
+ capitalize: bool = False,
2367
+ title: bool = False,
1265
2368
  length: int = 0,
1266
- ) -> list | dict:
1267
- """Helper function to cleanse subfield data
2369
+ ) -> dict | list:
2370
+ """Cleanse subfield data within a single column value.
2371
+
2372
+ This is NOT a pd.Series but either a dictionary or a list of dictionaries.
1268
2373
 
1269
2374
  Args:
1270
- data (list | dict): sub data - either a list of dictionaries or a dictionary
1271
- sub_field (str): defines which field in the sub data should be updated
1272
- regex_replacements (dict): Dictionary of regular expressions
1273
- upper (bool): if True transform value in subfield to upper-case
1274
- lower (bool): if True, transform value in subfield to lower-case
1275
- length (int, optional): maximum length of the strings
2375
+ data (dict | list):
2376
+ The column value. Can be a dictionary or a list of dictionaries
2377
+ dict_key (str):
2378
+ The dictionary key whose value should be cleansed in the data to cleanse.
2379
+ replacements (dict):
2380
+ Dictionary of regex replacements to apply to the subfield value.
2381
+ upper (bool):
2382
+ If True, convert value in subfield to upper-case.
2383
+ lower (bool):
2384
+ If True, convert value in subfield to lower-case.
2385
+ capitalize (bool, optional):
2386
+ If True, capitalize the first letter of the subfield value.
2387
+ title (bool, optional):
2388
+ If True, title-case the subfield value.
2389
+ length (int, optional):
2390
+ The maximum length for the subfield value.
2391
+
1276
2392
  Returns:
1277
- list | dict: Updated data
2393
+ dict | list:
2394
+ The updated data with the cleansing applied to the subfield.
2395
+
1278
2396
  """
1279
2397
 
1280
2398
  if isinstance(data, list):
1281
- # If data is a list, apply cleansing to each dictionary in the list
1282
- for i, item in enumerate(data):
1283
- if (
1284
- item is not None
1285
- and sub_field in item
1286
- and not pd.isnull(item[sub_field])
1287
- ):
1288
- if upper:
1289
- item[sub_field] = item[sub_field].upper()
1290
- elif lower:
1291
- item[sub_field] = item[sub_field].lower()
1292
- for regex_pattern, replacement in replacements.items():
1293
- if replacement:
1294
- regex_pattern = rf"\b{regex_pattern}\b"
1295
- item[sub_field] = re.sub(
1296
- regex_pattern, replacement, item[sub_field]
1297
- )
1298
- if length > 0:
1299
- item[sub_field] = item[sub_field][:length]
1300
- data[i] = item
1301
- elif isinstance(data, dict):
1302
- # If data is a dictionary, apply cleansing directly to the subfield
1303
- if sub_field in data and not pd.isnull(data[sub_field]):
1304
- if upper:
1305
- data[sub_field] = data[sub_field].upper()
1306
- elif lower:
1307
- data[sub_field] = data[sub_field].lower()
1308
- for regex_pattern, replacement in replacements.items():
1309
- if replacement:
1310
- regex_pattern = rf"\b{regex_pattern}\b"
1311
- data[sub_field] = re.sub(
1312
- regex_pattern, replacement, data[sub_field]
2399
+ data = [
2400
+ (
2401
+ self._cleanse_dictionary(
2402
+ data=item,
2403
+ dict_key=dict_key,
2404
+ replacements=replacements,
2405
+ upper=upper,
2406
+ lower=lower,
2407
+ capitalize=capitalize,
2408
+ title=title,
2409
+ length=length,
1313
2410
  )
1314
- if length > 0:
1315
- data[sub_field] = data[sub_field][:length]
2411
+ if item is not None and dict_key in item and not pd.isna(item[dict_key])
2412
+ else item
2413
+ )
2414
+ for item in data
2415
+ ]
2416
+ elif isinstance(data, dict):
2417
+ data = self._cleanse_dictionary(
2418
+ data=data,
2419
+ dict_key=dict_key,
2420
+ replacements=replacements,
2421
+ upper=upper,
2422
+ lower=lower,
2423
+ capitalize=capitalize,
2424
+ title=title,
2425
+ length=length,
2426
+ )
2427
+
1316
2428
  return data
1317
2429
 
1318
2430
  # end method definition
1319
2431
 
1320
- def filter(self, conditions: list, inplace: bool = True) -> pd.DataFrame:
1321
- """Filter the DataFrame based on (multiple) conditions.
2432
+ def _apply_string_cleansing(
2433
+ self,
2434
+ value: str,
2435
+ replacements: dict[str, str],
2436
+ upper: bool,
2437
+ lower: bool,
2438
+ capitalize: bool,
2439
+ title: bool,
2440
+ length: int,
2441
+ ) -> str | None:
2442
+ """Apply string operations (upper, lower, capitalize, title-case, replacements) to a string.
2443
+
2444
+ Args:
2445
+ value (str):
2446
+ The string value to which the operations will be applied.
2447
+ replacements (dict[str, str]):
2448
+ A dictionary of regular expression patterns (keys) and replacement strings (values) to apply to the string.
2449
+ upper (bool):
2450
+ If True, convert the string to uppercase.
2451
+ lower (bool):
2452
+ If True, convert the string to lowercase.
2453
+ capitalize (bool):
2454
+ If True, capitalize the first letter of the string and lowercase the rest. Default is False.
2455
+ title (bool):
2456
+ If True, convert the string to title-case (first letter of each word is capitalized). Default is False.
2457
+ length (int):
2458
+ If greater than 0, truncate the string to this length. Default is 0 (no truncation).
2459
+
2460
+ Returns:
2461
+ str | None:
2462
+ The updated string with all the applied operations. None in case an error occured.
2463
+
2464
+ Example:
2465
+ value = "hello world"
2466
+ replacements = {r"world": "there"}
2467
+ upper = True
2468
+ length = 5
2469
+
2470
+ result = _apply_string_cleansing(value, replacements, upper, length=length)
2471
+ # result would be "HELLO"
2472
+
2473
+ """
2474
+
2475
+ if not isinstance(
2476
+ value,
2477
+ str,
2478
+ ): # Only apply string operations if the value is a string
2479
+ return None
2480
+
2481
+ if upper:
2482
+ value = value.upper()
2483
+ if lower:
2484
+ value = value.lower()
2485
+ if capitalize:
2486
+ value = value.capitalize()
2487
+ if title:
2488
+ value = value.title()
2489
+
2490
+ # Handle regex replacements
2491
+ for regex_pattern, replacement in replacements.items():
2492
+ if regex_pattern:
2493
+ # Check if the pattern does NOT contain any regex special characters
2494
+ # (excluding dot and ampersand) and ONLY then use \b ... \b
2495
+ # Special regexp characters include: ^ $ * + ? ( ) | [ ] { } \
2496
+ if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
2497
+ # Wrap with word boundaries for whole-word matching
2498
+ # \b is a word boundary anchor in regular expressions.
2499
+ # It matches a position where one side is a word character
2500
+ # (like a letter or digit) and the other side is a non-word character
2501
+ # (like whitespace or punctuation). It's used to match whole words.
2502
+ # We want to have this to e.g. not replace "INT" with "INTERNATIONAL"
2503
+ # if the word is already "INTERNATIONAL". It is important
2504
+ # that the \b ... \b enclosure is ONLY used if regex_pattern is NOT
2505
+ # a regular expression but just a normal string.
2506
+ # TODO: we may reconsider if re.escape() is required or not:
2507
+ regex_pattern = re.escape(regex_pattern)
2508
+ regex_pattern = rf"\b{regex_pattern}\b"
2509
+ try:
2510
+ value = re.sub(regex_pattern, replacement, value)
2511
+ except re.error:
2512
+ self.logger.error(
2513
+ "Invalid regex pattern -> '%s' in replacement processing!",
2514
+ regex_pattern,
2515
+ )
2516
+ continue
2517
+
2518
+ # Truncate to the specified length, starting from index 0
2519
+ if 0 < length < len(value):
2520
+ value = value[:length]
2521
+
2522
+ return value
2523
+
2524
+ # end method definition
2525
+
2526
+ def filter(
2527
+ self,
2528
+ conditions: list,
2529
+ inplace: bool = True,
2530
+ reset_index: bool = True,
2531
+ ) -> pd.DataFrame | None:
2532
+ """Filter the data frame based on (multiple) conditions.
1322
2533
 
1323
2534
  Args:
1324
- conditions (list): Conditions are a list of dictionaries with 3 items:
1325
- * field (str): name of a column in the data frame
1326
- * value (str or list): expected value (filter criterium).
1327
- If it is a list then one of
1328
- the list elements must match the field value (OR)
1329
- * regex (bool): this flag controls if the value is interpreted as a
1330
- regular expression. If there is no regex item in the
1331
- dictionary then the default is False (= values is NOT regex).
1332
- If there are multiple conditions in the list each has to evaluate to True (AND)
1333
- inplace (bool, optional): Defines if the self._df is modified (inplace) or just
1334
- a new DataFrame is returned. Defaults to True.
2535
+ conditions (list):
2536
+ Conditions are a list of dictionaries with 3 items:
2537
+ * field (str): The name of a column in the data frame
2538
+ * value (str or list):
2539
+ Expected value (filter criterium).
2540
+ If it is a list then one of the list elements must match the field value (OR)
2541
+ * equal (bool):
2542
+ Whether to test for equal or non-equal. If not specified equal is treated as True.
2543
+ * regex (bool):
2544
+ This flag controls if the value is interpreted as a
2545
+ regular expression. If there is no regex item in the
2546
+ dictionary then the default is False (= values is NOT regex).
2547
+ * enabled (bool):
2548
+ True or False. The filter is only applied if 'enabled = True'
2549
+ If there are multiple conditions in the list each has to evaluate to True (AND)
2550
+ inplace (bool, optional):
2551
+ Defines if the self._df is modified (inplace) or just
2552
+ a new data frame is returned. Defaults to True.
2553
+ reset_index (bool, optional):
2554
+ Filter removes rows. If filter_index = True then the numbering
2555
+ of the index is newly calculated
2556
+
1335
2557
  Returns:
1336
- pd.DataFrame: new data frame or pointer to self._df (depending on the value of 'inplace')
2558
+ pd.DataFrame | None:
2559
+ A new data frame or pointer to self._df (depending on the value of 'inplace').
2560
+ None in case of an error.
2561
+
1337
2562
  """
1338
2563
 
1339
2564
  if self._df is None:
1340
- logger.error("DataFrame is not initialized.")
2565
+ self.logger.error("Data frame is not initialized.")
1341
2566
  return None
1342
2567
 
1343
2568
  if self._df.empty:
1344
- logger.error("DataFrame is empty.")
2569
+ self.logger.error("Data frame is empty.")
1345
2570
  return None
1346
2571
 
1347
- # first filtered_df is the full DataFreame.
1348
- # then it is subsequentially reduced by each condition
2572
+ # First filtered_df is the full data frame.
2573
+ # Then it is subsequentially reduced by each condition
1349
2574
  # at the end it is just those rows that match all conditions.
1350
- filtered_df = self._df
2575
+ filtered_df = self._df if inplace else self._df.copy()
2576
+
2577
+ def list_matches(row: list, values: list) -> bool:
2578
+ """Check if any item in the 'values' list is present in the given 'row' list.
2579
+
2580
+ Args:
2581
+ row (list):
2582
+ A list of items from the data frame column.
2583
+ values (list):
2584
+ A list of values to check for in the 'row'.
2585
+
2586
+ Returns:
2587
+ bool:
2588
+ True if any item in 'values' is found in 'row', otherwise False.
2589
+
2590
+ """
2591
+
2592
+ return any(item in values for item in row)
2593
+
2594
+ def dict_matches(row: dict, key: str, values: list) -> bool:
2595
+ """Check if the value for the dictionary 'key' is in 'values'.
1351
2596
 
1352
- # We traverse a list of conditions. Each condition must evaluate to true
2597
+ Args:
2598
+ row (dict):
2599
+ A dictionary from the data frame column.
2600
+ key (str):
2601
+ The key to lookup in the dictionary.
2602
+ values (list):
2603
+ A list of values to check for in the 'row'.
2604
+
2605
+ Returns:
2606
+ bool:
2607
+ True, if the value for the dictionary key is in 'values', otherwise False.
2608
+
2609
+ """
2610
+
2611
+ if not row or key not in row:
2612
+ return False
2613
+
2614
+ return row[key] in values
2615
+
2616
+ # We traverse a list of conditions. Each condition must evaluate to True
1353
2617
  # otherwise the current workspace or document (i.e. the data set for these objects)
1354
- # will be skipped. The variable filtered_df is
2618
+ # will be skipped.
1355
2619
  for condition in conditions:
2620
+ # Check if the condition is enabled. If 'enabled' is not
2621
+ # in the condition dict then we assume it is enabled.
2622
+ if not condition.get("enabled", True):
2623
+ continue
1356
2624
  field = condition.get("field", None)
1357
2625
  if not field:
1358
- logger.error("Missing value for filter condition 'field' in payload!")
2626
+ self.logger.error(
2627
+ "Missing value for filter condition 'field' in payload!",
2628
+ )
1359
2629
  continue
2630
+ if "." in field:
2631
+ field, sub = field.split(".", 1)
2632
+ else:
2633
+ sub = None
2634
+
1360
2635
  if field not in self._df.columns:
1361
- logger.warning(
1362
- "Filter condition field -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
2636
+ self.logger.warning(
2637
+ "Filter condition field -> '%s' does not exist as column in the data frame! Data frame has these columns -> %s",
1363
2638
  field,
1364
2639
  str(self._df.columns),
1365
2640
  )
1366
- continue # Skip filtering for columns not present in DataFrame
2641
+ continue # Skip filtering for columns not present in data frame
2642
+
2643
+ regex = condition.get("regex", False)
2644
+ # We need the column to be of type string if we want to use regular expressions
2645
+ # so if the column is not yet a string we convert the column to string:
2646
+ if regex and filtered_df[field].dtype != "object":
2647
+ # Change type of column to string:
2648
+ filtered_df[field] = filtered_df[field].astype(str)
2649
+ filtered_df[field] = filtered_df[field].fillna("")
2650
+
1367
2651
  value = condition.get("value", None)
1368
- if not value:
1369
- logger.error(
1370
- "Missing filter value of for filter condition field -> '%s'!", field
2652
+ if value is None:
2653
+ # Support alternative syntax using plural.
2654
+ value = condition.get("values", None)
2655
+ if value is None:
2656
+ self.logger.error(
2657
+ "Missing filter value(s) for filter condition field -> '%s'!",
2658
+ field,
1371
2659
  )
1372
2660
  continue
1373
- regex = condition.get("regex", False)
1374
-
1375
- logger.info(
1376
- "Data Frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
1377
- filtered_df.shape[0],
1378
- filtered_df.shape[1],
1379
- str(condition),
1380
- )
1381
-
1382
- filtered_dfs = []
1383
2661
 
1384
2662
  # if a single string is passed as value we put
1385
2663
  # it into an 1-item list to simplify the following code:
1386
2664
  if not isinstance(value, list):
1387
2665
  value = [value]
1388
2666
 
1389
- # multiple values are treated like a logical "or" condition
1390
- for value_item in value:
1391
- if regex:
1392
- filtered_dfs.append(
1393
- filtered_df[
1394
- ~filtered_df[field].isna()
1395
- & filtered_df[field].str.contains(value_item, regex=True)
1396
- ]
2667
+ # If all values in the condition are strings then we
2668
+ # want the column also to be of type string:
2669
+ if all(isinstance(v, str) for v in value):
2670
+ # Change type of column to string:
2671
+ # filtered_df[field] = filtered_df[field].astype(str)
2672
+ # filtered_df[field] = filtered_df[field].fillna("").astype(str)
2673
+ # filtered_df[field] = filtered_df[field].fillna("")
2674
+
2675
+ # When inplace == True, filtered_df is just a reference to self._df.
2676
+ # Using .loc[:, field] ensures that Pandas updates the column correctly in self._df.
2677
+ # When inplace == False, filtered_df is a full copy (self._df.copy() above),
2678
+ # so modifications remain in filtered_df.
2679
+ # .loc[:, field] ensures no SettingWithCopyWarning, since filtered_df is now a separate DataFrame.
2680
+ filtered_df.loc[:, field] = filtered_df[field].fillna("").astype(str)
2681
+
2682
+ self.logger.info(
2683
+ "Data frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
2684
+ str(filtered_df.shape[0]),
2685
+ str(filtered_df.shape[1]),
2686
+ str(condition),
2687
+ )
2688
+
2689
+ # Check if the column is boolean
2690
+ if pd.api.types.is_bool_dtype(filtered_df[field]):
2691
+ # Convert string representations of booleans to actual booleans
2692
+ value = [v.lower() in ["true", "1"] if isinstance(v, str) else bool(v) for v in value]
2693
+
2694
+ # Do we want to test for equalitiy or non-equality?
2695
+ # For lists equality means: value is in the list
2696
+ # For lists non-equality means: value is NOT in the list
2697
+ test_for_equal = condition.get("equal", True)
2698
+
2699
+ # Check if the column contains only lists (every non-empty element in the column is a list).
2700
+ # `filtered_df[field]`: Access the column with the name specified in 'field'.
2701
+ # `.dropna()`: Drop None or NaN rows for the test.
2702
+ # `.apply(lambda x: isinstance(x, list))`: For each element in the column, check if it is a list.
2703
+ # `.all()`: Ensure that all elements in the column satisfy the condition of being a list.
2704
+ if filtered_df[field].dropna().apply(lambda x: isinstance(x, list)).all():
2705
+ if not test_for_equal:
2706
+ filtered_df = filtered_df[~filtered_df[field].apply(list_matches, values=value)]
2707
+ else:
2708
+ filtered_df = filtered_df[filtered_df[field].apply(list_matches, values=value)]
2709
+ # Check if the column contains only dictionaries (every non-empty element in the column is a dict).
2710
+ # `filtered_df[field]`: Access the column with the name specified in 'field'.
2711
+ # `.dropna()`: Drop None or NaN rows for the test.
2712
+ # `.apply(lambda x: isinstance(x, dict))`: For each element in the column, check if it is a dict.
2713
+ # `.all()`: Ensure that all elements in the column satisfy the condition of being a dictionary.
2714
+ elif filtered_df[field].dropna().apply(lambda x: isinstance(x, dict)).all():
2715
+ if not sub:
2716
+ self.logger.error(
2717
+ "Filtering on dictionary values need a key. This needs to be provided with 'field.key' syntax!",
1397
2718
  )
2719
+ continue
2720
+ if not test_for_equal:
2721
+ filtered_df = filtered_df[~filtered_df[field].apply(dict_matches, key=sub, values=value)]
1398
2722
  else:
1399
- result_df = filtered_df[
1400
- ~filtered_df[field].isna() & filtered_df[field].eq(value_item)
1401
- ]
1402
- if not result_df.empty:
1403
- filtered_dfs.append(result_df)
1404
- # end for values
1405
-
1406
- if not filtered_dfs:
1407
- logger.warning(
1408
- "Filter with field -> '%s' and value -> '%s' delivered an empty Data Frame",
1409
- field,
1410
- str(value),
1411
- )
1412
- filtered_df.drop(filtered_df.index, inplace=True)
2723
+ filtered_df = filtered_df[filtered_df[field].apply(dict_matches, key=sub, values=value)]
2724
+ # Check if the column has boolean values:
2725
+ elif pd.api.types.is_bool_dtype(filtered_df[field]):
2726
+ # For a boolean filter we can drop NA values:
2727
+ filtered_df = filtered_df.dropna(subset=[field])
2728
+ if not test_for_equal:
2729
+ filtered_df = filtered_df[~filtered_df[field].isin(value)]
2730
+ else:
2731
+ filtered_df = filtered_df[filtered_df[field].isin(value)]
2732
+ elif not regex:
2733
+ if pd.api.types.is_string_dtype(filtered_df[field]):
2734
+ filtered_df[field] = filtered_df[field].str.strip()
2735
+ if not test_for_equal:
2736
+ filtered_df = filtered_df[~filtered_df[field].isin(value)]
2737
+ else:
2738
+ filtered_df = filtered_df[filtered_df[field].isin(value)]
1413
2739
  else:
1414
- # Concatenate the filtered DataFrames for each value in the list
1415
- filtered_df = pd.concat(filtered_dfs, ignore_index=True)
1416
-
1417
- logger.info(
1418
- "Data Frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
1419
- filtered_df.shape[0],
1420
- filtered_df.shape[1],
2740
+ # Create a pure boolean pd.Series as a filter criterium:
2741
+ regex_condition = filtered_df[field].str.contains(
2742
+ "|".join(value),
2743
+ regex=True,
2744
+ na=False,
2745
+ )
2746
+ # Apply the boolean pd.Series named 'regex_condition' as
2747
+ # a filter - either non-negated or negated (using ~):
2748
+ filtered_df = filtered_df[~regex_condition] if not test_for_equal else filtered_df[regex_condition]
2749
+
2750
+ self.logger.info(
2751
+ "Data frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
2752
+ str(filtered_df.shape[0]),
2753
+ str(filtered_df.shape[1]),
1421
2754
  str(condition),
1422
2755
  )
1423
2756
  # end for condition
@@ -1425,23 +2758,29 @@ class Data:
1425
2758
  if inplace:
1426
2759
  self._df = filtered_df
1427
2760
 
2761
+ if reset_index:
2762
+ self._df.reset_index(inplace=True, drop=True)
2763
+
1428
2764
  return filtered_df
1429
2765
 
1430
2766
  # end method definition
1431
2767
 
1432
- def fill_na_in_column(self, column_name: str, default_value: str | int):
1433
- """Replace NA values in a column with a defined new default value
2768
+ def fill_na_in_column(self, column_name: str, default_value: str | int) -> None:
2769
+ """Replace NA values in a column with a defined new default value.
1434
2770
 
1435
2771
  Args:
1436
- column_name (str): name of the column in the DataFrame
1437
- default_value (str | int): value to replace NA with
2772
+ column_name (str):
2773
+ The name of the column in the data frame.
2774
+ default_value (str | int):
2775
+ The value to replace NA with.
2776
+
1438
2777
  """
1439
2778
 
1440
2779
  if column_name in self._df.columns:
1441
2780
  self._df[column_name] = self._df[column_name].fillna(value=default_value)
1442
2781
  else:
1443
- logger.error(
1444
- "Cannot replace NA values as column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
2782
+ self.logger.error(
2783
+ "Cannot replace NA values as column -> '%s' does not exist in the data frame! Available columns -> %s",
1445
2784
  column_name,
1446
2785
  str(self._df.columns),
1447
2786
  )
@@ -1449,16 +2788,19 @@ class Data:
1449
2788
  # end method definition
1450
2789
 
1451
2790
  def fill_forward(self, inplace: bool) -> pd.DataFrame:
1452
- """Fill the missing cells appropriately by carrying forward
1453
- the values from the previous rows where necessary.
1454
- This has applications if a hierarchy is represented by
1455
- nested cells e.g. in an Excel sheet.
2791
+ """Fill the missing cells appropriately by carrying forward the values from the previous rows where necessary.
2792
+
2793
+ This has applications if a hierarchy is represented by
2794
+ nested cells e.g. in an Excel sheet.
1456
2795
 
1457
2796
  Args:
1458
- inplace (bool): Should the modification happen inplace or not.
2797
+ inplace (bool):
2798
+ Should the modification happen inplace or not.
1459
2799
 
1460
2800
  Returns:
1461
- pd.DataFrame: Resulting dataframe
2801
+ pd.DataFrame:
2802
+ The resulting data frame.
2803
+
1462
2804
  """
1463
2805
 
1464
2806
  # To convert an Excel representation of a folder structure with nested
@@ -1471,70 +2813,137 @@ class Data:
1471
2813
  # end method definition
1472
2814
 
1473
2815
  def lookup_value(
1474
- self, lookup_column: str, lookup_value: str, separator: str = "|"
1475
- ) -> pd.Series | None:
1476
- """Lookup a row that includes a lookup value in the value of a given column.
2816
+ self,
2817
+ lookup_column: str,
2818
+ lookup_value: str,
2819
+ separator: str = "|",
2820
+ single_row: bool = True,
2821
+ ) -> pd.Series | pd.DataFrame | None:
2822
+ """Lookup row(s) that includes a lookup value in the value of a given column.
1477
2823
 
1478
2824
  Args:
1479
- lookup_column (str): name of the column to search in
1480
- lookup_value (str): value to search for
1481
- separator (str): string list delimiter / separator
2825
+ lookup_column (str):
2826
+ The name of the column to search in.
2827
+ lookup_value (str):
2828
+ The value to search for.
2829
+ separator (str):
2830
+ The string list delimiter / separator. The pipe symbol | is the default
2831
+ as it is unlikely to appear in a normal string (other than a plain comma).
2832
+ The separator is NOT looked for in the lookup_value but in the column that
2833
+ is given by lookup_column!
2834
+ single_row (bool, optional):
2835
+ This defines if we just return the first matching row if multiple matching rows
2836
+ are found. Default is True (= single row).
1482
2837
 
1483
2838
  Returns:
1484
- pd.Series | None: data frame row that matches or None if no match was found.
2839
+ pd.Series | pd.DataFrame | None:
2840
+ Data frame (multiple rows) or Series (row) that matches the lookup value.
2841
+ None if no match was found.
2842
+
1485
2843
  """
1486
2844
 
1487
- # Use the `apply` function to filter rows where the lookup value matches a whole item in the comma-separated list
1488
- def match_lookup_value(string_list: str) -> bool:
1489
- """Spilt delimiter-separated list into a python list
2845
+ # Use the `apply` function to filter rows where the lookup value matches a
2846
+ # whole item in the separator-divided list:
2847
+ def match_lookup_value(string_list: str | None) -> bool:
2848
+ """Check if the lookup value is in a string list.
2849
+
2850
+ For this the string list is converted to a python
2851
+ list. A separator is used for the splitting.
1490
2852
 
1491
2853
  Args:
1492
- string_list (str): delimiter-separated string list like "a, b, c" or "a | b | c"
2854
+ string_list (str):
2855
+ Delimiter-separated string list like "a, b, c" or "a | b | c"
1493
2856
 
1494
2857
  Returns:
1495
- bool: True if lookup_value is equal to one of the delimiter-separated terms
2858
+ bool:
2859
+ True if lookup_value is equal to one of the delimiter-separated terms.
2860
+
1496
2861
  """
2862
+
2863
+ if pd.isna(string_list): # Handle None/NaN safely
2864
+ return False
2865
+
1497
2866
  # Ensure that the string is a string
1498
2867
  string_list = str(string_list)
1499
2868
 
1500
- return lookup_value in [
1501
- item.strip() for item in string_list.split(separator)
1502
- ]
2869
+ return lookup_value in [item.strip() for item in string_list.split(separator)]
1503
2870
 
1504
- df = self._df
2871
+ # end method definition
1505
2872
 
1506
2873
  if self._df is None:
1507
2874
  return None
1508
2875
 
2876
+ df = self._df
2877
+
1509
2878
  if lookup_column not in self._df.columns:
1510
- logger.error(
1511
- "Column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
2879
+ self.logger.error(
2880
+ "Cannot lookup value in column -> '%s'. Column does not exist in the data frame! Data frame has these columns -> %s",
1512
2881
  lookup_column,
1513
2882
  str(self._df.columns),
1514
2883
  )
1515
2884
  return None
1516
2885
 
1517
2886
  # Fill NaN or None values in the lookup column with empty strings
1518
- df[lookup_column] = df[lookup_column].fillna("")
2887
+ # df[lookup_column] = df[lookup_column].fillna("")
2888
+
2889
+ # Use the `apply` function to filter rows where the lookup value is in row cell
2890
+ # of column given by lookup_column. match_lookup_value() is called with
2891
+ # the content of the individual cell contents:
2892
+ matched_rows = df[df[lookup_column].apply(match_lookup_value)]
2893
+
2894
+ # If nothing was found we return None:
2895
+ if matched_rows.empty:
2896
+ return None
1519
2897
 
1520
- # Use the `apply` function to filter rows where the lookup value is in the Synonyms list
1521
- matched_row = df[df[lookup_column].apply(match_lookup_value)]
2898
+ # If it is OK to have multiple matches (= multiple rows = pd.DataFrame).
2899
+ # We can just return the matched_rows now which should be a pd.DataFrame:
2900
+ if not single_row:
2901
+ return matched_rows
2902
+
2903
+ # Check if more than one row matches, and log a warning if so
2904
+ if len(matched_rows) > 1:
2905
+ self.logger.warning(
2906
+ "More than one match found for lookup value -> '%s' in column -> '%s'. Returning the first match.",
2907
+ lookup_value,
2908
+ lookup_column,
2909
+ )
1522
2910
 
1523
2911
  # Return the first matched row, if any
1524
- if not matched_row.empty:
1525
- return matched_row.iloc[0]
2912
+ return matched_rows.iloc[0]
1526
2913
 
1527
- return None
2914
+ # end method definition
2915
+
2916
+ def set_value(self, column: str, value, condition: pd.Series | None = None) -> None: # noqa: ANN001
2917
+ """Set the value in the data frame based on a condition.
2918
+
2919
+ Args:
2920
+ column (str):
2921
+ The name of the column.
2922
+ value (Any):
2923
+ The value to set for those rows that fulfill the condition.
2924
+ condition (pd.Series, optional):
2925
+ This should be a boolean Series where each element is True or False,
2926
+ representing rows in the data frame that meet a certain condition.
2927
+ If None is provided then ALL rows get the 'value' in the given
2928
+ column.
2929
+
2930
+ """
2931
+
2932
+ if condition is None:
2933
+ self._df[column] = value # Set value unconditionally
2934
+ else:
2935
+ self._df.loc[condition, column] = value # Set value based on condition
1528
2936
 
1529
2937
  # end method definition
1530
2938
 
1531
2939
  def add_column(
1532
2940
  self,
1533
- source_column: str,
1534
- reg_exp: str,
1535
2941
  new_column: str,
1536
- prefix="",
1537
- suffix="",
2942
+ data_type: str = "string",
2943
+ source_column: str = "",
2944
+ reg_exp: str = "",
2945
+ prefix: str = "",
2946
+ suffix: str = "",
1538
2947
  length: int | None = None,
1539
2948
  group_chars: int | None = None,
1540
2949
  group_separator: str = ".",
@@ -1543,26 +2952,78 @@ class Data:
1543
2952
  """Add additional column to the data frame.
1544
2953
 
1545
2954
  Args:
1546
- source_column (str): name of the source column
1547
- reg_exp (str): regular expression to apply on the content of the source column
1548
- new_column (str): name of the column to add
1549
- prefix (str, optional): Prefix to add in front of the value. Defaults to "".
1550
- suffix (str, optional): Suffix to add at the end of the value. Defaults to "".
1551
- length (int | None, optional): Length to reduce to. Defaults to None (= unlimited).
1552
- group_chars (int | None, optional): group the resulting string in characters of group_chars. Defaults to None.
1553
- Usable e.g. for thousand seperator "."
1554
- group_separator (str, optional): Separator string for the grouping. Defaults to ".".
1555
- group_remove_leading_zero (bool, optional): Remove leading zeros from the groups. Defaults to True.
2955
+ new_column (str):
2956
+ The name of the column to add.
2957
+ data_type (str, optional):
2958
+ The data type of the new column.
2959
+ source_column (str, optional):
2960
+ The name of the source column.
2961
+ reg_exp (str, optional):
2962
+ A regular expression to apply on the content of the source column.
2963
+ prefix (str, optional):
2964
+ Prefix to add in front of the value. Defaults to "".
2965
+ suffix (str, optional):
2966
+ Suffix to add at the end of the value. Defaults to "".
2967
+ length (int | None, optional):
2968
+ Length to reduce to. Defaults to None (= unlimited).
2969
+ group_chars (int | None, optional):
2970
+ Group the resulting string in characters of group_chars. Defaults to None.
2971
+ Usable e.g. for thousand seperator "."
2972
+ group_separator (str, optional):
2973
+ Separator string for the grouping. Defaults to ".".
2974
+ group_remove_leading_zero (bool, optional):
2975
+ Remove leading zeros from the groups. Defaults to True.
1556
2976
 
1557
2977
  Returns:
1558
- bool: True = Success, False = Failure
2978
+ bool:
2979
+ True = Success, False = Failure
2980
+
1559
2981
  """
1560
2982
 
1561
2983
  if self._df is None:
1562
2984
  return False
1563
2985
 
2986
+ # Check that the new column does not yet exist
2987
+ if new_column in self._df.columns:
2988
+ self.logger.error(
2989
+ "New column -> '%s' does already exist in data frame! Cannot add it. Data frame has these columns -> %s",
2990
+ new_column,
2991
+ str(self._df.columns),
2992
+ )
2993
+ return False
2994
+
2995
+ # first we handle the very simple case to not have
2996
+ # a source column but just add an empty new column:
2997
+ if not source_column:
2998
+ self._df[new_column] = pd.Series(dtype=data_type)
2999
+ return True
3000
+
3001
+ # Check if the source column exists
3002
+ if source_column not in self._df.columns:
3003
+ self.logger.error(
3004
+ "Source column -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
3005
+ source_column,
3006
+ str(self._df.columns),
3007
+ )
3008
+ return False
3009
+
3010
+ # Validate the regex pattern
3011
+ try:
3012
+ re.compile(reg_exp) # Check if the pattern is a valid regex
3013
+ except re.error:
3014
+ self.logger.error(
3015
+ "Invalid regular expression -> %s. Cannot extract data for new column -> '%s'!",
3016
+ reg_exp,
3017
+ new_column,
3018
+ )
3019
+ return False
3020
+
3021
+ # Ensure the source column is of type string (convert it, if necessary)
3022
+ if self._df[source_column].dtype != "object":
3023
+ self._df[source_column] = self._df[source_column].astype(str)
3024
+
1564
3025
  # Use str.extract to apply the regular expression to the source column
1565
- # and then assign this modified colum to the variable extracted:
3026
+ # and then assign this modified column to the variable "extracted":
1566
3027
  extracted = self._df[source_column].str.extract(pat=reg_exp, expand=False)
1567
3028
 
1568
3029
  # Limit the result to the specified length
@@ -1571,9 +3032,9 @@ class Data:
1571
3032
 
1572
3033
  if group_chars is not None:
1573
3034
 
1574
- def process_grouping(x):
3035
+ def process_grouping(x) -> str | None: # noqa: ANN001
1575
3036
  if pd.isna(x):
1576
- return x
3037
+ return None
1577
3038
  # Split into groups
1578
3039
  groups = [x[i : i + group_chars] for i in range(0, len(x), group_chars)]
1579
3040
  if group_remove_leading_zero:
@@ -1594,21 +3055,36 @@ class Data:
1594
3055
 
1595
3056
  # end method definition
1596
3057
 
1597
- def convert_to_lists(self, columns: list, delimiter: str = ","):
1598
- """Method to intelligently convert strings to lists, with a configurable delimiter,
1599
- ignoring delimiters inside quotes
3058
+ def convert_to_lists(self, columns: list, delimiter: str = ",") -> None:
3059
+ """Intelligently convert string values to list values, in defined data frame columns.
3060
+
3061
+ The delimiter to separate values in the string value can be configured.
3062
+ The method is ignoring delimiters that are inside quotes.
1600
3063
 
1601
3064
  Args:
1602
- columns (list): name of the columns whose values should be converted to lists.
1603
- It is expected that
1604
- delimiter (str, optional): Character that delimits list items. Defaults to ",".
3065
+ columns (list):
3066
+ The name of the columns whose values should be converted to lists.
3067
+ delimiter (str, optional):
3068
+ Character that delimits list items. Defaults to ",".
1605
3069
 
1606
3070
  Returns:
1607
3071
  None. self._df is modified in place.
3072
+
1608
3073
  """
1609
3074
 
1610
3075
  # Regex to split by the delimiter, ignoring those inside quotes or double quotes
1611
- def split_string_ignoring_quotes(s, delimiter):
3076
+ def split_string_ignoring_quotes(s: str, delimiter: str) -> list:
3077
+ """Split a string into a list at positions that have a delimiter character.
3078
+
3079
+ Args:
3080
+ s (str): the string to split
3081
+ delimiter (str): The single character that is used for splitting.
3082
+
3083
+ Returns:
3084
+ A list of splitted values.
3085
+
3086
+ """
3087
+
1612
3088
  # Escaping the delimiter in case it's a special regex character
1613
3089
  delimiter = re.escape(delimiter)
1614
3090
  # Match quoted strings and unquoted delimiters separately
@@ -1617,27 +3093,84 @@ class Data:
1617
3093
 
1618
3094
  for col in columns:
1619
3095
  self._df[col] = self._df[col].apply(
1620
- lambda x: (
1621
- split_string_ignoring_quotes(x, delimiter)
1622
- if isinstance(x, str) and delimiter in x
1623
- else x
1624
- )
3096
+ lambda x: (split_string_ignoring_quotes(x, delimiter) if isinstance(x, str) and delimiter in x else x),
3097
+ )
3098
+
3099
+ # end method definition
3100
+
3101
+ def add_column_concat(
3102
+ self,
3103
+ source_columns: list,
3104
+ new_column: str,
3105
+ concat_char: str = "",
3106
+ upper: bool = False,
3107
+ lower: bool = False,
3108
+ capitalize: bool = False,
3109
+ title: bool = False,
3110
+ ) -> None:
3111
+ """Add a column as a concatenation of the values of multiple source columns.
3112
+
3113
+ Args:
3114
+ source_columns (list):
3115
+ The column names the list values are taken from.
3116
+ new_column (str):
3117
+ The name of the new column.
3118
+ concat_char (str, optional):
3119
+ Character to insert between the concatenated values. Default is "".
3120
+ upper (bool, optional):
3121
+ Convert result to uppercase if True.
3122
+ lower (bool, optional):
3123
+ Convert result to lowercase if True.
3124
+ capitalize (bool, optional):
3125
+ Capitalize the result if True.
3126
+ title (bool, optional):
3127
+ Convert result to title case if True.
3128
+
3129
+ Returns:
3130
+ None. self._df is modified in place.
3131
+
3132
+ """
3133
+
3134
+ def concatenate(row: pd.Series) -> str:
3135
+ # Comprehension to create a list from all source column values:
3136
+ concatenated = concat_char.join(
3137
+ [str(row[col]) for col in source_columns if pd.notna(row[col])],
1625
3138
  )
1626
3139
 
3140
+ # Apply case transformations based on parameters
3141
+ if upper:
3142
+ concatenated = concatenated.upper()
3143
+ elif lower:
3144
+ concatenated = concatenated.lower()
3145
+ elif capitalize:
3146
+ concatenated = concatenated.capitalize()
3147
+ elif title:
3148
+ concatenated = concatenated.title()
3149
+
3150
+ # end method definition
3151
+
3152
+ self._df[new_column] = self._df.apply(concatenate, axis=1)
3153
+
1627
3154
  # end method definition
1628
3155
 
1629
- def add_column_list(self, source_columns: list, new_column: str):
1630
- """Add a column with list objects. The list items are taken from a list of
1631
- source columns (row by row).
3156
+ def add_column_list(self, source_columns: list, new_column: str) -> None:
3157
+ """Add a column with list objects.
3158
+
3159
+ The list items are taken from a list of source columns (row by row).
1632
3160
 
1633
3161
  Args:
1634
- source_columns (list): column names the list values are taken from
1635
- new_column (str): name of the new column
3162
+ source_columns (list):
3163
+ The column names the list values are taken from.
3164
+ new_column (str):
3165
+ The name of the new column.
3166
+
1636
3167
  Returns:
1637
3168
  None. self._df is modified in place.
3169
+
1638
3170
  """
1639
3171
 
1640
- def create_list(row):
3172
+ def create_list(row: pd.Series) -> list:
3173
+ # Comprehension to create a list from all source column values:
1641
3174
  return [row[col] for col in source_columns]
1642
3175
 
1643
3176
  self._df[new_column] = self._df.apply(create_list, axis=1)
@@ -1645,87 +3178,90 @@ class Data:
1645
3178
  # end method definition
1646
3179
 
1647
3180
  def add_column_table(
1648
- self, source_columns: list, new_column: str, delimiter: str = ","
1649
- ):
1650
- """Add a column with tabular objects (list of dictionaris). The
1651
- source columns should include lists. The resulting dictionary
1652
- keys are the column names for the source columns.
1653
-
1654
- Example:
1655
- X[1] = 1, 2, 3
1656
- Y[1] = A, B, C
1657
- X[2] = 4, 5, 6
1658
- Y[2] = D, E, F
1659
-
1660
- Table[1] = [
1661
- {
1662
- "X": "1"
1663
- "Y": "A"
1664
- },
1665
- {
1666
- "X": "2"
1667
- "Y": "B"
1668
- }
1669
- {
1670
- "X": "3"
1671
- "Y": "C"
1672
- }
1673
- ]
1674
- Table[2] = [
1675
- {
1676
- "X": "4"
1677
- "Y": "D"
1678
- },
1679
- {
1680
- "X": "5"
1681
- "Y": "E"
1682
- }
1683
- {
1684
- "X": "6"
1685
- "Y": "F"
1686
- }
1687
- ]
3181
+ self,
3182
+ source_columns: list,
3183
+ new_column: str,
3184
+ delimiter: str = ",",
3185
+ ) -> None:
3186
+ """Add a column with tabular objects (list of dictionaries).
3187
+
3188
+ The source columns should include lists. The resulting dictionary
3189
+ keys are the column names for the source columns.
3190
+
3191
+ Example (["X", "Y"] are the source_columns, "Table" is the new_column):
3192
+ X[1] = [1, 2, 3] # row 1
3193
+ Y[1] = ["A", "B", "C"] # row 1
3194
+ X[2] = [4, 5, 6] # row 2
3195
+ Y[2] = ["D", "E", "F"] # row 2
3196
+
3197
+ Table[1] = [
3198
+ {
3199
+ "X": "1"
3200
+ "Y": "A"
3201
+ },
3202
+ {
3203
+ "X": "2"
3204
+ "Y": "B"
3205
+ }
3206
+ {
3207
+ "X": "3"
3208
+ "Y": "C"
3209
+ }
3210
+ ]
3211
+ Table[2] = [
3212
+ {
3213
+ "X": "4"
3214
+ "Y": "D"
3215
+ },
3216
+ {
3217
+ "X": "5"
3218
+ "Y": "E"
3219
+ }
3220
+ {
3221
+ "X": "6"
3222
+ "Y": "F"
3223
+ }
3224
+ ]
1688
3225
 
1689
3226
  Args:
1690
- source_columns (list): column names the list values are taken from
1691
- new_column (str): name of the new column
1692
- delimiter (str, optional): Character that delimits list items. Defaults to ",".
3227
+ source_columns (list):
3228
+ The column names the list values are taken from.
3229
+ new_column (str):
3230
+ The name of the new column.
3231
+ delimiter (str, optional):
3232
+ Character that delimits list items. Defaults to ",".
1693
3233
 
1694
3234
  Returns:
1695
3235
  None. self._df is modified in place.
3236
+
1696
3237
  """
1697
3238
 
1698
3239
  # Call the convert_to_lists method to ensure the columns are converted
1699
3240
  self.convert_to_lists(columns=source_columns, delimiter=delimiter)
1700
3241
 
1701
3242
  # Sub-method to pad lists to the same length
1702
- def pad_list(lst: list, max_len: int):
3243
+ def pad_list(lst: list, max_len: int) -> list:
1703
3244
  return lst + [None] * (max_len - len(lst))
1704
3245
 
1705
- def create_table(row) -> list:
1706
- max_len = max(
1707
- len(row[col]) if isinstance(row[col], list) else 1
1708
- for col in source_columns
1709
- )
3246
+ def create_table(row: pd.Series) -> list:
3247
+ max_len = max(len(row[col]) if isinstance(row[col], list) else 1 for col in source_columns)
1710
3248
 
1711
- # Pad lists to the maximum length, leave scalars as they are
3249
+ # Pad lists to the maximum length, leave scalar values as they are
1712
3250
  for col in source_columns:
1713
3251
  if isinstance(row[col], list):
1714
3252
  row[col] = pad_list(row[col], max_len)
3253
+ elif not pd.isna(row[col]):
3254
+ row[col] = [
3255
+ row[col],
3256
+ ] * max_len # Repeat scalar value to match the max length
1715
3257
  else:
1716
- if not pd.isna(row[col]):
1717
- row[col] = [
1718
- row[col]
1719
- ] * max_len # Repeat scalar to match the max length
1720
- else:
1721
- row[col] = [None] * max_len
1722
- # Create a list of dictionaries for each row
1723
- table = []
1724
- for i in range(max_len):
1725
- table.append({col: row[col][i] for col in source_columns})
3258
+ row[col] = [None] * max_len
3259
+ # Create a list of dictionaries for each row:
3260
+ table = [{col: row[col][i] for col in source_columns} for i in range(max_len)]
3261
+
1726
3262
  return table
1727
3263
 
1728
- # Apply the function to create a new column with a table
3264
+ # Apply the function to create a new column with table values:
1729
3265
  self._df[new_column] = self._df.apply(create_table, axis=1)
1730
3266
 
1731
3267
  # end method definition