pyxecm 1.4__py3-none-any.whl → 1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyxecm might be problematic. Click here for more details.

pyxecm/helper/data.py ADDED
@@ -0,0 +1,1731 @@
1
+ """
2
+ Data Module to implement functions to leverage Pandas to
3
+ manipulte data structures read for bulk generation of Extended ECM items.
4
+
5
+ This code implements a class called data which is referring
6
+ to Pandas DataFrame.
7
+
8
+ Class: Payload
9
+ Methods:
10
+
11
+ __init__ : class initializer
12
+ __len__: Lenght of the embedded DataFrame object.
13
+ __str__: Print the DataFrame of the class
14
+ get_data_frame: Get the Pandas DataFrame object
15
+ set_data_frame: Set the Pandas DataFrame object
16
+ append: Append additional data to the data frame.
17
+
18
+ load_json_data: Load JSON data into DataFrame
19
+ save_json_data: Save JSON data from DataFrame to file
20
+ load_excel_data: Load Excel file into DataFrame
21
+ load_csv_data: Load CSV data into DataFrame
22
+ load_directory: Load directory structure into Pandas Data Frame
23
+
24
+ partitionate: Partition a data frame into equally sized partions
25
+ deduplicate: Remove dupclicate rows that have all fields in unique_fields in common
26
+ sort: Sort the data frame based on one or multiple fields.
27
+ flatten: Flatten a sub-dictionary by copying selected fields to the
28
+ parent dictionary.
29
+ explode_and_flatten: Explode a substructure in the Data Frame
30
+ drop_columns: Drop selected columns from the Data Frame
31
+ keep_columns: Keep only selected columns from the Data Frame. Drop the rest.
32
+ cleanse: Cleanse data with regular expressions and upper/lower case conversion.
33
+ filter: Filter the DataFrame based on conditions
34
+
35
+ fill_forward: Fill the missing cells appropriately by carrying forward
36
+ the values from the previous rows where necessary.
37
+ fill_na_in_column: Replace NA values in a column with a defined new default value
38
+ """
39
+
40
+ __author__ = "Dr. Marc Diefenbruch"
41
+ __copyright__ = "Copyright 2024, OpenText"
42
+ __credits__ = ["Kai-Philip Gatzweiler"]
43
+ __maintainer__ = "Dr. Marc Diefenbruch"
44
+ __email__ = "mdiefenb@opentext.com"
45
+
46
+ import logging
47
+ import json
48
+ import os
49
+ import re
50
+ import threading
51
+
52
+ import pandas as pd
53
+
54
+ logger = logging.getLogger("pyxecm.helper.data")
55
+
56
+
57
+ class Data:
58
+ """Used to automate data loading for the customizer."""
59
+
60
+ _df: pd.DataFrame
61
+ _lock = threading.Lock()
62
+
63
+ def __init__(self, init_data: pd.DataFrame | list = None):
64
+ """Initialize the Data object.
65
+
66
+ Args:
67
+ init_data (pd.DataFrame | list, optional): Data to initialize the data frame. Can either be
68
+ another data frame (that gets copied) or a list of dictionaries.
69
+ Defaults to None.
70
+ """
71
+
72
+ if init_data is not None:
73
+ # if a data frame is passed to the constructor we
74
+ # copy its content to the new Data object
75
+
76
+ if isinstance(init_data, pd.DataFrame):
77
+ self._df: pd.DataFrame = init_data.copy()
78
+ elif isinstance(init_data, Data):
79
+ if init_data.get_data_frame() is not None:
80
+ self._df: pd.DataFrame = init_data.get_data_frame().copy()
81
+ elif isinstance(init_data, list):
82
+ self._df: pd.DataFrame = pd.DataFrame(init_data)
83
+ elif isinstance(init_data, dict):
84
+ # it is important to wrap the dict in a list to avoid that more than 1 row is created
85
+ self._df: pd.DataFrame = pd.DataFrame([init_data])
86
+ else:
87
+ logger.error("Illegal initialization data for 'Data' class!")
88
+ self._df = None
89
+ else:
90
+ self._df = None
91
+
92
+ # end method definition
93
+
94
+ def __len__(self) -> int:
95
+ """Lenght of the embedded DataFrame object.
96
+ This is basically a convenience method.
97
+
98
+ Returns:
99
+ int: Lenght of the DataFrame
100
+ """
101
+
102
+ if self._df is not None:
103
+ return len(self._df)
104
+ return 0
105
+
106
+ # end method definition
107
+
108
+ def __str__(self) -> str:
109
+ """Print the DataFrame of the class.
110
+
111
+ Returns:
112
+ str: String representation.
113
+ """
114
+
115
+ # if data frame is initialized we return
116
+ # the string representation of pd.DataFrame
117
+ if self._df is not None:
118
+ return str(self._df)
119
+
120
+ return str(self)
121
+
122
+ # end method definition
123
+
124
+ def __getitem__(self, column: str) -> pd.Series:
125
+ """Return the column corresponding to the key from the DataFrame
126
+
127
+ Args:
128
+ column (str): name of the Data Frame column
129
+
130
+ Returns:
131
+ pd.Series: column of the Data Frame with the given name
132
+ """
133
+
134
+ return self._df[column]
135
+
136
+ # end method definition
137
+
138
+ def lock(self):
139
+ """Return the threading lock object.
140
+
141
+ Returns:
142
+ _type_: threading lock object
143
+ """
144
+ return self._lock
145
+
146
+ # end method definition
147
+
148
+ def get_data_frame(self) -> pd.DataFrame:
149
+ """Get the Pandas DataFrame object
150
+
151
+ Returns:
152
+ pd.DataFrame: Pandas DataFrame object
153
+ """
154
+
155
+ return self._df
156
+
157
+ # end method definition
158
+
159
+ def set_data_frame(self, df: pd.DataFrame):
160
+ """Set the Pandas DataFrame object
161
+
162
+ Args:
163
+ df (pd.DataFrame): Pandas DataFrame object
164
+ """
165
+
166
+ self._df = df
167
+
168
+ # end method definition
169
+
170
+ def print_info(
171
+ self,
172
+ show_size: bool = True,
173
+ show_info: bool = False,
174
+ show_columns: bool = False,
175
+ show_first: bool = False,
176
+ show_last: bool = False,
177
+ show_sample: bool = False,
178
+ show_statistics: bool = False,
179
+ row_num: int = 10,
180
+ ):
181
+ """Log information about the data frame
182
+
183
+ Args:
184
+ show_size (bool, optional): Show size of data frame. Defaults to True.
185
+ show_info (bool, optional): Show information for data frame. Defaults to False.
186
+ show_columns (bool, optional): Show columns of data frame. Defaults to False.
187
+ show_first (bool, optional): Show first 10 items. Defaults to False.
188
+ show_last (bool, optional): Show last 10 items. Defaults to False.
189
+ show_sample (bool, optional): Show 10 sample items. Defaults to False.
190
+ show_statistics (bool, optional): Show data frame statistics. Defaults to False.
191
+ """
192
+
193
+ if self._df is None:
194
+ logger.warning("Data Frame is not initialized!")
195
+ return
196
+
197
+ if show_size:
198
+ logger.info(
199
+ "Data Frame has %s row(s) and %s column(s)",
200
+ self._df.shape[0],
201
+ self._df.shape[1],
202
+ )
203
+
204
+ if show_info:
205
+ # df.info() can not easily be embedded into a string
206
+ self._df.info()
207
+
208
+ if show_columns:
209
+ logger.info("Columns:\n%s", self._df.columns)
210
+ logger.info(
211
+ "Columns with number of null values:\n%s", self._df.isnull().sum()
212
+ )
213
+ logger.info(
214
+ "Columns with number of non-null values:\n%s", self._df.notnull().sum()
215
+ )
216
+ logger.info("Columns with number of NaN values:\n%s", self._df.isna().sum())
217
+ logger.info(
218
+ "Columns with number of non-NaN values:\n%s", self._df.notna().sum()
219
+ )
220
+
221
+ if show_first:
222
+ # the default for head is n = 5:
223
+ logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
224
+
225
+ if show_last:
226
+ # the default for tail is n = 5:
227
+ logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
228
+
229
+ if show_sample:
230
+ # the default for sample is n = 1:
231
+ logger.info("%s Sample rows:\n%s", str(row_num), self._df.sample(n=row_num))
232
+
233
+ if show_statistics:
234
+ logger.info(
235
+ "Description of statistics for data frame:\n%s", self._df.describe()
236
+ )
237
+ logger.info(
238
+ "Description of statistics for data frame (Transformed):\n%s",
239
+ self._df.describe().T,
240
+ )
241
+ logger.info(
242
+ "Description of statistics for data frame (objects):\n%s",
243
+ self._df.describe(include="object"),
244
+ )
245
+
246
+ # end method definition
247
+
248
+ def append(self, add_data: pd.DataFrame | list | dict) -> bool:
249
+ """Append additional data to the data frame.
250
+
251
+ Args:
252
+ add_data (pd.DataFrame | list | dict): Additional data. Can be pd.DataFrame or list of dicts (or Data)
253
+
254
+ Returns:
255
+ bool: True = Success, False = Error
256
+ """
257
+
258
+ # Does the data frame has already content?
259
+ # Then we need to concat / append. Otherwise
260
+ # we just initialize self._df
261
+ if self._df is not None:
262
+ if isinstance(add_data, pd.DataFrame):
263
+ self._df = pd.concat([self._df, add_data], ignore_index=True)
264
+ return True
265
+ elif isinstance(add_data, Data):
266
+ df = add_data.get_data_frame()
267
+ if df:
268
+ self._df = pd.concat([self._df, df], ignore_index=True)
269
+ return True
270
+ elif isinstance(add_data, list):
271
+ if add_data:
272
+ df = Data(add_data)
273
+ self._df = pd.concat(
274
+ [self._df, df.get_data_frame()], ignore_index=True
275
+ )
276
+ return True
277
+ elif isinstance(add_data, dict):
278
+ if add_data:
279
+ # it is important to wrap the dict in a list to avoid that more than 1 row is created
280
+ df = Data([add_data])
281
+ self._df = pd.concat(
282
+ [self._df, df.get_data_frame()], ignore_index=True
283
+ )
284
+ return True
285
+ else:
286
+ logger.error("Illegal data type -> '%s'", type(add_data))
287
+ return False
288
+ else: # self._df is None (initial state)
289
+ if isinstance(add_data, pd.DataFrame):
290
+ self._df = add_data
291
+ return True
292
+ elif isinstance(add_data, Data):
293
+ self._df = add_data.get_data_frame()
294
+ return True
295
+ elif isinstance(add_data, list):
296
+ self._df = pd.DataFrame(add_data)
297
+ return True
298
+ elif isinstance(add_data, dict):
299
+ # it is important to wrap the dict in a list to avoid that more than 1 row is created
300
+ self._df = pd.DataFrame([add_data])
301
+ return True
302
+ else:
303
+ logger.error("Illegal data type -> '%s'", type(add_data))
304
+ return False
305
+
306
+ # end method definition
307
+
308
+ def load_json_data(self, json_path: str, convert_dates: bool = False) -> bool:
309
+ """Load JSON data into DataFrame
310
+
311
+ Args:
312
+ json_path (str): Path to the JSON file.
313
+ convert_dates (bool, optional): whether or not dates should be converted
314
+ Returns:
315
+ bool: False in case an error occured, True otherwise.
316
+ """
317
+
318
+ if json_path is not None and os.path.exists(json_path):
319
+ # Load data from JSON file
320
+ try:
321
+ df = pd.read_json(path_or_buf=json_path, convert_dates=convert_dates)
322
+ if self._df is None:
323
+ self._df = df
324
+ else:
325
+ self._df = pd.concat([self._df, df])
326
+ logger.info(
327
+ "After loading -> '%s' the Data Frame has %s row(s) and %s column(s)",
328
+ json_path,
329
+ self._df.shape[0],
330
+ self._df.shape[1],
331
+ )
332
+ except FileNotFoundError:
333
+ logger.error(
334
+ "JSON file -> %s not found. Please check the file path.", json_path
335
+ )
336
+ return False
337
+ except PermissionError:
338
+ logger.error(
339
+ "Permission denied to access the JSON file -> %s.", json_path
340
+ )
341
+ return False
342
+ except IOError as e:
343
+ logger.error("An I/O error occurred -> %s", str(e))
344
+ return False
345
+ except json.JSONDecodeError as e:
346
+ logger.error("Error: Unable to decode JSON -> %s", str(e))
347
+ return False
348
+ except ValueError as e:
349
+ logger.error("Invalid JSON input -> %s", str(e))
350
+ return False
351
+ except AttributeError as e:
352
+ logger.error("Unexpected JSON data structure -> %s", str(e))
353
+ return False
354
+ except TypeError as e:
355
+ logger.error("Unexpected JSON data type -> %s", str(e))
356
+ return False
357
+ except KeyError as e:
358
+ logger.error("Missing key in JSON data -> %s", str(e))
359
+ return False
360
+
361
+ else:
362
+ logger.error(
363
+ "Missing JSON file - you have not specified a valid path -> %s.",
364
+ json_path,
365
+ )
366
+ return False
367
+ return True
368
+
369
+ # end method definition
370
+
371
+ def save_json_data(
372
+ self, json_path: str, orient: str = "records", preserve_index: bool = False
373
+ ) -> bool:
374
+ """Save JSON data from DataFrame to file
375
+
376
+ Args:
377
+ json_path (str): Path to the JSON file.
378
+ orient (str, optional): Structure of the JSON
379
+ preserve_index (bool, optional)
380
+ Returns:
381
+ bool: False in case an error occured, True otherwise.
382
+ """
383
+
384
+ if json_path is not None and os.path.exists(os.path.dirname(json_path)):
385
+ # Load data from JSON file
386
+ try:
387
+ if self._df is not None:
388
+ # index parameter is only allowed if orient has one of the following values:
389
+ if (
390
+ orient == "columns"
391
+ or orient == "index"
392
+ or orient == "table"
393
+ or orient == "split"
394
+ ):
395
+ self._df.to_json(
396
+ path_or_buf=json_path,
397
+ index=preserve_index,
398
+ orient=orient,
399
+ indent=2,
400
+ )
401
+ else:
402
+ self._df.to_json(path_or_buf=json_path, orient=orient, indent=2)
403
+ else:
404
+ logger.warning("Data Frame is empty. Cannot write it to JSON")
405
+ return False
406
+ except FileNotFoundError:
407
+ logger.error(
408
+ "File -> '%s' not found. Please check the file path.", json_path
409
+ )
410
+ return False
411
+ except PermissionError:
412
+ logger.error("Permission denied to access the file -> '%s'.", json_path)
413
+ return False
414
+ except IOError as e:
415
+ logger.error("An I/O error occurred -> %s", str(e))
416
+ return False
417
+ except ValueError as e:
418
+ logger.error("Value Error -> %s", str(e))
419
+ return False
420
+
421
+ else:
422
+ logger.error(
423
+ "Missing JSON file -> '%s' you have not specified a valid path!",
424
+ json_path,
425
+ )
426
+ return False
427
+ return True
428
+
429
+ # end method definition
430
+
431
+ def load_excel_data(
432
+ self,
433
+ xlsx_path: str,
434
+ sheet_names: str | list | None = 0,
435
+ usecols: str | list | None = None,
436
+ skip_rows: int | None = None,
437
+ header: int | None = 0,
438
+ names: list | None = None,
439
+ na_values: list | None = None,
440
+ ) -> bool:
441
+ """Load Excel (xlsx) data into DataFrame. Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
442
+ read from a local filesystem or URL. Supports an option to read a single sheet or a list of sheets.
443
+
444
+ Args:
445
+ xlsx_path (str): Path to the Excel file.
446
+ sheet_names (list | str | int, optional): Name or Index of the sheet in the Excel workbook to load.
447
+ If 'None' then all sheets will be loaded.
448
+ If 0 then first sheet in workbook will be loaded (this is the Default)
449
+ If string then this is interpreted as the name of the sheet to load.
450
+ If a list is passed, this can be a list of index values (int) or
451
+ a list of strings with the sheet names to load.
452
+ usecols (list | str, optional): List of columns to load, specified by general column names in Excel,
453
+ e.g. usecols='B:D', usecols=['A', 'C', 'F']
454
+ skip_rows (int, optional): List of rows to skip on top of the sheet (e.g. to not read headlines)
455
+ header (int | None, optional): Excel Row (0-indexed) to use for the column labels of the parsed DataFrame.
456
+ If file contains no header row, then you should explicitly pass header=None.
457
+ Default is 0.
458
+ names (list): List of column names to use. Default is None
459
+ na_values (list, optional): List of values in the Excel that should become the Pandas NA value.
460
+ Returns:
461
+ bool: False in case an error occured, True otherwise.
462
+ """
463
+
464
+ if xlsx_path is not None and os.path.exists(xlsx_path):
465
+ # Load data from Excel file
466
+ try:
467
+ df = pd.read_excel(
468
+ io=xlsx_path,
469
+ sheet_name=sheet_names,
470
+ usecols=usecols,
471
+ skiprows=skip_rows,
472
+ header=header,
473
+ names=names,
474
+ na_values=na_values,
475
+ )
476
+ # if multiple sheets from an Excel workbook are loaded,
477
+ # then read_excel() returns a dictionary. The keys are
478
+ # the names of the sheets and the values are the Data Frames.
479
+ # we handle this case as follows:
480
+ if isinstance(df, dict):
481
+ logger.info("Loading multiple Excel sheets from the workbook!")
482
+ multi_sheet_df = pd.DataFrame()
483
+ for sheet in df.keys():
484
+ multi_sheet_df = pd.concat(
485
+ [multi_sheet_df, df[sheet]], ignore_index=True
486
+ )
487
+ df = multi_sheet_df
488
+ if self._df is None:
489
+ self._df = df
490
+ else:
491
+ self._df = pd.concat([self._df, df], ignore_index=True)
492
+ except FileNotFoundError:
493
+ logger.error(
494
+ "Excel file -> '%s' not found. Please check the file path.",
495
+ xlsx_path,
496
+ )
497
+ return False
498
+ except PermissionError:
499
+ logger.error(
500
+ "Permission denied to access the Excel file -> '%s'.", xlsx_path
501
+ )
502
+ return False
503
+ except IOError as e:
504
+ logger.error(
505
+ "An I/O error occurred -> %s while reading the Excel file -> %s",
506
+ str(e),
507
+ xlsx_path,
508
+ )
509
+ return False
510
+ except ValueError as e:
511
+ logger.error(
512
+ "Invalid Excel input -> %s in Excel file -> %s", str(e), xlsx_path
513
+ )
514
+ return False
515
+ except AttributeError as e:
516
+ logger.error("Unexpected data structure -> %s", str(e))
517
+ return False
518
+ except TypeError as e:
519
+ logger.error("Unexpected data type -> %s", str(e))
520
+ return False
521
+ except KeyError as e:
522
+ logger.error("Missing key in Excel data -> %s", str(e))
523
+ return False
524
+
525
+ else:
526
+ logger.error(
527
+ "Missing Excel file -> '%s' you have not specified a valid path!",
528
+ xlsx_path,
529
+ )
530
+ return False
531
+ return True
532
+
533
+ # end method definition
534
+
535
+ def save_excel_data(
536
+ self, excel_path: str, sheet_name: str = "Pandas Export", index: bool = False
537
+ ) -> bool:
538
+ """
539
+ Save the DataFrame to an Excel file, with robust error handling and logging.
540
+
541
+ Args:
542
+ excel_path (str): The file path to save the Excel file.
543
+ sheet_name (str): The sheet name where data will be saved. Default is 'Sheet1'.
544
+ index: Whether to write the row names (index). Default is False.
545
+ """
546
+ try:
547
+ # Check if the directory exists
548
+ directory = os.path.dirname(excel_path)
549
+ if directory and not os.path.exists(directory):
550
+ raise FileNotFoundError(
551
+ "The directory -> '%s' does not exist." % directory
552
+ )
553
+
554
+ # Attempt to save the DataFrame to Excel
555
+ self._df.to_excel(excel_path, sheet_name=sheet_name, index=index)
556
+ logger.info("Data saved successfully to -> %s", excel_path)
557
+
558
+ except FileNotFoundError as e:
559
+ logger.error("Error: %s", e)
560
+ return False
561
+ except PermissionError:
562
+ logger.error(
563
+ "Error: Permission denied. You do not have permission to write to '%s'.",
564
+ excel_path,
565
+ )
566
+ return False
567
+ except ValueError as ve:
568
+ logger.error("Error: Invalid data for Excel format -> %s", ve)
569
+ return False
570
+ except OSError as oe:
571
+ logger.error("Error: OS error occurred while saving file -> %s", oe)
572
+ return False
573
+ except Exception as e:
574
+ # Catch-all for any other unexpected errors
575
+ logger.error("An unexpected error occurred -> %s", e)
576
+ return False
577
+
578
+ return True
579
+
580
+ # end method definition
581
+
582
+ def load_csv_data(
583
+ self, csv_path: str, delimiter: str = ",", encoding: str = "utf-8"
584
+ ) -> bool:
585
+ """Load CSV (Comma separated values) data into DataFrame
586
+
587
+ Args:
588
+ csv_path (str): Path to the CSV file.
589
+ delimiter (str, optional, length = 1): chracter to delimit values. Default ="," (comma)
590
+ encoding (str, optional): encoding of the file. Default = "utf-8".
591
+ Returns:
592
+ bool: False in case an error occured, True otherwise.
593
+ """
594
+
595
+ if csv_path is not None and os.path.exists(csv_path):
596
+ # Load data from CSV file
597
+ try:
598
+ df = pd.read_csv(
599
+ filepath_or_buffer=csv_path, delimiter=delimiter, encoding=encoding
600
+ )
601
+ if self._df is None:
602
+ self._df = df
603
+ else:
604
+ self._df = pd.concat([self._df, df])
605
+ except FileNotFoundError:
606
+ logger.error(
607
+ "CSV file -> '%s' not found. Please check the file path.", csv_path
608
+ )
609
+ return False
610
+ except PermissionError:
611
+ logger.error(
612
+ "Permission denied to access the CSV file -> %s.", csv_path
613
+ )
614
+ return False
615
+ except IOError as e:
616
+ logger.error("An I/O error occurred -> %s", str(e))
617
+ return False
618
+ except ValueError as e:
619
+ logger.error("Invalid CSV input -> %s", str(e))
620
+ return False
621
+ except AttributeError as e:
622
+ logger.error("Unexpected data structure -> %s", str(e))
623
+ return False
624
+ except TypeError as e:
625
+ logger.error("Unexpected data type -> %s", str(e))
626
+ return False
627
+ except KeyError as e:
628
+ logger.error("Missing key in CSV data -> %s", str(e))
629
+ return False
630
+
631
+ else:
632
+ logger.error(
633
+ "Missing CSV file -> '%s' you have not specified a valid path!",
634
+ csv_path,
635
+ )
636
+ return False
637
+ return True
638
+
639
+ # end method definition
640
+
641
+ def load_xml_data(
642
+ self, xml_path: str, xpath: str | None = None, xslt_path: str | None = None
643
+ ) -> bool:
644
+ """Load XML data into DataFrame
645
+
646
+ Args:
647
+ xml_path (str): Path to the XML file.
648
+ xpath (str, optional): XPath to the elements we want to select
649
+ xslt_path (str, optional): XSLT transformation file
650
+ Returns:
651
+ bool: False in cause an error occured, True otherwise.
652
+ """
653
+
654
+ try:
655
+ df = pd.read_xml(path_or_buffer=xml_path, xpath=xpath, stylesheet=xslt_path)
656
+ # Process the loaded data as needed
657
+ if self._df is None:
658
+ self._df = df
659
+ else:
660
+ self._df = pd.concat([self._df, df])
661
+ logger.info("XML file loaded successfully!")
662
+ return True
663
+ except FileNotFoundError:
664
+ print("File not found.")
665
+ return False
666
+ except PermissionError:
667
+ logger.error("Permission denied to access the file -> %s.", xml_path)
668
+ return False
669
+ except IOError as e:
670
+ logger.error("An I/O error occurred -> %s", str(e))
671
+ return False
672
+ except ValueError as e:
673
+ logger.error("Invalid CSV input -> %s", str(e))
674
+ return False
675
+ except AttributeError as e:
676
+ logger.error("Unexpected data structure -> %s", str(e))
677
+ return False
678
+ except TypeError as e:
679
+ logger.error("Unexpected data type -> %s", str(e))
680
+ return False
681
+ except KeyError as e:
682
+ logger.error("Missing key in CSV data -> %s", str(e))
683
+ return False
684
+
685
+ # end method definition
686
+
687
+ def load_directory(self, path_to_root: str) -> bool:
688
+ """Load directory structure into Pandas Data Frame
689
+
690
+ Args:
691
+ path_to_root (str): Path to the root element of the
692
+ directory structure
693
+
694
+ Returns:
695
+ bool: True = Success, False = Failure
696
+ """
697
+
698
+ try:
699
+ # Check if the provided path is a directory
700
+ if not os.path.isdir(path_to_root):
701
+ logger.error(
702
+ "The provided path -> '%s' is not a valid directory.", path_to_root
703
+ )
704
+ return False
705
+
706
+ # Initialize a list to hold file information
707
+ data = []
708
+
709
+ # Walk through the directory
710
+ for root, _, files in os.walk(path_to_root):
711
+ for file in files:
712
+ file_path = os.path.join(root, file)
713
+ file_size = os.path.getsize(file_path)
714
+ relative_path = os.path.relpath(file_path, path_to_root)
715
+ path_parts = relative_path.split(os.sep)
716
+
717
+ # Create a dictionary with the path parts and file details
718
+ entry = {
719
+ "level {}".format(i): part
720
+ for i, part in enumerate(path_parts[:-1], start=1)
721
+ }
722
+ entry.update({"filename": path_parts[-1], "size": file_size})
723
+ data.append(entry)
724
+
725
+ # Create DataFrame from list of dictionaries
726
+ self._df = pd.DataFrame(data)
727
+
728
+ # Determine the maximum number of levels
729
+ max_levels = max((len(entry) - 2 for entry in data), default=0)
730
+
731
+ # Ensure all entries have the same number of levels
732
+ for entry in data:
733
+ for i in range(1, max_levels + 1):
734
+ entry.setdefault("level {}".format(i), "")
735
+
736
+ # Convert to DataFrame again to make sure all columns are consistent
737
+ self._df = pd.DataFrame(data)
738
+
739
+ except NotADirectoryError as nde:
740
+ print(f"Error: {nde}")
741
+ except FileNotFoundError as fnfe:
742
+ print(f"Error: {fnfe}")
743
+ except PermissionError as pe:
744
+ print(f"Error: {pe}")
745
+
746
+ return True
747
+
748
+ # end method definition
749
+
750
+ def load_xml_directory(self, path_to_root: str, xpath: str | None = None) -> bool:
751
+ """Load directory structure into Pandas Data Frame
752
+
753
+ Args:
754
+ path_to_root (str): Path to the root element of the
755
+ directory structure
756
+ xpath (str, optional): XPath to the elements we want to select
757
+
758
+ Returns:
759
+ bool: True = Success, False = Failure
760
+ """
761
+
762
+ try:
763
+ # Check if the provided path is a directory
764
+ if not os.path.isdir(path_to_root):
765
+ logger.error(
766
+ "The provided path -> '%s' is not a valid directory.", path_to_root
767
+ )
768
+ return False
769
+
770
+ # Walk through the directory
771
+ for root, _, files in os.walk(path_to_root):
772
+ for file in files:
773
+ file_path = os.path.join(root, file)
774
+ file_size = os.path.getsize(file_path)
775
+ file_name = os.path.basename(file_path)
776
+
777
+ if file_name == "docovw.xml":
778
+ logger.info(
779
+ "Load XML file -> '%s' of size -> %s", file_path, file_size
780
+ )
781
+ success = self.load_xml_data(file_path, xpath=xpath)
782
+ if success:
783
+ logger.info(
784
+ "Successfully loaded XML file -> '%s'", file_path
785
+ )
786
+
787
+ except NotADirectoryError as nde:
788
+ logger.error("Error -> %s", str(nde))
789
+ except FileNotFoundError as fnfe:
790
+ logger.error("Error -> %s", str(fnfe))
791
+ except PermissionError as pe:
792
+ logger.error("Error -> %s", str(pe))
793
+
794
+ return True
795
+
796
+ # end method definition
797
+
798
+ def partitionate(self, number: int) -> list:
799
+ """Partition a data frame into equally sized
800
+ partions
801
+
802
+ Args:
803
+ number (int): Number of partitions
804
+
805
+ Returns:
806
+ list: List of partitions
807
+ """
808
+
809
+ # Calculate the approximate size of each partition
810
+ size = len(self._df)
811
+
812
+ if size >= number:
813
+ partition_size = size // number
814
+ remainder = size % number
815
+ else:
816
+ partition_size = size
817
+ number = 1
818
+ remainder = 0
819
+
820
+ logger.info(
821
+ "Data set has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
822
+ str(size),
823
+ str(number),
824
+ str(partition_size),
825
+ str(remainder),
826
+ )
827
+
828
+ # Initialize a list to store partitions
829
+ partitions = []
830
+ start_index = 0
831
+
832
+ # Slice the DataFrame into equally sized partitions
833
+ for i in range(number):
834
+ # start_index = i * partition_size
835
+ # end_index = (i + 1) * partition_size if i < number - 1 else None
836
+ # partition = self._df.iloc[start_index:end_index]
837
+ # partitions.append(partition)
838
+ # Calculate the end index for this partition
839
+ end_index = start_index + partition_size + (1 if i < remainder else 0)
840
+ partition = self._df.iloc[start_index:end_index]
841
+ partitions.append(partition)
842
+ start_index = end_index
843
+
844
+ return partitions
845
+
846
+ # end method definition
847
+
848
+ def partitionate_by_column(self, column_name: str) -> list | None:
849
+ """Partition a data frame based on equal values in a specified column.
850
+
851
+ Args:
852
+ column_name (str): The column name to partition by
853
+
854
+ Returns:
855
+ list | None: List of partitions or None in case of an error (e.g. column name does not exist).
856
+ """
857
+
858
+ if column_name not in self._df.columns:
859
+ logger.error(
860
+ "Column -> '%s' does not exist in the Data Frame. Data Frame has these columns -> %s",
861
+ column_name,
862
+ str(self._df.columns),
863
+ )
864
+ return None
865
+
866
+ # Separate rows with NaN or None values in the specified column
867
+ nan_partitions = self._df[self._df[column_name].isna()]
868
+ non_nan_df = self._df.dropna(subset=[column_name])
869
+
870
+ # Group by the specified column and create a list of DataFrames for each group
871
+ grouped = non_nan_df.groupby(column_name)
872
+ partitions = [group for _, group in grouped]
873
+
874
+ # Add each row with NaN or None values as its own partition
875
+ for i in range(len(nan_partitions)):
876
+ partitions.append(nan_partitions.iloc[[i]])
877
+
878
+ logger.info(
879
+ "Data Frame has been partitioned into -> %s partitions based on the values in column '%s'...",
880
+ str(len(partitions)),
881
+ column_name,
882
+ )
883
+
884
+ return partitions
885
+
886
+ # end method definition
887
+
888
+ def deduplicate(self, unique_fields: list, inplace: bool = True) -> pd.DataFrame:
889
+ """Remove dupclicate rows that have all fields in
890
+ unique_fields in common.
891
+
892
+ Args:
893
+ unique_fields (list): Defines the fields for which we want a unique
894
+ combination.
895
+ inplace (bool, optional): True if the deduplication happens in-place.
896
+ Defaults to True.
897
+ Returns:
898
+ pd.DataFrame | None: If inplace is False than a new deduplicatd DataFrame
899
+ is returned. Otherwise the object is modified in place
900
+ and self._df is returned.
901
+ """
902
+
903
+ if inplace:
904
+ self._df.drop_duplicates(subset=unique_fields, inplace=True)
905
+ self._df.reset_index(drop=True, inplace=True)
906
+ return self._df
907
+ else:
908
+ df = self._df.drop_duplicates(subset=unique_fields, inplace=False)
909
+ df = df.reset_index(drop=True, inplace=False)
910
+ return df
911
+
912
+ # end method definition
913
+
914
+ def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame:
915
+ """Sort the data frame based on one or multiple fields -
916
+ either in place or return it as a new data frame (e.g. not modifying self._df)
917
+
918
+ Args:
919
+ sort_fields (list): Columns / fields to be used for sorting
920
+ inplace (bool, optional): If the sorting should be inplace, i.e. modifying self._df.
921
+ Defaults to True.
922
+ Returns:
923
+ pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
924
+ """
925
+
926
+ if self._df is None:
927
+ return None
928
+
929
+ if not all(sort_field in self._df.columns for sort_field in sort_fields):
930
+ logger.warning(
931
+ "Not all of the given sort fields -> %s do exist in the Data Frame.",
932
+ str(sort_fields),
933
+ )
934
+ # Reduce the sort fields to those that really exist in the DataFrame:
935
+ sort_fields = [
936
+ sort_field
937
+ for sort_field in sort_fields
938
+ if sort_field in self._df.columns
939
+ ]
940
+ logger.warning(
941
+ "Only these given sort fields -> %s do exist as columns in the Data Frame.",
942
+ str(sort_fields),
943
+ )
944
+
945
+ if inplace:
946
+ self._df.sort_values(by=sort_fields, inplace=True)
947
+ self._df.reset_index(drop=True, inplace=True)
948
+ return self._df
949
+ else:
950
+ df = self._df.sort_values(by=sort_fields, inplace=False)
951
+ df = df.reset_index(drop=True, inplace=False)
952
+ return df
953
+
954
+ # end method definition
955
+
956
+ def flatten(
957
+ self,
958
+ parent_field: str,
959
+ flatten_fields: list,
960
+ ):
961
+ """Flatten a sub-dictionary by copying selected fields to the
962
+ parent dictionary. This is e.g. useful for then de-duplicate
963
+ a data set.
964
+
965
+ Args:
966
+ parent_field (str): name of the field in the parent dictionary
967
+ flatten_fields (list): fields in the sub-dictionary to copy
968
+ into the parent dictionary.
969
+ """
970
+
971
+ for flatten_field in flatten_fields:
972
+ flat_field = parent_field + "_" + flatten_field
973
+ # The following expression generates a new column in the
974
+ # data frame with the name of 'flat_field'.
975
+ # In the lambada function x is a dictionary that includes the subvalues
976
+ # and it returns the value of the given flatten field
977
+ # (if it exists, otherwise None). So x is self._df[parent_field], i.e.
978
+ # what the lambda function gets 'applied' on.
979
+ self._df[flat_field] = self._df[parent_field].apply(
980
+ lambda x, sub_field=flatten_field: (
981
+ x.get(sub_field, None) if isinstance(x, dict) else None
982
+ )
983
+ )
984
+
985
+ # end method definition
986
+
987
+ def explode_and_flatten(
988
+ self,
989
+ explode_field: str | list,
990
+ flatten_fields: list | None = None,
991
+ make_unique: bool = False,
992
+ reset_index: bool = False,
993
+ split_string_to_list: bool = False,
994
+ separator: str = ";,",
995
+ ) -> pd.DataFrame:
996
+ """Explode a substructure in the Data Frame
997
+
998
+ Args:
999
+ explode_field (str | list): Field(s) to explode which each has/have a list structure.
1000
+ Exploding multiple columns at once is possible. This delivers
1001
+ a very different result compared to exploding one column after
1002
+ the other!
1003
+ flatten_fields (list): Fields in the exploded substructure to include
1004
+ in the main dictionaries for easier processing.
1005
+ make_unique (bool, optional): if True deduplicate the exploded data frame.
1006
+ reset_index (bool, False): True = index is reset, False = Index is not reset
1007
+ split_string_to_list (bool, optional): if True flatten the exploded data frame.
1008
+ separator (str, optional): characters used to split the string values in the given column into a list
1009
+ Returns:
1010
+ pd.DataFrame: Pointer to the Pandas DataFrame
1011
+ """
1012
+
1013
+ def update_column(row):
1014
+ try:
1015
+ if sub in row:
1016
+ return row[sub]
1017
+ except (IndexError, KeyError, ValueError):
1018
+ return ""
1019
+
1020
+ # Define a function to split a string into a list
1021
+ def string_to_list(string: str | None) -> list:
1022
+ # Do nothing if the string is already a list
1023
+ if isinstance(string, list):
1024
+ return_list = string
1025
+ elif not string or pd.isna(string):
1026
+ return_list = []
1027
+ else:
1028
+ # Use regular expression to split by comma, semicolon, or comma followed by space
1029
+ return_list = re.split(rf"[{separator}]\s*", str(string))
1030
+
1031
+ return return_list
1032
+
1033
+ if isinstance(explode_field, list):
1034
+ logger.info("Explode multiple columns -> %s", str(explode_field))
1035
+ elif isinstance(explode_field, str):
1036
+ logger.info("Explode single column -> '%s'", explode_field)
1037
+ else:
1038
+ logger.error(
1039
+ "Illegal explode field(s) data type provided -> %s", type(explode_field)
1040
+ )
1041
+ return self._df
1042
+
1043
+ try:
1044
+ # remove the sub dictionary that sometimes is introduced by
1045
+ # XML loading. We just want the main part.
1046
+ if "." in explode_field:
1047
+ main = explode_field.split(".")[0]
1048
+ sub = explode_field.split(".")[1]
1049
+ self._df[main] = self._df[main].apply(update_column)
1050
+ explode_field = main
1051
+
1052
+ # Now that we have the right explode column
1053
+ # we need to convert it to a list if it is inside a string (with delimiters)
1054
+ if split_string_to_list:
1055
+ logger.info(
1056
+ "Split the string values of column -> '%s' into a list using separator -> '%s'",
1057
+ explode_field,
1058
+ separator,
1059
+ )
1060
+ # Apply the function to convert the string values in the column (give by the name in explode_field) to lists
1061
+ # The string_to_list() sub-method above also considers the separator parameter.
1062
+ self._df[explode_field] = self._df[explode_field].apply(string_to_list)
1063
+
1064
+ # Explode the field that has list values
1065
+ self._df = self._df.explode(column=explode_field)
1066
+ except KeyError:
1067
+ logger.error("Column -> '%s' not found in Data Frame!", str(explode_field))
1068
+ except ValueError:
1069
+ logger.error(
1070
+ "Unable to explode the specified column -> '%s'!", str(explode_field)
1071
+ )
1072
+
1073
+ if flatten_fields:
1074
+ self.flatten(parent_field=explode_field, flatten_fields=flatten_fields)
1075
+
1076
+ if make_unique:
1077
+ self._df.drop_duplicates(subset=flatten_fields, inplace=True)
1078
+
1079
+ if reset_index:
1080
+ self._df.reset_index(inplace=True)
1081
+
1082
+ return self._df
1083
+
1084
+ # end method definition
1085
+
1086
+ def drop_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
1087
+ """Drop selected columns from the Data Frame
1088
+
1089
+ Args:
1090
+ column_names (list): list of column names to drop.
1091
+ inplace (bool, optional): If the dropping should be inplace, i.e. modifying self._df.
1092
+ Defaults to True.
1093
+ Returns:
1094
+ pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1095
+ """
1096
+
1097
+ if not all(column_name in self._df.columns for column_name in column_names):
1098
+ # Reduce the column names to those that really exist in the DataFrame:
1099
+ column_names = [
1100
+ column_name
1101
+ for column_name in column_names
1102
+ if column_name in self._df.columns
1103
+ ]
1104
+ logger.warning(
1105
+ "Reduce to these columns -> %s that do exist in the Data Frame.",
1106
+ str(column_names),
1107
+ )
1108
+
1109
+ if inplace:
1110
+ self._df.drop(column_names, axis=1, inplace=True)
1111
+ return self._df
1112
+ else:
1113
+ df = self._df.drop(column_names, axis=1, inplace=False)
1114
+ return df
1115
+
1116
+ # end method definition
1117
+
1118
+ def keep_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
1119
+ """Keep only selected columns from the Data Frame. Drop the rest.
1120
+
1121
+ Args:
1122
+ column_names (list): list of column names to keep.
1123
+ inplace (bool, optional): If the keeping should be inplace, i.e. modifying self._df.
1124
+ Defaults to True.
1125
+ Returns:
1126
+ pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1127
+ """
1128
+
1129
+ if not all(column_name in self._df.columns for column_name in column_names):
1130
+ # Reduce the column names to those that really exist in the DataFrame:
1131
+ column_names = [
1132
+ column_name
1133
+ for column_name in column_names
1134
+ if column_name in self._df.columns
1135
+ ]
1136
+ logger.warning(
1137
+ "Reduce to these columns -> %s that do exist in the Data Frame.",
1138
+ column_names,
1139
+ )
1140
+
1141
+ if inplace:
1142
+ # keep only those columns which are in column_names:
1143
+ if column_names != []:
1144
+ self._df = self._df[column_names]
1145
+ return self._df
1146
+ else:
1147
+ # keep only those columns which are in column_names:
1148
+ if column_names != []:
1149
+ df = self._df[column_names]
1150
+ return df
1151
+ return None
1152
+
1153
+ # end method definition
1154
+
1155
+ def cleanse(self, cleansings: dict):
1156
+ """Cleanse data with regular expressions and upper/lower case conversion.
1157
+
1158
+ Args:
1159
+ cleansings (dict): Dictionary with keys that equal the column names.
1160
+ The dictionary values are dictionaries itself with
1161
+ these fields:
1162
+ * replacements (dict): name of a column in the data frame
1163
+ * upper (bool): change the value to uppercase
1164
+ * lower (bool): change the value to lowercase
1165
+ Example:
1166
+ cleansings = {
1167
+ "airportName": {
1168
+ "upper": true
1169
+ "replacements" : {
1170
+ "-": " ", # replace hypen with space
1171
+ ",\s*": " ", # remove commas followed by on or more spaces with a single space
1172
+ "\s+$": "", # remove trailing spaces at the end of the name
1173
+ "^\s+": "", # remove spaces at the beginning of the name
1174
+ }
1175
+ "length": 10
1176
+ }
1177
+ "airportId": {
1178
+ "upper": true
1179
+ "replacements" : {
1180
+ "K(.{3})": "\1", # if the airport has 4 charters and starts with a 'K' we remove the 'K'
1181
+ "\/": "", # remove forward slashes - this helps to have consistency with N/A, NA, n/a, na
1182
+ }
1183
+ }
1184
+ }
1185
+ """
1186
+
1187
+ # Iterate over each column in regex_dict
1188
+ for column, cleansing in cleansings.items():
1189
+ # "colum" is the name of the field we want to cleanse.
1190
+ # "cleansing" is a dict with
1191
+ if "." in column:
1192
+ # Handle columns with subfields
1193
+ main_field, sub_field = column.split(".")
1194
+ if not main_field in self._df.columns:
1195
+ continue
1196
+ # we use the additional parameters for lambda (beside x)
1197
+ # to avoid linter warning W0640
1198
+ self._df[main_field] = self._df[main_field].apply(
1199
+ lambda x, sub_field=sub_field, cleansing=cleansing: self._cleanse_subfield(
1200
+ data=x,
1201
+ sub_field=sub_field,
1202
+ replacements=cleansing.get("replacements", {}),
1203
+ upper=cleansing.get("upper", False),
1204
+ lower=cleansing.get("lower", False),
1205
+ length=cleansing.get("length", 0),
1206
+ )
1207
+ )
1208
+ else:
1209
+ if not column in self._df.columns:
1210
+ continue
1211
+
1212
+ logger.debug("\nBEFORE:\n%s\n", self._df[column])
1213
+
1214
+ if cleansing.get("upper", False) and self._df[column].dtype == "object":
1215
+ self._df[column] = self._df[column].str.upper()
1216
+ if cleansing.get("lower", False) and self._df[column].dtype == "object":
1217
+ self._df[column] = self._df[column].str.lower()
1218
+
1219
+ # Handle regular columns. regexp_pattern is on the left side
1220
+ # of the colon, and replacement the string on the right side of
1221
+ # the colon:
1222
+ for regex_pattern, replacement in cleansing.get(
1223
+ "replacements", {}
1224
+ ).items():
1225
+ if not regex_pattern:
1226
+ logger.error("Empty search / regexp pattern!")
1227
+ continue
1228
+ # \b is a word boundary anchor in regular expressions.
1229
+ # It matches a position where one side is a word character
1230
+ # (like a letter or digit) and the other side is a non-word character
1231
+ # (like whitespace or punctuation). It's used to match whole words.
1232
+ # We want to have this to e.g. not replace "INT" with "INTERNATIONAL"
1233
+ # if the word is already "INTERNATIONAL". It is important
1234
+ # that the \b ... \b enclosure is ONLY used if regex_pattern is NOT
1235
+ # a regular expression but just a normal string.
1236
+ # Check if the pattern does NOT contain any regex special characters
1237
+ # (excluding dot and ampersand) and ONLY then use \b ... \b
1238
+ # Special regexp characters include: ^ $ * + ? ( ) [ ] { } | \
1239
+ if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
1240
+ # Wrap with word boundaries for whole-word matching
1241
+ regex_pattern = rf"\b{regex_pattern}\b"
1242
+ self._df[column] = self._df[column].str.replace(
1243
+ pat=regex_pattern, repl=replacement, regex=True
1244
+ )
1245
+
1246
+ if (
1247
+ cleansing.get("length", 0) > 0
1248
+ and self._df[column].dtype == "object"
1249
+ ):
1250
+ self._df[column] = self._df[column].str.slice(
1251
+ 0, cleansing["length"]
1252
+ )
1253
+
1254
+ logger.debug("\nAFTER:\n%s\n", self._df[column])
1255
+
1256
+ # end method definition
1257
+
1258
+ def _cleanse_subfield(
1259
+ self,
1260
+ data: list | dict,
1261
+ sub_field: str,
1262
+ replacements: dict,
1263
+ upper: bool,
1264
+ lower: bool,
1265
+ length: int = 0,
1266
+ ) -> list | dict:
1267
+ """Helper function to cleanse subfield data
1268
+
1269
+ Args:
1270
+ data (list | dict): sub data - either a list of dictionaries or a dictionary
1271
+ sub_field (str): defines which field in the sub data should be updated
1272
+ regex_replacements (dict): Dictionary of regular expressions
1273
+ upper (bool): if True transform value in subfield to upper-case
1274
+ lower (bool): if True, transform value in subfield to lower-case
1275
+ length (int, optional): maximum length of the strings
1276
+ Returns:
1277
+ list | dict: Updated data
1278
+ """
1279
+
1280
+ if isinstance(data, list):
1281
+ # If data is a list, apply cleansing to each dictionary in the list
1282
+ for i, item in enumerate(data):
1283
+ if (
1284
+ item is not None
1285
+ and sub_field in item
1286
+ and not pd.isnull(item[sub_field])
1287
+ ):
1288
+ if upper:
1289
+ item[sub_field] = item[sub_field].upper()
1290
+ elif lower:
1291
+ item[sub_field] = item[sub_field].lower()
1292
+ for regex_pattern, replacement in replacements.items():
1293
+ if replacement:
1294
+ regex_pattern = rf"\b{regex_pattern}\b"
1295
+ item[sub_field] = re.sub(
1296
+ regex_pattern, replacement, item[sub_field]
1297
+ )
1298
+ if length > 0:
1299
+ item[sub_field] = item[sub_field][:length]
1300
+ data[i] = item
1301
+ elif isinstance(data, dict):
1302
+ # If data is a dictionary, apply cleansing directly to the subfield
1303
+ if sub_field in data and not pd.isnull(data[sub_field]):
1304
+ if upper:
1305
+ data[sub_field] = data[sub_field].upper()
1306
+ elif lower:
1307
+ data[sub_field] = data[sub_field].lower()
1308
+ for regex_pattern, replacement in replacements.items():
1309
+ if replacement:
1310
+ regex_pattern = rf"\b{regex_pattern}\b"
1311
+ data[sub_field] = re.sub(
1312
+ regex_pattern, replacement, data[sub_field]
1313
+ )
1314
+ if length > 0:
1315
+ data[sub_field] = data[sub_field][:length]
1316
+ return data
1317
+
1318
+ # end method definition
1319
+
1320
+ def filter(self, conditions: list, inplace: bool = True) -> pd.DataFrame:
1321
+ """Filter the DataFrame based on (multiple) conditions.
1322
+
1323
+ Args:
1324
+ conditions (list): Conditions are a list of dictionaries with 3 items:
1325
+ * field (str): name of a column in the data frame
1326
+ * value (str or list): expected value (filter criterium).
1327
+ If it is a list then one of
1328
+ the list elements must match the field value (OR)
1329
+ * regex (bool): this flag controls if the value is interpreted as a
1330
+ regular expression. If there is no regex item in the
1331
+ dictionary then the default is False (= values is NOT regex).
1332
+ If there are multiple conditions in the list each has to evaluate to True (AND)
1333
+ inplace (bool, optional): Defines if the self._df is modified (inplace) or just
1334
+ a new DataFrame is returned. Defaults to True.
1335
+ Returns:
1336
+ pd.DataFrame: new data frame or pointer to self._df (depending on the value of 'inplace')
1337
+ """
1338
+
1339
+ if self._df is None:
1340
+ logger.error("DataFrame is not initialized.")
1341
+ return None
1342
+
1343
+ if self._df.empty:
1344
+ logger.error("DataFrame is empty.")
1345
+ return None
1346
+
1347
+ # first filtered_df is the full DataFreame.
1348
+ # then it is subsequentially reduced by each condition
1349
+ # at the end it is just those rows that match all conditions.
1350
+ filtered_df = self._df
1351
+
1352
+ # We traverse a list of conditions. Each condition must evaluate to true
1353
+ # otherwise the current workspace or document (i.e. the data set for these objects)
1354
+ # will be skipped. The variable filtered_df is
1355
+ for condition in conditions:
1356
+ field = condition.get("field", None)
1357
+ if not field:
1358
+ logger.error("Missing value for filter condition 'field' in payload!")
1359
+ continue
1360
+ if field not in self._df.columns:
1361
+ logger.warning(
1362
+ "Filter condition field -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
1363
+ field,
1364
+ str(self._df.columns),
1365
+ )
1366
+ continue # Skip filtering for columns not present in DataFrame
1367
+ value = condition.get("value", None)
1368
+ if not value:
1369
+ logger.error(
1370
+ "Missing filter value of for filter condition field -> '%s'!", field
1371
+ )
1372
+ continue
1373
+ regex = condition.get("regex", False)
1374
+
1375
+ logger.info(
1376
+ "Data Frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
1377
+ filtered_df.shape[0],
1378
+ filtered_df.shape[1],
1379
+ str(condition),
1380
+ )
1381
+
1382
+ filtered_dfs = []
1383
+
1384
+ # if a single string is passed as value we put
1385
+ # it into an 1-item list to simplify the following code:
1386
+ if not isinstance(value, list):
1387
+ value = [value]
1388
+
1389
+ # multiple values are treated like a logical "or" condition
1390
+ for value_item in value:
1391
+ if regex:
1392
+ filtered_dfs.append(
1393
+ filtered_df[
1394
+ ~filtered_df[field].isna()
1395
+ & filtered_df[field].str.contains(value_item, regex=True)
1396
+ ]
1397
+ )
1398
+ else:
1399
+ result_df = filtered_df[
1400
+ ~filtered_df[field].isna() & filtered_df[field].eq(value_item)
1401
+ ]
1402
+ if not result_df.empty:
1403
+ filtered_dfs.append(result_df)
1404
+ # end for values
1405
+
1406
+ if not filtered_dfs:
1407
+ logger.warning(
1408
+ "Filter with field -> '%s' and value -> '%s' delivered an empty Data Frame",
1409
+ field,
1410
+ str(value),
1411
+ )
1412
+ filtered_df.drop(filtered_df.index, inplace=True)
1413
+ else:
1414
+ # Concatenate the filtered DataFrames for each value in the list
1415
+ filtered_df = pd.concat(filtered_dfs, ignore_index=True)
1416
+
1417
+ logger.info(
1418
+ "Data Frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
1419
+ filtered_df.shape[0],
1420
+ filtered_df.shape[1],
1421
+ str(condition),
1422
+ )
1423
+ # end for condition
1424
+
1425
+ if inplace:
1426
+ self._df = filtered_df
1427
+
1428
+ return filtered_df
1429
+
1430
+ # end method definition
1431
+
1432
+ def fill_na_in_column(self, column_name: str, default_value: str | int):
1433
+ """Replace NA values in a column with a defined new default value
1434
+
1435
+ Args:
1436
+ column_name (str): name of the column in the DataFrame
1437
+ default_value (str | int): value to replace NA with
1438
+ """
1439
+
1440
+ if column_name in self._df.columns:
1441
+ self._df[column_name] = self._df[column_name].fillna(value=default_value)
1442
+ else:
1443
+ logger.error(
1444
+ "Cannot replace NA values as column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
1445
+ column_name,
1446
+ str(self._df.columns),
1447
+ )
1448
+
1449
+ # end method definition
1450
+
1451
+ def fill_forward(self, inplace: bool) -> pd.DataFrame:
1452
+ """Fill the missing cells appropriately by carrying forward
1453
+ the values from the previous rows where necessary.
1454
+ This has applications if a hierarchy is represented by
1455
+ nested cells e.g. in an Excel sheet.
1456
+
1457
+ Args:
1458
+ inplace (bool): Should the modification happen inplace or not.
1459
+
1460
+ Returns:
1461
+ pd.DataFrame: Resulting dataframe
1462
+ """
1463
+
1464
+ # To convert an Excel representation of a folder structure with nested
1465
+ # columns into a format appropriate for Pandas,
1466
+ # where all cells should be filled
1467
+ df_filled = self._df.ffill(inplace=inplace)
1468
+
1469
+ return df_filled
1470
+
1471
+ # end method definition
1472
+
1473
+ def lookup_value(
1474
+ self, lookup_column: str, lookup_value: str, separator: str = "|"
1475
+ ) -> pd.Series | None:
1476
+ """Lookup a row that includes a lookup value in the value of a given column.
1477
+
1478
+ Args:
1479
+ lookup_column (str): name of the column to search in
1480
+ lookup_value (str): value to search for
1481
+ separator (str): string list delimiter / separator
1482
+
1483
+ Returns:
1484
+ pd.Series | None: data frame row that matches or None if no match was found.
1485
+ """
1486
+
1487
+ # Use the `apply` function to filter rows where the lookup value matches a whole item in the comma-separated list
1488
+ def match_lookup_value(string_list: str) -> bool:
1489
+ """Spilt delimiter-separated list into a python list
1490
+
1491
+ Args:
1492
+ string_list (str): delimiter-separated string list like "a, b, c" or "a | b | c"
1493
+
1494
+ Returns:
1495
+ bool: True if lookup_value is equal to one of the delimiter-separated terms
1496
+ """
1497
+ # Ensure that the string is a string
1498
+ string_list = str(string_list)
1499
+
1500
+ return lookup_value in [
1501
+ item.strip() for item in string_list.split(separator)
1502
+ ]
1503
+
1504
+ df = self._df
1505
+
1506
+ if self._df is None:
1507
+ return None
1508
+
1509
+ if lookup_column not in self._df.columns:
1510
+ logger.error(
1511
+ "Column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
1512
+ lookup_column,
1513
+ str(self._df.columns),
1514
+ )
1515
+ return None
1516
+
1517
+ # Fill NaN or None values in the lookup column with empty strings
1518
+ df[lookup_column] = df[lookup_column].fillna("")
1519
+
1520
+ # Use the `apply` function to filter rows where the lookup value is in the Synonyms list
1521
+ matched_row = df[df[lookup_column].apply(match_lookup_value)]
1522
+
1523
+ # Return the first matched row, if any
1524
+ if not matched_row.empty:
1525
+ return matched_row.iloc[0]
1526
+
1527
+ return None
1528
+
1529
+ # end method definition
1530
+
1531
+ def add_column(
1532
+ self,
1533
+ source_column: str,
1534
+ reg_exp: str,
1535
+ new_column: str,
1536
+ prefix="",
1537
+ suffix="",
1538
+ length: int | None = None,
1539
+ group_chars: int | None = None,
1540
+ group_separator: str = ".",
1541
+ group_remove_leading_zero: bool = True,
1542
+ ) -> bool:
1543
+ """Add additional column to the data frame.
1544
+
1545
+ Args:
1546
+ source_column (str): name of the source column
1547
+ reg_exp (str): regular expression to apply on the content of the source column
1548
+ new_column (str): name of the column to add
1549
+ prefix (str, optional): Prefix to add in front of the value. Defaults to "".
1550
+ suffix (str, optional): Suffix to add at the end of the value. Defaults to "".
1551
+ length (int | None, optional): Length to reduce to. Defaults to None (= unlimited).
1552
+ group_chars (int | None, optional): group the resulting string in characters of group_chars. Defaults to None.
1553
+ Usable e.g. for thousand seperator "."
1554
+ group_separator (str, optional): Separator string for the grouping. Defaults to ".".
1555
+ group_remove_leading_zero (bool, optional): Remove leading zeros from the groups. Defaults to True.
1556
+
1557
+ Returns:
1558
+ bool: True = Success, False = Failure
1559
+ """
1560
+
1561
+ if self._df is None:
1562
+ return False
1563
+
1564
+ # Use str.extract to apply the regular expression to the source column
1565
+ # and then assign this modified colum to the variable extracted:
1566
+ extracted = self._df[source_column].str.extract(pat=reg_exp, expand=False)
1567
+
1568
+ # Limit the result to the specified length
1569
+ if length is not None:
1570
+ extracted = extracted.str[:length]
1571
+
1572
+ if group_chars is not None:
1573
+
1574
+ def process_grouping(x):
1575
+ if pd.isna(x):
1576
+ return x
1577
+ # Split into groups
1578
+ groups = [x[i : i + group_chars] for i in range(0, len(x), group_chars)]
1579
+ if group_remove_leading_zero:
1580
+ # Remove leading zeros from each group
1581
+ groups = [group.lstrip("0") or "0" for group in groups]
1582
+ # Join groups with separator
1583
+ return group_separator.join(groups)
1584
+
1585
+ extracted = extracted.apply(process_grouping)
1586
+
1587
+ # Add prefix and suffix
1588
+ if prefix or suffix:
1589
+ extracted = prefix + extracted.astype(str) + suffix
1590
+
1591
+ self._df[new_column] = extracted
1592
+
1593
+ return True
1594
+
1595
+ # end method definition
1596
+
1597
+ def convert_to_lists(self, columns: list, delimiter: str = ","):
1598
+ """Method to intelligently convert strings to lists, with a configurable delimiter,
1599
+ ignoring delimiters inside quotes
1600
+
1601
+ Args:
1602
+ columns (list): name of the columns whose values should be converted to lists.
1603
+ It is expected that
1604
+ delimiter (str, optional): Character that delimits list items. Defaults to ",".
1605
+
1606
+ Returns:
1607
+ None. self._df is modified in place.
1608
+ """
1609
+
1610
+ # Regex to split by the delimiter, ignoring those inside quotes or double quotes
1611
+ def split_string_ignoring_quotes(s, delimiter):
1612
+ # Escaping the delimiter in case it's a special regex character
1613
+ delimiter = re.escape(delimiter)
1614
+ # Match quoted strings and unquoted delimiters separately
1615
+ pattern = rf'(?:"[^"]*"|\'[^\']*\'|[^{delimiter}]+)'
1616
+ return re.findall(pattern, s)
1617
+
1618
+ for col in columns:
1619
+ self._df[col] = self._df[col].apply(
1620
+ lambda x: (
1621
+ split_string_ignoring_quotes(x, delimiter)
1622
+ if isinstance(x, str) and delimiter in x
1623
+ else x
1624
+ )
1625
+ )
1626
+
1627
+ # end method definition
1628
+
1629
+ def add_column_list(self, source_columns: list, new_column: str):
1630
+ """Add a column with list objects. The list items are taken from a list of
1631
+ source columns (row by row).
1632
+
1633
+ Args:
1634
+ source_columns (list): column names the list values are taken from
1635
+ new_column (str): name of the new column
1636
+ Returns:
1637
+ None. self._df is modified in place.
1638
+ """
1639
+
1640
+ def create_list(row):
1641
+ return [row[col] for col in source_columns]
1642
+
1643
+ self._df[new_column] = self._df.apply(create_list, axis=1)
1644
+
1645
+ # end method definition
1646
+
1647
+ def add_column_table(
1648
+ self, source_columns: list, new_column: str, delimiter: str = ","
1649
+ ):
1650
+ """Add a column with tabular objects (list of dictionaris). The
1651
+ source columns should include lists. The resulting dictionary
1652
+ keys are the column names for the source columns.
1653
+
1654
+ Example:
1655
+ X[1] = 1, 2, 3
1656
+ Y[1] = A, B, C
1657
+ X[2] = 4, 5, 6
1658
+ Y[2] = D, E, F
1659
+
1660
+ Table[1] = [
1661
+ {
1662
+ "X": "1"
1663
+ "Y": "A"
1664
+ },
1665
+ {
1666
+ "X": "2"
1667
+ "Y": "B"
1668
+ }
1669
+ {
1670
+ "X": "3"
1671
+ "Y": "C"
1672
+ }
1673
+ ]
1674
+ Table[2] = [
1675
+ {
1676
+ "X": "4"
1677
+ "Y": "D"
1678
+ },
1679
+ {
1680
+ "X": "5"
1681
+ "Y": "E"
1682
+ }
1683
+ {
1684
+ "X": "6"
1685
+ "Y": "F"
1686
+ }
1687
+ ]
1688
+
1689
+ Args:
1690
+ source_columns (list): column names the list values are taken from
1691
+ new_column (str): name of the new column
1692
+ delimiter (str, optional): Character that delimits list items. Defaults to ",".
1693
+
1694
+ Returns:
1695
+ None. self._df is modified in place.
1696
+ """
1697
+
1698
+ # Call the convert_to_lists method to ensure the columns are converted
1699
+ self.convert_to_lists(columns=source_columns, delimiter=delimiter)
1700
+
1701
+ # Sub-method to pad lists to the same length
1702
+ def pad_list(lst: list, max_len: int):
1703
+ return lst + [None] * (max_len - len(lst))
1704
+
1705
+ def create_table(row) -> list:
1706
+ max_len = max(
1707
+ len(row[col]) if isinstance(row[col], list) else 1
1708
+ for col in source_columns
1709
+ )
1710
+
1711
+ # Pad lists to the maximum length, leave scalars as they are
1712
+ for col in source_columns:
1713
+ if isinstance(row[col], list):
1714
+ row[col] = pad_list(row[col], max_len)
1715
+ else:
1716
+ if not pd.isna(row[col]):
1717
+ row[col] = [
1718
+ row[col]
1719
+ ] * max_len # Repeat scalar to match the max length
1720
+ else:
1721
+ row[col] = [None] * max_len
1722
+ # Create a list of dictionaries for each row
1723
+ table = []
1724
+ for i in range(max_len):
1725
+ table.append({col: row[col][i] for col in source_columns})
1726
+ return table
1727
+
1728
+ # Apply the function to create a new column with a table
1729
+ self._df[new_column] = self._df.apply(create_table, axis=1)
1730
+
1731
+ # end method definition