pyxecm 1.3.0__py3-none-any.whl → 1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyxecm might be problematic. Click here for more details.

pyxecm/helper/data.py ADDED
@@ -0,0 +1,1527 @@
1
+ """
2
+ Data Module to implement functions to leverage Pandas to
3
+ manipulte data structures read for bulk generation of Extended ECM items.
4
+
5
+ This code implements a class called data which is referring
6
+ to Pandas DataFrame.
7
+
8
+ Class: Payload
9
+ Methods:
10
+
11
+ __init__ : class initializer
12
+ __len__: Lenght of the embedded DataFrame object.
13
+ __str__: Print the DataFrame of the class
14
+ get_data_frame: Get the Pandas DataFrame object
15
+ set_data_frame: Set the Pandas DataFrame object
16
+ append: Append additional data to the data frame.
17
+
18
+ load_json_data: Load JSON data into DataFrame
19
+ save_json_data: Save JSON data from DataFrame to file
20
+ load_excel_data: Load Excel file into DataFrame
21
+ load_csv_data: Load CSV data into DataFrame
22
+ load_directory: Load directory structure into Pandas Data Frame
23
+
24
+ partitionate: Partition a data frame into equally sized partions
25
+ deduplicate: Remove dupclicate rows that have all fields in unique_fields in common
26
+ sort: Sort the data frame based on one or multiple fields.
27
+ flatten: Flatten a sub-dictionary by copying selected fields to the
28
+ parent dictionary.
29
+ explode_and_flatten: Explode a substructure in the Data Frame
30
+ drop_columns: Drop selected columns from the Data Frame
31
+ keep_columns: Keep only selected columns from the Data Frame. Drop the rest.
32
+ cleanse: Cleanse data with regular expressions and upper/lower case conversion.
33
+ filter: Filter the DataFrame based on conditions
34
+
35
+ fill_forward: Fill the missing cells appropriately by carrying forward
36
+ the values from the previous rows where necessary.
37
+ fill_na_in_column: Replace NA values in a column with a defined new default value
38
+ """
39
+
40
+ __author__ = "Dr. Marc Diefenbruch"
41
+ __copyright__ = "Copyright 2024, OpenText"
42
+ __credits__ = ["Kai-Philip Gatzweiler"]
43
+ __maintainer__ = "Dr. Marc Diefenbruch"
44
+ __email__ = "mdiefenb@opentext.com"
45
+
46
+ import logging
47
+ import json
48
+ import os
49
+ import re
50
+ import threading
51
+
52
+ import pandas as pd
53
+
54
+ logger = logging.getLogger("pyxecm.helper.data")
55
+
56
+
57
+ class Data:
58
+ """Used to automate data loading for the customizer."""
59
+
60
+ _df: pd.DataFrame
61
+ _lock = threading.Lock()
62
+
63
+ def __init__(self, init_data: pd.DataFrame | list = None):
64
+ """Initialize the Data object.
65
+
66
+ Args:
67
+ init_data (pd.DataFrame | list, optional): Data to initialize the data frame. Can either be
68
+ another data frame (that gets copied) or a list of dictionaries.
69
+ Defaults to None.
70
+ """
71
+
72
+ if init_data is not None:
73
+ # if a data frame is passed to the constructor we
74
+ # copy its content to the new Data object
75
+
76
+ if isinstance(init_data, pd.DataFrame):
77
+ self._df: pd.DataFrame = init_data.copy()
78
+ elif isinstance(init_data, Data):
79
+ if init_data.get_data_frame() is not None:
80
+ self._df: pd.DataFrame = init_data.get_data_frame().copy()
81
+ elif isinstance(init_data, list):
82
+ self._df: pd.DataFrame = pd.DataFrame(init_data)
83
+ elif isinstance(init_data, dict):
84
+ # it is important to wrap the dict in a list to avoid that more than 1 row is created
85
+ self._df: pd.DataFrame = pd.DataFrame([init_data])
86
+ else:
87
+ logger.error("Illegal initialization data for 'Data' class!")
88
+ self._df = None
89
+ else:
90
+ self._df = None
91
+
92
+ # end method definition
93
+
94
+ def __len__(self) -> int:
95
+ """Lenght of the embedded DataFrame object.
96
+ This is basically a convenience method.
97
+
98
+ Returns:
99
+ int: Lenght of the DataFrame
100
+ """
101
+
102
+ if self._df is not None:
103
+ return len(self._df)
104
+ return 0
105
+
106
+ # end method definition
107
+
108
+ def __str__(self) -> str:
109
+ """Print the DataFrame of the class.
110
+
111
+ Returns:
112
+ str: String representation.
113
+ """
114
+
115
+ # if data frame is initialized we return
116
+ # the string representation of pd.DataFrame
117
+ if self._df is not None:
118
+ return str(self._df)
119
+
120
+ return str(self)
121
+
122
+ # end method definition
123
+
124
+ def lock(self):
125
+ """Return the threading lock object.
126
+
127
+ Returns:
128
+ _type_: threading lock object
129
+ """
130
+ return self._lock
131
+
132
+ # end method definition
133
+
134
+ def get_data_frame(self) -> pd.DataFrame:
135
+ """Get the Pandas DataFrame object
136
+
137
+ Returns:
138
+ pd.DataFrame: Pandas DataFrame object
139
+ """
140
+
141
+ return self._df
142
+
143
+ # end method definition
144
+
145
+ def set_data_frame(self, df: pd.DataFrame):
146
+ """Set the Pandas DataFrame object
147
+
148
+ Args:
149
+ df (pd.DataFrame): Pandas DataFrame object
150
+ """
151
+
152
+ self._df = df
153
+
154
+ # end method definition
155
+
156
+ def print_info(
157
+ self,
158
+ show_size: bool = True,
159
+ show_info: bool = False,
160
+ show_columns: bool = False,
161
+ show_first: bool = False,
162
+ show_last: bool = False,
163
+ show_sample: bool = False,
164
+ show_statistics: bool = False,
165
+ row_num: int = 10,
166
+ ):
167
+ """Log information about the data frame
168
+
169
+ Args:
170
+ show_size (bool, optional): Show size of data frame. Defaults to True.
171
+ show_info (bool, optional): Show information for data frame. Defaults to False.
172
+ show_columns (bool, optional): Show columns of data frame. Defaults to False.
173
+ show_first (bool, optional): Show first 10 items. Defaults to False.
174
+ show_last (bool, optional): Show last 10 items. Defaults to False.
175
+ show_sample (bool, optional): Show 10 sample items. Defaults to False.
176
+ show_statistics (bool, optional): Show data frame statistics. Defaults to False.
177
+ """
178
+
179
+ if self._df is None:
180
+ logger.warning("Data Frame is not initialized!")
181
+ return
182
+
183
+ if show_size:
184
+ logger.info(
185
+ "Data Frame has %s row(s) and %s column(s)",
186
+ self._df.shape[0],
187
+ self._df.shape[1],
188
+ )
189
+
190
+ if show_info:
191
+ # df.info() can not easily be embedded into a string
192
+ self._df.info()
193
+
194
+ if show_columns:
195
+ logger.info("Columns:\n%s", self._df.columns)
196
+ logger.info(
197
+ "Columns with number of null values:\n%s", self._df.isnull().sum()
198
+ )
199
+ logger.info(
200
+ "Columns with number of non-null values:\n%s", self._df.notnull().sum()
201
+ )
202
+ logger.info("Columns with number of NaN values:\n%s", self._df.isna().sum())
203
+ logger.info(
204
+ "Columns with number of non-NaN values:\n%s", self._df.notna().sum()
205
+ )
206
+
207
+ if show_first:
208
+ # the default for head is n = 5:
209
+ logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
210
+
211
+ if show_last:
212
+ # the default for tail is n = 5:
213
+ logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
214
+
215
+ if show_sample:
216
+ # the default for sample is n = 1:
217
+ logger.info("%s Sample rows:\n%s", str(row_num), self._df.sample(n=row_num))
218
+
219
+ if show_statistics:
220
+ logger.info(
221
+ "Description of statistics for data frame:\n%s", self._df.describe()
222
+ )
223
+ logger.info(
224
+ "Description of statistics for data frame (Transformed):\n%s",
225
+ self._df.describe().T,
226
+ )
227
+ logger.info(
228
+ "Description of statistics for data frame (objects):\n%s",
229
+ self._df.describe(include="object"),
230
+ )
231
+
232
+ # end method definition
233
+
234
+ def append(self, add_data: pd.DataFrame | list | dict) -> bool:
235
+ """Append additional data to the data frame.
236
+
237
+ Args:
238
+ add_data (pd.DataFrame | list | dict): Additional data. Can be pd.DataFrame or list of dicts (or Data)
239
+
240
+ Returns:
241
+ bool: True = Success, False = Error
242
+ """
243
+
244
+ # Does the data frame has already content?
245
+ # Then we need to concat / append. Otherwise
246
+ # we just initialize self._df
247
+ if self._df is not None:
248
+ if isinstance(add_data, pd.DataFrame):
249
+ self._df = pd.concat([self._df, add_data], ignore_index=True)
250
+ return True
251
+ elif isinstance(add_data, Data):
252
+ df = add_data.get_data_frame()
253
+ if df:
254
+ self._df = pd.concat([self._df, df], ignore_index=True)
255
+ return True
256
+ elif isinstance(add_data, list):
257
+ if add_data:
258
+ df = Data(add_data)
259
+ self._df = pd.concat(
260
+ [self._df, df.get_data_frame()], ignore_index=True
261
+ )
262
+ return True
263
+ elif isinstance(add_data, dict):
264
+ if add_data:
265
+ # it is important to wrap the dict in a list to avoid that more than 1 row is created
266
+ df = Data([add_data])
267
+ self._df = pd.concat(
268
+ [self._df, df.get_data_frame()], ignore_index=True
269
+ )
270
+ return True
271
+ else:
272
+ logger.error("Illegal data type -> '%s'", type(add_data))
273
+ return False
274
+ else: # self._df is None (initial state)
275
+ if isinstance(add_data, pd.DataFrame):
276
+ self._df = add_data
277
+ return True
278
+ elif isinstance(add_data, Data):
279
+ self._df = add_data.get_data_frame()
280
+ return True
281
+ elif isinstance(add_data, list):
282
+ self._df = pd.DataFrame(add_data)
283
+ return True
284
+ elif isinstance(add_data, dict):
285
+ # it is important to wrap the dict in a list to avoid that more than 1 row is created
286
+ self._df = pd.DataFrame([add_data])
287
+ return True
288
+ else:
289
+ logger.error("Illegal data type -> '%s'", type(add_data))
290
+ return False
291
+
292
+ # end method definition
293
+
294
+ def load_json_data(self, json_path: str, convert_dates: bool = False) -> bool:
295
+ """Load JSON data into DataFrame
296
+
297
+ Args:
298
+ json_path (str): Path to the JSON file.
299
+ convert_dates (bool, optional): whether or not dates should be converted
300
+ Returns:
301
+ bool: False in case an error occured, True otherwise.
302
+ """
303
+
304
+ if json_path is not None and os.path.exists(json_path):
305
+ # Load data from JSON file
306
+ try:
307
+ df = pd.read_json(path_or_buf=json_path, convert_dates=convert_dates)
308
+ if self._df is None:
309
+ self._df = df
310
+ else:
311
+ self._df = pd.concat([self._df, df])
312
+ logger.info(
313
+ "After loading -> '%s' the Data Frame has %s row(s) and %s column(s)",
314
+ json_path,
315
+ self._df.shape[0],
316
+ self._df.shape[1],
317
+ )
318
+ except FileNotFoundError:
319
+ logger.error(
320
+ "File -> %s not found. Please check the file path.", json_path
321
+ )
322
+ return False
323
+ except PermissionError:
324
+ logger.error("Permission denied to access the file -> %s.", json_path)
325
+ return False
326
+ except IOError as e:
327
+ logger.error("An I/O error occurred -> %s", str(e))
328
+ return False
329
+ except json.JSONDecodeError as e:
330
+ logger.error("Error: Unable to decode JSON -> %s", str(e))
331
+ return False
332
+ except ValueError as e:
333
+ logger.error("Invalid JSON input -> %s", str(e))
334
+ return False
335
+ except AttributeError as e:
336
+ logger.error("Unexpected data structure -> %s", str(e))
337
+ return False
338
+ except TypeError as e:
339
+ logger.error("Unexpected data type -> %s", str(e))
340
+ return False
341
+ except KeyError as e:
342
+ logger.error("Missing key in JSON data -> %s", str(e))
343
+ return False
344
+
345
+ else:
346
+ logger.error(
347
+ "Missing JSON file - you have not specified a valid path -> %s.",
348
+ json_path,
349
+ )
350
+ return False
351
+ return True
352
+
353
+ # end method definition
354
+
355
+ def save_json_data(
356
+ self, json_path: str, orient: str = "records", preserve_index: bool = False
357
+ ) -> bool:
358
+ """Save JSON data from DataFrame to file
359
+
360
+ Args:
361
+ json_path (str): Path to the JSON file.
362
+ orient (str, optional): Structure of the JSON
363
+ preserve_index (bool, optional)
364
+ Returns:
365
+ bool: False in case an error occured, True otherwise.
366
+ """
367
+
368
+ if json_path is not None and os.path.exists(os.path.dirname(json_path)):
369
+ # Load data from JSON file
370
+ try:
371
+ if self._df is not None:
372
+ # index parameter is only allowed if orient has one of the following values:
373
+ if (
374
+ orient == "columns"
375
+ or orient == "index"
376
+ or orient == "table"
377
+ or orient == "split"
378
+ ):
379
+ self._df.to_json(
380
+ path_or_buf=json_path,
381
+ index=preserve_index,
382
+ orient=orient,
383
+ indent=2,
384
+ )
385
+ else:
386
+ self._df.to_json(path_or_buf=json_path, orient=orient, indent=2)
387
+ else:
388
+ logger.warning("Data Frame is empty. Cannot write it to JSON")
389
+ return False
390
+ except FileNotFoundError:
391
+ logger.error(
392
+ "File -> '%s' not found. Please check the file path.", json_path
393
+ )
394
+ return False
395
+ except PermissionError:
396
+ logger.error("Permission denied to access the file -> '%s'.", json_path)
397
+ return False
398
+ except IOError as e:
399
+ logger.error("An I/O error occurred -> %s", str(e))
400
+ return False
401
+ except ValueError as e:
402
+ logger.error("Value Error -> %s", str(e))
403
+ return False
404
+
405
+ else:
406
+ logger.error(
407
+ "Missing JSON file -> '%s' you have not specified a valid path!",
408
+ json_path,
409
+ )
410
+ return False
411
+ return True
412
+
413
+ # end method definition
414
+
415
+ def load_excel_data(
416
+ self,
417
+ xlsx_path: str,
418
+ sheet_names: str | list | None = 0,
419
+ usecols: str | list | None = None,
420
+ skip_rows: int | None = None,
421
+ header: int | None = 0,
422
+ names: list | None = None,
423
+ na_values: list | None = None,
424
+ ) -> bool:
425
+ """Load Excel (xlsx) data into DataFrame. Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
426
+ read from a local filesystem or URL. Supports an option to read a single sheet or a list of sheets.
427
+
428
+ Args:
429
+ xlsx_path (str): Path to the Excel file.
430
+ sheet_names (list | str | int, optional): Name or Index of the sheet in the Excel workbook to load.
431
+ If 'None' then all sheets will be loaded.
432
+ If 0 then first sheet in workbook will be loaded (this is the Default)
433
+ If string then this is interpreted as the name of the sheet to load.
434
+ If a list is passed, this can be a list of index values (int) or
435
+ a list of strings with the sheet names to load.
436
+ usecols (list | str, optional): List of columns to load, specified by general column names in Excel,
437
+ e.g. usecols='B:D', usecols=['A', 'C', 'F']
438
+ skip_rows (int, optional): List of rows to skip on top of the sheet (e.g. to not read headlines)
439
+ header (int | None, optional): Excel Row (0-indexed) to use for the column labels of the parsed DataFrame.
440
+ If file contains no header row, then you should explicitly pass header=None.
441
+ Default is 0.
442
+ names (list): List of column names to use. Default is None
443
+ na_values (list, optional): List of values in the Excel that should become the Pandas NA value.
444
+ Returns:
445
+ bool: False in case an error occured, True otherwise.
446
+ """
447
+
448
+ if xlsx_path is not None and os.path.exists(xlsx_path):
449
+ # Load data from Excel file
450
+ try:
451
+ df = pd.read_excel(
452
+ io=xlsx_path,
453
+ sheet_name=sheet_names,
454
+ usecols=usecols,
455
+ skiprows=skip_rows,
456
+ header=header,
457
+ names=names,
458
+ na_values=na_values,
459
+ )
460
+ # if multiple sheets from an Excel workbook are loaded,
461
+ # then read_excel() returns a dictionary. The keys are
462
+ # the names of the sheets and the values are the Data Frames.
463
+ # we handle this case as follows:
464
+ if isinstance(df, dict):
465
+ logger.info("Loading multiple Excel sheets from the workbook!")
466
+ multi_sheet_df = pd.DataFrame()
467
+ for sheet in df.keys():
468
+ multi_sheet_df = pd.concat(
469
+ [multi_sheet_df, df[sheet]], ignore_index=True
470
+ )
471
+ df = multi_sheet_df
472
+ if self._df is None:
473
+ self._df = df
474
+ else:
475
+ self._df = pd.concat([self._df, df], ignore_index=True)
476
+ except FileNotFoundError:
477
+ logger.error(
478
+ "File -> '%s' not found. Please check the file path.", xlsx_path
479
+ )
480
+ return False
481
+ except PermissionError:
482
+ logger.error("Permission denied to access the file -> '%s'.", xlsx_path)
483
+ return False
484
+ except IOError as e:
485
+ logger.error("An I/O error occurred -> %s", str(e))
486
+ return False
487
+ except ValueError as e:
488
+ logger.error("Invalid Excel input -> %s", str(e))
489
+ return False
490
+ except AttributeError as e:
491
+ logger.error("Unexpected data structure -> %s", str(e))
492
+ return False
493
+ except TypeError as e:
494
+ logger.error("Unexpected data type -> %s", str(e))
495
+ return False
496
+ except KeyError as e:
497
+ logger.error("Missing key in Excel data -> %s", str(e))
498
+ return False
499
+
500
+ else:
501
+ logger.error(
502
+ "Missing Excel file -> '%s' you have not specified a valid path!",
503
+ xlsx_path,
504
+ )
505
+ return False
506
+ return True
507
+
508
+ # end method definition
509
+
510
+ def save_excel_data(
511
+ self, excel_path: str, sheet_name: str = "Pandas Export", index: bool = False
512
+ ) -> bool:
513
+ """
514
+ Save the DataFrame to an Excel file, with robust error handling and logging.
515
+
516
+ Args:
517
+ excel_path (str): The file path to save the Excel file.
518
+ sheet_name (str): The sheet name where data will be saved. Default is 'Sheet1'.
519
+ index: Whether to write the row names (index). Default is False.
520
+ """
521
+ try:
522
+ # Check if the directory exists
523
+ directory = os.path.dirname(excel_path)
524
+ if directory and not os.path.exists(directory):
525
+ raise FileNotFoundError(
526
+ "The directory -> '%s' does not exist." % directory
527
+ )
528
+
529
+ # Attempt to save the DataFrame to Excel
530
+ self._df.to_excel(excel_path, sheet_name=sheet_name, index=index)
531
+ logger.info("Data saved successfully to -> %s", excel_path)
532
+
533
+ except FileNotFoundError as e:
534
+ logger.error("Error: %s", e)
535
+ return False
536
+ except PermissionError:
537
+ logger.error(
538
+ "Error: Permission denied. You do not have permission to write to '%s'.",
539
+ excel_path,
540
+ )
541
+ return False
542
+ except ValueError as ve:
543
+ logger.error("Error: Invalid data for Excel format -> %s", ve)
544
+ return False
545
+ except OSError as oe:
546
+ logger.error("Error: OS error occurred while saving file -> %s", oe)
547
+ return False
548
+ except Exception as e:
549
+ # Catch-all for any other unexpected errors
550
+ logger.error("An unexpected error occurred -> %s", e)
551
+ return False
552
+
553
+ return True
554
+
555
+ # end method definition
556
+
557
+ def load_csv_data(self, csv_path: str) -> bool:
558
+ """Load CSV (Comma separated values) data into DataFrame
559
+
560
+ Args:
561
+ csv_path (str): Path to the CSV file.
562
+ Returns:
563
+ bool: False in case an error occured, True otherwise.
564
+ """
565
+
566
+ if csv_path is not None and os.path.exists(csv_path):
567
+ # Load data from CSV file
568
+ try:
569
+ df = pd.read_csv(csv_path)
570
+ if self._df is None:
571
+ self._df = df
572
+ else:
573
+ self._df = pd.concat([self._df, df])
574
+ except FileNotFoundError:
575
+ logger.error(
576
+ "File -> '%s' not found. Please check the file path.", csv_path
577
+ )
578
+ return False
579
+ except PermissionError:
580
+ logger.error("Permission denied to access the file -> %s.", csv_path)
581
+ return False
582
+ except IOError as e:
583
+ logger.error("An I/O error occurred -> %s", str(e))
584
+ return False
585
+ except ValueError as e:
586
+ logger.error("Invalid CSV input -> %s", str(e))
587
+ return False
588
+ except AttributeError as e:
589
+ logger.error("Unexpected data structure -> %s", str(e))
590
+ return False
591
+ except TypeError as e:
592
+ logger.error("Unexpected data type -> %s", str(e))
593
+ return False
594
+ except KeyError as e:
595
+ logger.error("Missing key in CSV data -> %s", str(e))
596
+ return False
597
+
598
+ else:
599
+ logger.error(
600
+ "Missing CSV file -> '%s' you have not specified a valid path!",
601
+ csv_path,
602
+ )
603
+ return False
604
+ return True
605
+
606
+ # end method definition
607
+
608
+ def load_xml_data(
609
+ self, xml_path: str, xpath: str | None = None, xslt_path: str | None = None
610
+ ) -> bool:
611
+ """Load XML data into DataFrame
612
+
613
+ Args:
614
+ xml_path (str): Path to the XML file.
615
+ xpath (str, optional): XPath to the elements we want to select
616
+ xslt_path (str, optional): XSLT transformation file
617
+ Returns:
618
+ bool: False in cause an error occured, True otherwise.
619
+ """
620
+
621
+ try:
622
+ df = pd.read_xml(path_or_buffer=xml_path, xpath=xpath, stylesheet=xslt_path)
623
+ # Process the loaded data as needed
624
+ if self._df is None:
625
+ self._df = df
626
+ else:
627
+ self._df = pd.concat([self._df, df])
628
+ logger.info("XML file loaded successfully!")
629
+ return True
630
+ except FileNotFoundError:
631
+ print("File not found.")
632
+ return False
633
+ except PermissionError:
634
+ logger.error("Permission denied to access the file -> %s.", xml_path)
635
+ return False
636
+ except IOError as e:
637
+ logger.error("An I/O error occurred -> %s", str(e))
638
+ return False
639
+ except ValueError as e:
640
+ logger.error("Invalid CSV input -> %s", str(e))
641
+ return False
642
+ except AttributeError as e:
643
+ logger.error("Unexpected data structure -> %s", str(e))
644
+ return False
645
+ except TypeError as e:
646
+ logger.error("Unexpected data type -> %s", str(e))
647
+ return False
648
+ except KeyError as e:
649
+ logger.error("Missing key in CSV data -> %s", str(e))
650
+ return False
651
+
652
+ # end method definition
653
+
654
+ def load_directory(self, path_to_root: str) -> bool:
655
+ """Load directory structure into Pandas Data Frame
656
+
657
+ Args:
658
+ path_to_root (str): Path to the root element of the
659
+ directory structure
660
+
661
+ Returns:
662
+ bool: True = Success, False = Failure
663
+ """
664
+
665
+ try:
666
+ # Check if the provided path is a directory
667
+ if not os.path.isdir(path_to_root):
668
+ logger.error(
669
+ "The provided path -> '%s' is not a valid directory.", path_to_root
670
+ )
671
+ return False
672
+
673
+ # Initialize a list to hold file information
674
+ data = []
675
+
676
+ # Walk through the directory
677
+ for root, _, files in os.walk(path_to_root):
678
+ for file in files:
679
+ file_path = os.path.join(root, file)
680
+ file_size = os.path.getsize(file_path)
681
+ relative_path = os.path.relpath(file_path, path_to_root)
682
+ path_parts = relative_path.split(os.sep)
683
+
684
+ # Create a dictionary with the path parts and file details
685
+ entry = {
686
+ "level {}".format(i): part
687
+ for i, part in enumerate(path_parts[:-1], start=1)
688
+ }
689
+ entry.update({"filename": path_parts[-1], "size": file_size})
690
+ data.append(entry)
691
+
692
+ # Create DataFrame from list of dictionaries
693
+ self._df = pd.DataFrame(data)
694
+
695
+ # Determine the maximum number of levels
696
+ max_levels = max((len(entry) - 2 for entry in data), default=0)
697
+
698
+ # Ensure all entries have the same number of levels
699
+ for entry in data:
700
+ for i in range(1, max_levels + 1):
701
+ entry.setdefault("level {}".format(i), "")
702
+
703
+ # Convert to DataFrame again to make sure all columns are consistent
704
+ self._df = pd.DataFrame(data)
705
+
706
+ except NotADirectoryError as nde:
707
+ print(f"Error: {nde}")
708
+ except FileNotFoundError as fnfe:
709
+ print(f"Error: {fnfe}")
710
+ except PermissionError as pe:
711
+ print(f"Error: {pe}")
712
+
713
+ return True
714
+
715
+ # end method definition
716
+
717
+ def load_xml_directory(self, path_to_root: str, xpath: str | None = None) -> bool:
718
+ """Load directory structure into Pandas Data Frame
719
+
720
+ Args:
721
+ path_to_root (str): Path to the root element of the
722
+ directory structure
723
+ xpath (str, optional): XPath to the elements we want to select
724
+
725
+ Returns:
726
+ bool: True = Success, False = Failure
727
+ """
728
+
729
+ try:
730
+ # Check if the provided path is a directory
731
+ if not os.path.isdir(path_to_root):
732
+ logger.error(
733
+ "The provided path -> '%s' is not a valid directory.", path_to_root
734
+ )
735
+ return False
736
+
737
+ # Walk through the directory
738
+ for root, _, files in os.walk(path_to_root):
739
+ for file in files:
740
+ file_path = os.path.join(root, file)
741
+ file_size = os.path.getsize(file_path)
742
+ file_name = os.path.basename(file_path)
743
+
744
+ if file_name == "docovw.xml":
745
+ logger.info(
746
+ "Load XML file -> '%s' of size -> %s", file_path, file_size
747
+ )
748
+ success = self.load_xml_data(file_path, xpath=xpath)
749
+ if success:
750
+ logger.info(
751
+ "Successfully loaded XML file -> '%s'", file_path
752
+ )
753
+
754
+ except NotADirectoryError as nde:
755
+ logger.error("Error -> %s", str(nde))
756
+ except FileNotFoundError as fnfe:
757
+ logger.error("Error -> %s", str(fnfe))
758
+ except PermissionError as pe:
759
+ logger.error("Error -> %s", str(pe))
760
+
761
+ return True
762
+
763
+ # end method definition
764
+
765
+ def partitionate(self, number: int) -> list:
766
+ """Partition a data frame into equally sized
767
+ partions
768
+
769
+ Args:
770
+ number (int): Number of partitions
771
+
772
+ Returns:
773
+ list: List of partitions
774
+ """
775
+
776
+ # Calculate the approximate size of each partition
777
+ size = len(self._df)
778
+
779
+ if size >= number:
780
+ partition_size = size // number
781
+ remainder = size % number
782
+ else:
783
+ partition_size = size
784
+ number = 1
785
+ remainder = 0
786
+
787
+ logger.info(
788
+ "Data set has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
789
+ str(size),
790
+ str(number),
791
+ str(partition_size),
792
+ str(remainder),
793
+ )
794
+
795
+ # Initialize a list to store partitions
796
+ partitions = []
797
+ start_index = 0
798
+
799
+ # Slice the DataFrame into equally sized partitions
800
+ for i in range(number):
801
+ # start_index = i * partition_size
802
+ # end_index = (i + 1) * partition_size if i < number - 1 else None
803
+ # partition = self._df.iloc[start_index:end_index]
804
+ # partitions.append(partition)
805
+ # Calculate the end index for this partition
806
+ end_index = start_index + partition_size + (1 if i < remainder else 0)
807
+ partition = self._df.iloc[start_index:end_index]
808
+ partitions.append(partition)
809
+ start_index = end_index
810
+
811
+ return partitions
812
+
813
+ # end method definition
814
+
815
+ def partitionate_by_column(self, column_name: str) -> list | None:
816
+ """Partition a data frame based on equal values in a specified column.
817
+
818
+ Args:
819
+ column_name (str): The column name to partition by
820
+
821
+ Returns:
822
+ list: List of partitions
823
+ """
824
+
825
+ if column_name not in self._df.columns:
826
+ logger.error(
827
+ "Column -> '%s' does not exist in the Data Frame. Data Frame has these columns -> %s",
828
+ column_name,
829
+ str(self._df.columns),
830
+ )
831
+ return None
832
+
833
+ # Separate rows with NaN or None values in the specified column
834
+ nan_partitions = self._df[self._df[column_name].isna()]
835
+ non_nan_df = self._df.dropna(subset=[column_name])
836
+
837
+ # Group by the specified column and create a list of DataFrames for each group
838
+ grouped = non_nan_df.groupby(column_name)
839
+ partitions = [group for _, group in grouped]
840
+
841
+ # Add each row with NaN or None values as its own partition
842
+ for i in range(len(nan_partitions)):
843
+ partitions.append(nan_partitions.iloc[[i]])
844
+
845
+ logger.info(
846
+ "Data Frame has been partitioned into -> %s partitions based on the values in column '%s'...",
847
+ str(len(partitions)),
848
+ column_name,
849
+ )
850
+
851
+ return partitions
852
+
853
+ # end method definition
854
+
855
+ def deduplicate(self, unique_fields: list, inplace: bool = True) -> pd.DataFrame:
856
+ """Remove dupclicate rows that have all fields in
857
+ unique_fields in common.
858
+
859
+ Args:
860
+ unique_fields (list): Defines the fields for which we want a unique
861
+ combination.
862
+ inplace (bool, optional): True if the deduplication happens in-place.
863
+ Defaults to True.
864
+ Returns:
865
+ pd.DataFrame | None: If inplace is False than a new deduplicatd DataFrame
866
+ is returned. Otherwise the object is modified in place
867
+ and self._df is returned.
868
+ """
869
+
870
+ if inplace:
871
+ self._df.drop_duplicates(subset=unique_fields, inplace=True)
872
+ self._df.reset_index(drop=True, inplace=True)
873
+ return self._df
874
+ else:
875
+ df = self._df.drop_duplicates(subset=unique_fields, inplace=False)
876
+ df = df.reset_index(drop=True, inplace=False)
877
+ return df
878
+
879
+ # end method definition
880
+
881
+ def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame:
882
+ """Sort the data frame based on one or multiple fields -
883
+ either in place or return it as a new data frame (e.g. not modifying self._df)
884
+
885
+ Args:
886
+ sort_fields (list): Columns / fields to be used for sorting
887
+ inplace (bool, optional): If the sorting should be inplace, i.e. modifying self._df.
888
+ Defaults to True.
889
+ Returns:
890
+ pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
891
+ """
892
+
893
+ if self._df is None:
894
+ return None
895
+
896
+ if not all(sort_field in self._df.columns for sort_field in sort_fields):
897
+ logger.warning(
898
+ "Not all of the given sort fields -> %s do exist in the Data Frame.",
899
+ str(sort_fields),
900
+ )
901
+ # Reduce the sort fields to those that really exist in the DataFrame:
902
+ sort_fields = [
903
+ sort_field
904
+ for sort_field in sort_fields
905
+ if sort_field in self._df.columns
906
+ ]
907
+ logger.warning(
908
+ "Only these given sort fields -> %s do exist as columns in the Data Frame.",
909
+ str(sort_fields),
910
+ )
911
+
912
+ if inplace:
913
+ self._df.sort_values(by=sort_fields, inplace=True)
914
+ self._df.reset_index(drop=True, inplace=True)
915
+ return self._df
916
+ else:
917
+ df = self._df.sort_values(by=sort_fields, inplace=False)
918
+ df = df.reset_index(drop=True, inplace=False)
919
+ return df
920
+
921
+ # end method definition
922
+
923
+ def flatten(
924
+ self,
925
+ parent_field: str,
926
+ flatten_fields: list,
927
+ ):
928
+ """Flatten a sub-dictionary by copying selected fields to the
929
+ parent dictionary. This is e.g. useful for then de-duplicate
930
+ a data set.
931
+
932
+ Args:
933
+ parent_field (str): name of the field in the parent dictionary
934
+ flatten_fields (list): fields in the sub-dictionary to copy
935
+ into the parent dictionary.
936
+ """
937
+
938
+ for flatten_field in flatten_fields:
939
+ flat_field = parent_field + "_" + flatten_field
940
+ # The following expression generates a new column in the
941
+ # data frame with the name of 'flat_field'.
942
+ # In the lambada function x is a dictionary that includes the subvalues
943
+ # and it returns the value of the given flatten field
944
+ # (if it exists, otherwise None). So x is self._df[parent_field], i.e.
945
+ # what the lambda function gets 'applied' on.
946
+ self._df[flat_field] = self._df[parent_field].apply(
947
+ lambda x, sub_field=flatten_field: (
948
+ x.get(sub_field, None) if isinstance(x, dict) else None
949
+ )
950
+ )
951
+
952
+ # end method definition
953
+
954
+ def explode_and_flatten(
955
+ self,
956
+ explode_field: str | list,
957
+ flatten_fields: list | None = None,
958
+ make_unique: bool = False,
959
+ reset_index: bool = False,
960
+ split_string_to_list: bool = False,
961
+ ) -> pd.DataFrame:
962
+ """Explode a substructure in the Data Frame
963
+
964
+ Args:
965
+ explode_field (str | list): Field(s) to explode which each has/have a list structure.
966
+ Exploding multiple columns at once is possible. This delivers
967
+ a very different result compared to exploding one column after
968
+ the other!
969
+ flatten_fields (list): Fields in the exploded substructure to include
970
+ in the main dictionaries for easier processing.
971
+ make_unique (bool, optional): if True deduplicate the exploded data frame.
972
+ flatten (bool, optional): if True flatten the exploded data frame.
973
+ Returns:
974
+ pd.DataFrame: Pointer to the Pandas DataFrame
975
+ """
976
+
977
+ def update_column(row):
978
+ try:
979
+ if sub in row:
980
+ return row[sub]
981
+ except (IndexError, KeyError, ValueError):
982
+ return ""
983
+
984
+ # Define a function to split a string into a list
985
+ def string_to_list(string: str | None) -> list:
986
+ if not string or pd.isna(string):
987
+ return []
988
+ # Use regular expression to split by comma, semicolon, or comma followed by space
989
+ return re.split(r"[;,]\s*", str(string))
990
+
991
+ if isinstance(explode_field, list):
992
+ logger.info("Explode multiple columns -> %s", str(explode_field))
993
+ elif isinstance(explode_field, str):
994
+ logger.info("Explode single column -> '%s'", explode_field)
995
+ else:
996
+ logger.error(
997
+ "Illegal explode field(s) data type provided -> %s", type(explode_field)
998
+ )
999
+ return self._df
1000
+
1001
+ if split_string_to_list:
1002
+ # Apply the function to convert the 'string_column' values to lists
1003
+ self._df[explode_field] = self._df[explode_field].apply(string_to_list)
1004
+
1005
+ try:
1006
+ # remove the sub dictionary that sometimes is introduced by
1007
+ # XML loading
1008
+ if "." in explode_field:
1009
+ main = explode_field.split(".")[0]
1010
+ sub = explode_field.split(".")[1]
1011
+ self._df[main] = self._df[main].apply(update_column)
1012
+ explode_field = main
1013
+ # Explode the field that has list values
1014
+ self._df = self._df.explode(column=explode_field)
1015
+ except KeyError:
1016
+ logger.error("Column -> '%s' not found in Data Frame!", str(explode_field))
1017
+ except ValueError:
1018
+ logger.error(
1019
+ "Unable to explode the specified column -> '%s'!", str(explode_field)
1020
+ )
1021
+
1022
+ if flatten_fields:
1023
+ self.flatten(parent_field=explode_field, flatten_fields=flatten_fields)
1024
+
1025
+ if make_unique:
1026
+ self._df.drop_duplicates(subset=flatten_fields, inplace=True)
1027
+
1028
+ if reset_index:
1029
+ self._df.reset_index(inplace=True)
1030
+
1031
+ return self._df
1032
+
1033
+ # end method definition
1034
+
1035
+ def drop_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
1036
+ """Drop selected columns from the Data Frame
1037
+
1038
+ Args:
1039
+ column_names (list): list of column names to drop.
1040
+ inplace (bool, optional): If the dropping should be inplace, i.e. modifying self._df.
1041
+ Defaults to True.
1042
+ Returns:
1043
+ pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1044
+ """
1045
+
1046
+ if not all(column_name in self._df.columns for column_name in column_names):
1047
+ # Reduce the column names to those that really exist in the DataFrame:
1048
+ column_names = [
1049
+ column_name
1050
+ for column_name in column_names
1051
+ if column_name in self._df.columns
1052
+ ]
1053
+ logger.warning(
1054
+ "Reduce to these columns -> %s that do exist in the Data Frame.",
1055
+ str(column_names),
1056
+ )
1057
+
1058
+ if inplace:
1059
+ self._df.drop(column_names, axis=1, inplace=True)
1060
+ return self._df
1061
+ else:
1062
+ df = self._df.drop(column_names, axis=1, inplace=False)
1063
+ return df
1064
+
1065
+ # end method definition
1066
+
1067
+ def keep_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
1068
+ """Keep only selected columns from the Data Frame. Drop the rest.
1069
+
1070
+ Args:
1071
+ column_names (list): list of column names to keep.
1072
+ inplace (bool, optional): If the keeping should be inplace, i.e. modifying self._df.
1073
+ Defaults to True.
1074
+ Returns:
1075
+ pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
1076
+ """
1077
+
1078
+ if not all(column_name in self._df.columns for column_name in column_names):
1079
+ # Reduce the column names to those that really exist in the DataFrame:
1080
+ column_names = [
1081
+ column_name
1082
+ for column_name in column_names
1083
+ if column_name in self._df.columns
1084
+ ]
1085
+ logger.warning(
1086
+ "Reduce to these columns -> %s that do exist in the Data Frame.",
1087
+ column_names,
1088
+ )
1089
+
1090
+ if inplace:
1091
+ # keep only those columns which are in column_names:
1092
+ if column_names != []:
1093
+ self._df = self._df[column_names]
1094
+ return self._df
1095
+ else:
1096
+ # keep only those columns which are in column_names:
1097
+ if column_names != []:
1098
+ df = self._df[column_names]
1099
+ return df
1100
+ return None
1101
+
1102
+ # end method definition
1103
+
1104
+ def cleanse(self, cleansings: dict):
1105
+ """Cleanse data with regular expressions and upper/lower case conversion.
1106
+
1107
+ Args:
1108
+ cleansings (dict): Dictionary with keys that equal the column names.
1109
+ The dictionary values are dictionaries itself with
1110
+ these fields:
1111
+ * replacements (dict): name of a column in the data frame
1112
+ * upper (bool): change the value to uppercase
1113
+ * lower (bool): change the value to lowercase
1114
+ Example:
1115
+ cleansings = {
1116
+ "airportName": {
1117
+ "upper": true
1118
+ "replacements" : {
1119
+ "-": " ", # replace hypen with space
1120
+ ",\s*": " ", # remove commas followed by on or more spaces with a single space
1121
+ "\s+$": "", # remove trailing spaces at the end of the name
1122
+ "^\s+": "", # remove spaces at the beginning of the name
1123
+ }
1124
+ "length": 10
1125
+ }
1126
+ "airportId": {
1127
+ "upper": true
1128
+ "replacements" : {
1129
+ "K(.{3})": "\1", # if the airport has 4 charters and starts with a 'K' we remove the 'K'
1130
+ "\/": "", # remove forward slashes - this helps to have consistency with N/A, NA, n/a, na
1131
+ }
1132
+ }
1133
+ }
1134
+ """
1135
+
1136
+ # Iterate over each column in regex_dict
1137
+ for column, cleansing in cleansings.items():
1138
+ # "colum" is the name of the field we want to cleanse.
1139
+ # "cleansing" is a dict with
1140
+ if "." in column:
1141
+ # Handle columns with subfields
1142
+ main_field, sub_field = column.split(".")
1143
+ if not main_field in self._df.columns:
1144
+ continue
1145
+ # we use the additional parameters for lambda (beside x)
1146
+ # to avoid linter warning W0640
1147
+ self._df[main_field] = self._df[main_field].apply(
1148
+ lambda x, sub_field=sub_field, cleansing=cleansing: self._cleanse_subfield(
1149
+ data=x,
1150
+ sub_field=sub_field,
1151
+ replacements=cleansing.get("replacements", {}),
1152
+ upper=cleansing.get("upper", False),
1153
+ lower=cleansing.get("lower", False),
1154
+ length=cleansing.get("length", 0),
1155
+ )
1156
+ )
1157
+ else:
1158
+ if not column in self._df.columns:
1159
+ continue
1160
+
1161
+ logger.debug("\nBEFORE:\n%s\n", self._df[column])
1162
+
1163
+ if cleansing.get("upper", False) and self._df[column].dtype == "object":
1164
+ self._df[column] = self._df[column].str.upper()
1165
+ if cleansing.get("lower", False) and self._df[column].dtype == "object":
1166
+ self._df[column] = self._df[column].str.lower()
1167
+
1168
+ # Handle regular columns
1169
+ for regex_pattern, replacement in cleansing.get(
1170
+ "replacements", {}
1171
+ ).items():
1172
+ # if replacement:
1173
+ # \b is a word boundary anchor in regular expressions.
1174
+ # It matches a position where one side is a word character
1175
+ # (like a letter or digit) and the other side is a non-word character
1176
+ # (like whitespace or punctuation). It's often used to match whole words.
1177
+ # regex_pattern = rf"\b{regex_pattern}\b"
1178
+ # self._df[column] = self._df[column].replace(
1179
+ # regex=regex_pattern, value=replacement
1180
+ # )
1181
+ self._df[column] = self._df[column].str.replace(
1182
+ pat=regex_pattern, repl=replacement, regex=True
1183
+ )
1184
+
1185
+ if (
1186
+ cleansing.get("length", 0) > 0
1187
+ and self._df[column].dtype == "object"
1188
+ ):
1189
+ self._df[column] = self._df[column].str.slice(
1190
+ 0, cleansing["length"]
1191
+ )
1192
+
1193
+ logger.debug("\nAFTER:\n%s\n", self._df[column])
1194
+
1195
+ # end method definition
1196
+
1197
+ def _cleanse_subfield(
1198
+ self,
1199
+ data: list | dict,
1200
+ sub_field: str,
1201
+ replacements: dict,
1202
+ upper: bool,
1203
+ lower: bool,
1204
+ length: int = 0,
1205
+ ) -> list | dict:
1206
+ """Helper function to cleanse subfield data
1207
+
1208
+ Args:
1209
+ data (list | dict): sub data - either a list of dictionaries or a dictionary
1210
+ sub_field (str): defines which field in the sub data should be updated
1211
+ regex_replacements (dict): Dictionary of regular expressions
1212
+ upper (bool): if True transform value in subfield to upper-case
1213
+ lower (bool): if True, transform value in subfield to lower-case
1214
+ length (int, optional): maximum length of the strings
1215
+ Returns:
1216
+ list | dict: Updated data
1217
+ """
1218
+
1219
+ if isinstance(data, list):
1220
+ # If data is a list, apply cleansing to each dictionary in the list
1221
+ for i, item in enumerate(data):
1222
+ if (
1223
+ item is not None
1224
+ and sub_field in item
1225
+ and not pd.isnull(item[sub_field])
1226
+ ):
1227
+ if upper:
1228
+ item[sub_field] = item[sub_field].upper()
1229
+ elif lower:
1230
+ item[sub_field] = item[sub_field].lower()
1231
+ for regex_pattern, replacement in replacements.items():
1232
+ if replacement:
1233
+ regex_pattern = rf"\b{regex_pattern}\b"
1234
+ item[sub_field] = re.sub(
1235
+ regex_pattern, replacement, item[sub_field]
1236
+ )
1237
+ if length > 0:
1238
+ item[sub_field] = item[sub_field][:length]
1239
+ data[i] = item
1240
+ elif isinstance(data, dict):
1241
+ # If data is a dictionary, apply cleansing directly to the subfield
1242
+ if sub_field in data and not pd.isnull(data[sub_field]):
1243
+ if upper:
1244
+ data[sub_field] = data[sub_field].upper()
1245
+ elif lower:
1246
+ data[sub_field] = data[sub_field].lower()
1247
+ for regex_pattern, replacement in replacements.items():
1248
+ if replacement:
1249
+ regex_pattern = rf"\b{regex_pattern}\b"
1250
+ data[sub_field] = re.sub(
1251
+ regex_pattern, replacement, data[sub_field]
1252
+ )
1253
+ if length > 0:
1254
+ data[sub_field] = data[sub_field][:length]
1255
+ return data
1256
+
1257
+ # end method definition
1258
+
1259
+ def filter(self, conditions: list, inplace: bool = True) -> pd.DataFrame:
1260
+ """Filter the DataFrame based on (multiple) conditions.
1261
+
1262
+ Args:
1263
+ conditions (list): Conditions are a list of dictionaries with 3 items:
1264
+ * field (str): name of a column in the data frame
1265
+ * value (str or list): expected value (filter criterium).
1266
+ If it is a list then one of
1267
+ the list elements must match the field value (OR)
1268
+ * regex (bool): this flag controls if the value is interpreted as a
1269
+ regular expression. If there is no regex item in the
1270
+ dictionary then the default is False (= values is NOT regex).
1271
+ If there are multiple conditions in the list each has to evaluate to True (AND)
1272
+ inplace (bool, optional): Defines if the self._df is modified (inplace) or just
1273
+ a new DataFrame is returned. Defaults to True.
1274
+ Returns:
1275
+ pd.DataFrame: new data frame or pointer to self._df (depending on the value of 'inplace')
1276
+ """
1277
+
1278
+ if self._df is None:
1279
+ logger.error("DataFrame is not initialized.")
1280
+ return None
1281
+
1282
+ if self._df.empty:
1283
+ logger.error("DataFrame is empty.")
1284
+ return None
1285
+
1286
+ # first filtered_df is the full DataFreame.
1287
+ # then it is subsequentially reduced by each condition
1288
+ # at the end it is just those rows that match all conditions.
1289
+ filtered_df = self._df
1290
+
1291
+ # We traverse a list of conditions. Each condition must evaluate to true
1292
+ # otherwise the current workspace or document (i.e. the data set for these objects)
1293
+ # will be skipped. The variable filtered_df is
1294
+ for condition in conditions:
1295
+ field = condition.get("field", None)
1296
+ if not field:
1297
+ logger.error("Missing value for filter condition field in payload!")
1298
+ continue
1299
+ if field not in self._df.columns:
1300
+ logger.warning(
1301
+ "Filter condition field -> %s does not exist as column in data frame! Data frame has these columns -> %s",
1302
+ field,
1303
+ str(self._df.columns),
1304
+ )
1305
+ continue # Skip filtering for columns not present in DataFrame
1306
+ value = condition.get("value", None)
1307
+ if not value:
1308
+ logger.error(
1309
+ "Missing filter value of for filter condition field -> '%s'!", field
1310
+ )
1311
+ continue
1312
+ regex = condition.get("regex", False)
1313
+
1314
+ logger.info(
1315
+ "Data Frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
1316
+ filtered_df.shape[0],
1317
+ filtered_df.shape[1],
1318
+ str(condition),
1319
+ )
1320
+
1321
+ filtered_dfs = []
1322
+
1323
+ # if a single string is passed as value we put
1324
+ # it into an 1-item list to simplify the following code:
1325
+ if not isinstance(value, list):
1326
+ value = [value]
1327
+
1328
+ # multiple values are treated like a logical "or" condition
1329
+ for value_item in value:
1330
+ if regex:
1331
+ filtered_dfs.append(
1332
+ filtered_df[
1333
+ ~filtered_df[field].isna()
1334
+ & filtered_df[field].str.contains(value_item, regex=True)
1335
+ ]
1336
+ )
1337
+ else:
1338
+ result_df = filtered_df[
1339
+ ~filtered_df[field].isna() & filtered_df[field].eq(value_item)
1340
+ ]
1341
+ if not result_df.empty:
1342
+ filtered_dfs.append(result_df)
1343
+ # end for values
1344
+
1345
+ if not filtered_dfs:
1346
+ logger.warning(
1347
+ "Filter with field -> '%s' and value -> '%s' delivered an empty Data Frame",
1348
+ field,
1349
+ str(value),
1350
+ )
1351
+ filtered_df.drop(filtered_df.index, inplace=True)
1352
+ else:
1353
+ # Concatenate the filtered DataFrames for each value in the list
1354
+ filtered_df = pd.concat(filtered_dfs, ignore_index=True)
1355
+
1356
+ logger.info(
1357
+ "Data Frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
1358
+ filtered_df.shape[0],
1359
+ filtered_df.shape[1],
1360
+ str(condition),
1361
+ )
1362
+ # end for condition
1363
+
1364
+ if inplace:
1365
+ self._df = filtered_df
1366
+
1367
+ return filtered_df
1368
+
1369
+ # end method definition
1370
+
1371
+ def fill_na_in_column(self, column_name: str, default_value: str | int):
1372
+ """Replace NA values in a column with a defined new default value
1373
+
1374
+ Args:
1375
+ column_name (str): name of the column in the DataFrame
1376
+ default_value (str | int): value to replace NA with
1377
+ """
1378
+
1379
+ if column_name in self._df.columns:
1380
+ self._df[column_name] = self._df[column_name].fillna(value=default_value)
1381
+ else:
1382
+ logger.error(
1383
+ "Cannot replace NA values as column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
1384
+ column_name,
1385
+ str(self._df.columns),
1386
+ )
1387
+
1388
+ # end method definition
1389
+
1390
+ def fill_forward(self, inplace: bool) -> pd.DataFrame:
1391
+ """Fill the missing cells appropriately by carrying forward
1392
+ the values from the previous rows where necessary.
1393
+ This has applications if a hierarchy is represented by
1394
+ nested cells e.g. in an Excel sheet.
1395
+
1396
+ Args:
1397
+ inplace (bool): Should the modification happen inplace or not.
1398
+
1399
+ Returns:
1400
+ pd.DataFrame: Resulting dataframe
1401
+ """
1402
+
1403
+ # To convert an Excel representation of a folder structure with nested
1404
+ # columns into a format appropriate for Pandas,
1405
+ # where all cells should be filled
1406
+ df_filled = self._df.ffill(inplace=inplace)
1407
+
1408
+ return df_filled
1409
+
1410
+ # end method definition
1411
+
1412
+ def lookup_value(
1413
+ self, lookup_column: str, lookup_value: str, separator: str = "|"
1414
+ ) -> pd.Series | None:
1415
+ """Lookup a row that includes a lookup value in the value of a given column.
1416
+
1417
+ Args:
1418
+ lookup_column (str): name of the column to search in
1419
+ lookup_value (str): value to search for
1420
+ separator (str): string list delimiter / separator
1421
+
1422
+ Returns:
1423
+ pd.Series | None: data frame row that matches or None if no match was found.
1424
+ """
1425
+
1426
+ # Use the `apply` function to filter rows where the lookup value matches a whole item in the comma-separated list
1427
+ def match_lookup_value(string_list: str) -> bool:
1428
+ """Spilt delimiter-separated list into a python list
1429
+
1430
+ Args:
1431
+ string_list (str): delimiter-separated string list like "a, b, c" or "a | b | c"
1432
+
1433
+ Returns:
1434
+ bool: True if lookup_value is equal to one of the delimiter-separated terms
1435
+ """
1436
+ return lookup_value in [
1437
+ item.strip() for item in string_list.split(separator)
1438
+ ]
1439
+
1440
+ df = self._df
1441
+
1442
+ if self._df is None:
1443
+ return None
1444
+
1445
+ if lookup_column not in self._df.columns:
1446
+ logger.error(
1447
+ "Column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
1448
+ lookup_column,
1449
+ str(self._df.columns),
1450
+ )
1451
+ return None
1452
+
1453
+ # Fill NaN or None values in the lookup column with empty strings
1454
+ df[lookup_column] = df[lookup_column].fillna("")
1455
+
1456
+ # Use the `apply` function to filter rows where the lookup value is in the Synonyms list
1457
+ matched_row = df[df[lookup_column].apply(match_lookup_value)]
1458
+
1459
+ # Return the first matched row, if any
1460
+ if not matched_row.empty:
1461
+ return matched_row.iloc[0]
1462
+
1463
+ return None
1464
+
1465
+ # end method definition
1466
+
1467
+ def add_column(
1468
+ self,
1469
+ source_column: str,
1470
+ reg_exp: str,
1471
+ new_column: str,
1472
+ prefix="",
1473
+ suffix="",
1474
+ length: int | None = None,
1475
+ group_chars: int | None = None,
1476
+ group_separator: str = ".",
1477
+ group_remove_leading_zero: bool = True,
1478
+ ) -> bool:
1479
+ """Add additional column to the data frame.
1480
+
1481
+ Args:
1482
+ source_column (str): name of the source column
1483
+ reg_exp (str): regular expression to apply on the content of the source column
1484
+ new_column (str): name of the column to add
1485
+ prefix (str, optional): Prefix to add in front of the value. Defaults to "".
1486
+ suffix (str, optional): Suffix to add at the end of the value. Defaults to "".
1487
+ length (int | None, optional): Length to reduce to. Defaults to None.
1488
+ group_chars (int | None, optional): group the resulting string in characters of group_chars. Defaults to None.
1489
+ group_separator (str, optional): Separator string for the grouping. Defaults to ".".
1490
+ group_remove_leading_zero (bool, optional): Remove leading zeros from the groups. Defaults to True.
1491
+
1492
+ Returns:
1493
+ bool: True = Success, False = Failure
1494
+ """
1495
+
1496
+ if self._df is None:
1497
+ return False
1498
+
1499
+ # Use str.extract to apply the regular expression to the source column
1500
+ extracted = self._df[source_column].str.extract(pat=reg_exp, expand=False)
1501
+
1502
+ # Limit the result to the specified length
1503
+ if length is not None:
1504
+ extracted = extracted.str[:length]
1505
+
1506
+ if group_chars is not None:
1507
+
1508
+ def process_grouping(x):
1509
+ if pd.isna(x):
1510
+ return x
1511
+ # Split into groups
1512
+ groups = [x[i : i + group_chars] for i in range(0, len(x), group_chars)]
1513
+ if group_remove_leading_zero:
1514
+ # Remove leading zeros from each group
1515
+ groups = [group.lstrip("0") or "0" for group in groups]
1516
+ # Join groups with separator
1517
+ return group_separator.join(groups)
1518
+
1519
+ extracted = extracted.apply(process_grouping)
1520
+
1521
+ # Add prefix and suffix
1522
+ if prefix or suffix:
1523
+ extracted = prefix + extracted.astype(str) + suffix
1524
+
1525
+ self._df[new_column] = extracted
1526
+
1527
+ return True