PyPI - pyxecm - Versions diffs - 1.6__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

pyxecm 1.6py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyxecm might be problematic. Click here for more details.

Files changed (56) hide show

pyxecm/__init__.py +6 -4
pyxecm/avts.py +673 -246
pyxecm/coreshare.py +686 -467
pyxecm/customizer/__init__.py +16 -4
pyxecm/customizer/__main__.py +58 -0
pyxecm/customizer/api/__init__.py +5 -0
pyxecm/customizer/api/__main__.py +6 -0
pyxecm/customizer/api/app.py +914 -0
pyxecm/customizer/api/auth.py +154 -0
pyxecm/customizer/api/metrics.py +92 -0
pyxecm/customizer/api/models.py +13 -0
pyxecm/customizer/api/payload_list.py +865 -0
pyxecm/customizer/api/settings.py +103 -0
pyxecm/customizer/browser_automation.py +332 -139
pyxecm/customizer/customizer.py +1007 -1130
pyxecm/customizer/exceptions.py +35 -0
pyxecm/customizer/guidewire.py +322 -0
pyxecm/customizer/k8s.py +713 -378
pyxecm/customizer/log.py +107 -0
pyxecm/customizer/m365.py +2867 -909
pyxecm/customizer/nhc.py +1169 -0
pyxecm/customizer/openapi.py +258 -0
pyxecm/customizer/payload.py +16817 -7467
pyxecm/customizer/pht.py +699 -285
pyxecm/customizer/salesforce.py +516 -342
pyxecm/customizer/sap.py +58 -41
pyxecm/customizer/servicenow.py +593 -371
pyxecm/customizer/settings.py +442 -0
pyxecm/customizer/successfactors.py +408 -346
pyxecm/customizer/translate.py +83 -48
pyxecm/helper/__init__.py +5 -2
pyxecm/helper/assoc.py +83 -43
pyxecm/helper/data.py +2406 -870
pyxecm/helper/logadapter.py +27 -0
pyxecm/helper/web.py +229 -101
pyxecm/helper/xml.py +527 -171
pyxecm/maintenance_page/__init__.py +5 -0
pyxecm/maintenance_page/__main__.py +6 -0
pyxecm/maintenance_page/app.py +51 -0
pyxecm/maintenance_page/settings.py +28 -0
pyxecm/maintenance_page/static/favicon.avif +0 -0
pyxecm/maintenance_page/templates/maintenance.html +165 -0
pyxecm/otac.py +234 -140
pyxecm/otawp.py +1436 -557
pyxecm/otcs.py +7716 -3161
pyxecm/otds.py +2150 -919
pyxecm/otiv.py +36 -21
pyxecm/otmm.py +1272 -325
pyxecm/otpd.py +231 -127
pyxecm-2.0.0.dist-info/METADATA +145 -0
pyxecm-2.0.0.dist-info/RECORD +54 -0
{pyxecm-1.6.dist-info → pyxecm-2.0.0.dist-info}/WHEEL +1 -1
pyxecm-1.6.dist-info/METADATA +0 -53
pyxecm-1.6.dist-info/RECORD +0 -32
{pyxecm-1.6.dist-info → pyxecm-2.0.0.dist-info/licenses}/LICENSE +0 -0
{pyxecm-1.6.dist-info → pyxecm-2.0.0.dist-info}/top_level.txt +0 -0

pyxecm/helper/data.py CHANGED Viewed

@@ -1,74 +1,61 @@
-"""
-Data Module to implement functions to leverage Pandas to
-manipulte data structures read for bulk generation of Extended ECM items.
-This code implements a class called data which is referring
-to Pandas DataFrame.
-Class: Payload
-Methods:
-__init__ : class initializer
-__len__: Lenght of the embedded DataFrame object.
-__str__: Print the DataFrame of the class
-get_data_frame: Get the Pandas DataFrame object
-set_data_frame: Set the Pandas DataFrame object
-append: Append additional data to the data frame.
-load_json_data: Load JSON data into DataFrame
-save_json_data: Save JSON data from DataFrame to file
-load_excel_data: Load Excel file into DataFrame
-load_csv_data: Load CSV data into DataFrame
-load_directory: Load directory structure into Pandas Data Frame
-partitionate: Partition a data frame into equally sized partions
-deduplicate: Remove dupclicate rows that have all fields in unique_fields in common
-sort: Sort the data frame based on one or multiple fields.
-flatten: Flatten a sub-dictionary by copying selected fields to the
-         parent dictionary.
-explode_and_flatten: Explode a substructure in the Data Frame
-drop_columns: Drop selected columns from the Data Frame
-keep_columns: Keep only selected columns from the Data Frame. Drop the rest.
-cleanse: Cleanse data with regular expressions and upper/lower case conversion.
-filter: Filter the DataFrame based on conditions
-fill_forward: Fill the missing cells appropriately by carrying forward
-              the values from the previous rows where necessary.
-fill_na_in_column: Replace NA values in a column with a defined new default value
+"""Data Module leveraging Pandas to manipulte data sets read for bulk generation of Content Server items.
+See: https://pandas.pydata.org
+This code implements a class called "Data" which is a wrapper
+to Pandas data frame.
 """
 __author__ = "Dr. Marc Diefenbruch"
-__copyright__ = "Copyright 2024, OpenText"
+__copyright__ = "Copyright (C) 2024-2025, OpenText"
 __credits__ = ["Kai-Philip Gatzweiler"]
 __maintainer__ = "Dr. Marc Diefenbruch"
 __email__ = "mdiefenb@opentext.com"
-import logging
 import json
+import logging
 import os
 import re
 import threading
+from io import StringIO
 import pandas as pd
+import requests
-logger = logging.getLogger("pyxecm.helper.data")
+default_logger = logging.getLogger("pyxecm.helper.data")
 class Data:
     """Used to automate data loading for the customizer."""
+    logger: logging.Logger = default_logger
     _df: pd.DataFrame
-    _lock = threading.Lock()
+    _lock: threading.Lock = threading.Lock()
-    def __init__(self, init_data: pd.DataFrame | list = None):
+    def __init__(
+        self,
+        init_data: pd.DataFrame | list = None,
+        logger: logging.Logger = default_logger,
+    ) -> None:
         """Initialize the Data object.
         Args:
-            init_data (pd.DataFrame | list, optional): Data to initialize the data frame. Can either be
-                                                       another data frame (that gets copied) or a list of dictionaries.
-                                                       Defaults to None.
+            init_data (pd.DataFrame | list, optional):
+                Data to initialize the data frame. Can either be
+                another data frame (that gets copied) or a list of dictionaries.
+                Defaults to None.
+            logger (logging.Logger, optional):
+                Pass a special logging object. This is optional. If not provided,
+                the default logger is used.
         """
+        if logger != default_logger:
+            self.logger = logger.getChild("data")
+            for logfilter in logger.filters:
+                self.logger.addFilter(logfilter)
         if init_data is not None:
             # if a data frame is passed to the constructor we
             # copy its content to the new Data object
@@ -84,7 +71,7 @@ class Data:
                 # it is important to wrap the dict in a list to avoid that more than 1 row is created
                 self._df: pd.DataFrame = pd.DataFrame([init_data])
             else:
-                logger.error("Illegal initialization data for 'Data' class!")
+                self.logger.error("Illegal initialization data for 'Data' class!")
                 self._df = None
         else:
             self._df = None
@@ -92,11 +79,14 @@ class Data:
     # end method definition
     def __len__(self) -> int:
-        """Lenght of the embedded DataFrame object.
-           This is basically a convenience method.
+        """Return lenght of the embedded Pandas data frame object.
+        This is basically a convenience method.
         Returns:
-            int: Lenght of the DataFrame
+            int:
+                Lenght of the data frame.
         """
         if self._df is not None:
@@ -106,10 +96,12 @@ class Data:
     # end method definition
     def __str__(self) -> str:
-        """Print the DataFrame of the class.
+        """Print the Pandas data frame object.
         Returns:
-            str: String representation.
+            str:
+                String representation.
         """
         # if data frame is initialized we return
@@ -122,51 +114,72 @@ class Data:
     # end method definition
     def __getitem__(self, column: str) -> pd.Series:
-        """Return the column corresponding to the key from the DataFrame
+        """Return the column corresponding to the key from the data frame.
         Args:
-            column (str): name of the Data Frame column
+            column (str): The name of the data frame column.
         Returns:
-            pd.Series: column of the Data Frame with the given name
+            pd.Series: The column of the data frame with the given name.
         """
         return self._df[column]
     # end method definition
-    def lock(self):
+    def lock(self) -> threading.Lock:
         """Return the threading lock object.
         Returns:
-            _type_: threading lock object
+            threading.Lock: The threading lock object.
         """
         return self._lock
     # end method definition
     def get_data_frame(self) -> pd.DataFrame:
-        """Get the Pandas DataFrame object
+        """Get the Pandas data frame object.
         Returns:
-            pd.DataFrame: Pandas DataFrame object
+            pd.DataFrame: The Pandas data frame object.
         """
         return self._df
     # end method definition
-    def set_data_frame(self, df: pd.DataFrame):
-        """Set the Pandas DataFrame object
+    def set_data_frame(self, df: pd.DataFrame) -> None:
+        """Set the Pandas data frame object.
         Args:
-            df (pd.DataFrame): Pandas DataFrame object
+            df (pd.DataFrame): The new Pandas data frame object.
         """
         self._df = df
     # end method definition
+    def get_columns(self) -> list | None:
+        """Get the list of column names of the data frame.
+        Returns:
+            list | None:
+                The list of column names in the data frame.
+        """
+        if self._df is None:
+            return None
+        return self._df.columns
+    # end method definition
     def print_info(
         self,
         show_size: bool = True,
@@ -177,26 +190,40 @@ class Data:
         show_sample: bool = False,
         show_statistics: bool = False,
         row_num: int = 10,
-    ):
-        """Log information about the data frame
+    ) -> None:
+        """Log information about the data frame.
         Args:
-            show_size (bool, optional): Show size of data frame. Defaults to True.
-            show_info (bool, optional): Show information for data frame. Defaults to False.
-            show_columns (bool, optional): Show columns of data frame. Defaults to False.
-            show_first (bool, optional): Show first 10 items. Defaults to False.
-            show_last (bool, optional): Show last 10 items. Defaults to False.
-            show_sample (bool, optional): Show 10 sample items. Defaults to False.
-            show_statistics (bool, optional): Show data frame statistics. Defaults to False.
+            show_size (bool, optional):
+                Show size of data frame. Defaults to True.
+            show_info (bool, optional):
+                Show information for data frame. Defaults to False.
+            show_columns (bool, optional):
+                Show columns of data frame. Defaults to False.
+            show_first (bool, optional):
+                Show first N items. Defaults to False. N is defined
+                by the row_num parameter.
+            show_last (bool, optional):
+                Show last N items. Defaults to False. N is defined
+                by the row_num parameter.
+            show_sample (bool, optional):
+                Show N sample items. Defaults to False. N is defined
+                by the row_num parameter.
+            show_statistics (bool, optional):
+                Show data frame statistics. Defaults to False.
+            row_num (int, optional):
+                Used as the number of rows printed using show_first,
+                show_last, show_sample. Default is 10.
         """
         if self._df is None:
-            logger.warning("Data Frame is not initialized!")
+            self.logger.warning("Data frame is not initialized!")
             return
         if show_size:
-            logger.info(
-                "Data Frame has %s row(s) and %s column(s)",
+            self.logger.info(
+                "Data frame has %s row(s) and %s column(s)",
                 self._df.shape[0],
                 self._df.shape[1],
             )
@@ -206,39 +233,42 @@ class Data:
             self._df.info()
         if show_columns:
-            logger.info("Columns:\n%s", self._df.columns)
-            logger.info(
-                "Columns with number of null values:\n%s", self._df.isnull().sum()
-            )
-            logger.info(
-                "Columns with number of non-null values:\n%s", self._df.notnull().sum()
+            self.logger.info("Columns:\n%s", self._df.columns)
+            self.logger.info(
+                "Columns with number of NaN values:\n%s",
+                self._df.isna().sum(),
             )
-            logger.info("Columns with number of NaN values:\n%s", self._df.isna().sum())
-            logger.info(
-                "Columns with number of non-NaN values:\n%s", self._df.notna().sum()
+            self.logger.info(
+                "Columns with number of non-NaN values:\n%s",
+                self._df.notna().sum(),
             )
         if show_first:
             # the default for head is n = 5:
-            logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
+            self.logger.info("First %s rows:\n%s", str(row_num), self._df.head(row_num))
         if show_last:
             # the default for tail is n = 5:
-            logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
+            self.logger.info("Last %s rows:\n%s", str(row_num), self._df.tail(row_num))
         if show_sample:
             # the default for sample is n = 1:
-            logger.info("%s Sample rows:\n%s", str(row_num), self._df.sample(n=row_num))
+            self.logger.info(
+                "%s Sample rows:\n%s",
+                str(row_num),
+                self._df.sample(n=row_num),
+            )
         if show_statistics:
-            logger.info(
-                "Description of statistics for data frame:\n%s", self._df.describe()
+            self.logger.info(
+                "Description of statistics for data frame:\n%s",
+                self._df.describe(),
             )
-            logger.info(
-                "Description of statistics for data frame (Transformed):\n%s",
+            self.logger.info(
+                "Description of statistics for data frame (transformed):\n%s",
                 self._df.describe().T,
             )
-            logger.info(
+            self.logger.info(
                 "Description of statistics for data frame (objects):\n%s",
                 self._df.describe(include="object"),
             )
@@ -249,10 +279,13 @@ class Data:
         """Append additional data to the data frame.
         Args:
-            add_data (pd.DataFrame | list | dict): Additional data. Can be pd.DataFrame or list of dicts (or Data)
+            add_data (pd.DataFrame | list | dict):
+                Additional data. Can be pd.DataFrame or list of dicts (or Data).
         Returns:
-            bool: True = Success, False = Error
+            bool:
+                True = Success, False = Error
         """
         # Does the data frame has already content?
@@ -264,166 +297,395 @@ class Data:
                 return True
             elif isinstance(add_data, Data):
                 df = add_data.get_data_frame()
-                if df:
+                if df is not None and not df.empty:
                     self._df = pd.concat([self._df, df], ignore_index=True)
                 return True
             elif isinstance(add_data, list):
                 if add_data:
-                    df = Data(add_data)
+                    df = Data(add_data, logger=self.logger)
                     self._df = pd.concat(
-                        [self._df, df.get_data_frame()], ignore_index=True
+                        [self._df, df.get_data_frame()],
+                        ignore_index=True,
                     )
                 return True
             elif isinstance(add_data, dict):
                 if add_data:
                     # it is important to wrap the dict in a list to avoid that more than 1 row is created
-                    df = Data([add_data])
+                    df = Data([add_data], logger=self.logger)
                     self._df = pd.concat(
-                        [self._df, df.get_data_frame()], ignore_index=True
+                        [self._df, df.get_data_frame()],
+                        ignore_index=True,
                     )
                 return True
             else:
-                logger.error("Illegal data type -> '%s'", type(add_data))
-                return False
-        else:  # self._df is None (initial state)
-            if isinstance(add_data, pd.DataFrame):
-                self._df = add_data
-                return True
-            elif isinstance(add_data, Data):
-                self._df = add_data.get_data_frame()
-                return True
-            elif isinstance(add_data, list):
-                self._df = pd.DataFrame(add_data)
-                return True
-            elif isinstance(add_data, dict):
-                # it is important to wrap the dict in a list to avoid that more than 1 row is created
-                self._df = pd.DataFrame([add_data])
-                return True
-            else:
-                logger.error("Illegal data type -> '%s'", type(add_data))
+                self.logger.error("Illegal data type -> '%s'", type(add_data))
                 return False
+        elif isinstance(add_data, pd.DataFrame):
+            self._df = add_data
+            return True
+        elif isinstance(add_data, Data):
+            self._df = add_data.get_data_frame()
+            return True
+        elif isinstance(add_data, list):
+            self._df = pd.DataFrame(add_data)
+            return True
+        elif isinstance(add_data, dict):
+            # it is important to wrap the dict in a list to avoid that more than 1 row is created
+            self._df = pd.DataFrame([add_data])
+            return True
+        else:
+            self.logger.error("Illegal data type -> '%s'", type(add_data))
+            return False
+    # end method definition
+    def merge(
+        self,
+        merge_data: pd.DataFrame,
+        on: str | list[str] | None = None,
+        how: str = "inner",
+        left_on: str | list[str] | None = None,
+        right_on: str | list[str] | None = None,
+        left_index: bool = False,
+        right_index: bool = False,
+        suffixes: tuple[str, str] = ("_x", "_y"),
+        indicator: bool = False,
+        validate: str | None = None,
+    ) -> pd.DataFrame | None:
+        """Merge the current DataFrame (_df) with another DataFrame.
+        Args:
+            merge_data (pd.DataFrame | Data):
+                The DataFrame to merge with.
+            on (str | list[str]):
+                Column(s) to merge on. Defaults to None.
+            how (str, optional):
+                Type of merge ('inner', 'outer', 'left', 'right', 'cross'). Defaults to 'inner'.
+            left_on (str | list[str] | None, optional):
+                Column(s) from self._df to merge on. Defaults to None.
+            right_on (str | list[str] | None, optional):
+                Column(s) from other DataFrame to merge on. Defaults to None.
+            left_index (str | list[str], optional):
+                 Whether to merge on the index of self._df. Defaults to False.
+            right_index (bool, optional):
+                Whether to merge on the index of other. Defaults to False.
+            suffixes (tuple[str, str]):
+                Suffixes for overlapping column names. Defaults to ('_x', '_y').
+            indicator (bool, optional):
+                If True, adds a column showing the merge source. Defaults to False.
+            validate ():
+                If provided, checks merge integrity
+                ('one_to_one', 'one_to_many', 'many_to_one', 'many_to_many'). Defaults to None.
+        Returns:
+            The merged DataFrame or None in case of an error.
+        Exceptions:
+            ValueError: If `other` is not a DataFrame.
+            KeyError: If required columns for merging are missing.
+            ValueError: If `validate` check fails.
+        """
+        if self._df is None or self._df.empty:
+            self._df = merge_data
+        if isinstance(merge_data, Data):
+            merge_data = merge_data.get_data_frame()  # Extract DataFrame from Data instance
+        try:
+            return self._df.merge(
+                merge_data,
+                how=how,
+                on=on,
+                left_on=left_on,
+                right_on=right_on,
+                left_index=left_index,
+                right_index=right_index,
+                suffixes=suffixes,
+                indicator=indicator,
+                validate=validate,
+            )
+        except KeyError:
+            self.logger.error("Column(s) not found for merging!")
+        except ValueError:
+            self.logger.error("Invalid merge operation!")
+        return None
+    # end method definition
+    def strip(self, columns: list | None = None, inplace: bool = True) -> pd.DataFrame:
+        """Strip leading and trailing spaces from specified columns in a data frame.
+        Args:
+            columns (list | None):
+                The list of column names to strip. If None, it strips
+                leading and trailing spaces from _all_ string columns.
+            inplace (bool, optional):
+                If True, the data modification is done in place, i.e.
+                modifying the existing data frame of the object.
+                If False, the data frame is copied and the copy is modified
+                and returned.
+        Returns:
+            pd.DataFrame:
+                The modified data frame with stripped columns.
+        """
+        df = self._df.copy() if not inplace else self._df
+        if columns is None:
+            # Strip spaces from all string columns
+            df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
+        else:
+            # Strip spaces from specified columns
+            for col in columns:
+                if col in df.columns and df[col].dtype == "object":  # Check if the column exists and is of string type
+                    df[col] = df[col].str.strip()
+        if inplace:
+            self._df = df
+        return df
     # end method definition
-    def load_json_data(self, json_path: str, convert_dates: bool = False) -> bool:
-        """Load JSON data into DataFrame
+    def load_json_data(
+        self,
+        json_path: str,
+        convert_dates: bool = False,
+        index_column: str | None = None,
+        compression: str | None = None,
+    ) -> bool:
+        """Load JSON data into a Pandas data frame.
         Args:
-            json_path (str): Path to the JSON file.
-            convert_dates (bool, optional): whether or not dates should be converted
+            json_path (str):
+                The path to the JSON file.
+            convert_dates (bool, optional):
+                Defines whether or not dates should be converted.
+                The default is False = dates are NOT converted.
+            index_column (str | None, optional):
+                The Name of the column (i.e. JSON data field) that should
+                become the index in the loaded data frame.
+            compression (str | None):
+                Remove a compression:
+                * gzip (.gz)
+                * bz2 (.bz2)
+                * zip (.zip)
+                * xz (.xz)
+                The value for compression should not include the dot.
+                Default is None = no compression.
         Returns:
             bool: False in case an error occured, True otherwise.
         """
-        if json_path is not None and os.path.exists(json_path):
-            # Load data from JSON file
-            try:
-                df = pd.read_json(path_or_buf=json_path, convert_dates=convert_dates)
-                if self._df is None:
-                    self._df = df
-                else:
-                    self._df = pd.concat([self._df, df])
-                logger.info(
-                    "After loading -> '%s' the Data Frame has %s row(s) and %s column(s)",
-                    json_path,
-                    self._df.shape[0],
-                    self._df.shape[1],
-                )
-            except FileNotFoundError:
-                logger.error(
-                    "JSON file -> %s not found. Please check the file path.", json_path
-                )
-                return False
-            except PermissionError:
-                logger.error(
-                    "Permission denied to access the JSON file -> %s.", json_path
+        if not json_path:
+            self.logger.error(
+                "You have not specified a JSON path!",
+            )
+            return False
+        # If compression is enabled the file path should have
+        # the matching file name extension:
+        if compression:
+            compression = compression.lstrip(".")  # remove a dot prefix if present
+            suffix = "." + compression if compression != "gzip" else "gz"
+            if not json_path.endswith(suffix):
+                json_path += suffix
+        if not os.path.exists(json_path):
+            self.logger.error(
+                "Missing JSON file - you have not specified a valid path -> '%s'.",
+                json_path,
+            )
+            return False
+        # Load data from JSON file
+        try:
+            df = pd.read_json(
+                path_or_buf=json_path,
+                convert_dates=convert_dates,
+                compression=compression,
+            )
+            if index_column and index_column not in df.columns:
+                self.logger.error(
+                    "Specified index column -> '%s' not found in the JSON data.",
+                    index_column,
                 )
                 return False
-            except IOError as e:
-                logger.error("An I/O error occurred -> %s", str(e))
-                return False
-            except json.JSONDecodeError as e:
-                logger.error("Error: Unable to decode JSON -> %s", str(e))
-                return False
-            except ValueError as e:
-                logger.error("Invalid JSON input -> %s", str(e))
-                return False
-            except AttributeError as e:
-                logger.error("Unexpected JSON data structure -> %s", str(e))
-                return False
-            except TypeError as e:
-                logger.error("Unexpected JSON data type -> %s", str(e))
-                return False
-            except KeyError as e:
-                logger.error("Missing key in JSON data -> %s", str(e))
-                return False
-        else:
-            logger.error(
-                "Missing JSON file - you have not specified a valid path -> %s.",
+            if index_column:
+                df = df.set_index(keys=index_column)
+            if self._df is None:
+                self._df = df
+            else:
+                self._df = pd.concat([self._df, df])
+            self.logger.info(
+                "After loading JSON file -> '%s', the data frame has %s row(s) and %s column(s)",
+                json_path,
+                self._df.shape[0],
+                self._df.shape[1],
+            )
+        except FileNotFoundError:
+            self.logger.error(
+                "JSON file -> '%s' not found. Please check the file path.",
+                json_path,
+            )
+            return False
+        except PermissionError:
+            self.logger.error(
+                "Missing permission to access the JSON file -> '%s'.",
                 json_path,
             )
             return False
+        except OSError:
+            self.logger.error("An I/O error occurred!")
+            return False
+        except json.JSONDecodeError:
+            self.logger.error(
+                "Unable to decode JSON file -> '%s'",
+                json_path,
+            )
+            return False
+        except ValueError:
+            self.logger.error("Invalid JSON input -> %s", json_path)
+            return False
+        except AttributeError:
+            self.logger.error("Unexpected JSON data structure in file -> %s", json_path)
+            return False
+        except TypeError:
+            self.logger.error("Unexpected JSON data type in file -> %s", json_path)
+            return False
+        except KeyError:
+            self.logger.error("Missing key in JSON data in file -> %s", json_path)
+            return False
         return True
     # end method definition
     def save_json_data(
-        self, json_path: str, orient: str = "records", preserve_index: bool = False
+        self,
+        json_path: str,
+        orient: str = "records",
+        preserve_index: bool = False,
+        index_column: str = "index",
+        compression: str | None = None,
     ) -> bool:
-        """Save JSON data from DataFrame to file
+        """Save JSON data from data frame to file.
         Args:
-            json_path (str): Path to the JSON file.
-            orient (str, optional): Structure of the JSON
-            preserve_index (bool, optional)
+            json_path (str): The path to where the JSON file should be safed.
+            orient (str, optional):
+                The structure of the JSON. Possible values:
+                * "records" (this is the default)
+                * "columns"
+                * "index"
+                * "table"
+                * "split"
+            preserve_index (bool, optional):
+                Defines if the index column of the data frame should be exported as well.
+                The default is False (index is not exported).
+            index_column (str, optional):
+                The Name of the column (i.e. JSON data field) that should
+                become the index in the loaded data frame. The default is "index".
+            compression (str | None):
+                Apply a compression:
+                * gzip (.gz)
+                * bz2 (.bz2)
+                * zip (.zip)
+                * xz (.xz)
         Returns:
-            bool: False in case an error occured, True otherwise.
+            bool:
+                False in case an error occured, True otherwise.
         """
-        if json_path is not None and os.path.exists(os.path.dirname(json_path)):
-            # Load data from JSON file
-            try:
-                if self._df is not None:
-                    # index parameter is only allowed if orient has one of the following values:
-                    if (
-                        orient == "columns"
-                        or orient == "index"
-                        or orient == "table"
-                        or orient == "split"
-                    ):
-                        self._df.to_json(
-                            path_or_buf=json_path,
-                            index=preserve_index,
-                            orient=orient,
-                            indent=2,
-                        )
-                    else:
-                        self._df.to_json(path_or_buf=json_path, orient=orient, indent=2)
+        if not json_path:
+            self.logger.error(
+                "You have not specified a JSON path!",
+            )
+            return False
+        # If compression is enabled the file path should have
+        # the matching file name extension:
+        if compression:
+            suffix = "." + compression if compression != "gzip" else ".gz"
+            if not json_path.endswith(suffix):
+                json_path += suffix
+        # Save data to JSON file
+        try:
+            if self._df is not None:
+                if not os.path.exists(os.path.dirname(json_path)):
+                    os.makedirs(os.path.dirname(json_path), exist_ok=True)
+                # index parameter is only allowed if orient has one of the following values:
+                if orient in ("columns", "index", "table", "split"):
+                    self._df.to_json(
+                        path_or_buf=json_path,
+                        index=preserve_index,
+                        orient=orient,
+                        indent=2,
+                        compression=compression,
+                        date_format="iso",
+                    )
+                # In this case we cannot use the index parameter as this would give this error:
+                # Value Error -> 'index=True' is only valid when 'orient' is 'split', 'table', 'index', or 'columns'
+                # So we create a new column that preserves the original row IDs from the index. The nasme
+                elif preserve_index:
+                    df_with_index = self._df.reset_index(
+                        names=index_column,
+                        inplace=False,
+                    )
+                    df_with_index.to_json(
+                        path_or_buf=json_path,
+                        orient=orient,
+                        indent=2,
+                        compression=compression,
+                        date_format="iso",
+                    )
                 else:
-                    logger.warning("Data Frame is empty. Cannot write it to JSON")
-                    return False
-            except FileNotFoundError:
-                logger.error(
-                    "File -> '%s' not found. Please check the file path.", json_path
+                    self._df.to_json(
+                        path_or_buf=json_path,
+                        orient=orient,
+                        indent=2,
+                        compression=compression,
+                        date_format="iso",
+                    )
+            else:
+                self.logger.warning(
+                    "Data frame is empty. Cannot write it to JSON file -> '%s'.",
+                    json_path,
                 )
                 return False
-            except PermissionError:
-                logger.error("Permission denied to access the file -> '%s'.", json_path)
-                return False
-            except IOError as e:
-                logger.error("An I/O error occurred -> %s", str(e))
-                return False
-            except ValueError as e:
-                logger.error("Value Error -> %s", str(e))
-                return False
-        else:
-            logger.error(
-                "Missing JSON file -> '%s' you have not specified a valid path!",
+        except FileNotFoundError:
+            self.logger.error(
+                "File -> '%s' not found. Please check the file path.",
+                json_path,
+            )
+            return False
+        except PermissionError:
+            self.logger.error(
+                "Permission denied to access the file -> '%s'.",
                 json_path,
             )
             return False
+        except OSError:
+            self.logger.error("An I/O error occurred accessing file -> %s", json_path)
+            return False
+        except ValueError:
+            self.logger.error("Value error!")
+            return False
         return True
     # end method definition
@@ -438,27 +700,40 @@ class Data:
         names: list | None = None,
         na_values: list | None = None,
     ) -> bool:
-        """Load Excel (xlsx) data into DataFrame. Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
-           read from a local filesystem or URL. Supports an option to read a single sheet or a list of sheets.
+        """Load Excel (xlsx) data into Pandas data frame.
+        Supports xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
+        read from a local filesystem or URL. Supports an option to read a
+        single sheet or a list of sheets.
         Args:
-            xlsx_path (str): Path to the Excel file.
-            sheet_names (list | str | int, optional): Name or Index of the sheet in the Excel workbook to load.
-                                                      If 'None' then all sheets will be loaded.
-                                                      If 0 then first sheet in workbook will be loaded (this is the Default)
-                                                      If string then this is interpreted as the name of the sheet to load.
-                                                      If a list is passed, this can be a list of index values (int) or
-                                                      a list of strings with the sheet names to load.
-            usecols (list | str, optional): List of columns to load, specified by general column names in Excel,
-                                            e.g. usecols='B:D', usecols=['A', 'C', 'F']
-            skip_rows (int, optional): List of rows to skip on top of the sheet (e.g. to not read headlines)
-            header (int | None, optional): Excel Row (0-indexed) to use for the column labels of the parsed DataFrame.
-                                           If file contains no header row, then you should explicitly pass header=None.
-                                           Default is 0.
-            names (list): List of column names to use. Default is None
-            na_values (list, optional): List of values in the Excel that should become the Pandas NA value.
+            xlsx_path (str):
+                The path to the Excel file to load.
+            sheet_names (list | str | int, optional):
+                Name or Index of the sheet in the Excel workbook to load.
+                If 'None' then all sheets will be loaded.
+                If 0 then first sheet in workbook will be loaded (this is the Default).
+                If string then this is interpreted as the name of the sheet to load.
+                If a list is passed, this can be a list of index values (int) or
+                a list of strings with the sheet names to load.
+            usecols (list | str, optional):
+                A list of columns to load, specified by general column names in Excel,
+                e.g. usecols='B:D', usecols=['A', 'C', 'F']
+            skip_rows (int, optional):
+                List of rows to skip on top of the sheet (e.g. to not read headlines)
+            header (int | None, optional):
+                Excel Row (0-indexed) to use for the column labels of the parsed data frame.
+                If file contains no header row, then you should explicitly pass header=None.
+                Default is 0.
+            names (list, optional):
+                A list of column names to use. Default is None.
+            na_values (list, optional):
+                A list of values in the Excel that should become the Pandas NA value.
         Returns:
-            bool: False in case an error occured, True otherwise.
+            bool:
+                False in case an error occured, True otherwise.
         """
         if xlsx_path is not None and os.path.exists(xlsx_path):
@@ -473,16 +748,21 @@ class Data:
                     names=names,
                     na_values=na_values,
                 )
-                # if multiple sheets from an Excel workbook are loaded,
+                # If multiple sheets from an Excel workbook are loaded,
                 # then read_excel() returns a dictionary. The keys are
-                # the names of the sheets and the values are the Data Frames.
-                # we handle this case as follows:
+                # the names of the sheets and the values are the data frames.
+                # As this class can only handle one data frame per object,
+                # We handle this case by concatenating the different sheets.
+                # If you don't want this make sure your Excel workbook has only
+                # one sheet or use the "sheet_name" parameter to select the one(s)
+                # you want to load.
                 if isinstance(df, dict):
-                    logger.info("Loading multiple Excel sheets from the workbook!")
+                    self.logger.info("Loading multiple Excel sheets from the workbook!")
                     multi_sheet_df = pd.DataFrame()
-                    for sheet in df.keys():
+                    for sheet in df:
                         multi_sheet_df = pd.concat(
-                            [multi_sheet_df, df[sheet]], ignore_index=True
+                            [multi_sheet_df, df[sheet]],
+                            ignore_index=True,
                         )
                     df = multi_sheet_df
                 if self._df is None:
@@ -490,89 +770,127 @@ class Data:
                 else:
                     self._df = pd.concat([self._df, df], ignore_index=True)
             except FileNotFoundError:
-                logger.error(
+                self.logger.error(
                     "Excel file -> '%s' not found. Please check the file path.",
                     xlsx_path,
                 )
                 return False
             except PermissionError:
-                logger.error(
-                    "Permission denied to access the Excel file -> '%s'.", xlsx_path
+                self.logger.error(
+                    "Missing permission to access the Excel file -> '%s'.",
+                    xlsx_path,
                 )
                 return False
-            except IOError as e:
-                logger.error(
-                    "An I/O error occurred -> %s while reading the Excel file -> %s",
-                    str(e),
+            except OSError:
+                self.logger.error(
+                    "An I/O error occurred while reading the Excel file -> '%s'",
                     xlsx_path,
                 )
                 return False
-            except ValueError as e:
-                logger.error(
-                    "Invalid Excel input -> %s in Excel file -> %s", str(e), xlsx_path
+            except ValueError:
+                self.logger.error(
+                    "Invalid Excel input in file -> '%s'",
+                    xlsx_path,
                 )
                 return False
-            except AttributeError as e:
-                logger.error("Unexpected data structure -> %s", str(e))
+            except AttributeError:
+                self.logger.error("Unexpected data structure in file -> %s", xlsx_path)
                 return False
-            except TypeError as e:
-                logger.error("Unexpected data type -> %s", str(e))
+            except TypeError:
+                self.logger.error("Unexpected data type in file -> %s", xlsx_path)
                 return False
-            except KeyError as e:
-                logger.error("Missing key in Excel data -> %s", str(e))
+            except KeyError:
+                self.logger.error("Missing key in Excel data in file -> %s", xlsx_path)
                 return False
         else:
-            logger.error(
-                "Missing Excel file -> '%s' you have not specified a valid path!",
+            self.logger.error(
+                "Missing Excel file -> '%s'. You have not specified a valid path!",
                 xlsx_path,
             )
             return False
         return True
     # end method definition
     def save_excel_data(
-        self, excel_path: str, sheet_name: str = "Pandas Export", index: bool = False
+        self,
+        excel_path: str,
+        sheet_name: str = "Pandas Export",
+        index: bool = False,
+        columns: list | None = None,
     ) -> bool:
-        """
-        Save the DataFrame to an Excel file, with robust error handling and logging.
+        """Save the data frame to an Excel file, with robust error handling and logging.
         Args:
-            excel_path (str): The file path to save the Excel file.
-            sheet_name (str): The sheet name where data will be saved. Default is 'Sheet1'.
-            index: Whether to write the row names (index). Default is False.
+            excel_path (str):
+                The file path to save the Excel file.
+            sheet_name (str):
+                The sheet name where data will be saved. Default is 'Sheet1'.
+            index (bool, optional):
+                Whether to write the row names (index). Default is False.
+            columns (list | None, optional):
+                A list of column names to write into the excel file.
+        Returns:
+            bool:
+                True = success, False = error.
         """
         try:
             # Check if the directory exists
             directory = os.path.dirname(excel_path)
             if directory and not os.path.exists(directory):
-                raise FileNotFoundError(
-                    "The directory -> '%s' does not exist." % directory
-                )
+                os.makedirs(directory)
+            # Validate columns if provided
+            if columns:
+                existing_columns = [col for col in columns if col in self._df.columns]
+                missing_columns = set(columns) - set(existing_columns)
+                if missing_columns:
+                    self.logger.warning(
+                        "The following columns do not exist in the data frame and cannot be saved to Excel -> %s",
+                        ", ".join(missing_columns),
+                    )
+                columns = existing_columns
-            # Attempt to save the DataFrame to Excel
-            self._df.to_excel(excel_path, sheet_name=sheet_name, index=index)
-            logger.info("Data saved successfully to -> %s", excel_path)
+            # Attempt to save the data frame to Excel:
+            self._df.to_excel(
+                excel_path,
+                sheet_name=sheet_name,
+                index=index,
+                columns=columns or None,  # Pass None if no columns provided
+            )
+            self.logger.info(
+                "Data frame saved successfully to Excel file -> '%s'.",
+                excel_path,
+            )
-        except FileNotFoundError as e:
-            logger.error("Error: %s", e)
+        except FileNotFoundError:
+            self.logger.error(
+                "Cannot write data frame to Excel file -> '%s'",
+                excel_path,
+            )
             return False
         except PermissionError:
-            logger.error(
-                "Error: Permission denied. You do not have permission to write to '%s'.",
+            self.logger.error(
+                "Cannot write data frame to Excel file -> '%s'",
                 excel_path,
             )
             return False
-        except ValueError as ve:
-            logger.error("Error: Invalid data for Excel format -> %s", ve)
-            return False
-        except OSError as oe:
-            logger.error("Error: OS error occurred while saving file -> %s", oe)
+        except ValueError:
+            self.logger.error(
+                "Cannot write data frame to Excel file -> '%s'",
+                excel_path,
+            )
             return False
-        except Exception as e:
-            # Catch-all for any other unexpected errors
-            logger.error("An unexpected error occurred -> %s", e)
+        except OSError:
+            self.logger.error(
+                "Cannot write data frame to Excel file -> '%s'",
+                excel_path,
+            )
             return False
         return True
@@ -580,130 +898,266 @@ class Data:
     # end method definition
     def load_csv_data(
-        self, csv_path: str, delimiter: str = ",", encoding: str = "utf-8"
+        self,
+        csv_path: str,
+        delimiter: str = ",",
+        names: list | None = None,
+        header: int | None = 0,
+        usecols: list | None = None,
+        encoding: str = "utf-8",
     ) -> bool:
-        """Load CSV (Comma separated values) data into DataFrame
+        """Load CSV (Comma separated values) data into data frame.
         Args:
-            csv_path (str): Path to the CSV file.
-            delimiter (str, optional, length = 1): chracter to delimit values. Default ="," (comma)
-            encoding (str, optional): encoding of the file. Default = "utf-8".
+            csv_path (str):
+                The path to the CSV file.
+            delimiter (str, optional, length = 1):
+                The character used to delimit values. Default is "," (comma).
+            names (list | None, optional):
+                The list of column names. This is useful if file does not have a header line
+                but just the data.
+            header (int | None, optional):
+                The index of the header line. Default is 0 (first line). None indicates
+                that the file does not have a header line
+            usecols (list | None, optional):
+                There are three possible list values types:
+                1. int:
+                    These values are treated as column indices for columns to keep
+                    (first column has index 0).
+                2. str:
+                    The names of the columns to keep. For this to work the file needs
+                    either a header line (i.e. 'header != None') or the 'names'
+                    parameter must be specified.
+                3. bool:
+                    The length of the list must match the number of columns. Only
+                    columns that have a value of True are kept.
+            encoding (str, optional):
+                The encoding of the file. Default = "utf-8".
         Returns:
-            bool: False in case an error occured, True otherwise.
+            bool:
+                False in case an error occured, True otherwise.
         """
-        if csv_path is not None and os.path.exists(csv_path):
-            # Load data from CSV file
+        if csv_path.startswith("http"):
+            # Download file from remote location specified by the packageUrl
+            # this must be a public place without authentication:
+            self.logger.debug("Download CSV file from URL -> '%s'.", csv_path)
             try:
-                df = pd.read_csv(
-                    filepath_or_buffer=csv_path, delimiter=delimiter, encoding=encoding
-                )
-                if self._df is None:
-                    self._df = df
-                else:
-                    self._df = pd.concat([self._df, df])
-            except FileNotFoundError:
-                logger.error(
-                    "CSV file -> '%s' not found. Please check the file path.", csv_path
-                )
-                return False
-            except PermissionError:
-                logger.error(
-                    "Permission denied to access the CSV file -> %s.", csv_path
-                )
+                response = requests.get(url=csv_path, timeout=1200)
+                response.raise_for_status()
+            except requests.exceptions.HTTPError:
+                self.logger.error("HTTP error with -> %s", csv_path)
                 return False
-            except IOError as e:
-                logger.error("An I/O error occurred -> %s", str(e))
+            except requests.exceptions.ConnectionError:
+                self.logger.error("Connection error with -> %s", csv_path)
                 return False
-            except ValueError as e:
-                logger.error("Invalid CSV input -> %s", str(e))
+            except requests.exceptions.Timeout:
+                self.logger.error("Timeout error with -> %s", csv_path)
                 return False
-            except AttributeError as e:
-                logger.error("Unexpected data structure -> %s", str(e))
-                return False
-            except TypeError as e:
-                logger.error("Unexpected data type -> %s", str(e))
-                return False
-            except KeyError as e:
-                logger.error("Missing key in CSV data -> %s", str(e))
+            except requests.exceptions.RequestException:
+                self.logger.error("Request error with -> %s", csv_path)
                 return False
-        else:
-            logger.error(
-                "Missing CSV file -> '%s' you have not specified a valid path!",
+            self.logger.debug(
+                "Successfully downloaded CSV file -> %s; status code -> %s",
                 csv_path,
+                response.status_code,
             )
-            return False
-        return True
-    # end method definition
+            # Convert bytes to a string using utf-8 and create a file-like object
+            csv_file = StringIO(response.content.decode(encoding))
-    def load_xml_data(
-        self, xml_path: str, xpath: str | None = None, xslt_path: str | None = None
-    ) -> bool:
-        """Load XML data into DataFrame
+        elif os.path.exists(csv_path):
+            self.logger.debug("Using local CSV file -> '%s'.", csv_path)
+            csv_file = csv_path
-        Args:
-            xml_path (str): Path to the XML file.
-            xpath (str, optional): XPath to the elements we want to select
-            xslt_path (str, optional): XSLT transformation file
-        Returns:
-            bool: False in cause an error occured, True otherwise.
-        """
+        else:
+            self.logger.error(
+                "Missing CSV file -> '%s' you have not specified a valid path!",
+                csv_path,
+            )
+            return False
+        # Load data from CSV file or buffer
         try:
-            df = pd.read_xml(path_or_buffer=xml_path, xpath=xpath, stylesheet=xslt_path)
-            # Process the loaded data as needed
+            df = pd.read_csv(
+                filepath_or_buffer=csv_file,
+                delimiter=delimiter,
+                names=names,
+                header=header,
+                usecols=usecols,
+                encoding=encoding,
+                skipinitialspace=True,
+            )
             if self._df is None:
                 self._df = df
             else:
                 self._df = pd.concat([self._df, df])
-            logger.info("XML file loaded successfully!")
-            return True
         except FileNotFoundError:
-            print("File not found.")
+            self.logger.error(
+                "CSV file -> '%s' not found. Please check the file path.",
+                csv_path,
+            )
             return False
         except PermissionError:
-            logger.error("Permission denied to access the file -> %s.", xml_path)
+            self.logger.error(
+                "Permission denied to access the CSV file -> '%s'.",
+                csv_path,
+            )
             return False
-        except IOError as e:
-            logger.error("An I/O error occurred -> %s", str(e))
+        except OSError:
+            self.logger.error("An I/O error occurred!")
             return False
-        except ValueError as e:
-            logger.error("Invalid CSV input -> %s", str(e))
+        except ValueError:
+            self.logger.error("Invalid CSV input in file -> %s", csv_path)
             return False
-        except AttributeError as e:
-            logger.error("Unexpected data structure -> %s", str(e))
+        except AttributeError:
+            self.logger.error("Unexpected data structure in file -> %s", csv_path)
             return False
-        except TypeError as e:
-            logger.error("Unexpected data type -> %s", str(e))
+        except TypeError:
+            self.logger.error("Unexpected data type in file -> %s", csv_path)
             return False
-        except KeyError as e:
-            logger.error("Missing key in CSV data -> %s", str(e))
+        except KeyError:
+            self.logger.error("Missing key in CSV data -> %s", csv_path)
             return False
+        return True
     # end method definition
-    def load_directory(self, path_to_root: str) -> bool:
-        """Load directory structure into Pandas Data Frame
+    def load_xml_data(
+        self,
+        xml_path: str,
+        xpath: str | None = None,
+        xslt_path: str | None = None,
+        encoding: str = "utf-8",
+    ) -> bool:
+        """Load XML data into a Pandas data frame.
         Args:
-            path_to_root (str): Path to the root element of the
-                                directory structure
+            xml_path (str):
+                The path to the XML file to load.
+            xpath (str, optional):
+                An XPath to the elements we want to select.
+            xslt_path (str, optional):
+                An XSLT transformation file to convert the XML data.
+            encoding (str, optional):
+                The encoding of the file. Default is UTF-8.
         Returns:
-            bool: True = Success, False = Failure
+            bool:
+                False in case an error occured, True otherwise.
         """
-        try:
-            # Check if the provided path is a directory
-            if not os.path.isdir(path_to_root):
-                logger.error(
-                    "The provided path -> '%s' is not a valid directory.", path_to_root
-                )
-                return False
+        if xml_path.startswith("http"):
+            # Download file from remote location specified by the packageUrl
+            # this must be a public place without authentication:
+            self.logger.debug("Download XML file from URL -> '%s'.", xml_path)
-            # Initialize a list to hold file information
+            try:
+                response = requests.get(url=xml_path, timeout=1200)
+                response.raise_for_status()
+            except requests.exceptions.HTTPError:
+                self.logger.error("HTTP error with -> %s", xml_path)
+                return False
+            except requests.exceptions.ConnectionError:
+                self.logger.error("Connection error with -> %s", xml_path)
+                return False
+            except requests.exceptions.Timeout:
+                self.logger.error("Timeout error with -> %s", xml_path)
+                return False
+            except requests.exceptions.RequestException:
+                self.logger.error("Request error with -> %s", xml_path)
+                return False
+            self.logger.debug(
+                "Successfully downloaded XML file -> '%s'; status code -> %s",
+                xml_path,
+                response.status_code,
+            )
+            # Convert bytes to a string using utf-8 and create a file-like object
+            xml_file = StringIO(response.content.decode(encoding))
+        elif os.path.exists(xml_path):
+            self.logger.debug("Using local XML file -> '%s'.", xml_path)
+            xml_file = xml_path
+        else:
+            self.logger.error(
+                "Missing XML file -> '%s'. You have not specified a valid path or URL!",
+                xml_path,
+            )
+            return False
+        # Load data from XML file or buffer
+        try:
+            df = pd.read_xml(
+                path_or_buffer=xml_file,
+                xpath=xpath,
+                stylesheet=xslt_path,
+                encoding=encoding,
+            )
+            # Process the loaded data as needed
+            if self._df is None:
+                self._df = df
+            else:
+                self._df = pd.concat([self._df, df])
+            self.logger.info("XML file -> '%s' loaded successfully!", xml_path)
+        except FileNotFoundError:
+            self.logger.error("XML file -> '%s' not found.", xml_path)
+            return False
+        except PermissionError:
+            self.logger.error(
+                "Missing permission to access the XML file -> '%s'.",
+                xml_path,
+            )
+            return False
+        except OSError:
+            self.logger.error("An I/O error occurred loading from -> %s", xml_path)
+            return False
+        except ValueError:
+            self.logger.error("Invalid XML data in file -> %s", xml_path)
+            return False
+        except AttributeError:
+            self.logger.error("Unexpected data structure in XML file -> %s", xml_path)
+            return False
+        except TypeError:
+            self.logger.error("Unexpected data type in XML file -> %s", xml_path)
+            return False
+        except KeyError:
+            self.logger.error("Missing key in XML file -> %s", xml_path)
+            return False
+        return True
+    # end method definition
+    def load_directory(self, path_to_root: str) -> bool:
+        """Load directory structure into Pandas data frame.
+        Args:
+            path_to_root (str):
+                Path to the root element of the directory structure.
+        Returns:
+            bool: True = Success, False = Failure
+        """
+        try:
+            # Check if the provided path is a directory
+            if not os.path.isdir(path_to_root):
+                self.logger.error(
+                    "The provided path -> '%s' is not a valid directory.",
+                    path_to_root,
+                )
+                return False
+            # Initialize a list to hold file information
             data = []
             # Walk through the directory
@@ -715,55 +1169,88 @@ class Data:
                     path_parts = relative_path.split(os.sep)
                     # Create a dictionary with the path parts and file details
-                    entry = {
-                        "level {}".format(i): part
-                        for i, part in enumerate(path_parts[:-1], start=1)
-                    }
-                    entry.update({"filename": path_parts[-1], "size": file_size})
+                    entry = {"level {}".format(i): part for i, part in enumerate(path_parts[:-1], start=1)}
+                    entry.update(
+                        {
+                            "filename": path_parts[-1],
+                            "size": file_size,
+                            "path": path_parts[1:-1],
+                            "relative_path": relative_path,
+                            "download_dir": root,
+                        },
+                    )
                     data.append(entry)
-            # Create DataFrame from list of dictionaries
+            # Create data frame from list of dictionaries:
             self._df = pd.DataFrame(data)
             # Determine the maximum number of levels
             max_levels = max((len(entry) - 2 for entry in data), default=0)
-            # Ensure all entries have the same number of levels
+            # Ensure all entries have the same number of levels:
             for entry in data:
                 for i in range(1, max_levels + 1):
                     entry.setdefault("level {}".format(i), "")
-            # Convert to DataFrame again to make sure all columns are consistent
+            # Convert to data frame again to make sure all columns are consistent:
             self._df = pd.DataFrame(data)
-        except NotADirectoryError as nde:
-            print(f"Error: {nde}")
-        except FileNotFoundError as fnfe:
-            print(f"Error: {fnfe}")
-        except PermissionError as pe:
-            print(f"Error: {pe}")
+        except NotADirectoryError:
+            self.logger.error(
+                "Provided path -> '%s' is not a directory!",
+                path_to_root,
+            )
+            return False
+        except FileNotFoundError:
+            self.logger.error(
+                "Provided path -> '%s' does not exist in file system!",
+                path_to_root,
+            )
+            return False
+        except PermissionError:
+            self.logger.error(
+                "Permission error accessing path -> '%s'!",
+                path_to_root,
+            )
+            return False
         return True
     # end method definition
-    def load_xml_directory(self, path_to_root: str, xpath: str | None = None) -> bool:
-        """Load directory structure into Pandas Data Frame
+    def load_xml_directory(
+        self,
+        path_to_root: str,
+        xpath: str | None = None,
+        xml_files: list | None = None,
+    ) -> bool:
+        """Load XML files from a directory structure into Pandas data frame.
         Args:
-            path_to_root (str): Path to the root element of the
-                                directory structure
-            xpath (str, optional): XPath to the elements we want to select
+            path_to_root (str):
+                Path to the root element of the directory structure.
+            xpath (str, optional):
+                XPath to the XML elements we want to select.
+            xml_files (list | None, optional):
+                Names of the XML files to load from the directory.
         Returns:
-            bool: True = Success, False = Failure
+            bool:
+                True = Success, False = Failure
         """
+        # Establish a default if None is passed via the parameter:
+        if not xml_files:
+            xml_files = ["docovw.xml"]
         try:
             # Check if the provided path is a directory
             if not os.path.isdir(path_to_root):
-                logger.error(
-                    "The provided path -> '%s' is not a valid directory.", path_to_root
+                self.logger.error(
+                    "The provided path -> '%s' is not a valid directory.",
+                    path_to_root,
                 )
                 return False
@@ -774,36 +1261,223 @@ class Data:
                     file_size = os.path.getsize(file_path)
                     file_name = os.path.basename(file_path)
-                    if file_name == "docovw.xml":
-                        logger.info(
-                            "Load XML file -> '%s' of size -> %s", file_path, file_size
+                    if file_name in xml_files:
+                        self.logger.info(
+                            "Load XML file -> '%s' of size -> %s from -> '%s'...",
+                            file_name,
+                            file_size,
+                            file_path,
                         )
                         success = self.load_xml_data(file_path, xpath=xpath)
                         if success:
-                            logger.info(
-                                "Successfully loaded XML file -> '%s'", file_path
+                            self.logger.info(
+                                "Successfully loaded XML file -> '%s'.",
+                                file_path,
                             )
-        except NotADirectoryError as nde:
-            logger.error("Error -> %s", str(nde))
-        except FileNotFoundError as fnfe:
-            logger.error("Error -> %s", str(fnfe))
-        except PermissionError as pe:
-            logger.error("Error -> %s", str(pe))
+        except NotADirectoryError:
+            self.logger.error(
+                "Provided path -> '%s' is not a directory",
+                path_to_root,
+            )
+            return False
+        except FileNotFoundError:
+            self.logger.error(
+                "Provided path -> '%s' does not exist in file system!",
+                path_to_root,
+            )
+            return False
+        except PermissionError:
+            self.logger.error(
+                "Missing permission to access path -> '%s'",
+                path_to_root,
+            )
+            return False
+        return True
+    # end method definition
+    def load_web_links(
+        self,
+        url: str,
+        common_data: dict | None = None,
+        pattern: str = r"",
+    ) -> list | None:
+        """Get all linked file URLs on a given web page (url) that are following a given pattern.
+        Construct a list of dictionaries based on this. This method is a helper method for load_web() below.
+        Args:
+            url (str):
+                The web page URL.
+            common_data (dict | None, optional):
+                Fields that should be added to each dictionary item. Defaults to None.
+            pattern (str, optional):
+                Regular Expression. Defaults to r"".
+        Returns:
+            list | None:
+                List of links on the web page that are complying with the given regular expression.
+        """
+        try:
+            response = requests.get(url, timeout=300)
+            response.raise_for_status()
+        except requests.RequestException:
+            self.logger.error("Failed to retrieve page at %s", url)
+            return []
+        # Find all file links (hyperlinks) on the page (no file extension assumed)
+        # Example filename pattern: "al022023.public.005"
+        file_links = re.findall(r'href="([^"]+)"', response.text)
+        if not file_links:
+            self.logger.warning("No file links found on the web page -> %s", url)
+            return []
+        result_list = []
+        base_url = url if url.endswith("/") else url + "/"
+        for link in file_links:
+            data = common_data.copy() if common_data else {}
+            # Construct the full URL
+            full_url = base_url + link.lstrip("/")
+            if pattern:
+                # Filter by expected naming pattern for links
+                match = re.search(pattern, link)
+                if not match:
+                    continue
+                # Extract and assign groups if they exist
+                # TODO(mdiefenb): these names are currently hard-coded
+                # for the National Hurricane Center Dataset (NHC)
+                if len(match.groups()) >= 1:
+                    data["Code"] = match.group(1).upper()
+                if len(match.groups()) >= 2:
+                    data["Type"] = match.group(2)
+                if len(match.groups()) >= 3:
+                    data["Message ID"] = match.group(3)
+            data["URL"] = full_url
+            data["Filename"] = link
+            result_list.append(data)
+        return result_list
+    # end method definition
+    def load_web(
+        self,
+        values: list,
+        value_name: str,
+        url_templates: list,
+        special_values: list | None = None,
+        special_url_templates: dict | None = None,
+        pattern: str = r"",
+    ) -> bool:
+        """Traverse years and bulletin types to collect all bulletin URLs.
+        Args:
+            values (list):
+                List of values to travers over
+            value_name (str):
+                Dictionary key to construct an item in combination with a value from values
+            url_templates (list):
+                URLs to travers per value. The URLs should contain one {} that is
+                replace by the current value.
+            special_values (list | None, optional):
+                List of vales (a subset of the other values list)
+                that we want to handle in a special way. Defaults to None.
+            special_url_templates (dict | None, optional):
+                URLs for the special values. Defaults to None.
+                The dictionary keys are the special values. The
+                dictionary values are lists of special URLs with placeholders.
+            pattern (str, optional):
+                Regular expression to find the proper links on the page. Defaults to r"".
+        Returns:
+            bool:
+                True for success, False in case of an error.
+        """
+        result_list = []
+        # We have two nested for loops below. The out traverses over all placeholder values.
+        # These could be the calendar years, e.g. [2003,...,2024]
+        # The inner for loop traverses over the list of specified URLs. We can have multiple for
+        # each value.
+        # Do we have a list of placeholder values we want to iterate over?
+        if values:
+            # Traverse all values in the values list:
+            for value in values:
+                # Do we want a special treatment for this value (e.g. the current year)
+                if value in special_values:
+                    self.logger.info("Processing special value -> '%s'...", value)
+                    if value not in special_url_templates and str(value) not in special_url_templates:
+                        self.logger.error(
+                            "Cannot find key -> '%s' in special URL templates dictionary -> %s! Skipping...",
+                            value,
+                            str(special_url_templates),
+                        )
+                        continue
+                    # If the dictionary uses string keys then we need to convert the value
+                    # to a string as well to avoid key errors:
+                    if str(value) in special_url_templates:
+                        value = str(value)
+                    special_url_template_list = special_url_templates[value]
+                    for special_url_template in special_url_template_list:
+                        # Now the value is inserted into the placeholder in the URL:
+                        special_url = special_url_template.format(value)
+                        common_data = {value_name: value} if value_name else None
+                        result_list += self.load_web_links(
+                            url=special_url,
+                            common_data=common_data,
+                            pattern=pattern,
+                        )
+                else:  # normal URLs
+                    self.logger.info("Processing value -> '%s'...", value)
+                    for url_template in url_templates:
+                        # Now the value is inserted into the placeholder in the URL:
+                        url = url_template.format(value)
+                        common_data = {value_name: value} if value_name else None
+                        result_list += self.load_web_links(
+                            url=url,
+                            common_data=common_data,
+                            pattern=pattern,
+                        )
+        else:
+            for url_template in url_templates:
+                url = url_template.format(value)
+                result_list += self.load_web_links(
+                    url=url,
+                    common_data=None,
+                    pattern=pattern,
+                )
+        # Add the data list to the data frame:
+        self.append(result_list)
         return True
     # end method definition
     def partitionate(self, number: int) -> list:
-        """Partition a data frame into equally sized
-           partions
+        """Partition a data frame into equally sized partitions.
         Args:
-            number (int): Number of partitions
+            number (int):
+                The number of desired partitions.
         Returns:
-            list: List of partitions
+            list:
+                A list of created partitions.
         """
         # Calculate the approximate size of each partition
@@ -817,24 +1491,20 @@ class Data:
             number = 1
             remainder = 0
-        logger.info(
-            "Data set has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
+        self.logger.info(
+            "Data frame has -> %s elements. We split it into -> %s partitions with -> %s rows and remainder -> %s...",
             str(size),
             str(number),
             str(partition_size),
             str(remainder),
         )
-        # Initialize a list to store partitions
+        # Initialize a list to store partitions:
         partitions = []
         start_index = 0
-        # Slice the DataFrame into equally sized partitions
+        # Slice the data frame into equally sized partitions:
         for i in range(number):
-            # start_index = i * partition_size
-            # end_index = (i + 1) * partition_size if i < number - 1 else None
-            # partition = self._df.iloc[start_index:end_index]
-            # partitions.append(partition)
             # Calculate the end index for this partition
             end_index = start_index + partition_size + (1 if i < remainder else 0)
             partition = self._df.iloc[start_index:end_index]
@@ -849,34 +1519,44 @@ class Data:
         """Partition a data frame based on equal values in a specified column.
         Args:
-            column_name (str): The column name to partition by
+            column_name (str):
+                The column name to partition by.
         Returns:
-            list | None: List of partitions or None in case of an error (e.g. column name does not exist).
+            list | None:
+                List of partitions or None in case of an error (e.g. column name does not exist).
         """
         if column_name not in self._df.columns:
-            logger.error(
-                "Column -> '%s' does not exist in the Data Frame. Data Frame has these columns -> %s",
+            self.logger.error(
+                "Cannot partitionate by column -> '%s'. Column does not exist in the data frame. Data frame has these columns -> %s",
                 column_name,
                 str(self._df.columns),
             )
             return None
-        # Separate rows with NaN or None values in the specified column
+        # Separate rows with NaN or None values in the specified column:
         nan_partitions = self._df[self._df[column_name].isna()]
+        # Keep only rows where the specified column has valid (non-NaN) values:
         non_nan_df = self._df.dropna(subset=[column_name])
-        # Group by the specified column and create a list of DataFrames for each group
+        # Group the non-NaN DataFrame by the specified column's values:
         grouped = non_nan_df.groupby(column_name)
+        # Create a list of partitions (DataFrames) for each unique value in the column:
         partitions = [group for _, group in grouped]
-        # Add each row with NaN or None values as its own partition
-        for i in range(len(nan_partitions)):
-            partitions.append(nan_partitions.iloc[[i]])
+        # Add each row with NaN/None as its own partition
+        # iterrows() returns each row as a Series. To convert it back to a DataFrame:
+        # 1. .to_frame() turns the Series into a DataFrame, but with the original column names as rows.
+        # 2. .T (transpose) flips it back, turning the original row into a proper DataFrame row.
+        # This ensures that even rows with NaN values are treated as DataFrame partitions.
+        partitions.extend([row.to_frame().T for _, row in nan_partitions.iterrows()])
-        logger.info(
-            "Data Frame has been partitioned into -> %s partitions based on the values in column '%s'...",
+        self.logger.info(
+            "Data frame has been partitioned into -> %s partitions based on the values in column -> '%s'...",
             str(len(partitions)),
             column_name,
         )
@@ -886,18 +1566,19 @@ class Data:
     # end method definition
     def deduplicate(self, unique_fields: list, inplace: bool = True) -> pd.DataFrame:
-        """Remove dupclicate rows that have all fields in
-           unique_fields in common.
+        """Remove dupclicate rows that have all fields in unique_fields in common.
         Args:
-            unique_fields (list): Defines the fields for which we want a unique
-                                  combination.
-            inplace (bool, optional): True if the deduplication happens in-place.
-                                      Defaults to True.
+            unique_fields (list):
+                Defines the fields for which we want a unique combination for.
+            inplace (bool, optional):
+                True if the deduplication happens in-place. Defaults to True.
         Returns:
-            pd.DataFrame | None: If inplace is False than a new deduplicatd DataFrame
-                                 is returned. Otherwise the object is modified in place
-                                 and self._df is returned.
+            pd.DataFrame:
+                If inplace is False than a new deduplicatd data frame is returned.
+                Otherwise the object is modified in place and self._df is returned.
         """
         if inplace:
@@ -911,34 +1592,38 @@ class Data:
     # end method definition
-    def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame:
-        """Sort the data frame based on one or multiple fields -
-           either in place or return it as a new data frame (e.g. not modifying self._df)
+    def sort(self, sort_fields: list, inplace: bool = True) -> pd.DataFrame | None:
+        """Sort the data frame based on one or multiple fields.
+        Sorting can be either in place or return it as a new data frame
+        (e.g. not modifying self._df).
         Args:
-            sort_fields (list): Columns / fields to be used for sorting
-            inplace (bool, optional): If the sorting should be inplace, i.e. modifying self._df.
-                                      Defaults to True.
+            sort_fields (list):
+                The columns / fields to be used for sorting.
+            inplace (bool, optional):
+                If the sorting should be inplace, i.e. modifying self._df.
+                Defaults to True.
         Returns:
-            pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
+            pd.DataFrame | None:
+                New data frame (if inplace = False) or self._df (if inplace = True).
+                None in case of an error.
         """
         if self._df is None:
             return None
         if not all(sort_field in self._df.columns for sort_field in sort_fields):
-            logger.warning(
-                "Not all of the given sort fields -> %s do exist in the Data Frame.",
+            self.logger.warning(
+                "Not all of the given sort fields -> %s do exist in the data frame.",
                 str(sort_fields),
             )
-            # Reduce the sort fields to those that really exist in the DataFrame:
-            sort_fields = [
-                sort_field
-                for sort_field in sort_fields
-                if sort_field in self._df.columns
-            ]
-            logger.warning(
-                "Only these given sort fields -> %s do exist as columns in the Data Frame.",
+            # Reduce the sort fields to those that really exist in the data frame:
+            sort_fields = [sort_field for sort_field in sort_fields if sort_field in self._df.columns]
+            self.logger.warning(
+                "Only these given sort fields -> %s do exist as columns in the data frame.",
                 str(sort_fields),
             )
@@ -953,156 +1638,278 @@ class Data:
     # end method definition
-    def flatten(
-        self,
-        parent_field: str,
-        flatten_fields: list,
-    ):
-        """Flatten a sub-dictionary by copying selected fields to the
-           parent dictionary. This is e.g. useful for then de-duplicate
-           a data set.
+    def flatten(self, parent_field: str, flatten_fields: list, concatenator: str = "_") -> None:
+        """Flatten a sub-dictionary by copying selected fields to the parent dictionary.
+        This is e.g. useful for then de-duplicate a data frame.
+        To flatten a data frame makes sense in situation when a column used
+        to have a list of dictionaries and got "exploded" (see explode_and_flatten()
+        method below). In this case the column as dictionary values that then can
+        be flattened.
         Args:
-            parent_field (str): name of the field in the parent dictionary
-            flatten_fields (list): fields in the sub-dictionary to copy
-                                   into the parent dictionary.
+            parent_field (str):
+                Name prefix of the new column in the data frame. The flattened field
+                names are added with a leading underscore.
+            flatten_fields (list):
+                Fields in the dictionary of the source column that are copied
+                as new columns into the data frame.
+            concatenator (str, optional):
+                Character or string used to concatenate the parent field with the flattened field
+                to create a unique name.
         """
+        # First do a sanity check if the data frame is not yet initialized.
+        if self._df is None:
+            self.logger.error(
+                "The data frame is not initialized or empty. Cannot flatten field(s) -> '%s' in the data frame.",
+                flatten_fields,
+            )
+            return
+        if parent_field not in self._df.columns:
+            self.logger.warning(
+                "The parent field -> '%s' cannot be flattened as it doesn't exist as column in the data frame!",
+                parent_field,
+            )
+            return
         for flatten_field in flatten_fields:
-            flat_field = parent_field + "_" + flatten_field
+            flat_field = parent_field + concatenator + flatten_field
             # The following expression generates a new column in the
             # data frame with the name of 'flat_field'.
-            # In the lambada function x is a dictionary that includes the subvalues
+            # In the lambda function x is a dictionary that includes the subvalues
             # and it returns the value of the given flatten field
             # (if it exists, otherwise None). So x is self._df[parent_field], i.e.
             # what the lambda function gets 'applied' on.
             self._df[flat_field] = self._df[parent_field].apply(
-                lambda x, sub_field=flatten_field: (
-                    x.get(sub_field, None) if isinstance(x, dict) else None
-                )
+                lambda x, sub_field=flatten_field: (x.get(sub_field, None) if isinstance(x, dict) else None),
             )
     # end method definition
     def explode_and_flatten(
         self,
-        explode_field: str | list,
+        explode_fields: str | list,
         flatten_fields: list | None = None,
         make_unique: bool = False,
         reset_index: bool = False,
         split_string_to_list: bool = False,
         separator: str = ";,",
-    ) -> pd.DataFrame:
-        """Explode a substructure in the Data Frame
+    ) -> pd.DataFrame | None:
+        """Explode a substructure in the Pandas data frame.
         Args:
-            explode_field (str | list): Field(s) to explode which each has/have a list structure.
-                                        Exploding multiple columns at once is possible. This delivers
-                                        a very different result compared to exploding one column after
-                                        the other!
-            flatten_fields (list): Fields in the exploded substructure to include
-                                   in the main dictionaries for easier processing.
-            make_unique (bool, optional): if True deduplicate the exploded data frame.
-            reset_index (bool, False): True = index is reset, False = Index is not reset
-            split_string_to_list (bool, optional): if True flatten the exploded data frame.
-            separator (str, optional): characters used to split the string values in the given column into a list
+            explode_fields (str | list):
+                Field(s) to explode. Each field to explode should have a list structure.
+                Exploding multiple columns at once is possible. This delivers
+                a very different result compared to exploding one column after the other!
+            flatten_fields (list):
+                Fields in the exploded substructure to include
+                in the main dictionaries for easier processing.
+            make_unique (bool, optional):
+                If True, deduplicate the exploded data frame.
+            reset_index (bool, False):
+                If True, then the index is reset, False = Index is not reset.
+            split_string_to_list (bool, optional):
+                If True flatten the exploded data frame.
+            separator (str, optional):
+                Characters used to split the string values in the given column into a list.
         Returns:
-            pd.DataFrame: Pointer to the Pandas DataFrame
+            pd.DataFrame | None:
+                Pointer to the Pandas data frame.
         """
-        def update_column(row):
-            try:
-                if sub in row:
-                    return row[sub]
-            except (IndexError, KeyError, ValueError):
-                return ""
-        # Define a function to split a string into a list
-        def string_to_list(string: str | None) -> list:
-            # Do nothing if the string is already a list
-            if isinstance(string, list):
-                return_list = string
-            elif not string or pd.isna(string):
-                return_list = []
-            else:
-                # Use regular expression to split by comma, semicolon, or comma followed by space
-                return_list = re.split(rf"[{separator}]\s*", str(string))
+        def update_column(row: pd.Series, sub: str) -> str:
+            """Extract the value of a sub-column from a nested dictionary within a Pandas Series.
+            Args:
+                row (pd.Series):
+                    A row from the data frame.
+                sub (str):
+                    The sub-column name to extract.
+            Returns:
+                str:
+                    The value of the sub-column, or an empty string if not found.
+            """
+            if isinstance(row, dict) and sub in row:
+                return row[sub]
+            return ""
+        # end def update_column()
+        def string_to_list(value: str) -> list:
+            """Convert a string to a list by splitting it using a specified separator.
+            If the input is already a list, it is returned as-is. If the input is `None` or a missing value,
+            an empty list is returned. Otherwise, the string is split into a list of substrings using
+            the given separator. Leading and trailing spaces in the resulting substrings are removed.
+            Args:
+                value (str):
+                    The input string to be converted into a list. Can also be a list, `None`,
+                    or a missing value (e.g., NaN).
+            Returns:
+                list:
+                    A list of substrings if the input is a string, or an empty list if the input
+                    is `None` or a missing value. If the input is already a list, it is returned unchanged.
+            """
+            # Check if the value is already a list; if so, return it directly
+            if isinstance(value, list):
+                return value
+            # If the value is None or a missing value (e.g., NaN), return an empty list
+            if not value or pd.isna(value):
+                return []
+            # Use a regular expression to split the string by the separator
+            # and remove leading/trailing spaces from each resulting substring
+            return_list = re.split(rf"[{separator}]\s*", str(value))
             return return_list
-        if isinstance(explode_field, list):
-            logger.info("Explode multiple columns -> %s", str(explode_field))
-        elif isinstance(explode_field, str):
-            logger.info("Explode single column -> '%s'", explode_field)
+        # end def string_to_list()
+        #
+        # Start of main method:
+        #
+        # First do a sanity check if the data frame is not yet initialized.
+        if self._df is None:
+            self.logger.error(
+                "The data frame is not initialized or empty. Cannot explode data frame.",
+            )
+            return None
+        # Next do a sanity check for the given explode_field. It should
+        # either be a string (single column name) or a list (multiple column names):
+        if isinstance(explode_fields, list):
+            self.logger.info("Exploding list of columns -> %s", str(explode_fields))
+        elif isinstance(explode_fields, str):
+            self.logger.info("Exploding single column -> '%s'", explode_fields)
         else:
-            logger.error(
-                "Illegal explode field(s) data type provided -> %s", type(explode_field)
+            self.logger.error(
+                "Illegal explode field(s) data type -> %s. Explode field must either be a string or a list of strings.",
+                type(explode_fields),
             )
             return self._df
-        try:
-            # remove the sub dictionary that sometimes is introduced by
-            # XML loading. We just want the main part.
-            if "." in explode_field:
-                main = explode_field.split(".")[0]
-                sub = explode_field.split(".")[1]
-                self._df[main] = self._df[main].apply(update_column)
-                explode_field = main
-            # Now that we have the right explode column
-            # we need to convert it to a list if it is inside a string (with delimiters)
-            if split_string_to_list:
-                logger.info(
-                    "Split the string values of column -> '%s' into a list using separator -> '%s'",
-                    explode_field,
+        # Ensure explode_fields is a list for uniform processing:
+        if isinstance(explode_fields, str):
+            explode_fields = [explode_fields]
+        # Process nested field names with '.'
+        processed_fields = []
+        for field in explode_fields:
+            # The "." indicates that the column has dictionary values:
+            if "." in field:
+                main, sub = field.split(".", 1)
+                if main not in self._df.columns:
+                    self.logger.error(
+                        "The column -> '%s' does not exist in the data frame! Cannot explode it. Data frame has these columns -> %s",
+                        main,
+                        str(self._df.columns.tolist()),
+                    )
+                    continue
+                # Use update_column to extract the dictionary key specified by the sub value:
+                self.logger.info(
+                    "Extracting dictionary value for key -> '%s' from column -> '%s'.",
+                    sub,
+                    main,
+                )
+                self._df[main] = self._df[main].apply(update_column, args=(sub,))
+                processed_fields.append(main)
+            else:
+                processed_fields.append(field)
+        # Verify all processed fields exist in the data frame:
+        missing_columns = [col for col in processed_fields if col not in self._df.columns]
+        if missing_columns:
+            self.logger.error(
+                "The following columns are missing in the data frame and cannot be exploded -> %s. Data frame has these columns -> %s",
+                missing_columns,
+                str(self._df.columns.tolist()),
+            )
+            return self._df
+        # Handle splitting strings into lists if required:
+        if split_string_to_list:
+            for field in processed_fields:
+                self.logger.info(
+                    "Splitting strings in column -> '%s' into lists using separator -> '%s'",
+                    field,
                     separator,
                 )
                 # Apply the function to convert the string values in the column (give by the name in explode_field) to lists
                 # The string_to_list() sub-method above also considers the separator parameter.
-                self._df[explode_field] = self._df[explode_field].apply(string_to_list)
-            # Explode the field that has list values
-            self._df = self._df.explode(column=explode_field)
-        except KeyError:
-            logger.error("Column -> '%s' not found in Data Frame!", str(explode_field))
+                self._df[field] = self._df[field].apply(string_to_list)
+        # Explode all specified columns at once.
+        # explode() can either take a string field or a list of fields.
+        # # It is VERY important to do the explosion of multiple columns together -
+        # otherwise we get combinatorial explosion. Explosion of multiple columns 1-by-1
+        # is VERY different from doing the explosion together!
+        self.logger.info("Validated column(s) to explode -> %s", processed_fields)
+        try:
+            self._df = self._df.explode(
+                column=processed_fields,
+                ignore_index=reset_index,
+            )
         except ValueError:
-            logger.error(
-                "Unable to explode the specified column -> '%s'!", str(explode_field)
+            self.logger.error(
+                "Error exploding columns -> %s",
+                processed_fields,
             )
+            return self._df
         if flatten_fields:
-            self.flatten(parent_field=explode_field, flatten_fields=flatten_fields)
+            # Ensure that flatten() is called for each exploded column
+            for field in processed_fields:
+                self.flatten(parent_field=field, flatten_fields=flatten_fields)
+            # Deduplicate rows if required
             if make_unique:
                 self._df.drop_duplicates(subset=flatten_fields, inplace=True)
+        # Reset index explicitly if not handled during explode
         if reset_index:
-            self._df.reset_index(inplace=True)
+            self._df.reset_index(drop=True, inplace=True)
         return self._df
     # end method definition
     def drop_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
-        """Drop selected columns from the Data Frame
+        """Drop selected columns from the Pandas data frame.
         Args:
-            column_names (list): list of column names to drop.
-            inplace (bool, optional): If the dropping should be inplace, i.e. modifying self._df.
-                                      Defaults to True.
+            column_names (list):
+                The list of column names to drop.
+            inplace (bool, optional):
+                Whether or not the dropping should be inplace, i.e. modifying self._df.
+                Defaults to True.
         Returns:
-            pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
+            pd.DataFrame:
+                New data frame (if inplace = False) or self._df (if inplace = True)
         """
         if not all(column_name in self._df.columns for column_name in column_names):
-            # Reduce the column names to those that really exist in the DataFrame:
-            column_names = [
-                column_name
-                for column_name in column_names
-                if column_name in self._df.columns
-            ]
-            logger.warning(
-                "Reduce to these columns -> %s that do exist in the Data Frame.",
+            # Reduce the column names to those that really exist in the data frame:
+            column_names = [column_name for column_name in column_names if column_name in self._df.columns]
+            self.logger.info(
+                "Drop columns -> %s from the data frame.",
                 str(column_names),
             )
@@ -1116,25 +1923,26 @@ class Data:
     # end method definition
     def keep_columns(self, column_names: list, inplace: bool = True) -> pd.DataFrame:
-        """Keep only selected columns from the Data Frame. Drop the rest.
+        """Keep only selected columns in the data frame. Drop the rest.
         Args:
-            column_names (list): list of column names to keep.
-            inplace (bool, optional): If the keeping should be inplace, i.e. modifying self._df.
-                                      Defaults to True.
+            column_names (list):
+                A list of column names to keep.
+            inplace (bool, optional):
+                If the keeping should be inplace, i.e. modifying self._df.
+                Defaults to True.
         Returns:
-            pd.DataFrame: New DataFrame (if inplace = False) or self._df (if inplace = True)
+            pd.DataFrame:
+                New data frame (if inplace = False) or self._df (if inplace = True).
         """
         if not all(column_name in self._df.columns for column_name in column_names):
-            # Reduce the column names to those that really exist in the DataFrame:
-            column_names = [
-                column_name
-                for column_name in column_names
-                if column_name in self._df.columns
-            ]
-            logger.warning(
-                "Reduce to these columns -> %s that do exist in the Data Frame.",
+            # Reduce the column names to those that really exist in the data frame:
+            column_names = [column_name for column_name in column_names if column_name in self._df.columns]
+            self.logger.info(
+                "Reduce columns to keep to these columns -> %s that do exist in the data frame.",
                 column_names,
             )
@@ -1152,272 +1960,797 @@ class Data:
     # end method definition
-    def cleanse(self, cleansings: dict):
-        """Cleanse data with regular expressions and upper/lower case conversion.
+    def rename_column(self, old_column_name: str, new_column_name: str) -> bool:
+        """Rename a data frame column.
+        Args:
+            old_column_name (str):
+                The old name of the column.
+            new_column_name (str):
+                The new name of the column.
+        Returns:
+            bool:
+                True = Success, False = Error
+        """
+        if self._df is None:
+            return False
+        if old_column_name not in self._df.columns:
+            self.logger.error(
+                "Cannot rename column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
+                old_column_name,
+                str(self._df.columns),
+            )
+            return False
+        if new_column_name in self._df.columns:
+            self.logger.error(
+                "Cannot rename column -> '%s' to -> '%s'. New name does already exist as column in the data frame! Data frame has these columns -> %s",
+                old_column_name,
+                new_column_name,
+                str(self._df.columns),
+            )
+            return False
+        self._df.rename(columns={old_column_name: new_column_name}, inplace=True)
+        return True
+    # end method definition
+    def is_dict_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
+        """Safely checks if a column predominantly contains dictionary-like objects.
+        Args:
+            column (pd.Series):
+                The pandas Series (column) to check.
+            threshold (float, optional):
+                0.0 < threshold <= 1.0. Float representation of the percentage.
+                Default = 0.5 (50%).
+        Returns:
+            bool:
+                True if the column contains mostly dictionary-like objects, False otherwise.
+        """
+        if not isinstance(column, pd.Series):
+            self.logger.error(
+                "Expected Pandas series, but got -> %s",
+                str(type(column)),
+            )
+            return False
+        if not 0.0 < threshold <= 1.0:
+            self.logger.error(
+                "Threshold must be between 0.0 and 1.0, but got -> %s",
+                str(threshold),
+            )
+            return False
+        # Drop null values (NaN or None) and check types of remaining values
+        non_null_values = column.dropna()
+        dict_count = non_null_values.apply(lambda x: isinstance(x, dict)).sum()
+        # If more than threshold % of non-null values are dictionaries, return True.
+        # Else return False.
+        return dict_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
+    # end method definition
+    def is_list_column(self, column: pd.Series, threshold: float = 0.5) -> bool:
+        """Safely checks if a column predominantly contains list-like objects.
+        Args:
+            column (pd.Series):
+                The pandas Series (column) to check.
+            threshold (float, optional):
+                0.0 < threshold <= 1.0. Float representation of the percentage. Default = 0.5 (50%).
+        Returns:
+            bool:
+                True if the column contains list-like objects, False otherwise.
+        """
+        if not isinstance(column, pd.Series):
+            self.logger.error(
+                "Expected pandas series, but got -> %s",
+                str(type(column)),
+            )
+            return False
+        if not 0.0 < threshold <= 1.0:
+            self.logger.error(
+                "Threshold must be between 0.0 and 1.0, but got -> %s",
+                str(threshold),
+            )
+            return False
+        # Drop null values (NaN or None) and check types of remaining values
+        non_null_values = column.dropna()
+        list_count = non_null_values.apply(lambda x: isinstance(x, list)).sum()
+        # If more than threshold % of non-null values are lists, return True.
+        # Else return False.
+        return list_count / len(non_null_values) > threshold if len(non_null_values) > 0 else False
+    # end method definition
+    def is_string_column(self, column: pd.Series) -> bool:
+        """Determine if a Pandas series predominantly contains string values, ignoring NaN values.
         Args:
-            cleansings (dict): Dictionary with keys that equal the column names.
-                               The dictionary values are dictionaries itself with
-                               these fields:
-                               * replacements (dict): name of a column in the data frame
-                               * upper (bool): change the value to uppercase
-                               * lower (bool): change the value to lowercase
-                            Example:
-                                cleansings = {
-                                    "airportName": {
-                                        "upper": true
-                                        "replacements" : {
-                                            "-": " ",  # replace hypen with space
-                                            ",\s*": " ",  # remove commas followed by on or more spaces with a single space
-                                            "\s+$": "", # remove trailing spaces at the end of the name
-                                            "^\s+": "", # remove spaces at the beginning of the name
-                                        }
-                                        "length": 10
-                                    }
-                                    "airportId": {
-                                        "upper": true
-                                        "replacements" : {
-                                            "K(.{3})": "\1", # if the airport has 4 charters and starts with a 'K' we remove the 'K'
-                                            "\/": "", # remove forward slashes - this helps to have consistency with N/A, NA, n/a, na
-                                        }
-                                    }
-                                }
+            column (pd.Series):
+                The Pandas Series to check.
+        Returns:
+            bool:
+                True if all non-NaN values in the column are strings, False otherwise.
         """
-        # Iterate over each column in regex_dict
+        # Drop NaN values and check if remaining values are strings
+        return column.dropna().map(lambda x: isinstance(x, str)).all()
+    # end method definition
+    def cleanse(self, cleansings: dict) -> None:
+        """Cleanse data with regular expressions and upper/lower case conversions.
+        Args:
+            cleansings (dict):
+                Dictionary with keys that equal the column names.
+                The dictionary values are dictionaries themselves with
+                these fields:
+                * replacements (dict): name of a column in the data frame
+                * upper (bool, optional, default = False): change the value to uppercase
+                * lower (bool, optional, default = False): change the value to lowercase
+                * capitalize (bool, optional, default = False) - first character upper case, rest lower-case
+                * title (bool, optional, default = False) - first character of each word upper case
+                * length (int, optional, default = 0): truncate to max length
+        """
+        # Iterate over each column in the cleansing dictionary
         for column, cleansing in cleansings.items():
-            # "colum" is the name of the field we want to cleanse.
-            # "cleansing" is a dict with
+            # Read the cleansing parameters:
+            replacements = cleansing.get("replacements", {})
+            upper = cleansing.get("upper", False)
+            lower = cleansing.get("lower", False)
+            capitalize = cleansing.get("capitalize", False)
+            title = cleansing.get("title", False)
+            length = cleansing.get("length", 0)
+            # Handle dict columns - we expect the column name to seperate
+            # main field from sub field using a dot syntax (e.g., "column.subfield")
             if "." in column:
-                # Handle columns with subfields
-                main_field, sub_field = column.split(".")
-                if not main_field in self._df.columns:
+                column, dict_key = column.split(".")
+                if column not in self._df.columns:
+                    self.logger.error(
+                        "Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
+                        column,
+                        str(self._df.columns),
+                    )
                     continue
-                # we use the additional parameters for lambda (beside x)
-                # to avoid linter warning W0640
-                self._df[main_field] = self._df[main_field].apply(
-                    lambda x, sub_field=sub_field, cleansing=cleansing: self._cleanse_subfield(
+                # Apply cleansing to dictionary values in the main column
+                self.logger.info(
+                    "Cleansing for column -> '%s' has a subfield -> '%s' configured. Do cleansing for dictionary items with key -> '%s'...",
+                    column,
+                    dict_key,
+                    dict_key,
+                )
+                self._df[column] = self._df[column].apply(
+                    lambda x,
+                    dict_key=dict_key,
+                    replacements=replacements,
+                    upper=upper,
+                    lower=lower,
+                    capitalize=capitalize,
+                    title=title,
+                    length=length: self._cleanse_subfield(
                         data=x,
-                        sub_field=sub_field,
-                        replacements=cleansing.get("replacements", {}),
-                        upper=cleansing.get("upper", False),
-                        lower=cleansing.get("lower", False),
-                        length=cleansing.get("length", 0),
-                    )
+                        dict_key=dict_key,
+                        replacements=replacements,
+                        upper=upper,
+                        lower=lower,
+                        capitalize=capitalize,
+                        title=title,
+                        length=length,
+                    ),
                 )
-            else:
-                if not column in self._df.columns:
+            # end if "." in column
+            else:  # the else case handles strings and list columns
+                if column not in self._df.columns:
+                    self.logger.error(
+                        "Cannot cleanse column -> '%s'. It does not exist in the data frame! Data frame has these columns -> %s",
+                        column,
+                        str(self._df.columns),
+                    )
                     continue
-                logger.debug("\nBEFORE:\n%s\n", self._df[column])
-                if cleansing.get("upper", False) and self._df[column].dtype == "object":
-                    self._df[column] = self._df[column].str.upper()
-                if cleansing.get("lower", False) and self._df[column].dtype == "object":
-                    self._df[column] = self._df[column].str.lower()
-                # Handle regular columns. regexp_pattern is on the left side
-                # of the colon, and replacement the string on the right side of
-                # the colon:
-                for regex_pattern, replacement in cleansing.get(
-                    "replacements", {}
-                ).items():
-                    if not regex_pattern:
-                        logger.error("Empty search / regexp pattern!")
-                        continue
-                    # \b is a word boundary anchor in regular expressions.
-                    # It matches a position where one side is a word character
-                    # (like a letter or digit) and the other side is a non-word character
-                    # (like whitespace or punctuation). It's used to match whole words.
-                    # We want to have this to e.g. not replace "INT" with "INTERNATIONAL"
-                    # if the word is already "INTERNATIONAL". It is important
-                    # that the \b ... \b enclosure is ONLY used if regex_pattern is NOT
-                    # a regular expression but just a normal string.
-                    # Check if the pattern does NOT contain any regex special characters
-                    # (excluding dot and ampersand) and ONLY then use \b ... \b
-                    # Special regexp characters include: ^ $ * + ? ( ) [ ] { } | \
-                    if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
-                        # Wrap with word boundaries for whole-word matching
-                        regex_pattern = rf"\b{regex_pattern}\b"
-                    self._df[column] = self._df[column].str.replace(
-                        pat=regex_pattern, repl=replacement, regex=True
+                # Handle string columns:
+                if self.is_string_column(self._df[column]):
+                    # Apply cleansing operations on string column
+                    self.logger.info(
+                        "Column -> '%s' has string values. Do cleansing for string values...",
+                        column,
+                    )
+                    self._df[column] = self._df[column].apply(
+                        lambda x,
+                        replacements=replacements,
+                        upper=upper,
+                        lower=lower,
+                        capitalize=capitalize,
+                        title=title,
+                        length=length: (
+                            self._apply_string_cleansing(
+                                value=x,
+                                replacements=replacements,
+                                upper=upper,
+                                lower=lower,
+                                capitalize=capitalize,
+                                title=title,
+                                length=length,
+                            )
+                            if isinstance(x, str)
+                            else x
+                        ),
                     )
-                if (
-                    cleansing.get("length", 0) > 0
-                    and self._df[column].dtype == "object"
-                ):
-                    self._df[column] = self._df[column].str.slice(
-                        0, cleansing["length"]
+                # Handle list columns:
+                elif self.is_list_column(self._df[column]):
+                    # Handle list-like columns for this we iterate over each list item
+                    # and apply the cleansing by calling _apply_string_cleansing() for item:
+                    self.logger.info(
+                        "Column -> '%s' has list values. Do cleansing for each list item...",
+                        column,
+                    )
+                    self._df[column] = self._df[column].apply(
+                        lambda x,
+                        replacements=replacements,
+                        upper=upper,
+                        lower=lower,
+                        capitalize=capitalize,
+                        title=title,
+                        length=length: (
+                            [
+                                (
+                                    self._apply_string_cleansing(
+                                        value=item,
+                                        replacements=replacements,
+                                        upper=upper,
+                                        lower=lower,
+                                        capitalize=capitalize,
+                                        title=title,
+                                        length=length,
+                                    )
+                                    if isinstance(
+                                        item,
+                                        str,
+                                    )  # we just change string list items
+                                    else item
+                                )
+                                for item in x
+                            ]
+                            if isinstance(x, list)
+                            else x
+                        ),
+                    )
+                else:
+                    self.logger.error(
+                        "Column -> '%s' is not a string, list, or dict-like column. Skipping cleansing...",
+                        column,
+                    )
+            # end else handling strings and lists
+        # for column, cleansing in cleansings.items()
+    # end method definition
+    def _cleanse_dictionary(
+        self,
+        data: dict,
+        dict_key: str,
+        replacements: dict[str, str],
+        upper: bool,
+        lower: bool,
+        capitalize: bool = False,
+        title: bool = False,
+        length: int = 0,
+    ) -> dict:
+        """Cleanse dictionary data within a single column value that has a given key.
+        Args:
+            data (dict):
+                The column dictionary value.
+            dict_key (str):
+                The dictionary key whose value should be cleansed in the row to cleanse.
+            replacements (dict):
+                Dictionary of regex replacements to apply to the subfield value.
+            upper (bool):
+                If True, convert value in subfield to upper-case.
+            lower (bool):
+                If True, convert value in subfield to lower-case.
+            capitalize (bool, optional):
+                If True, capitalize the first letter of the subfield value.
+            title (bool, optional):
+                If True, title-case the subfield value.
+            length (int, optional):
+                The maximum length for the subfield value.
+        Returns:
+            dict:
+                The updated data with the cleansing applied to the dictionary item with the given key.
+        """
+        if pd.isna(data):
+            return data
+        if dict_key not in data:
+            self.logger.warning(
+                "The dictionary key -> '%s' (field) is not in the data frame row! Cleansing skipped!",
+                dict_key,
+            )
+            return data
+        # 1. Read the value to be cleansed from the data dict:
+        value = data[dict_key]
+        # 2. Apply string operations based on the type of the value (str, list, or dict)
+        if isinstance(value, str):
+            # If the value is a string, apply the string operations directly
+            value: str = self._apply_string_cleansing(
+                value=value,
+                replacements=replacements,
+                upper=upper,
+                lower=lower,
+                capitalize=capitalize,
+                title=title,
+                length=length,
+            )
+        elif isinstance(value, list):
+            # If the value is a list, apply string operations to each element
+            value: list = [
+                (
+                    self._apply_string_cleansing(
+                        value=item,
+                        replacements=replacements,
+                        upper=upper,
+                        lower=lower,
+                        capitalize=capitalize,
+                        title=title,
+                        length=length,
+                    )
+                    if isinstance(item, str)
+                    else item
+                )
+                for item in value
+            ]
+        elif isinstance(value, dict):
+            # If the value is a dictionary, apply string operations to each value
+            value: dict = {
+                k: (
+                    self._apply_string_cleansing(
+                        value=v,
+                        replacements=replacements,
+                        upper=upper,
+                        lower=lower,
+                        capitalize=capitalize,
+                        title=title,
+                        length=length,
                     )
+                    if isinstance(v, str)
+                    else v
+                )
+                for k, v in value.items()
+            }
+        # 3. Write back the cleansed value to the data dict:
+        data[dict_key] = value
-                logger.debug("\nAFTER:\n%s\n", self._df[column])
+        return data
     # end method definition
     def _cleanse_subfield(
         self,
-        data: list | dict,
-        sub_field: str,
-        replacements: dict,
+        data: dict | list,
+        dict_key: str,
+        replacements: dict[str, str],
         upper: bool,
         lower: bool,
+        capitalize: bool = False,
+        title: bool = False,
         length: int = 0,
-    ) -> list | dict:
-        """Helper function to cleanse subfield data
+    ) -> dict | list:
+        """Cleanse subfield data within a single column value.
+        This is NOT a pd.Series but either a dictionary or a list of dictionaries.
         Args:
-            data (list | dict): sub data - either a list of dictionaries or a dictionary
-            sub_field (str): defines which field in the sub data should be updated
-            regex_replacements (dict): Dictionary of regular expressions
-            upper (bool): if True transform value in subfield to upper-case
-            lower (bool): if True, transform value in subfield to lower-case
-            length (int, optional): maximum length of the strings
+            data (dict | list):
+                The column value. Can be a dictionary or a list of dictionaries
+            dict_key (str):
+                The dictionary key whose value should be cleansed in the data to cleanse.
+            replacements (dict):
+                Dictionary of regex replacements to apply to the subfield value.
+            upper (bool):
+                If True, convert value in subfield to upper-case.
+            lower (bool):
+                If True, convert value in subfield to lower-case.
+            capitalize (bool, optional):
+                If True, capitalize the first letter of the subfield value.
+            title (bool, optional):
+                If True, title-case the subfield value.
+            length (int, optional):
+                The maximum length for the subfield value.
         Returns:
-            list | dict: Updated data
+            dict | list:
+                The updated data with the cleansing applied to the subfield.
         """
         if isinstance(data, list):
-            # If data is a list, apply cleansing to each dictionary in the list
-            for i, item in enumerate(data):
-                if (
-                    item is not None
-                    and sub_field in item
-                    and not pd.isnull(item[sub_field])
-                ):
-                    if upper:
-                        item[sub_field] = item[sub_field].upper()
-                    elif lower:
-                        item[sub_field] = item[sub_field].lower()
-                    for regex_pattern, replacement in replacements.items():
-                        if replacement:
-                            regex_pattern = rf"\b{regex_pattern}\b"
-                        item[sub_field] = re.sub(
-                            regex_pattern, replacement, item[sub_field]
-                        )
-                    if length > 0:
-                        item[sub_field] = item[sub_field][:length]
-                data[i] = item
-        elif isinstance(data, dict):
-            # If data is a dictionary, apply cleansing directly to the subfield
-            if sub_field in data and not pd.isnull(data[sub_field]):
-                if upper:
-                    data[sub_field] = data[sub_field].upper()
-                elif lower:
-                    data[sub_field] = data[sub_field].lower()
-                for regex_pattern, replacement in replacements.items():
-                    if replacement:
-                        regex_pattern = rf"\b{regex_pattern}\b"
-                    data[sub_field] = re.sub(
-                        regex_pattern, replacement, data[sub_field]
+            data = [
+                (
+                    self._cleanse_dictionary(
+                        data=item,
+                        dict_key=dict_key,
+                        replacements=replacements,
+                        upper=upper,
+                        lower=lower,
+                        capitalize=capitalize,
+                        title=title,
+                        length=length,
                     )
-                if length > 0:
-                    data[sub_field] = data[sub_field][:length]
+                    if item is not None and dict_key in item and not pd.isna(item[dict_key])
+                    else item
+                )
+                for item in data
+            ]
+        elif isinstance(data, dict):
+            data = self._cleanse_dictionary(
+                data=data,
+                dict_key=dict_key,
+                replacements=replacements,
+                upper=upper,
+                lower=lower,
+                capitalize=capitalize,
+                title=title,
+                length=length,
+            )
         return data
     # end method definition
-    def filter(self, conditions: list, inplace: bool = True) -> pd.DataFrame:
-        """Filter the DataFrame based on (multiple) conditions.
+    def _apply_string_cleansing(
+        self,
+        value: str,
+        replacements: dict[str, str],
+        upper: bool,
+        lower: bool,
+        capitalize: bool,
+        title: bool,
+        length: int,
+    ) -> str | None:
+        """Apply string operations (upper, lower, capitalize, title-case, replacements) to a string.
+        Args:
+            value (str):
+                The string value to which the operations will be applied.
+            replacements (dict[str, str]):
+                A dictionary of regular expression patterns (keys) and replacement strings (values) to apply to the string.
+            upper (bool):
+                If True, convert the string to uppercase.
+            lower (bool):
+                If True, convert the string to lowercase.
+            capitalize (bool):
+                If True, capitalize the first letter of the string and lowercase the rest. Default is False.
+            title (bool):
+                If True, convert the string to title-case (first letter of each word is capitalized). Default is False.
+            length (int):
+                If greater than 0, truncate the string to this length. Default is 0 (no truncation).
+        Returns:
+            str | None:
+                The updated string with all the applied operations. None in case an error occured.
+        Example:
+            value = "hello world"
+            replacements = {r"world": "there"}
+            upper = True
+            length = 5
+            result = _apply_string_cleansing(value, replacements, upper, length=length)
+            # result would be "HELLO"
+        """
+        if not isinstance(
+            value,
+            str,
+        ):  # Only apply string operations if the value is a string
+            return None
+        if upper:
+            value = value.upper()
+        if lower:
+            value = value.lower()
+        if capitalize:
+            value = value.capitalize()
+        if title:
+            value = value.title()
+        # Handle regex replacements
+        for regex_pattern, replacement in replacements.items():
+            if regex_pattern:
+                # Check if the pattern does NOT contain any regex special characters
+                # (excluding dot and ampersand) and ONLY then use \b ... \b
+                # Special regexp characters include: ^ $ * + ? ( ) | [ ] { } \
+                if not re.search(r"[\\^$*+?()|[\]{}]", regex_pattern):
+                    # Wrap with word boundaries for whole-word matching
+                    # \b is a word boundary anchor in regular expressions.
+                    # It matches a position where one side is a word character
+                    # (like a letter or digit) and the other side is a non-word character
+                    # (like whitespace or punctuation). It's used to match whole words.
+                    # We want to have this to e.g. not replace "INT" with "INTERNATIONAL"
+                    # if the word is already "INTERNATIONAL". It is important
+                    # that the \b ... \b enclosure is ONLY used if regex_pattern is NOT
+                    # a regular expression but just a normal string.
+                    # TODO: we may reconsider if re.escape() is required or not:
+                    regex_pattern = re.escape(regex_pattern)
+                    regex_pattern = rf"\b{regex_pattern}\b"
+                try:
+                    value = re.sub(regex_pattern, replacement, value)
+                except re.error:
+                    self.logger.error(
+                        "Invalid regex pattern -> '%s' in replacement processing!",
+                        regex_pattern,
+                    )
+                    continue
+        # Truncate to the specified length, starting from index 0
+        if 0 < length < len(value):
+            value = value[:length]
+        return value
+    # end method definition
+    def filter(
+        self,
+        conditions: list,
+        inplace: bool = True,
+        reset_index: bool = True,
+    ) -> pd.DataFrame | None:
+        """Filter the data frame based on (multiple) conditions.
         Args:
-            conditions (list): Conditions are a list of dictionaries with 3 items:
-                               * field (str): name of a column in the data frame
-                               * value (str or list): expected value (filter criterium).
-                                                      If it is a list then one of
-                                                      the list elements must match the field value (OR)
-                               * regex (bool): this flag controls if the value is interpreted as a
-                                               regular expression. If there is no regex item in the
-                                               dictionary then the default is False (= values is NOT regex).
-                               If there are multiple conditions in the list each has to evaluate to True (AND)
-            inplace (bool, optional): Defines if the self._df is modified (inplace) or just
-                                      a new DataFrame is returned. Defaults to True.
+            conditions (list):
+                Conditions are a list of dictionaries with 3 items:
+                * field (str): The name of a column in the data frame
+                * value (str or list):
+                    Expected value (filter criterium).
+                    If it is a list then one of the list elements must match the field value (OR)
+                * equal (bool):
+                    Whether to test for equal or non-equal. If not specified equal is treated as True.
+                * regex (bool):
+                    This flag controls if the value is interpreted as a
+                    regular expression. If there is no regex item in the
+                    dictionary then the default is False (= values is NOT regex).
+                * enabled (bool):
+                    True or False. The filter is only applied if 'enabled = True'
+                If there are multiple conditions in the list each has to evaluate to True (AND)
+            inplace (bool, optional):
+                Defines if the self._df is modified (inplace) or just
+                a new data frame is returned. Defaults to True.
+            reset_index (bool, optional):
+                Filter removes rows. If filter_index = True then the numbering
+                of the index is newly calculated
         Returns:
-            pd.DataFrame: new data frame or pointer to self._df (depending on the value of 'inplace')
+            pd.DataFrame | None:
+                A new data frame or pointer to self._df (depending on the value of 'inplace').
+                None in case of an error.
         """
         if self._df is None:
-            logger.error("DataFrame is not initialized.")
+            self.logger.error("Data frame is not initialized.")
             return None
         if self._df.empty:
-            logger.error("DataFrame is empty.")
+            self.logger.error("Data frame is empty.")
             return None
-        # first filtered_df is the full DataFreame.
-        # then it is subsequentially reduced by each condition
+        # First filtered_df is the full data frame.
+        # Then it is subsequentially reduced by each condition
         # at the end it is just those rows that match all conditions.
-        filtered_df = self._df
+        filtered_df = self._df if inplace else self._df.copy()
+        def list_matches(row: list, values: list) -> bool:
+            """Check if any item in the 'values' list is present in the given 'row' list.
+            Args:
+                row (list):
+                    A list of items from the data frame column.
+                values (list):
+                    A list of values to check for in the 'row'.
+            Returns:
+                bool:
+                    True if any item in 'values' is found in 'row', otherwise False.
+            """
+            return any(item in values for item in row)
+        def dict_matches(row: dict, key: str, values: list) -> bool:
+            """Check if the value for the dictionary 'key' is in 'values'.
-        # We traverse a list of conditions. Each condition must evaluate to true
+            Args:
+                row (dict):
+                    A dictionary from the data frame column.
+                key (str):
+                    The key to lookup in the dictionary.
+                values (list):
+                    A list of values to check for in the 'row'.
+            Returns:
+                bool:
+                    True, if the value for the dictionary key is in 'values', otherwise False.
+            """
+            if not row or key not in row:
+                return False
+            return row[key] in values
+        # We traverse a list of conditions. Each condition must evaluate to True
         # otherwise the current workspace or document (i.e. the data set for these objects)
-        # will be skipped. The variable filtered_df is
+        # will be skipped.
         for condition in conditions:
+            # Check if the condition is enabled. If 'enabled' is not
+            # in the condition dict then we assume it is enabled.
+            if not condition.get("enabled", True):
+                continue
             field = condition.get("field", None)
             if not field:
-                logger.error("Missing value for filter condition 'field' in payload!")
+                self.logger.error(
+                    "Missing value for filter condition 'field' in payload!",
+                )
                 continue
+            if "." in field:
+                field, sub = field.split(".", 1)
+            else:
+                sub = None
             if field not in self._df.columns:
-                logger.warning(
-                    "Filter condition field -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
+                self.logger.warning(
+                    "Filter condition field -> '%s' does not exist as column in the data frame! Data frame has these columns -> %s",
                     field,
                     str(self._df.columns),
                 )
-                continue  # Skip filtering for columns not present in DataFrame
+                continue  # Skip filtering for columns not present in data frame
+            regex = condition.get("regex", False)
+            # We need the column to be of type string if we want to use regular expressions
+            # so if the column is not yet a string we convert the column to string:
+            if regex and filtered_df[field].dtype != "object":
+                # Change type of column to string:
+                filtered_df[field] = filtered_df[field].astype(str)
+                filtered_df[field] = filtered_df[field].fillna("")
             value = condition.get("value", None)
-            if not value:
-                logger.error(
-                    "Missing filter value of for filter condition field -> '%s'!", field
+            if value is None:
+                # Support alternative syntax using plural.
+                value = condition.get("values", None)
+            if value is None:
+                self.logger.error(
+                    "Missing filter value(s) for filter condition field -> '%s'!",
+                    field,
                 )
                 continue
-            regex = condition.get("regex", False)
-            logger.info(
-                "Data Frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
-                filtered_df.shape[0],
-                filtered_df.shape[1],
-                str(condition),
-            )
-            filtered_dfs = []
             # if a single string is passed as value we put
             # it into an 1-item list to simplify the following code:
             if not isinstance(value, list):
                 value = [value]
-            # multiple values are treated like a logical "or" condition
-            for value_item in value:
-                if regex:
-                    filtered_dfs.append(
-                        filtered_df[
-                            ~filtered_df[field].isna()
-                            & filtered_df[field].str.contains(value_item, regex=True)
-                        ]
+            # If all values in the condition are strings then we
+            # want the column also to be of type string:
+            if all(isinstance(v, str) for v in value):
+                # Change type of column to string:
+                #                filtered_df[field] = filtered_df[field].astype(str)
+                #                filtered_df[field] = filtered_df[field].fillna("").astype(str)
+                #                filtered_df[field] = filtered_df[field].fillna("")
+                # When inplace == True, filtered_df is just a reference to self._df.
+                # Using .loc[:, field] ensures that Pandas updates the column correctly in self._df.
+                # When inplace == False, filtered_df is a full copy (self._df.copy() above),
+                # so modifications remain in filtered_df.
+                # .loc[:, field] ensures no SettingWithCopyWarning, since filtered_df is now a separate DataFrame.
+                filtered_df.loc[:, field] = filtered_df[field].fillna("").astype(str)
+            self.logger.info(
+                "Data frame has %s row(s) and %s column(s) before filter -> %s has been applied.",
+                str(filtered_df.shape[0]),
+                str(filtered_df.shape[1]),
+                str(condition),
+            )
+            # Check if the column is boolean
+            if pd.api.types.is_bool_dtype(filtered_df[field]):
+                # Convert string representations of booleans to actual booleans
+                value = [v.lower() in ["true", "1"] if isinstance(v, str) else bool(v) for v in value]
+            # Do we want to test for equalitiy or non-equality?
+            # For lists equality means: value is in the list
+            # For lists non-equality means: value is NOT in the list
+            test_for_equal = condition.get("equal", True)
+            # Check if the column contains only lists (every non-empty element in the column is a list).
+            # `filtered_df[field]`: Access the column with the name specified in 'field'.
+            # `.dropna()`: Drop None or NaN rows for the test.
+            # `.apply(lambda x: isinstance(x, list))`: For each element in the column, check if it is a list.
+            # `.all()`: Ensure that all elements in the column satisfy the condition of being a list.
+            if filtered_df[field].dropna().apply(lambda x: isinstance(x, list)).all():
+                if not test_for_equal:
+                    filtered_df = filtered_df[~filtered_df[field].apply(list_matches, values=value)]
+                else:
+                    filtered_df = filtered_df[filtered_df[field].apply(list_matches, values=value)]
+            # Check if the column contains only dictionaries (every non-empty element in the column is a dict).
+            # `filtered_df[field]`: Access the column with the name specified in 'field'.
+            # `.dropna()`: Drop None or NaN rows for the test.
+            # `.apply(lambda x: isinstance(x, dict))`: For each element in the column, check if it is a dict.
+            # `.all()`: Ensure that all elements in the column satisfy the condition of being a dictionary.
+            elif filtered_df[field].dropna().apply(lambda x: isinstance(x, dict)).all():
+                if not sub:
+                    self.logger.error(
+                        "Filtering on dictionary values need a key. This needs to be provided with 'field.key' syntax!",
                     )
+                    continue
+                if not test_for_equal:
+                    filtered_df = filtered_df[~filtered_df[field].apply(dict_matches, key=sub, values=value)]
                 else:
-                    result_df = filtered_df[
-                        ~filtered_df[field].isna() & filtered_df[field].eq(value_item)
-                    ]
-                    if not result_df.empty:
-                        filtered_dfs.append(result_df)
-            # end for values
-            if not filtered_dfs:
-                logger.warning(
-                    "Filter with field -> '%s' and value -> '%s' delivered an empty Data Frame",
-                    field,
-                    str(value),
-                )
-                filtered_df.drop(filtered_df.index, inplace=True)
+                    filtered_df = filtered_df[filtered_df[field].apply(dict_matches, key=sub, values=value)]
+            # Check if the column has boolean values:
+            elif pd.api.types.is_bool_dtype(filtered_df[field]):
+                # For a boolean filter we can drop NA values:
+                filtered_df = filtered_df.dropna(subset=[field])
+                if not test_for_equal:
+                    filtered_df = filtered_df[~filtered_df[field].isin(value)]
+                else:
+                    filtered_df = filtered_df[filtered_df[field].isin(value)]
+            elif not regex:
+                if pd.api.types.is_string_dtype(filtered_df[field]):
+                    filtered_df[field] = filtered_df[field].str.strip()
+                if not test_for_equal:
+                    filtered_df = filtered_df[~filtered_df[field].isin(value)]
+                else:
+                    filtered_df = filtered_df[filtered_df[field].isin(value)]
             else:
-                # Concatenate the filtered DataFrames for each value in the list
-                filtered_df = pd.concat(filtered_dfs, ignore_index=True)
-            logger.info(
-                "Data Frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
-                filtered_df.shape[0],
-                filtered_df.shape[1],
+                # Create a pure boolean pd.Series as a filter criterium:
+                regex_condition = filtered_df[field].str.contains(
+                    "|".join(value),
+                    regex=True,
+                    na=False,
+                )
+                # Apply the boolean pd.Series named 'regex_condition' as
+                # a filter - either non-negated or negated (using ~):
+                filtered_df = filtered_df[~regex_condition] if not test_for_equal else filtered_df[regex_condition]
+            self.logger.info(
+                "Data frame has %s row(s) and %s column(s) after filter -> %s has been applied.",
+                str(filtered_df.shape[0]),
+                str(filtered_df.shape[1]),
                 str(condition),
             )
         # end for condition
@@ -1425,23 +2758,29 @@ class Data:
         if inplace:
             self._df = filtered_df
+            if reset_index:
+                self._df.reset_index(inplace=True, drop=True)
         return filtered_df
     # end method definition
-    def fill_na_in_column(self, column_name: str, default_value: str | int):
-        """Replace NA values in a column with a defined new default value
+    def fill_na_in_column(self, column_name: str, default_value: str | int) -> None:
+        """Replace NA values in a column with a defined new default value.
         Args:
-            column_name (str): name of the column in the DataFrame
-            default_value (str | int): value to replace NA with
+            column_name (str):
+                The name of the column in the data frame.
+            default_value (str | int):
+                The value to replace NA with.
         """
         if column_name in self._df.columns:
             self._df[column_name] = self._df[column_name].fillna(value=default_value)
         else:
-            logger.error(
-                "Cannot replace NA values as column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
+            self.logger.error(
+                "Cannot replace NA values as column -> '%s' does not exist in the data frame! Available columns -> %s",
                 column_name,
                 str(self._df.columns),
             )
@@ -1449,16 +2788,19 @@ class Data:
     # end method definition
     def fill_forward(self, inplace: bool) -> pd.DataFrame:
-        """Fill the missing cells appropriately by carrying forward
-           the values from the previous rows where necessary.
-           This has applications if a hierarchy is represented by
-           nested cells e.g. in an Excel sheet.
+        """Fill the missing cells appropriately by carrying forward the values from the previous rows where necessary.
+        This has applications if a hierarchy is represented by
+        nested cells e.g. in an Excel sheet.
         Args:
-            inplace (bool): Should the modification happen inplace or not.
+            inplace (bool):
+                Should the modification happen inplace or not.
         Returns:
-            pd.DataFrame: Resulting dataframe
+            pd.DataFrame:
+                The resulting data frame.
         """
         # To convert an Excel representation of a folder structure with nested
@@ -1471,70 +2813,137 @@ class Data:
     # end method definition
     def lookup_value(
-        self, lookup_column: str, lookup_value: str, separator: str = "|"
-    ) -> pd.Series | None:
-        """Lookup a row that includes a lookup value in the value of a given column.
+        self,
+        lookup_column: str,
+        lookup_value: str,
+        separator: str = "|",
+        single_row: bool = True,
+    ) -> pd.Series | pd.DataFrame | None:
+        """Lookup row(s) that includes a lookup value in the value of a given column.
         Args:
-            lookup_column (str): name of the column to search in
-            lookup_value (str): value to search for
-            separator (str): string list delimiter / separator
+            lookup_column (str):
+                The name of the column to search in.
+            lookup_value (str):
+                The value to search for.
+            separator (str):
+                The string list delimiter / separator. The pipe symbol | is the default
+                as it is unlikely to appear in a normal string (other than a plain comma).
+                The separator is NOT looked for in the lookup_value but in the column that
+                is given by lookup_column!
+            single_row (bool, optional):
+                This defines if we just return the first matching row if multiple matching rows
+                are found. Default is True (= single row).
         Returns:
-            pd.Series | None: data frame row that matches or None if no match was found.
+            pd.Series | pd.DataFrame | None:
+                Data frame (multiple rows) or Series (row) that matches the lookup value.
+                None if no match was found.
         """
-        # Use the `apply` function to filter rows where the lookup value matches a whole item in the comma-separated list
-        def match_lookup_value(string_list: str) -> bool:
-            """Spilt delimiter-separated list into a python list
+        # Use the `apply` function to filter rows where the lookup value matches a
+        # whole item in the separator-divided list:
+        def match_lookup_value(string_list: str | None) -> bool:
+            """Check if the lookup value is in a string list.
+            For this the string list is converted to a python
+            list. A separator is used for the splitting.
             Args:
-                string_list (str): delimiter-separated string list like "a, b, c" or "a | b | c"
+                string_list (str):
+                    Delimiter-separated string list like "a, b, c" or "a | b | c"
             Returns:
-                bool: True if lookup_value is equal to one of the delimiter-separated terms
+                bool:
+                    True if lookup_value is equal to one of the delimiter-separated terms.
             """
+            if pd.isna(string_list):  # Handle None/NaN safely
+                return False
             # Ensure that the string is a string
             string_list = str(string_list)
-            return lookup_value in [
-                item.strip() for item in string_list.split(separator)
-            ]
+            return lookup_value in [item.strip() for item in string_list.split(separator)]
-        df = self._df
+        # end method definition
         if self._df is None:
             return None
+        df = self._df
         if lookup_column not in self._df.columns:
-            logger.error(
-                "Column -> '%s' does not exist in the Data Frame! Data Frame has these columns -> %s",
+            self.logger.error(
+                "Cannot lookup value in column -> '%s'. Column does not exist in the data frame! Data frame has these columns -> %s",
                 lookup_column,
                 str(self._df.columns),
             )
             return None
         # Fill NaN or None values in the lookup column with empty strings
-        df[lookup_column] = df[lookup_column].fillna("")
+        # df[lookup_column] = df[lookup_column].fillna("")
+        # Use the `apply` function to filter rows where the lookup value is in row cell
+        # of column given by lookup_column. match_lookup_value() is called with
+        # the content of the individual cell contents:
+        matched_rows = df[df[lookup_column].apply(match_lookup_value)]
+        # If nothing was found we return None:
+        if matched_rows.empty:
+            return None
-        # Use the `apply` function to filter rows where the lookup value is in the Synonyms list
-        matched_row = df[df[lookup_column].apply(match_lookup_value)]
+        # If it is OK to have multiple matches (= multiple rows = pd.DataFrame).
+        # We can just return the matched_rows now which should be a pd.DataFrame:
+        if not single_row:
+            return matched_rows
+        # Check if more than one row matches, and log a warning if so
+        if len(matched_rows) > 1:
+            self.logger.warning(
+                "More than one match found for lookup value -> '%s' in column -> '%s'. Returning the first match.",
+                lookup_value,
+                lookup_column,
+            )
         # Return the first matched row, if any
-        if not matched_row.empty:
-            return matched_row.iloc[0]
+        return matched_rows.iloc[0]
-        return None
+    # end method definition
+    def set_value(self, column: str, value, condition: pd.Series | None = None) -> None:  # noqa: ANN001
+        """Set the value in the data frame based on a condition.
+        Args:
+            column (str):
+                The name of the column.
+            value (Any):
+                The value to set for those rows that fulfill the condition.
+            condition (pd.Series, optional):
+                This should be a boolean Series where each element is True or False,
+                representing rows in the data frame that meet a certain condition.
+                If None is provided then ALL rows get the 'value' in the given
+                column.
+        """
+        if condition is None:
+            self._df[column] = value  # Set value unconditionally
+        else:
+            self._df.loc[condition, column] = value  # Set value based on condition
     # end method definition
     def add_column(
         self,
-        source_column: str,
-        reg_exp: str,
         new_column: str,
-        prefix="",
-        suffix="",
+        data_type: str = "string",
+        source_column: str = "",
+        reg_exp: str = "",
+        prefix: str = "",
+        suffix: str = "",
         length: int | None = None,
         group_chars: int | None = None,
         group_separator: str = ".",
@@ -1543,26 +2952,78 @@ class Data:
         """Add additional column to the data frame.
         Args:
-            source_column (str): name of the source column
-            reg_exp (str): regular expression to apply on the content of the source column
-            new_column (str): name of the column to add
-            prefix (str, optional): Prefix to add in front of the value. Defaults to "".
-            suffix (str, optional): Suffix to add at the end of the value. Defaults to "".
-            length (int | None, optional): Length to reduce to. Defaults to None (= unlimited).
-            group_chars (int | None, optional): group the resulting string in characters of group_chars. Defaults to None.
-                                                Usable e.g. for thousand seperator "."
-            group_separator (str, optional): Separator string for the grouping. Defaults to ".".
-            group_remove_leading_zero (bool, optional): Remove leading zeros from the groups. Defaults to True.
+            new_column (str):
+                The name of the column to add.
+            data_type (str, optional):
+                The data type of the new column.
+            source_column (str, optional):
+                The name of the source column.
+            reg_exp (str, optional):
+                A regular expression to apply on the content of the source column.
+            prefix (str, optional):
+                Prefix to add in front of the value. Defaults to "".
+            suffix (str, optional):
+                Suffix to add at the end of the value. Defaults to "".
+            length (int | None, optional):
+                Length to reduce to. Defaults to None (= unlimited).
+            group_chars (int | None, optional):
+                Group the resulting string in characters of group_chars. Defaults to None.
+                Usable e.g. for thousand seperator "."
+            group_separator (str, optional):
+                Separator string for the grouping. Defaults to ".".
+            group_remove_leading_zero (bool, optional):
+                Remove leading zeros from the groups. Defaults to True.
         Returns:
-            bool: True = Success, False = Failure
+            bool:
+                True = Success, False = Failure
         """
         if self._df is None:
             return False
+        # Check that the new column does not yet exist
+        if new_column in self._df.columns:
+            self.logger.error(
+                "New column -> '%s' does already exist in data frame! Cannot add it. Data frame has these columns -> %s",
+                new_column,
+                str(self._df.columns),
+            )
+            return False
+        # first we handle the very simple case to not have
+        # a source column but just add an empty new column:
+        if not source_column:
+            self._df[new_column] = pd.Series(dtype=data_type)
+            return True
+        # Check if the source column exists
+        if source_column not in self._df.columns:
+            self.logger.error(
+                "Source column -> '%s' does not exist as column in data frame! Data frame has these columns -> %s",
+                source_column,
+                str(self._df.columns),
+            )
+            return False
+        # Validate the regex pattern
+        try:
+            re.compile(reg_exp)  # Check if the pattern is a valid regex
+        except re.error:
+            self.logger.error(
+                "Invalid regular expression -> %s. Cannot extract data for new column -> '%s'!",
+                reg_exp,
+                new_column,
+            )
+            return False
+        # Ensure the source column is of type string (convert it, if necessary)
+        if self._df[source_column].dtype != "object":
+            self._df[source_column] = self._df[source_column].astype(str)
         # Use str.extract to apply the regular expression to the source column
-        # and then assign this modified colum to the variable extracted:
+        # and then assign this modified column to the variable "extracted":
         extracted = self._df[source_column].str.extract(pat=reg_exp, expand=False)
         # Limit the result to the specified length
@@ -1571,9 +3032,9 @@ class Data:
         if group_chars is not None:
-            def process_grouping(x):
+            def process_grouping(x) -> str | None:  # noqa: ANN001
                 if pd.isna(x):
-                    return x
+                    return None
                 # Split into groups
                 groups = [x[i : i + group_chars] for i in range(0, len(x), group_chars)]
                 if group_remove_leading_zero:
@@ -1594,21 +3055,36 @@ class Data:
     # end method definition
-    def convert_to_lists(self, columns: list, delimiter: str = ","):
-        """Method to intelligently convert strings to lists, with a configurable delimiter,
-           ignoring delimiters inside quotes
+    def convert_to_lists(self, columns: list, delimiter: str = ",") -> None:
+        """Intelligently convert string values to list values, in defined data frame columns.
+        The delimiter to separate values in the string value can be configured.
+        The method is ignoring delimiters that are inside quotes.
         Args:
-            columns (list): name of the columns whose values should be converted to lists.
-                            It is expected that
-            delimiter (str, optional): Character that delimits list items. Defaults to ",".
+            columns (list):
+                The name of the columns whose values should be converted to lists.
+            delimiter (str, optional):
+                Character that delimits list items. Defaults to ",".
         Returns:
             None. self._df is modified in place.
         """
         # Regex to split by the delimiter, ignoring those inside quotes or double quotes
-        def split_string_ignoring_quotes(s, delimiter):
+        def split_string_ignoring_quotes(s: str, delimiter: str) -> list:
+            """Split a string into a list at positions that have a delimiter character.
+            Args:
+                s (str): the string to split
+                delimiter (str): The single character that is used for splitting.
+            Returns:
+                A list of splitted values.
+            """
             # Escaping the delimiter in case it's a special regex character
             delimiter = re.escape(delimiter)
             # Match quoted strings and unquoted delimiters separately
@@ -1617,27 +3093,84 @@ class Data:
         for col in columns:
             self._df[col] = self._df[col].apply(
-                lambda x: (
-                    split_string_ignoring_quotes(x, delimiter)
-                    if isinstance(x, str) and delimiter in x
-                    else x
-                )
+                lambda x: (split_string_ignoring_quotes(x, delimiter) if isinstance(x, str) and delimiter in x else x),
+            )
+    # end method definition
+    def add_column_concat(
+        self,
+        source_columns: list,
+        new_column: str,
+        concat_char: str = "",
+        upper: bool = False,
+        lower: bool = False,
+        capitalize: bool = False,
+        title: bool = False,
+    ) -> None:
+        """Add a column as a concatenation of the values of multiple source columns.
+        Args:
+            source_columns (list):
+                The column names the list values are taken from.
+            new_column (str):
+                The name of the new column.
+            concat_char (str, optional):
+                Character to insert between the concatenated values. Default is "".
+            upper (bool, optional):
+                Convert result to uppercase if True.
+            lower (bool, optional):
+                Convert result to lowercase if True.
+            capitalize (bool, optional):
+                Capitalize the result if True.
+            title (bool, optional):
+                Convert result to title case if True.
+        Returns:
+            None. self._df is modified in place.
+        """
+        def concatenate(row: pd.Series) -> str:
+            # Comprehension to create a list from all source column values:
+            concatenated = concat_char.join(
+                [str(row[col]) for col in source_columns if pd.notna(row[col])],
             )
+            # Apply case transformations based on parameters
+            if upper:
+                concatenated = concatenated.upper()
+            elif lower:
+                concatenated = concatenated.lower()
+            elif capitalize:
+                concatenated = concatenated.capitalize()
+            elif title:
+                concatenated = concatenated.title()
+        # end method definition
+        self._df[new_column] = self._df.apply(concatenate, axis=1)
     # end method definition
-    def add_column_list(self, source_columns: list, new_column: str):
-        """Add a column with list objects. The list items are taken from a list of
-           source columns (row by row).
+    def add_column_list(self, source_columns: list, new_column: str) -> None:
+        """Add a column with list objects.
+        The list items are taken from a list of source columns (row by row).
         Args:
-            source_columns (list): column names the list values are taken from
-            new_column (str): name of the new column
+            source_columns (list):
+                The column names the list values are taken from.
+            new_column (str):
+                The name of the new column.
         Returns:
             None. self._df is modified in place.
         """
-        def create_list(row):
+        def create_list(row: pd.Series) -> list:
+            # Comprehension to create a list from all source column values:
             return [row[col] for col in source_columns]
         self._df[new_column] = self._df.apply(create_list, axis=1)
@@ -1645,87 +3178,90 @@ class Data:
     # end method definition
     def add_column_table(
-        self, source_columns: list, new_column: str, delimiter: str = ","
-    ):
-        """Add a column with tabular objects (list of dictionaris). The
-           source columns should include lists. The resulting dictionary
-           keys are the column names for the source columns.
-           Example:
-           X[1] = 1, 2, 3
-           Y[1] = A, B, C
-           X[2] = 4, 5, 6
-           Y[2] = D, E, F
-           Table[1] = [
-                {
-                    "X": "1"
-                    "Y": "A"
-                },
-                {
-                    "X": "2"
-                    "Y": "B"
-                }
-                {
-                    "X": "3"
-                    "Y": "C"
-                }
-           ]
-           Table[2] = [
-                {
-                    "X": "4"
-                    "Y": "D"
-                },
-                {
-                    "X": "5"
-                    "Y": "E"
-                }
-                {
-                    "X": "6"
-                    "Y": "F"
-                }
-           ]
+        self,
+        source_columns: list,
+        new_column: str,
+        delimiter: str = ",",
+    ) -> None:
+        """Add a column with tabular objects (list of dictionaries).
+        The source columns should include lists. The resulting dictionary
+        keys are the column names for the source columns.
+        Example (["X", "Y"] are the source_columns, "Table" is the new_column):
+        X[1] = [1, 2, 3]         # row 1
+        Y[1] = ["A", "B", "C"]   # row 1
+        X[2] = [4, 5, 6]         # row 2
+        Y[2] = ["D", "E", "F"]   # row 2
+        Table[1] = [
+            {
+                "X": "1"
+                "Y": "A"
+            },
+            {
+                "X": "2"
+                "Y": "B"
+            }
+            {
+                "X": "3"
+                "Y": "C"
+            }
+        ]
+        Table[2] = [
+            {
+                "X": "4"
+                "Y": "D"
+            },
+            {
+                "X": "5"
+                "Y": "E"
+            }
+            {
+                "X": "6"
+                "Y": "F"
+            }
+        ]
         Args:
-            source_columns (list): column names the list values are taken from
-            new_column (str): name of the new column
-            delimiter (str, optional): Character that delimits list items. Defaults to ",".
+            source_columns (list):
+                The column names the list values are taken from.
+            new_column (str):
+                The name of the new column.
+            delimiter (str, optional):
+                Character that delimits list items. Defaults to ",".
         Returns:
             None. self._df is modified in place.
         """
         # Call the convert_to_lists method to ensure the columns are converted
         self.convert_to_lists(columns=source_columns, delimiter=delimiter)
         # Sub-method to pad lists to the same length
-        def pad_list(lst: list, max_len: int):
+        def pad_list(lst: list, max_len: int) -> list:
             return lst + [None] * (max_len - len(lst))
-        def create_table(row) -> list:
-            max_len = max(
-                len(row[col]) if isinstance(row[col], list) else 1
-                for col in source_columns
-            )
+        def create_table(row: pd.Series) -> list:
+            max_len = max(len(row[col]) if isinstance(row[col], list) else 1 for col in source_columns)
-            # Pad lists to the maximum length, leave scalars as they are
+            # Pad lists to the maximum length, leave scalar values as they are
             for col in source_columns:
                 if isinstance(row[col], list):
                     row[col] = pad_list(row[col], max_len)
+                elif not pd.isna(row[col]):
+                    row[col] = [
+                        row[col],
+                    ] * max_len  # Repeat scalar value to match the max length
                 else:
-                    if not pd.isna(row[col]):
-                        row[col] = [
-                            row[col]
-                        ] * max_len  # Repeat scalar to match the max length
-                    else:
-                        row[col] = [None] * max_len
-            # Create a list of dictionaries for each row
-            table = []
-            for i in range(max_len):
-                table.append({col: row[col][i] for col in source_columns})
+                    row[col] = [None] * max_len
+            # Create a list of dictionaries for each row:
+            table = [{col: row[col][i] for col in source_columns} for i in range(max_len)]
             return table
-        # Apply the function to create a new column with a table
+        # Apply the function to create a new column with table values:
         self._df[new_column] = self._df.apply(create_table, axis=1)
     # end method definition

pyxecm 1.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

Potentially problematic release.

pyxecm 1.6py3-none-any.whl → 2.0.0py3-none-any.whl