PyPI - sdg-core-lib - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sdg-core-lib 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

sdg_core_lib/job.py ADDED Viewed

@@ -0,0 +1,56 @@
+import copy
+import pandas as pd
+from sdg_core_lib.evaluate.TabularComparison import TabularComparisonEvaluator
+from sdg_core_lib.NumericDataset import NumericDataset
+from sdg_core_lib.data_generator.model_factory import model_factory
+from sdg_core_lib.data_generator.models.UnspecializedModel import UnspecializedModel
+def job(
+    model_info: dict, dataset: list, n_rows: int, save_filepath: str, train: bool
+) -> tuple[list[dict], dict, UnspecializedModel, NumericDataset]:
+    """
+    Main function to run the job.
+    This function will run the Synthetic Data Generation job. It will create an instance of the specified model or
+    load the specified dataset, pre-process the data, train the model (if specified to do so), generate synthetic
+    data, evaluate the generated data and save the results to the specified location.
+    :param model_info: a dictionary containing the model's information
+    :param dataset: a list of dataframes
+    :param n_rows: the number of rows to generate
+    :param save_filepath: the path to save the results
+    :param train: a boolean indicating if the model should be trained
+    :return: a tuple containing a list of metrics, a dictionary with the model's info, the trained model, and the generated dataset
+    """
+    if len(dataset) == 0:
+        data_info = model_info.get("training_data_info", [])
+        data = NumericDataset(dataset=data_info)
+    else:
+        data = NumericDataset(dataset=dataset)
+    model = model_factory(model_info, data.input_shape)
+    if train:
+        model.train(data=data)
+        model.save(save_filepath)
+    predicted_data = model.infer(n_rows)
+    df_predict = pd.DataFrame(data=predicted_data.tolist(), columns=data.columns)
+    report = {"available": False}
+    if len(data.dataframe) > 0:
+        evaluator = TabularComparisonEvaluator(
+            real_data=data.dataframe,
+            synthetic_data=df_predict,
+            numerical_columns=data.continuous_columns,
+            categorical_columns=data.categorical_columns,
+        )
+        report = evaluator.compute()
+    generated = copy.deepcopy(data)
+    generated.dataframe = df_predict
+    results = generated.parse_tabular_data_json()
+    return results, report, model, data

sdg_core_lib/post_process/FunctionApplier.py ADDED Viewed

@@ -0,0 +1,14 @@
+import numpy as np
+from sdg_core_lib.post_process.functions.UnspecializedFunction import (
+    UnspecializedFunction,
+)
+class FunctionApplier:
+    def __init__(self, functions: list[UnspecializedFunction], data: list[np.array]):
+        self.functions = functions
+        self.data = data
+    def apply(self):
+        pass

sdg_core_lib/post_process/__init__.py ADDED Viewed

File without changes

sdg_core_lib/post_process/function_factory.py ADDED Viewed

@@ -0,0 +1,41 @@
+import importlib
+from sdg_core_lib.post_process.functions.UnspecializedFunction import (
+    UnspecializedFunction,
+)
+def dynamic_import(class_name: str):
+    """
+    Dynamically imports a class given its name.
+    :param class_name: a string with the full name of the class to import
+    :return: the class itself
+    """
+    module_name, class_name = class_name.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    return getattr(module, class_name)
+def function_factory(function_dict: dict) -> UnspecializedFunction:
+    """
+    This function is a generic model factory. Takes a dictionary containing useful model information and plugs
+    them in the model itself.
+    Input shape may be passed as an argument (i.e) from the request data itself, or [alternatively] may be present in
+    model dictionary. If not explicitly passed, it will use the model dictionary
+    :return: An instance of a BaseModel class or any subclass
+    """
+    function_name, parameter_list = parse_function_info(function_dict)
+    function_class = dynamic_import(function_name)
+    function = function_class(parameters=parameter_list)
+    return function
+def parse_function_info(function_dict: dict):
+    """ """
+    function_name = function_dict["function_reference"]
+    parameter_list = function_dict["parameters"]
+    return function_name, parameter_list

sdg_core_lib/post_process/functions/FunctionInfo.py ADDED Viewed

@@ -0,0 +1,25 @@
+from sdg_core_lib.post_process.functions.Parameter import Parameter
+class FunctionInfo:
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        function_reference: str,
+        parameters: list[Parameter],
+    ):
+        self.name = name
+        self.description = description
+        self.function_reference = function_reference
+        self.Parameters = parameters
+    def get_function_info(self):
+        return {
+            "function": {
+                "name": self.name,
+                "description": self.description,
+                "function_reference": self.function_reference,
+            },
+            "parameters": [param.to_json() for param in self.Parameters],
+        }

sdg_core_lib/post_process/functions/FunctionResult.py ADDED Viewed

@@ -0,0 +1,15 @@
+import numpy as np
+class FunctionResult:
+    def __init__(self, result: np.array, indexes: np.array, evaluation_result: bool):
+        self.indexes = indexes
+        self.result = result
+        self.evaluation_result = evaluation_result
+    def to_dict(self):
+        return {
+            "indexes": self.indexes,
+            "results": self.result,
+            "evaluation_results": self.evaluation_result,
+        }

sdg_core_lib/post_process/functions/Parameter.py ADDED Viewed

@@ -0,0 +1,33 @@
+import ast
+import builtins
+class Parameter:
+    def __init__(self, name: str, value: str, parameter_type: str):
+        self.name = name
+        self.value = value
+        self.parameter_type = parameter_type
+    def to_json(self):
+        return {
+            "name": self.name,
+            "value": str(self.value),
+            "parameter_type": self.parameter_type,
+        }
+    @staticmethod
+    def _convert_type(stringed_value: str, parameter_type: str):
+        converted_value = ast.literal_eval(stringed_value)
+        target_type = getattr(builtins, parameter_type)
+        if not isinstance(converted_value, target_type):
+            raise ValueError(
+                f"Type inference went wrong: expected type {target_type} but got {type(converted_value)}"
+            )
+        return converted_value
+    @classmethod
+    def from_json(cls, json_data):
+        converted_value = cls._convert_type(
+            json_data["value"], json_data["parameter_type"]
+        )
+        return cls(json_data["name"], converted_value, json_data["parameter_type"])

sdg_core_lib/post_process/functions/UnspecializedFunction.py ADDED Viewed

@@ -0,0 +1,42 @@
+import numpy as np
+from abc import ABC, abstractmethod
+from sdg_core_lib.post_process.functions.FunctionResult import FunctionResult
+from sdg_core_lib.post_process.functions.Parameter import Parameter
+class UnspecializedFunction(ABC):
+    def __init__(self, parameters: list[dict]):
+        self.parameters = [Parameter.from_json(param) for param in parameters]
+    @abstractmethod
+    def _check_parameters(self):
+        raise NotImplementedError
+    @abstractmethod
+    def _compute(self, data: np.array) -> tuple[np.array, np.array]:
+        """
+        Applies a data transformation function on a given set of generated data
+        :param data: a numpy array of data from a single feature
+        :return: transformed data and affected indexes
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def _evaluate(self, data: np.array) -> bool:
+        """
+        Applies an evaluation function on a given set of generated data
+        :param data: a numpy array of data from a single feature
+        :return: a single boolean value evaluating id data meets evaluation criteria
+        """
+        raise NotImplementedError
+    @classmethod
+    def self_describe(cls):
+        raise NotImplementedError
+    def get_results(self, data: np.array) -> dict:
+        results, indexes = self._compute(data)
+        evaluation_results = self._evaluate(data)
+        report = FunctionResult(results, indexes, evaluation_results)
+        return report.to_dict()

sdg_core_lib/post_process/functions/__init__.py ADDED Viewed

File without changes

sdg_core_lib/post_process/functions/distribution_evaluator/__init__.py ADDED Viewed

File without changes

sdg_core_lib/post_process/functions/distribution_evaluator/implementation/NormalTester.py ADDED Viewed

@@ -0,0 +1,65 @@
+import numpy as np
+from scipy.stats import normaltest, ttest_1samp, kstest, norm
+from sdg_core_lib.post_process.functions.FunctionInfo import FunctionInfo
+from sdg_core_lib.post_process.functions.UnspecializedFunction import (
+    UnspecializedFunction,
+)
+from sdg_core_lib.post_process.functions.Parameter import Parameter
+class NormalTester(UnspecializedFunction):
+    def __init__(self, parameters: list[dict]):
+        super().__init__(parameters)
+        self.mean = None
+        self.std = None
+        self._check_parameters()
+    def _check_parameters(self):
+        param_mapping = {param.name: param for param in self.parameters}
+        self.mean = param_mapping["mean"].value
+        self.std = param_mapping["standard_deviation"].value
+    def _compute(self, data: np.array) -> tuple[np.array, np.array]:
+        """
+        Currently returns same data
+        :param data:
+        :return:
+        """
+        return data, np.array(range(len(data)))
+    def _evaluate(self, data: np.array) -> bool:
+        """
+        Checks if data is normally distributed.
+        Consider the null hypotesis that data is normally distributed.
+        If null hypotesis is rejected (p < 0.05), it means that data is not normally distributed
+        Evaluation is based on 3 tests:
+        1. D’Agostino and Pearson’s test
+        2. Student's t-test
+        3. Kolmogorov-Smirnov
+        :param data:
+        :return: False if null hypotesis is rejected (p < 0.05), True if it is failed to reject (p > 0.05)
+        """
+        def cdf_function(x):
+            return norm.cdf(x, loc=self.mean, scale=self.std)
+        _, p_normal = normaltest(data)
+        _, p_t = ttest_1samp(data, self.mean)
+        _, p_k = kstest(data, cdf_function)
+        p = min(p_normal, p_t, p_k)
+        return p > 0.05
+    @classmethod
+    def self_describe(cls):
+        return FunctionInfo(
+            name=f"{cls.__qualname__}",
+            function_reference=f"{cls.__module__}.{cls.__qualname__}",
+            parameters=[
+                Parameter("mean", 0.0, "float"),
+                Parameter("standard_deviation", 1.0, "float"),
+            ],
+            description="Checks if data is normally distributed given a desired mean and standard deviation",
+        ).get_function_info()

sdg_core_lib/post_process/functions/distribution_evaluator/implementation/__init__.py ADDED Viewed

File without changes

sdg_core_lib/post_process/functions/filter/IntervalThreshold.py ADDED Viewed

@@ -0,0 +1,32 @@
+from sdg_core_lib.post_process.functions.UnspecializedFunction import (
+    UnspecializedFunction,
+)
+import numpy as np
+class IntervalThreshold(UnspecializedFunction):
+    def __init__(self, parameters: list[dict]):
+        super().__init__(parameters)
+        self.upper_bound = None
+        self.lower_bound = None
+        self.upper_strict = None
+        self.lower_strict = None
+        self._check_parameters()
+    def _check_parameters(self):
+        param_mapping = {param.name: param for param in self.parameters}
+        self.upper_bound = param_mapping["upper_bound"].value
+        self.lower_bound = param_mapping["lower_bound"].value
+        self.upper_strict = param_mapping["upper_strict"].value
+        self.lower_strict = param_mapping["lower_strict"].value
+    def _compute(self, data: np.array):
+        pass
+    def _evaluate(self, data: np.array):
+        pass
+    @classmethod
+    def self_describe(cls):
+        raise NotImplementedError

sdg_core_lib/post_process/functions/filter/MonoThreshold.py ADDED Viewed

@@ -0,0 +1,28 @@
+from sdg_core_lib.post_process.functions.UnspecializedFunction import (
+    UnspecializedFunction,
+)
+import numpy as np
+class MonoThreshold(UnspecializedFunction):
+    def __init__(self, parameters: list[dict]):
+        super().__init__(parameters)
+        self.value = None
+        self.strict = None
+        self._check_parameters()
+    def _check_parameters(self):
+        param_mapping = {param.name: param for param in self.parameters}
+        self.value = param_mapping["value"].value
+        self.strict = param_mapping["strict"].value
+    def _compute(self, data: np.array):
+        pass
+    def _evaluate(self, data: np.array):
+        pass
+    @classmethod
+    def self_describe(cls):
+        raise NotImplementedError

sdg_core_lib/post_process/functions/filter/__init__.py ADDED Viewed

File without changes

sdg_core_lib/post_process/functions/filter/implementation/InnerThreshold.py ADDED Viewed

@@ -0,0 +1,43 @@
+import numpy as np
+from sdg_core_lib.post_process.functions.FunctionInfo import FunctionInfo
+from sdg_core_lib.post_process.functions.Parameter import Parameter
+from sdg_core_lib.post_process.functions.filter.IntervalThreshold import (
+    IntervalThreshold,
+)
+class InnerThreshold(IntervalThreshold):
+    def __init__(self, parameters: list[dict]):
+        super().__init__(parameters)
+    def _compute(self, data: np.array):
+        if self.lower_strict:
+            upper_indexes = np.greater_equal(data, self.lower_bound)
+        else:
+            upper_indexes = np.greater(data, self.lower_bound)
+        if self.upper_strict:
+            lower_indexes = np.less_equal(data, self.upper_bound)
+        else:
+            lower_indexes = np.less(self.upper_bound)
+        final_indexes = lower_indexes & upper_indexes
+        return data[final_indexes], final_indexes
+    def _evaluate(self, data: np.array):
+        return True
+    @classmethod
+    def self_describe(cls):
+        return FunctionInfo(
+            name=f"{cls.__qualname__}",
+            function_reference=f"{cls.__module__}.{cls.__qualname__}",
+            parameters=[
+                Parameter("lower_bound", 0.0, "float"),
+                Parameter("upper_bound", 1.0, "float"),
+                Parameter("lower_strict", True, "bool"),
+                Parameter("upper_strict", True, "bool"),
+            ],
+            description="Filters data between a given interval",
+        ).get_function_info()

sdg_core_lib/post_process/functions/filter/implementation/LowerThreshold.py ADDED Viewed

@@ -0,0 +1,32 @@
+import numpy as np
+from sdg_core_lib.post_process.functions.FunctionInfo import FunctionInfo
+from sdg_core_lib.post_process.functions.Parameter import Parameter
+from sdg_core_lib.post_process.functions.filter.MonoThreshold import MonoThreshold
+class LowerThreshold(MonoThreshold):
+    def __init__(self, parameters: list[dict]):
+        super().__init__(parameters)
+    def _compute(self, data: np.array):
+        if self.strict:
+            indexes = np.greater_equal(data, self.value)
+        else:
+            indexes = np.greater(data, self.value)
+        return data[indexes], indexes
+    def _evaluate(self, data: np.array):
+        return True
+    @classmethod
+    def self_describe(cls):
+        return FunctionInfo(
+            name=f"{cls.__qualname__}",
+            function_reference=f"{cls.__module__}.{cls.__qualname__}",
+            parameters=[
+                Parameter("value", 0.0, "float"),
+                Parameter("strict", True, "bool"),
+            ],
+            description="Mono-threshold function: pick values greater than a lower threshold",
+        ).get_function_info()

sdg_core_lib/post_process/functions/filter/implementation/OuterThreshold.py ADDED Viewed

@@ -0,0 +1,42 @@
+import numpy as np
+from sdg_core_lib.post_process.functions.FunctionInfo import FunctionInfo
+from sdg_core_lib.post_process.functions.Parameter import Parameter
+from sdg_core_lib.post_process.functions.filter.IntervalThreshold import (
+    IntervalThreshold,
+)
+class OuterThreshold(IntervalThreshold):
+    def __init__(self, parameters: list[dict]):
+        super().__init__(parameters)
+    def _compute(self, data: np.array):
+        if self.lower_strict:
+            upper_indexes = np.greater_equal(data, self.upper_bound)
+        else:
+            upper_indexes = np.greater(data, self.upper_bound)
+        if self.upper_strict:
+            lower_indexes = np.less_equal(data, self.lower_bound)
+        else:
+            lower_indexes = np.less(data, self.lower_bound)
+        final_indexes = lower_indexes | upper_indexes
+        return data[final_indexes], final_indexes
+    def _evaluate(self, data: np.array):
+        return True
+    @classmethod
+    def self_describe(cls):
+        return FunctionInfo(
+            name=f"{cls.__qualname__}",
+            function_reference=f"{cls.__module__}.{cls.__qualname__}",
+            parameters=[
+                Parameter("lower_bound", 0.0, "float"),
+                Parameter("upper_bound", 1.0, "float"),
+                Parameter("lower_strict", True, "bool"),
+                Parameter("upper_strict", True, "bool"),
+            ],
+            description="Filters data outside a given interval",
+        ).get_function_info()

sdg_core_lib/post_process/functions/filter/implementation/UpperThreshold.py ADDED Viewed

@@ -0,0 +1,32 @@
+import numpy as np
+from sdg_core_lib.post_process.functions.FunctionInfo import FunctionInfo
+from sdg_core_lib.post_process.functions.Parameter import Parameter
+from sdg_core_lib.post_process.functions.filter.MonoThreshold import MonoThreshold
+class UpperThreshold(MonoThreshold):
+    def __init__(self, parameters: list[dict]):
+        super().__init__(parameters)
+    def _compute(self, data: np.array):
+        if self.strict:
+            indexes = np.less_equal(data, self.value)
+        else:
+            indexes = np.less(data, self.value)
+        return data[indexes], indexes
+    def _evaluate(self, data: np.array):
+        return True
+    @classmethod
+    def self_describe(cls):
+        return FunctionInfo(
+            name=f"{cls.__qualname__}",
+            function_reference=f"{cls.__module__}.{cls.__qualname__}",
+            parameters=[
+                Parameter("value", 0.0, "float"),
+                Parameter("strict", True, "bool"),
+            ],
+            description="Mono-threshold function: picks value less than an upper threshold",
+        ).get_function_info()

sdg_core_lib/post_process/functions/filter/implementation/__init__.py ADDED Viewed

File without changes

sdg_core_lib/preprocess/__init__.py ADDED Viewed

File without changes

sdg_core_lib/preprocess/scale.py ADDED Viewed

@@ -0,0 +1,51 @@
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+import numpy as np
+def standardize_simple_tabular_input(
+    train_data: np.array, test_data: np.array = None
+) -> tuple[StandardScaler, np.array, np.array]:
+    """
+    Standardizes the tabular input data by scaling features to have zero mean and unit variance.
+    :param train_data: A numpy array of shape (batch, features) representing the training data.
+    :param test_data: An optional numpy array of shape (batch, features) representing the test data.
+    :return: A tuple containing the fitted StandardScaler, the standardized training data, and the standardized test data
+             if provided.
+    :raises DataException: If the input data does not have the expected shape.
+    """
+    scaler = StandardScaler()
+    train_data = scaler.fit_transform(train_data)
+    if test_data is not None:
+        test_data = scaler.transform(test_data)
+    return scaler, train_data, test_data
+def standardize_simple_tabular_time_series(
+    train_data: np.array, test_data: np.array = None
+) -> tuple[MinMaxScaler, np.array, np.array]:
+    """
+    Standardizes the time series data by scaling features to have zero mean and unit variance.
+    :param train_data: A numpy array of shape (batch, features, steps) representing the training data.
+    :param test_data: An optional numpy array of shape (batch, features, steps) representing the test data.
+    :return: A tuple containing the fitted StandardScaler, the standardized training data, and the standardized test data
+             if provided.
+    :raises DataException: If the input data does not have the expected shape.
+    """
+    scaler = MinMaxScaler()
+    batch, features, steps = train_data.shape
+    x_reshaped = train_data.transpose(0, 2, 1).reshape(-1, features)
+    x_scaled = scaler.fit_transform(x_reshaped)
+    train_data = x_scaled.reshape(batch, steps, features).transpose(0, 2, 1)
+    if test_data is not None:
+        t_reshaped = test_data.transpose(0, 2, 1).reshape(-1, features)
+        t_scaled = scaler.transform(t_reshaped)
+        test_data = t_scaled.reshape(batch, steps, features).transpose(0, 2, 1)
+    return scaler, train_data, test_data

sdg_core_lib/test/__init__.py ADDED Viewed

File without changes

sdg_core_lib/test/data_generator/__init__.py ADDED Viewed

File without changes

sdg_core_lib/test/data_generator/models/__init__.py ADDED Viewed

File without changes

sdg_core_lib/test/data_generator/models/keras/__init__.py ADDED Viewed

File without changes

sdg_core_lib/test/data_generator/models/keras/implementation/__init__.py ADDED Viewed

File without changes