PyPI - tarandm_analytics - Versions diffs - 1.0.0.dev1__tar.gz → 1.0.0.dev2__tar.gz - Mend

tarandm_analytics 1.0.0.dev1tar.gz → 1.0.0.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{tarandm_analytics-1.0.0.dev1 → tarandm_analytics-1.0.0.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: tarandm_analytics
-Version: 1.0.0.dev1
+Version: 1.0.0.dev2
 Summary: Package links analytics in Python with TaranDM software.
 Author: Marek Teller
 Author-email: mteller@taran.ai

{tarandm_analytics-1.0.0.dev1 → tarandm_analytics-1.0.0.dev2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tarandm_analytics"
-version = "1.0.0.dev1"
+version = "1.0.0.dev2"
 description = "Package links analytics in Python with TaranDM software."
 authors = ["Marek Teller <mteller@taran.ai>"]
 readme = "README.md"

{tarandm_analytics-1.0.0.dev1 → tarandm_analytics-1.0.0.dev2}/tarandm_analytics/predictive_models/builder.py RENAMED Viewed

@@ -20,10 +20,7 @@ from tarandm_analytics_utils.predictive_models.attribute_preprocessing.attribute
 from tarandm_analytics_utils.predictive_models.attribute_preprocessing.attribute_transformation import (
     AttributeTransformation,
 )
-from tarandm_analytics_utils.predictive_models.extended_predictive_model import (
-    ExtendedPredictiveModel,
-    PredictiveModelType,
-)
+from tarandm_analytics_utils.predictive_models.extended_predictive_model import PredictiveModelType
 from tarandm_analytics_utils.predictive_models.model_description.model_description import (
     AttachedImage,
     PredictiveModelDescription,
@@ -53,6 +50,7 @@ if TYPE_CHECKING:
     import pandas as pd
     import polars as pl
     import numpy as np
+    from tarandm_analytics.predictive_models.extended_predictive_model import ExtendedPredictiveModel
 else:
     RandomForestClassifier = Any
     LogisticRegression = Any
@@ -135,12 +133,7 @@ class PredictiveModelBuilder:
                 model=model,
             )
-        # 2. Serialize model
-        serialized_model = self._get_dumped_model(model, model_type_final)
-        if model_type_final == PredictiveModelType.PMML and "feature_names" not in serialized_model:
-            serialized_model["feature_names"] = attributes
-        # 3. Get descriptive data about data samples used in model development
+        # 2. Get descriptive data about data samples used in model development
         sample_description_data = self._get_data_sample_description(
             data=data,
             column_name_label=label_name,
@@ -148,7 +141,7 @@ class PredictiveModelBuilder:
             column_name_date=column_name_date,
         )
-        # 4. Get model performance over different samples
+        # 3. Get model performance over different samples
         model_performance = self._get_predictive_model_performance(
             data=data,
             column_name_sample=column_name_sample,
@@ -156,7 +149,7 @@ class PredictiveModelBuilder:
             evaluate_performance=evaluate_performance,
         )
-        # 5. Generate images
+        # 4. Generate images
         images = self._generate_images(
             data=data,
             model=model,
@@ -165,7 +158,7 @@ class PredictiveModelBuilder:
             learning_curves_data=learning_curves_data,
         )
-        # 6. Prepare request data
+        # 5. Prepare request data
         return RequestData(
             model_type=model_type_final,
             target_class=target_class,
@@ -481,7 +474,9 @@ class PredictiveModelBuilder:
         return binning
-    def create_model_from_data_frame(self, df: "pl.DataFrame") -> ExtendedPredictiveModel:
+    def create_model_from_data_frame(self, df: "pl.DataFrame") -> "ExtendedPredictiveModel":
+        from tarandm_analytics.predictive_models.extended_predictive_model import ExtendedPredictiveModel
         mandatory_columns = ["attribute", "bin_from", "bin_to", "categories", "value"]
         if any([p not in df.columns for p in mandatory_columns]):
             raise TypeError(f"Expert score csv missing one of the following columns: {mandatory_columns}.")
@@ -622,7 +617,9 @@ class PredictiveModelBuilder:
         return extended_predictive_model
-    def create_model_from_csv(self, filename: Union[io.StringIO, str], delimiter: str = ",") -> ExtendedPredictiveModel:
+    def create_model_from_csv(
+        self, filename: Union[io.StringIO, str], delimiter: str = ","
+    ) -> "ExtendedPredictiveModel":
         """Expert score model can be defined by csv file. Function 'create_model_from_csv' loads expert score model from csv
         and create internal representation of the model compatible with TaranDM.
@@ -652,33 +649,43 @@ class PredictiveModelBuilder:
     def _build_predictive_model(
         self, predictive_model: ModelType, request_data: RequestData
-    ) -> ExtendedPredictiveModel:
-        serialized_model = self._get_dumped_model(
-            model=predictive_model, model_type=request_data.model_type, attributes=request_data.predictors
-        )
+    ) -> "ExtendedPredictiveModel":
+        from tarandm_analytics.predictive_models.extended_predictive_model import ExtendedPredictiveModel
-        # 1. basic validation of provided data
-        if request_data.target_class is None and request_data.model_type == PredictiveModelType.RANDOM_FOREST:
-            try:
-                request_data.target_class = serialized_model["random_forest_model"]["classes_"][-1]
-                logger.warning(
-                    f"Parameter 'target_class' was not provided for RandomForest model. Value was automatically set "
-                    f"to '{request_data.target_class}'"
-                )
-            except Exception as e:
-                raise ValueError(
-                    f"Parameter 'target_class' was not provided for random forest model and could not be "
-                    f"auto-detected (Error in auto-detection: {e})."
-                )
+        if request_data.model_type == PredictiveModelType.EXPERT_SCORE:
+            extended_model = self.create_model_from_data_frame(df=cast("pl.DataFrame", predictive_model))
+            extended_model.target = request_data.label_name
+            extended_model.target_class = request_data.target_class
+            extended_model.description = request_data.description
+            extended_model.performance = request_data.model_performance
+        else:
+            serialized_model = self._get_dumped_model(
+                model=predictive_model, model_type=request_data.model_type, attributes=request_data.predictors
+            )
+            if request_data.model_type == PredictiveModelType.PMML and "feature_names" not in serialized_model:
+                serialized_model["feature_names"] = request_data.predictors
-        # 2. Prepare attribute preprocessing data
-        # for validating if the binning is defined for valid attribute, we first need to get available attributes
-        # (original attributes + attributes created in transformations)
-        attribute_binning_preprocessed = self._prepare_attribute_binning(
-            attribute_binning=request_data.attribute_binning
-        )
+            # 1. basic validation of provided data
+            if request_data.model_type == PredictiveModelType.RANDOM_FOREST and request_data.target_class is None:
+                try:
+                    request_data.target_class = serialized_model["random_forest_model"]["classes_"][-1]
+                    logger.warning(
+                        f"Parameter 'target_class' was not provided for RandomForest model. Value was automatically set "
+                        f"to '{request_data.target_class}'"
+                    )
+                except Exception as e:
+                    raise ValueError(
+                        f"Parameter 'target_class' was not provided for random forest model and could not be "
+                        f"auto-detected (Error in auto-detection: {e})."
+                    )
+            # 2. Prepare attribute preprocessing data
+            # for validating if the binning is defined for valid attribute, we first need to get available attributes
+            # (original attributes + attributes created in transformations)
+            attribute_binning_preprocessed = self._prepare_attribute_binning(
+                attribute_binning=request_data.attribute_binning
+            )
-        if request_data.model_type != PredictiveModelType.EXPERT_SCORE:
             # Detect attributes if not provided
             if request_data.predictors is None or len(request_data.predictors) == 0:
                 request_data.predictors = self.automated_attribute_detection(
@@ -709,36 +716,28 @@ class PredictiveModelBuilder:
                     "dummy_encoding": request_data.dummy_encoding or [],
                 }
             )
+            if request_data.model_type == PredictiveModelType.PMML:
+                extended_model_dict = {
+                    "external_model": serialized_model,
+                    "attributes": serialized_model["feature_names"],
+                    "predictive_model_type": request_data.model_type,
+                    "target": request_data.label_name,
+                }
-        # 3. Build extended model and send its content as response
-        if request_data.model_type == PredictiveModelType.EXPERT_SCORE:
-            extended_model = self.create_model_from_data_frame(df=cast("pl.DataFrame", predictive_model))
-            extended_model.target = request_data.label_name
-            extended_model.target_class = request_data.target_class
-            extended_model.description = request_data.description
-            extended_model.performance = request_data.model_performance
-        elif request_data.model_type == PredictiveModelType.PMML:
-            extended_model_dict = {
-                "external_model": serialized_model,
-                "attributes": serialized_model["feature_names"],
-                "predictive_model_type": request_data.model_type,
-                "target": request_data.label_name,
-            }
-            extended_model = ExtendedPredictiveModel.model_validate(extended_model_dict)
-        else:
-            extended_model_dict = {
-                "external_model": serialized_model,
-                "predictive_model_type": request_data.model_type,
-                "attributes": request_data.predictors,
-                "target": request_data.label_name,
-                "target_class": request_data.target_class,
-                "attribute_preprocessing": attribute_preprocessing,
-                "description": request_data.description,
-                "performance": request_data.model_performance,
-                "monitoring": request_data.monitoring_data,
-            }
-            extended_model = ExtendedPredictiveModel.model_validate(extended_model_dict)
+                extended_model = ExtendedPredictiveModel.model_validate(extended_model_dict)
+            else:
+                extended_model_dict = {
+                    "external_model": serialized_model,
+                    "predictive_model_type": request_data.model_type,
+                    "attributes": request_data.predictors,
+                    "target": request_data.label_name,
+                    "target_class": request_data.target_class,
+                    "attribute_preprocessing": attribute_preprocessing,
+                    "description": request_data.description,
+                    "performance": request_data.model_performance,
+                    "monitoring": request_data.monitoring_data,
+                }
+                extended_model = ExtendedPredictiveModel.model_validate(extended_model_dict)
         return extended_model
     def build(
@@ -767,7 +766,7 @@ class PredictiveModelBuilder:
         evaluate_performance: Optional[Dict[str, Union[str, List[str]]]] = None,
         learning_curves_data: Optional[Dict] = None,
         created_date: Optional[datetime.date] = None,
-    ) -> ExtendedPredictiveModel:
+    ) -> "ExtendedPredictiveModel":
         """
         Function prepares input data for build model zip file, that is ready to be implemented in TaranDM software.
         Created input data will be sent to the TaranDM endpoint, through which final model zip file is returned.
@@ -1152,13 +1151,13 @@ class PredictiveModelBuilder:
                 model_attributes = list(model.feature_names_in_)
             else:
                 raise ValueError(
-                    "Model predictors names were not provided and could not be detected automatically. Model "
-                    "was recognized as scikit-learn estimator. Tried to collect model predictors names from "
+                    "Model attribute names were not provided and could not be detected automatically. Model "
+                    "was recognized as scikit-learn estimator. Tried to collect model attribute names from "
                     "property 'feature_names_in_'. This property is available in scikit-learn since version "
                     "0.24."
                 )
         else:
-            raise ValueError("Model predictors names were not provided and could not be detected automatically.")
+            raise ValueError("Model attribute names were not provided and could not be detected automatically.")
         # We detected feature that enters the model. First, we detect features as they were before dummy encoding
         names_encoded_to_orig = {}
@@ -1176,7 +1175,7 @@ class PredictiveModelBuilder:
                 if binned_attribute_name is not None and binned_attribute_name != binning.attribute:
                     if binned_attribute_name not in model_attributes:
                         raise ValueError(
-                            "Model predictors names were not provided and could not be detected automatically."
+                            "Model attribute names were not provided and could not be detected automatically."
                         )
                     model_attributes = [a for a in model_attributes if a != binned_attribute_name]
                     if binning.attribute not in model_attributes:
@@ -1192,7 +1191,7 @@ class PredictiveModelBuilder:
                 if transformed_attribute_name is not None and transformed_attribute_name != transformation.attribute:
                     if transformed_attribute_name not in model_attributes:
                         raise ValueError(
-                            "Model predictors names were not provided and could not be detected automatically."
+                            "Model attribute names were not provided and could not be detected automatically."
                         )
                     model_attributes = [a for a in model_attributes if a != transformed_attribute_name]
                     if transformation.attribute not in model_attributes:
@@ -1995,7 +1994,7 @@ class PredictiveModelBuilder:
             if len(categories_str) > 30:
                 categories_str = categories_str[0:27] + "..."
             logger.info(
-                f"Target rate for predictor {col_attribute} and group of categories {{{categories_str}}} is zero."
+                f"Target rate for attribute {col_attribute} and group of categories {{{categories_str}}} is zero."
             )
             return bin_frequency / total_count, 0.0
@@ -2033,7 +2032,7 @@ class PredictiveModelBuilder:
                     encoded_feature_name = single_dummy.encoded_feature_name
                     if encoded_feature_name not in model_attrs:
                         logger.warning(
-                            f"Dummy encoding for predictor {attr} defines feature {encoded_feature_name}. This "
+                            f"Dummy encoding for attribute {attr} defines feature {encoded_feature_name}. This "
                             f"feature is not used in model. Please check dummy encoding for typos."
                         )
                     else:
@@ -2084,7 +2083,7 @@ class PredictiveModelBuilder:
         elif set(attributes) != set(orig_attrs):
             logger.warning(
                 f"Expected original features (features before transformation and encodings) are different "
-                f"from predictors provided in 'attributes' parameter. Expected: {orig_attrs};   Provided: "
+                f"from attributes provided in 'attributes' parameter. Expected: {orig_attrs};   Provided: "
                 f"{attributes}. Expected original features will be used in exported model. Please check "
                 f"that this is a correct behavior."
             )
@@ -2169,6 +2168,8 @@ class PredictiveModelBuilder:
         return unique_attributes
     def expert_score_model_dump(self, model: "pl.DataFrame") -> Dict[str, Any]:
+        import polars as pl
         unique_attributes = self._get_unique_values_from_df_col(model, "attribute")
         feature_names = self._get_unique_values_from_df_col(model.filter(pl.col("is_intercept") == 0), "attribute")

{tarandm_analytics-1.0.0.dev1 → tarandm_analytics-1.0.0.dev2}/tarandm_analytics/predictive_models/expert_score.py RENAMED Viewed

@@ -6,7 +6,7 @@ from tarandm_analytics.predictive_models.abstract_predictive_model import Abstra
 class ModelExpertScore(AbstractPredictiveModel):
-    """Internal representation of expertly defined score. Expert score defines binning of predictors and assigns a value
+    """Internal representation of expertly defined score. Expert score defines binning of attributes and assigns a value
     to every bin. Prediction is simple sum of assigned values.
     """
@@ -48,13 +48,13 @@ class ModelExpertScore(AbstractPredictiveModel):
             values = attribute_values.get(feature, None)
             if not isinstance(values, list):
                 raise TypeError(
-                    f"'predict_batch' method expect values of each predictor provided in list. Values for "
+                    f"'predict_batch' method expect values of each attribute provided in list. Values for "
                     f"{feature} was provided as {type(values)}."
                 )
             elif not all(isinstance(val, (int, float)) for val in values):
                 raise TypeError("Some of values provided to predict_batch method are not numerical.")
             elif n_obs != len(values):
-                raise ValueError("Number of values provided to predict_batch is inconsistent across predictors.")
+                raise ValueError("Number of values provided to predict_batch is inconsistent across attributes.")
             data_for_predict.append(values)
         intercept = self.intercept or 0.0

{tarandm_analytics-1.0.0.dev1 → tarandm_analytics-1.0.0.dev2}/tarandm_analytics/predictive_models/extended_predictive_model.py RENAMED Viewed

@@ -2,8 +2,9 @@
 #   duplication or any other usage without previous written agreement of Taran Advisory is
 #   prohibited.
+from enum import Enum
 from io import StringIO
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from pydantic import Field
@@ -17,6 +18,10 @@ from tarandm_analytics_utils.predictive_models.extended_predictive_model import
     PredictiveModelType,
 )
 from tarandm_analytics_utils.utils.dump import safe_dumps_json
+from tarandm_analytics_utils.predictive_models.attribute_preprocessing.attribute_binning import AttributeDataType
+from tarandm_analytics_utils.predictive_models.attribute_preprocessing.attribute_transformation import (
+    AttributeTransformation,
+)
 class ExtendedPredictiveModel(AbstractExtendedPredictiveModel):
@@ -133,3 +138,100 @@ class ExtendedPredictiveModel(AbstractExtendedPredictiveModel):
             }
         return {}
+    def convert_attribute(self, attribute_value: Any) -> Any:
+        if isinstance(attribute_value, Enum):
+            return attribute_value.value
+        return attribute_value
+    def apply_attribute_transformation(
+        self, transformation: AttributeTransformation, attribute_value: Union[int, float]
+    ) -> Optional[Union[int, float]]:
+        raise NotImplementedError()  # Implemented only in core because of QueryEvaluator
+    def prepare_all_preprocessed_attributes(self, attribute_values: Dict[str, Any]) -> Dict[str, Any]:  # noqa: C901
+        """
+        Method to preprocess attributes - convert from Money and apply binning/transformations if needed
+        """
+        attribute_values_preprocessed = {}
+        # convert money to float
+        for attribute in attribute_values:
+            attribute_values_preprocessed[attribute] = self.convert_attribute(attribute_values[attribute])
+            # if boolean values convert to either "true"/"false" (CATEGORICAL) or 1/0 (NUMERICAL)
+            if isinstance(attribute_values_preprocessed[attribute], bool):
+                binning_found = False
+                for binning in self.attribute_preprocessing.binning:
+                    if binning.attribute == attribute:
+                        if binning.attribute_data_type == AttributeDataType.CATEGORICAL:
+                            attribute_values_preprocessed[attribute] = str(attribute_values_preprocessed[attribute])
+                            binning_found = True
+                        else:
+                            attribute_values_preprocessed[attribute] = int(attribute_values_preprocessed[attribute])
+                            binning_found = True
+                if not binning_found:
+                    attribute_values_preprocessed[attribute] = int(attribute_values_preprocessed[attribute])
+        # apply transformations
+        if self.attribute_preprocessing is not None and self.attribute_preprocessing.transformations is not None:
+            for transformation in self.attribute_preprocessing.transformations:
+                if transformation.attribute in attribute_values_preprocessed.keys():
+                    attribute_values_preprocessed[
+                        transformation.transformed_attribute_name or transformation.attribute
+                    ] = self.apply_attribute_transformation(
+                        transformation=transformation,
+                        attribute_value=attribute_values_preprocessed[transformation.attribute],
+                    )
+        # apply binning
+        if self.attribute_preprocessing is not None and self.attribute_preprocessing.binning is not None:
+            for binning in self.attribute_preprocessing.binning:
+                if binning.attribute in attribute_values_preprocessed.keys():
+                    attribute_values_preprocessed[binning.binned_attribute_name or binning.attribute] = (
+                        self.attribute_preprocessing.apply_attribute_binning(
+                            attribute=binning.attribute,
+                            attribute_value=attribute_values_preprocessed[binning.attribute],
+                        )
+                    )
+        # apply dummy encoding
+        if self.attribute_preprocessing is not None:
+            for dummy_encoding in self.attribute_preprocessing.dummy_encoding:
+                attribute_value = attribute_values_preprocessed[dummy_encoding.attribute]
+                encoding = self.attribute_preprocessing.apply_dummy_encoding(
+                    attribute=dummy_encoding.attribute, attribute_value=attribute_value
+                )
+                if encoding is not None:
+                    attribute_values_preprocessed.update(encoding)
+        return attribute_values_preprocessed
+    def filter_final_model_attributes(self, attribute_values_preprocessed: Dict[str, Any]) -> Dict[str, Any]:
+        # return only attributes that enter final model
+        model_attribute_values_preprocessed = {}
+        for attr in self.external_model.feature_names:
+            model_attribute_values_preprocessed[attr] = attribute_values_preprocessed[attr]
+        return model_attribute_values_preprocessed
+    def apply_preprocessing(self, attribute_values: Dict[str, Any]) -> Dict[str, Any]:  # noqa: C901
+        all_preprocessed_attributes = self.prepare_all_preprocessed_attributes(attribute_values=attribute_values)
+        return self.filter_final_model_attributes(attribute_values_preprocessed=all_preprocessed_attributes)
+    def predict(self, attribute_values: Dict[str, Any]) -> Tuple[Optional[float], Dict[str, Any]]:
+        """
+        Method predict is responsible for computing prediction of models.
+        :param attribute_values: Dictionary {attribute: its value}.
+        :return: Prediction of the models.
+        """
+        attribute_values_preprocessed = self.apply_preprocessing(attribute_values=attribute_values)
+        if self.external_model is not None:
+            return (
+                self.external_model.predict(attribute_values=attribute_values_preprocessed),
+                attribute_values_preprocessed,
+            )
+        raise Exception("Model is not defined")

{tarandm_analytics-1.0.0.dev1 → tarandm_analytics-1.0.0.dev2}/tarandm_analytics/predictive_models/extreme_gradient_boosting.py RENAMED Viewed

@@ -89,14 +89,14 @@ class ModelXGBoost(AbstractPredictiveModel):
             values = attribute_values.get(feature, None)
             if not isinstance(values, list):
                 raise TypeError(
-                    f"'predict_batch' method expect values of each predictor provided in list. Values for "
+                    f"'predict_batch' method expect values of each attribute provided in list. Values for "
                     f"{feature} was provided as {type(values)}."
                 )
             elif not all(val is None or isinstance(val, (int, float)) for val in values):
                 raise TypeError("Some of values provided to predict_batch method are not numerical or None.")
             elif n_obs is not None and n_obs != len(values):
                 raise ValueError(
-                    "Values provided to predict_batch method do not have the same size for all predictors."
+                    "Values provided to predict_batch method do not have the same size for all attributes."
                 )
             data_for_predict.append(values)

{tarandm_analytics-1.0.0.dev1 → tarandm_analytics-1.0.0.dev2}/tarandm_analytics/predictive_models/logistic_regression.py RENAMED Viewed

@@ -94,14 +94,14 @@ class ModelLogisticRegression(AbstractPredictiveModel):
             values = attribute_values.get(feature, None)
             if not isinstance(values, list):
                 raise TypeError(
-                    f"'predict_batch' method expect values of each predictor provided in list. Values for "
+                    f"'predict_batch' method expect values of each attribute provided in list. Values for "
                     f"{feature} was provided as {type(values)}."
                 )
             elif not all(isinstance(val, (int, float)) for val in values):
                 raise TypeError("Some of values provided to predict_batch method are not numerical.")
             elif n_obs is not None and n_obs != len(values):
                 raise ValueError(
-                    "Values provided to predict_batch method do not have the same size for all predictors."
+                    "Values provided to predict_batch method do not have the same size for all attributes."
                 )
             data_for_predict.append(values)

{tarandm_analytics-1.0.0.dev1 → tarandm_analytics-1.0.0.dev2}/tarandm_analytics/predictive_models/random_forest.py RENAMED Viewed

@@ -76,14 +76,14 @@ class ModelRandomForest(AbstractPredictiveModel):
             values = attribute_values.get(feature, None)
             if not isinstance(values, list):
                 raise TypeError(
-                    f"'predict_batch' method expect values of each predictor provided in list. Values for "
+                    f"'predict_batch' method expect values of each attribute provided in list. Values for "
                     f"{feature} was provided as {type(values)}."
                 )
             elif not all(isinstance(val, (int, float)) for val in values):
                 raise TypeError("Some of values provided to predict_batch method are not numerical.")
             elif n_obs is not None and n_obs != len(values):
                 raise ValueError(
-                    "Values provided to predict_batch method do not have the same size for all predictors."
+                    "Values provided to predict_batch method do not have the same size for all attributes."
                 )
             data_for_predict.append(values)