PyPI - rock-physics-open - Versions diffs - 0.0__py3-none-any.whl - Mend

rock-physics-open 0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rock-physics-open might be problematic. Click here for more details.

Files changed (142) hide show

rock_physics_open/equinor_utilities/gen_utilities/filter_input.py ADDED Viewed

@@ -0,0 +1,126 @@
+from sys import byteorder
+import numpy as np
+import pandas as pd
+WRONG_BYTEORDER = ">" if byteorder == "little" else "<"
+def filter_input_log(
+    args, working_int=None, negative=False, no_zero=False, positive=True
+):
+    """
+    Check for valid input values in numpy arrays or pandas data frames. Default behaviour is to
+    identify missing values - assumed to be NaN and Inf. Other conditions
+    can be stated in the key word arguments. Unknown conditions are ignored and a warning
+    is issued. Run dim_check_vector to make sure that all inputs have the same length.
+    Erroneous values in a sample in one log will remove the sample from all the logs.
+    All inputs must have the same array length (data frames the same number of indices).
+    Parameters
+    ----------
+    args : list or tuple or np.ndarray or pd.DataFrame
+        Inputs to be filtered, single array or dataframe or lists of arrays or data frames.
+    working_int : np.ndarray
+        Valid positions are shown as values > 0.
+    negative : bool
+        Positive values are excluded (zero values are retained).
+    no_zero : bool
+        Zero values are excluded.
+    positive : bool
+        Negative values are excluded.
+    Returns
+    -------
+    tuple
+        idx, output_args : (np.ndarray, list)
+        indices of valid values [bool],
+        list of input arrays at valid indices.
+    """
+    type_error = "filter_input_log: unknown input data type: {}".format(type(args))
+    size_error = "filter_input_log: inputs of different length"
+    if not isinstance(args, (list, tuple, np.ndarray, pd.DataFrame)):
+        raise ValueError(type_error)
+    # Make sure that 'args' is iterable
+    if isinstance(args, (np.ndarray, pd.DataFrame)):
+        args = [args]
+    # Input tuple
+    if isinstance(args, tuple):
+        args = list(args)
+    # Need to preserve original inputs
+    input_args = args.copy()
+    # Test that inputs are of the right types and the same length
+    if not np.all([isinstance(log, (np.ndarray, pd.DataFrame)) for log in args]):
+        raise ValueError(type_error)
+    if not np.all([log.shape[0] == args[0].shape[0] for log in args]):
+        raise ValueError(size_error)
+    # Generate pandas series from numpy arrays
+    args = [pd.Series(log) if isinstance(log, np.ndarray) else log for log in args]
+    # Merge into a data frame
+    logs = pd.concat(args, axis=1)
+    # If any of the input logs are of type boolean, False means that they should not be included,
+    # regardless of filter flags
+    # https://github.com/pandas-dev/pandas/issues/32432
+    # idx = ~logs.any(bool_only=True, axis=1)
+    # Need to do it the cumbersome way for the time being
+    bool_col = logs.dtypes.apply(lambda dtype: dtype == "bool")
+    if any(bool_col):
+        idx = ~logs.loc[:, logs.columns[bool_col]].any(axis=1)
+        logs.drop(columns=logs.columns[bool_col], inplace=True)
+    else:
+        idx = pd.Series(index=logs.index, data=np.zeros_like(logs.index).astype(bool))
+    # Standard checks: NaN and Inf
+    idx = np.logical_or(idx, logs.isna().any(axis=1))
+    idx = np.logical_or(idx, logs.isin([np.inf, -np.inf]).any(axis=1))
+    # Remove columns with dtype that is not numeric
+    obj_col = [dd.kind not in ["i", "u", "f", "c"] for dd in logs.dtypes]
+    logs.drop(columns=logs.columns[obj_col], inplace=True)
+    # Checks according to the input options input_dict
+    # Only consider working interval if it is included or set to some value
+    if working_int is not None and not np.all(working_int == 0):
+        idx = np.logical_or(idx, working_int == 0)
+    if negative:
+        # noinspection PyTypeChecker
+        idx = np.logical_or(idx, (logs >= 0.0).all(axis=1))
+        # idx = np.logical_or(idx, logs.loc[logs > 0.0]).any(axis=1)
+    if no_zero:
+        idx = np.logical_or(idx, (logs == 0.0).any(axis=1))
+    if positive:
+        # noinspection PyTypeChecker
+        idx = np.logical_or(idx, (logs < 0.0).any(axis=1))
+    # Negate idx to identify samples to retain
+    idx = np.logical_not(idx)
+    num_valid_samples = idx.sum()
+    if num_valid_samples == 0:
+        raise ValueError("No acceptable input values")
+    for i in range(len(input_args)):
+        if isinstance(input_args[i], np.ndarray):
+            input_args[i] = input_args[i][idx]
+        else:  # data frame
+            # https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#byte-ordering-issues
+            check_type = (
+                np.array([col_type.byteorder for col_type in input_args[i].dtypes])
+                == WRONG_BYTEORDER
+            )
+            if np.any(check_type):
+                tmp_array = (
+                    input_args[i].to_numpy().byteswap().newbyteorder().astype(float)
+                )
+                cols = input_args[i].columns
+                for j in range(check_type.shape[0]):
+                    if check_type[j]:
+                        input_args[i][cols[j]] = tmp_array[:, j]
+            input_args[i] = input_args[i].loc[idx]
+    return np.array(idx), input_args

rock_physics_open/equinor_utilities/gen_utilities/filter_output.py ADDED Viewed

@@ -0,0 +1,78 @@
+from sys import byteorder
+import numpy as np
+import pandas as pd
+WRONG_BYTEORDER = ">" if byteorder == "little" else "<"
+def filter_output(idx_inp, inp_log):
+    """
+    Function to restore outputs from a plugin to original length and
+    with values at correct positions. The logs are assumed to go through
+    matching input filtering done by gen_utilities.filter_input_log earlier.
+    Parameters
+    ----------
+    idx_inp: np.ndarray
+        boolean array which is True at locations to be filled, length idx_inp is returned length of
+        arrays or data frames.
+    inp_log: tuple or list or np.ndarray or pd.DataFrame
+        input numpy array(s) or pandas data frame(s), in list or tuple that are to be expanded to original
+        length.
+    Returns
+    -------
+    return_logs : list
+        Expanded inputs.
+    """
+    def _expand_array(idx, inp_single_log):
+        logs = np.ones(idx.shape, dtype=float) * np.nan
+        try:
+            logs[idx] = inp_single_log.flatten()
+        except ValueError:
+            # Assume that the dtype  of the input log is not fit for casting to float, set to object and retry
+            logs = logs.astype(object)
+            logs[idx] = inp_single_log
+        return logs.reshape(idx.shape)
+    def _expand_df(idx, inp_df):
+        logs = pd.DataFrame(columns=inp_df.columns, index=np.arange(idx.shape[0]))
+        logs.loc[idx] = inp_df
+        return logs
+    if not isinstance(inp_log, (list, tuple, np.ndarray, pd.DataFrame)):
+        raise ValueError(
+            "filter_output: unknown input data type: {}".format(type(inp_log))
+        )
+    if not isinstance(idx_inp, (list, np.ndarray)):
+        raise ValueError(
+            "filter_output: unknown filter array data type: {}".format(type(idx_inp))
+        )
+    # Make iterable in case of single input
+    if isinstance(inp_log, (np.ndarray, pd.DataFrame)):
+        inp_log = [inp_log]
+    if isinstance(idx_inp, np.ndarray):
+        idx_inp = [idx_inp]
+    # Possible to simplify?
+    if len(idx_inp) != len(inp_log):
+        if len(idx_inp) == 1:
+            idx_inp = idx_inp * len(inp_log)
+        else:
+            raise ValueError(
+                "filter_output: mismatch between length of filter arrays and inputs: {} and {}".format(
+                    len(idx_inp), len(inp_log)
+                )
+            )
+    return_logs = []
+    for this_idx, this_log in zip(idx_inp, inp_log):
+        if isinstance(this_log, np.ndarray):
+            return_logs.append(_expand_array(this_idx, this_log))
+        elif isinstance(this_log, pd.DataFrame):
+            return_logs.append(_expand_df(this_idx, this_log))
+    return return_logs

rock_physics_open/equinor_utilities/machine_learning_utilities/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .dummy_vars import generate_dummy_vars
+from .exponential_model import CarbonateExponentialPressure
+from .import_ml_models import import_model
+from .run_regression import run_regression
+from .sigmoidal_model import CarbonateSigmoidalPressure, Sigmoid
+__all__ = [
+    "generate_dummy_vars",
+    "CarbonateExponentialPressure",
+    "import_model",
+    "run_regression",
+    "CarbonateSigmoidalPressure",
+    "Sigmoid",
+]

rock_physics_open/equinor_utilities/machine_learning_utilities/dummy_vars.py ADDED Viewed

@@ -0,0 +1,42 @@
+import numpy as np
+from sklearn.preprocessing import OneHotEncoder
+def generate_dummy_vars(inp_frame, class_var, ohe=None):
+    """
+    From categorical variables generate a one-hot-encoder, i.e. each value in the categorical variable becomes a binary
+    variable. See sklearn.preprocessing.OneHotEncoder.
+    Parameters
+    ----------
+    inp_frame : pd.DataFrame
+        Input data containing categorical variables.
+    class_var : str
+        Name of categorical variable.
+    ohe : preprocessing.OneHotEncoder
+        One-hot-encoder object.
+    Returns
+    -------
+    dum_features, no_dummy_cols, dum_var_names : (np.ndarray, int, np.ndarray)
+        dum_features: 2D array with transformed dummy variables, no_dummy_cols: number of columns in returned array,
+        dum_var_names: automatically generated feature names.
+    """
+    from pandas.api.types import is_numeric_dtype
+    if is_numeric_dtype(inp_frame[class_var]):
+        # Make sure that the chosen indicator variable contains discrete values
+        inp_frame = inp_frame.astype({class_var: "int32"})
+    features_in = np.array(inp_frame[class_var]).reshape(-1, 1)
+    if ohe is None:
+        classes = features_in
+        ohe = OneHotEncoder(categories="auto", sparse_output=False)
+        ohe.fit(classes)
+    dum_features = ohe.transform(features_in)
+    no_dummy_cols = dum_features.shape[1]
+    dum_var_names = ohe.get_feature_names_out()
+    return dum_features, no_dummy_cols, dum_var_names

rock_physics_open/equinor_utilities/machine_learning_utilities/exponential_model.py ADDED Viewed

@@ -0,0 +1,119 @@
+import pickle
+from typing import Union
+import numpy as np
+def _verify_input(inp_arr):
+    if isinstance(inp_arr, np.ndarray) and not (
+        inp_arr.ndim == 2 and inp_arr.shape[1] == 3
+    ):
+        raise ValueError(
+            "Input to predict method should be an nx3 numpy array with columns velocity, in situ "
+            "pressure and depleted pressure"
+        )
+class CarbonateExponentialPressure:
+    def __init__(
+        self,
+        a_factor: float = None,
+        b_factor: float = None,
+        model_max_pressure: float = None,
+        description: str = "",
+    ):
+        self._a_factor = a_factor
+        self._b_factor = b_factor
+        self._model_max_pressure = model_max_pressure
+        self._description = description
+    def todict(self):
+        return {
+            "a_factor": self._a_factor,
+            "b_factor": self._b_factor,
+            "model_max_pressure": self._model_max_pressure,
+            "description": self._description,
+        }
+    @property
+    def a_factor(self) -> float:
+        return self._a_factor
+    @property
+    def b_factor(self) -> float:
+        return self._b_factor
+    @property
+    def max_pressure(self) -> float:
+        return self._model_max_pressure
+    @property
+    def description(self) -> str:
+        return self._description
+    def predict(self, inp_arr: np.ndarray) -> Union[np.ndarray, None]:
+        _verify_input(inp_arr)
+        if not self._valid():
+            return None
+        vel = inp_arr[:, 0]
+        eff_pres_in_situ = inp_arr[:, 1]
+        eff_pres_depl = inp_arr[:, 2]
+        # Return differential velocity to match alternative models
+        return (
+            vel
+            * (1.0 - self._a_factor * np.exp(-eff_pres_depl / self._b_factor))
+            / (1.0 - self._a_factor * np.exp(-eff_pres_in_situ / self._b_factor))
+            - vel
+        )
+    def predict_max(self, inp_arr: np.ndarray) -> Union[np.ndarray, None]:
+        _verify_input(inp_arr)
+        if not self._valid():
+            return None
+        vel = inp_arr[:, 0]
+        eff_pres_in_situ = inp_arr[:, 1]
+        return (
+            vel
+            * (
+                1.0
+                - self._a_factor * np.exp(-self._model_max_pressure / self._b_factor)
+            )
+            / (1.0 - self._a_factor * np.exp(-eff_pres_in_situ / self.b_factor))
+        )
+    def predict_abs(self, inp_arr: np.ndarray) -> Union[np.ndarray, None]:
+        _verify_input(inp_arr)
+        if not self._valid():
+            return None
+        vel = inp_arr[:, 0]
+        eff_pres_in_situ = inp_arr[:, 1]
+        eff_pres_depl = inp_arr[:, 2]
+        return (
+            vel
+            * (1.0 - self._a_factor * np.exp(-eff_pres_depl / self._b_factor))
+            / (1.0 - self._a_factor * np.exp(-eff_pres_in_situ / self._b_factor))
+        )
+    def save(self, file):
+        with open(file, "wb") as f_out:
+            pickle.dump(self.todict(), f_out)
+    @classmethod
+    def load(cls, file):
+        with open(file, "rb") as f_in:
+            inp_pcl = pickle.load(f_in)
+            return cls(
+                a_factor=inp_pcl["a_factor"],
+                b_factor=inp_pcl["b_factor"],
+                model_max_pressure=inp_pcl["model_max_pressure"],
+                description=inp_pcl["description"],
+            )
+    def _valid(self):
+        if self.a_factor is None:
+            raise ValueError('object field "a_factor" is not set')
+        if self.b_factor is None:
+            raise ValueError('object field "b_factor" is not set')
+        if self.max_pressure is None:
+            raise ValueError('object field "max_pressure" is not set')
+        return True

rock_physics_open/equinor_utilities/machine_learning_utilities/import_ml_models.py ADDED Viewed

@@ -0,0 +1,61 @@
+from .exponential_model import CarbonateExponentialPressure
+from .sigmoidal_model import CarbonateSigmoidalPressure
+def import_model(model_file_name):
+    """
+    Utility to import a pickled dict containing information needed to run a classification or regression based on
+    a calibrated model.
+    Parameters
+    ----------
+    model_file_name : str
+        Full name including path for model file.
+    Returns
+    -------
+    models, scaler, ohe, label_var, label_units, feature_var, cat_var : Any
+        models: various regression or classification models from e.g. sklearn or tensorflow keras, scaler: preprocessing
+        Robust Scaler, label_var: name(s) of label variable(s), label_unit: unit(s) of label variable(s), cat_var:
+        categorical variables that should be encoded with one-hot-encoder.
+    """
+    from pickle import load
+    with open(model_file_name, "rb") as fin:
+        # 11.04.2021 HFLE: There is an issue that is not connected to the local function, in that a warning is issued
+        # when the model is loaded, claiming that it is of an older version. This is debugged in detail, and the model
+        # IS of the correct version, so the error arise elsewhere. To avoid confusion, the warning is suppressed here
+        import warnings
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=UserWarning)
+            mod_dict = load(fin)
+    if mod_dict["model_type"] == "Sigmoid":
+        models = CarbonateSigmoidalPressure.load(mod_dict["nn_mod"])
+    elif mod_dict["model_type"] == "Exponential":
+        models = CarbonateExponentialPressure.load(mod_dict["nn_mod"])
+    else:
+        raise ValueError("unknown model type {}".format(mod_dict["model_type"]))
+    ohe = None
+    cat_var = []
+    try:
+        if mod_dict["ohe"]:
+            with open(mod_dict["ohe"], "rb") as f:
+                ohe_dict = load(f)
+                ohe = ohe_dict["ohe"]
+                cat_var = ohe_dict["cat_var"]
+    except (FileExistsError, FileNotFoundError):
+        pass
+    return (
+        models,
+        mod_dict["scaler"],
+        ohe,
+        mod_dict["label_var"],
+        mod_dict["label_units"],
+        mod_dict["feature_var"],
+        cat_var,
+    )

rock_physics_open/equinor_utilities/machine_learning_utilities/run_regression.py ADDED Viewed

@@ -0,0 +1,151 @@
+import os
+from re import match
+import numpy as np
+import pandas as pd
+from .dummy_vars import generate_dummy_vars
+from .import_ml_models import import_model
+def _read_models(*model_files, model_dir=None):
+    # Find the directory of the model files, change working directory, return to original directory at end of function
+    orig_dir = os.getcwd()
+    if model_dir is None:
+        model_dir, _ = os.path.split(model_files[0])
+    os.chdir(model_dir)
+    # Allocate lists and read model
+    reg_model, scaler, ohe, label_var, label_unit, feat_var, cat_var = (
+        [] for _ in range(7)
+    )
+    answer_lists = [reg_model, scaler, ohe, label_var, label_unit, feat_var, cat_var]
+    for mod_name in model_files:
+        answer = import_model(mod_name)
+        for ans, lst in zip(answer, answer_lists):
+            lst.append(ans)
+    # Need to modify names
+    col_names, col_units = ([] for _ in range(2))
+    for i in range(len(label_var)):
+        col_names.append(label_var[i] + "_" + model_files[i].replace(label_var[i], ""))
+        col_units.append(label_unit[i])
+    os.chdir(orig_dir)
+    return (
+        reg_model,
+        scaler,
+        ohe,
+        label_var,
+        label_unit,
+        feat_var,
+        cat_var,
+        col_names,
+        col_units,
+    )
+def _perform_regression(
+    inp_frame, col_names, feat_var, cat_var, ohe, scaler, reg_model
+):
+    depth = inp_frame.index.to_numpy()
+    res_frame = pd.DataFrame(index=depth, columns=col_names)
+    for j, model_name in enumerate(col_names):
+        tmp_frame = inp_frame.copy()
+        # Limit to columns used in estimation before dropping NaNs
+        num_var = [i for i in feat_var[j] if not bool(match(r"x\d", i))]
+        no_num_var = len(num_var)
+        if cat_var[j]:
+            num_var.append(cat_var[j])
+        tmp_frame = tmp_frame[num_var]
+        idx_na_n = tmp_frame.isna().any(axis=1)
+        if cat_var[j]:
+            dum_features, _, dum_var_names = generate_dummy_vars(
+                tmp_frame.loc[~idx_na_n], cat_var[j], ohe=ohe[j]
+            )
+            # Add dummy features to data frame
+            kept_dum_var = []
+            for i, name in enumerate(dum_var_names):
+                if name in feat_var[j]:
+                    tmp_frame.loc[~idx_na_n, name] = dum_features[:, i]
+                    kept_dum_var.append(name)
+            tmp_frame.drop(columns=[cat_var[j]], inplace=True)
+            # Need to assure that we have the correct sequence of features
+            tmp_frame = tmp_frame.reindex(columns=feat_var[j])
+            new_features = np.zeros((np.sum(~idx_na_n), tmp_frame.shape[1]))
+            # Make scaling optional
+            if scaler[j] is not None:
+                new_features[:, :no_num_var] = scaler[j].transform(
+                    tmp_frame.to_numpy()[~idx_na_n, :no_num_var]
+                )
+            else:
+                new_features[:, :no_num_var] = tmp_frame.to_numpy()[
+                    ~idx_na_n, :no_num_var
+                ]
+            new_features[:, no_num_var:] = tmp_frame.loc[
+                ~idx_na_n, kept_dum_var
+            ].to_numpy()
+        else:
+            # Much simpler if there are no dummy variables
+            # Need to assure that we have the correct sequence of features
+            tmp_frame = tmp_frame.reindex(columns=feat_var[j])
+            # Make scaling optional
+            if scaler[j] is not None:
+                new_features = scaler[j].transform(tmp_frame.to_numpy()[~idx_na_n, :])
+            else:
+                new_features = tmp_frame.to_numpy()[~idx_na_n, :]
+        new_var = np.ones(depth.shape[0]) * np.nan
+        new_var[~idx_na_n] = reg_model[j].predict(new_features).flatten()
+        res_frame[col_names[j]] = new_var
+    return res_frame
+def run_regression(inp_df, vp_model_file_name, vs_model_file_name, model_dir=None):
+    """
+    Estimate Vp and Vs by neural network regression with multiple inputs.
+    Parameters
+    ----------
+    inp_df : pd.DataFrame
+        Input logs required for the regression.
+    vp_model_file_name : str
+        Full file name for vp model.
+    vs_model_file_name : str
+        Full file name for vs model.
+    model_dir : str
+        Directory.
+    Returns
+    -------
+    vp, vs : pd.DataFrame
+        Estimated vp and vs as series in Pandas DataFrame.
+    """
+    (
+        regression_model,
+        scaler_obj,
+        ohe_obj,
+        label_var,
+        label_var_unit,
+        feature_var,
+        category_var,
+        column_names,
+        column_units,
+    ) = _read_models(vp_model_file_name, vs_model_file_name, model_dir=model_dir)
+    return _perform_regression(
+        inp_df,
+        column_names,
+        feature_var,
+        category_var,
+        ohe_obj,
+        scaler_obj,
+        regression_model,
+    )