PyPI - rm-utils - Versions diffs - 1.0.0__tar.gz - Mend

rm-utils 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

rm_utils-1.0.0/LICENSE +21 -0
rm_utils-1.0.0/PKG-INFO +74 -0
rm_utils-1.0.0/docs/PYPI.md +41 -0
rm_utils-1.0.0/pyproject.toml +46 -0
rm_utils-1.0.0/rm_utils/assets/logo.png +0 -0
rm_utils-1.0.0/rm_utils/metrics/__init__.py +1 -0
rm_utils-1.0.0/rm_utils/metrics/calculator.py +184 -0
rm_utils-1.0.0/rm_utils/metrics/metric_funcs.py +75 -0
rm_utils-1.0.0/rm_utils/psi/__init__.py +1 -0
rm_utils-1.0.0/rm_utils/psi/stability.py +1140 -0
rm_utils-1.0.0/rm_utils/reports/__init__.py +1 -0
rm_utils-1.0.0/rm_utils/reports/reporter.py +296 -0
rm_utils-1.0.0/rm_utils/sql/db_connectors.py +308 -0

rm_utils-1.0.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Nikita Emelyanov
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

rm_utils-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,74 @@
+Metadata-Version: 2.4
+Name: rm-utils
+Version: 1.0.0
+Summary:
+License: MIT
+License-File: LICENSE
+Author: n-emelianov
+Author-email: limbolume2023@gmail.com
+Requires-Python: >=3.12, <3.15
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Requires-Dist: clickhouse-connect (>=0.10.0,<0.11.0)
+Requires-Dist: matplotlib (>=3.10.8,<4.0.0)
+Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
+Requires-Dist: pandas (>=3.0.0,<4.0.0)
+Requires-Dist: paramiko (<2.12.0)
+Requires-Dist: pillow (>=12.1.0,<13.0.0)
+Requires-Dist: psycopg2-binary (>=2.9.11,<3.0.0)
+Requires-Dist: scikit-learn (>=1.8.0,<2.0.0)
+Requires-Dist: sqlalchemy (>=2.0.46,<3.0.0)
+Requires-Dist: sqlparse (>=0.5.5,<0.6.0)
+Requires-Dist: sshtunnel (>=0.4.0,<0.5.0)
+Project-URL: Changelog, https://github.com/n-emelyanov/RM_UTILS/blob/master/docs/CHANGELOG.md
+Project-URL: Documentation, https://github.com/n-emelyanov/RM_UTILS#readme
+Project-URL: Homepage, https://github.com/n-emelyanov/RM_UTILS
+Project-URL: Issues, https://github.com/n-emelyanov/RM_UTILS/issues
+Project-URL: Repository, https://github.com/n-emelyanov/RM_UTILS.git
+Description-Content-Type: text/markdown
+# Библиотека утилит для риск-менеджмента ML-моделей
+![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)
+Библиотека разработана для помощи в реализации и мониторинге рисковых ML-моделей в финансовой и других риск-ориентированных областях. Включает инструменты для расчета метрик, анализа стабильности, генерации отчетов и работы с данными.
+## Ключевые возможности
+- Расчет PSI (Population Stability Index) и других метрик стабильности
+- Специализированные метрики для оценки качества ML-моделей в риск-менеджменте
+- Генерация автоматизированных отчетов
+- Утилиты для работы с SQL-запросами
+- Инструменты для обработки и анализа данных
+## Установка
+```bash
+pip install rm-utils
+```
+## Использование
+```python
+from rm_utils.reports import ExcelReporter
+# Пример создания отчета в Excel
+path = r"/path_to_excel/report.xlsx"
+writer = ExcelReporter(path)
+# Добавление датафреймов
+writer.add_dataframe(data=your_dataframe, row_offset=4, col_offset=2)
+# Сохранение отчета
+writer.save()
+```
+Подробные примеры использования смотрите в [examples/usage_examples.ipynb](./examples/usage_examples.ipynb).
+## История изменений
+Все изменения подробно описаны в [CHANGELOG.md](https://github.com/n-emelyanov/RM_UTILS/blob/master/CHANGELOG.md)

rm_utils-1.0.0/docs/PYPI.md ADDED Viewed

@@ -0,0 +1,41 @@
+# Библиотека утилит для риск-менеджмента ML-моделей
+![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)
+Библиотека разработана для помощи в реализации и мониторинге рисковых ML-моделей в финансовой и других риск-ориентированных областях. Включает инструменты для расчета метрик, анализа стабильности, генерации отчетов и работы с данными.
+## Ключевые возможности
+- Расчет PSI (Population Stability Index) и других метрик стабильности
+- Специализированные метрики для оценки качества ML-моделей в риск-менеджменте
+- Генерация автоматизированных отчетов
+- Утилиты для работы с SQL-запросами
+- Инструменты для обработки и анализа данных
+## Установка
+```bash
+pip install rm-utils
+```
+## Использование
+```python
+from rm_utils.reports import ExcelReporter
+# Пример создания отчета в Excel
+path = r"/path_to_excel/report.xlsx"
+writer = ExcelReporter(path)
+# Добавление датафреймов
+writer.add_dataframe(data=your_dataframe, row_offset=4, col_offset=2)
+# Сохранение отчета
+writer.save()
+```
+Подробные примеры использования смотрите в [examples/usage_examples.ipynb](./examples/usage_examples.ipynb).
+## История изменений
+Все изменения подробно описаны в [CHANGELOG.md](https://github.com/n-emelyanov/RM_UTILS/blob/master/CHANGELOG.md)

rm_utils-1.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,46 @@
+[project]
+name = "rm-utils"
+version = "1.0.0"
+description = ""
+authors = [
+    {name = "n-emelianov",email = "limbolume2023@gmail.com"}
+]
+license = {text = "MIT"}
+readme = "docs/PYPI.md"
+requires-python = ">=3.12, <3.15"
+dependencies = [
+    "pandas (>=3.0.0,<4.0.0)",
+    "openpyxl (>=3.1.5,<4.0.0)",
+    "pillow (>=12.1.0,<13.0.0)",
+    "scikit-learn (>=1.8.0,<2.0.0)",
+    "matplotlib (>=3.10.8,<4.0.0)",
+    "sqlparse (>=0.5.5,<0.6.0)",
+    "sqlalchemy (>=2.0.46,<3.0.0)",
+    "sshtunnel (>=0.4.0,<0.5.0)",
+    "clickhouse-connect (>=0.10.0,<0.11.0)",
+    "psycopg2-binary (>=2.9.11,<3.0.0)",
+    "paramiko (<2.12.0)"
+]
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"
+[dependency-groups]
+dev = [
+    "pre-commit (>=4.5.1,<5.0.0)",
+    "pytest (>=9.0.2,<10.0.0)",
+    "ipykernel (>=7.1.0,<8.0.0)",
+    "seaborn (>=0.13.2,<0.14.0)"
+]
+[project.urls]
+Homepage = "https://github.com/n-emelyanov/RM_UTILS"           # Главная страница
+Documentation = "https://github.com/n-emelyanov/RM_UTILS#readme"  # Документация
+Repository = "https://github.com/n-emelyanov/RM_UTILS.git"    # Исходный код
+Changelog = "https://github.com/n-emelyanov/RM_UTILS/blob/master/docs/CHANGELOG.md"  # История изменений
+Issues = "https://github.com/n-emelyanov/RM_UTILS/issues"     # Баг-трекер
+[tool.setuptools]
+packages = ["rm_utils"]

rm_utils-1.0.0/rm_utils/assets/logo.png ADDED Viewed

Binary file

rm_utils-1.0.0/rm_utils/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .calculator import MetricCalculator

rm_utils-1.0.0/rm_utils/metrics/calculator.py ADDED Viewed

@@ -0,0 +1,184 @@
+import itertools
+import pandas as pd
+import numpy as np
+from typing import List
+from IPython.display import display
+def all_combinations(my_list):
+    return itertools.chain.from_iterable(
+        itertools.combinations(my_list, i + 1) for i in range(len(my_list)))
+class MetricCalculator:
+    """Source для расчета метрик по группам.
+    Parameters:
+    ---
+    metr_funcs : dict
+        Словарь функций, используемых для расчета метрик по группам.
+        Функции должны следовать конвенции sklearn - первые 2 аргумента = y_true, y_pred
+    stats_funcs : dict, default={}
+        Словарь функций для расчета метрик по группам.
+        В отличие от metr_funcs используется для расчета статистик по таргету или по другим столбцам.
+        Функции принимают два аргумента - (y_true, data=None)
+    """
+    def __init__(
+        self,
+        metr_funcs: dict,
+        funcs_params: dict = {},
+        stats_funcs: dict = {}
+    ):
+        self.metr_funcs = metr_funcs
+        self.funcs_params = funcs_params
+        self.stats_funcs = stats_funcs
+    def _partial_stack(
+        self, result: pd.DataFrame, pred_cols: List[str], group_cols: List[str]
+    ) -> pd.DataFrame:
+        """Преобразует DataFrame с MultiIndex колонками в частично 'stacked' формат."""
+        columns_to_stack = pd.MultiIndex.from_product(
+            [pred_cols, self.metr_funcs.keys()]
+        )
+        # Стакаем и меняем столбцы
+        stacked = (
+            result[columns_to_stack]
+            .stack(level=0)
+            .reindex(self.metr_funcs.keys(), axis=1)
+            .reset_index()
+            .rename(columns={f"level_{len(group_cols)}": "pred"})
+            .set_index(group_cols)
+        )
+        kept = result.drop(columns=columns_to_stack)
+        kept.columns = kept.columns.get_level_values(1)
+        if len(kept.columns) == 0:
+            return stacked.reset_index()
+        # return pd.concat([stacked, kept], axis=1).reset_index()
+        return stacked.join(kept).reset_index()
+    def _set_metr_funcs(self, data: pd.DataFrame, pred_cols: List[str]) -> dict:
+        # Используем pd.Series таргета для ускорения
+        true_values = data[self.true_col]
+        agg_funcs = {}
+        # Оборачиваем metr_funcs, подставляем параметры если имеются, добавляем в agg_funcs
+        for pred_col in pred_cols:
+            agg_funcs[pred_col] = [
+                (
+                    func_name,
+                    lambda x, f=func, params=self.funcs_params.get(func_name, {}): f(
+                        true_values[x.index], x, **params
+                    ),
+                )
+                for func_name, func in self.metr_funcs.items()
+            ]
+        # Оборачиваем stats_funcs и добавляем в agg_funcs
+        if self.stats_funcs:
+            agg_funcs[self.true_col] = [
+                (
+                    func_name,
+                    lambda x, func=func: func(true_values[x.index], data.loc[x.index])
+                )
+                    for func_name, func in self.stats_funcs.items()
+            ]
+        return agg_funcs
+    def calculate(
+        self,
+        data: pd.DataFrame,
+        true_col: str,
+        pred_cols: List[str] | str,
+        group_cols: List[str] | str,
+        groupby_exclude_combinations: list[str] | None = None,
+        pretify_one_func: bool = False,
+    ) -> pd.DataFrame:
+        """Расчет метрик по группам.
+        Parameters
+        ---
+        data : pd.DataFrame
+            Датафрейм с данными.
+        true_col : str
+            Имя столбца с истинными значениями (передается первым аргументом в metr_funcs).
+        pred_cols : List[str] | str
+            Список имен столбцов с предсказаниями или одно имя столбца.
+        group_cols : List[str]
+            Список имен столбцов для группировки.
+        groupby_exclude_combinations : list[str] or None, default=None
+            Список столбцов для дополнитльной агрегации с all.
+        pretify_one_func : bool, default=False
+            Если True, то возвращает DataFrame с одной функцией в колонке.
+        Returns
+        -------
+        pd.DataFrame
+            Датафрейм с рассчитанными метриками.
+        Examples
+        -------
+        >>> metr_calc = MetricCalculator(
+        ...     metr_funcs={'mae': mean_absolute_error},
+        ...     stats_funcs={'n_obs': lambda y_true, data: len(data)
+        ... )
+        >>> res = metr_calc.calculate(
+        ...     data=data,
+        ...     true_col='target',
+        ...     pred_cols=['pred1', 'pred2'],
+        ...     group_cols=['group1', 'group2']
+        ... )
+        """
+        self.true_col = true_col
+        # Конвертируем pred_cols/group_cols в список, если это строка
+        pred_cols = [pred_cols] if isinstance(pred_cols, str) else pred_cols
+        group_cols = [group_cols] if isinstance(group_cols, str) else group_cols
+        agg_funcs = self._set_metr_funcs(data, pred_cols)
+        result = data.groupby(group_cols).agg(agg_funcs)
+        if groupby_exclude_combinations is not None:
+            for idxs in all_combinations(range(len(groupby_exclude_combinations))):
+                # Исключенные группы
+                not_groupby_cols = list(
+                    np.array(groupby_exclude_combinations.copy())[list(idxs)]
+                )
+                groupby_cols = [
+                    col for col in group_cols if col not in not_groupby_cols
+                ]
+                result_temp = data.groupby(groupby_cols).agg(agg_funcs)
+                for _removed in not_groupby_cols:
+                    result_temp[_removed] = 'all'
+                result_temp = result_temp.reset_index().set_index(group_cols)
+                result = pd.concat([result, result_temp], axis=0)
+        # Красивый вывод, если была задействована одна функция
+        if pretify_one_func and len(self.metr_funcs) == 1:
+            result.columns = pred_cols + [*(self.stats_funcs or {})]
+            return result.reset_index()
+        result = self._partial_stack(result, pred_cols, group_cols)
+        return result

rm_utils-1.0.0/rm_utils/metrics/metric_funcs.py ADDED Viewed

@@ -0,0 +1,75 @@
+from sklearn.metrics import confusion_matrix, mean_squared_error
+from sklearn.metrics import roc_auc_score
+from pandas.api.types import is_numeric_dtype
+import pandas as pd
+import numpy as np
+def mean_absolute_percentage_error(y_true, y_pred, sample_weight=None):
+    """MAPE metric"""
+    epsilon = np.finfo(np.float64).eps
+    mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon)
+    output_errors = np.average(mape, weights=sample_weight, axis=0)
+    return output_errors
+def shortfall(y_true, y_pred):
+    """Shortfall metric"""
+    return 1 - np.sum(y_pred) / np.sum(y_true)
+def pearson_corr(y_true, y_pred):
+    """Pearson correlation coefficient"""
+    corr_coef = np.corrcoef(y_true, y_pred)
+    return corr_coef[0, 1]
+def pearson_nan_corr(y_true, y_pred):
+    """Pearson correlation coefficient ignoring nan values"""
+    corr_coef = np.ma.corrcoef(np.ma.masked_invalid(y_true), np.ma.masked_invalid(y_pred))
+    return corr_coef[0, 1]
+def spearman_corr(y_true, y_pred):
+    """Pearson correlation coefficient ignoring nan values"""
+    corr_coef = np.ma.corrcoef(np.ma.masked_invalid(y_true), np.ma.masked_invalid(y_pred))
+    return corr_coef[0, 1]
+def root_mse(y_true, y_pred):
+    """Root MSE"""
+    return mean_squared_error(y_true, y_pred)
+def gini_score(y_true, y_pred):
+    """Gini score"""
+    roc_auc = roc_auc_score(y_true, y_pred)
+    return roc_auc * 2 - 1
+def gini_score_safe(y_true, y_pred):
+    """Gini score for unseasoned data"""
+    if len(np.unique(y_true)) != 2:
+        return 0
+    roc_auc = roc_auc_score(y_true, y_pred)
+    return roc_auc * 2 - 1
+def roc_auc_score_nan(y_true, y_pred):
+    """ROC_AUC score for bad/unseasoned data.
+    Calcs only where preds & labels is not None.
+    Returns nans instead or raising errors.
+    """
+    notna_mask = (~np.isnan(y_pred)) & (~np.isnan(y_true))
+    y_true, y_pred = y_true[notna_mask], y_pred[notna_mask]
+    if (len(y_true) < 3) or (len(np.unique(y_true)) != 2):
+        return np.nan
+    return roc_auc_score(y_true, y_pred)
+def gini_score_nan(y_true, y_pred):
+    """Gini score for bad/unseasoned data.
+    Calcs only where preds & labels is not None.
+    Returns nans instead or raising errors.
+    """
+    roc_auc = roc_auc_score_nan(y_true, y_pred)
+    return roc_auc * 2 - 1

rm_utils-1.0.0/rm_utils/psi/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .stability import StabilityIndexCalculator, psi_plot