PyPI - pycreditools - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pycreditools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

pycreditools/__init__.py +43 -0
pycreditools/_kernels/__init__.py +5 -0
pycreditools/_kernels/iv.py +167 -0
pycreditools/_kernels/tier_metrics.py +103 -0
pycreditools/_kernels/ward.py +155 -0
pycreditools/_parallel.py +32 -0
pycreditools/_types.py +28 -0
pycreditools/analysis.py +96 -0
pycreditools/grouping.py +222 -0
pycreditools/performance.py +141 -0
pycreditools/policy.py +133 -0
pycreditools/py.typed +1 -0
pycreditools/sample_data.py +98 -0
pycreditools/screening.py +224 -0
pycreditools/simulation.py +185 -0
pycreditools/stages.py +175 -0
pycreditools/stress.py +119 -0
pycreditools-0.1.0.dist-info/METADATA +155 -0
pycreditools-0.1.0.dist-info/RECORD +21 -0
pycreditools-0.1.0.dist-info/WHEEL +4 -0
pycreditools-0.1.0.dist-info/licenses/LICENSE +21 -0

pycreditools/__init__.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""
+pycreditools: A Python library for credit risk policy simulation and analysis.
+"""
+from ._types import SimulationMethod, ClusteringMethod, Quadrant, StageDirection, PolicySummary
+from .stages import Stage, CutoffStage, FilterStage, RateStage
+from .stress import StressScenario, AggravationStress, MonotonicStress, CustomStress
+from .policy import CreditPolicy
+from .simulation import CreditSimResults, run_simulation
+from .performance import summarize_results, compare_policies
+from .analysis import run_tradeoff_analysis
+from .grouping import find_risk_groups, RiskGroupResult, GroupingRecipe
+from .screening import screen_risk_segments, ScreeningResult, ScreeningRecipe
+from .sample_data import generate_sample_data
+__all__ = [
+    "SimulationMethod",
+    "ClusteringMethod",
+    "Quadrant",
+    "StageDirection",
+    "PolicySummary",
+    "Stage",
+    "CutoffStage",
+    "FilterStage",
+    "RateStage",
+    "StressScenario",
+    "AggravationStress",
+    "MonotonicStress",
+    "CustomStress",
+    "CreditPolicy",
+    "CreditSimResults",
+    "run_simulation",
+    "summarize_results",
+    "compare_policies",
+    "run_tradeoff_analysis",
+    "find_risk_groups",
+    "RiskGroupResult",
+    "GroupingRecipe",
+    "screen_risk_segments",
+    "ScreeningResult",
+    "ScreeningRecipe",
+    "generate_sample_data",
+]

pycreditools/_kernels/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .ward import ward_cluster
+from .iv import iv_cluster
+from .tier_metrics import calculate_tier_metrics
+__all__ = ["ward_cluster", "iv_cluster", "calculate_tier_metrics"]

pycreditools/_kernels/iv.py ADDED Viewed

@@ -0,0 +1,167 @@
+from __future__ import annotations
+import numpy as np
+def iv_cluster(
+    pd_values: np.ndarray,
+    volumes: np.ndarray,
+    max_groups: int,
+    min_vol_ratio: float,
+    lambda_cross: float = 0.5,
+    lambda_vol: float = 0.2,
+    monthly_vols: np.ndarray | None = None,
+    monthly_bads: np.ndarray | None = None,
+) -> np.ndarray:
+    """
+    IV-based agglomerative clustering with constraints.
+    Args:
+        pd_values: float64[n_bins] - mean PD per bin
+        volumes: int64[n_bins] - volume per bin
+        max_groups: exact number of output clusters (algorithm will merge down to this)
+        min_vol_ratio: min fraction of total volume per cluster
+        lambda_cross: penalty weight for vintage crossings
+        lambda_vol: penalty weight for PD volatility
+        monthly_vols: int64[n_bins, n_months]
+        monthly_bads: int64[n_bins, n_months]
+    Returns:
+        int64[n_bins] - 1-based group assignments
+    """
+    n_bins = len(pd_values)
+    if n_bins == 0:
+        return np.array([], dtype=np.int64)
+    if n_bins <= max_groups and (volumes == 0).sum() == 0:
+        # Check if all other constraints hold? Actually if we just want to force merges
+        # when constraints are violated, we should still run the loop.
+        pass
+    active = np.ones(n_bins, dtype=bool)
+    current_vol = volumes.copy().astype(np.float64)
+    current_bads = (pd_values * current_vol).astype(np.float64)
+    total_vol = current_vol.sum()
+    total_bads = current_bads.sum()
+    total_goods = total_vol - total_bads
+    if monthly_vols is not None and monthly_bads is not None:
+        curr_m_vols = monthly_vols.copy().astype(np.float64)
+        curr_m_bads = monthly_bads.copy().astype(np.float64)
+    else:
+        curr_m_vols = None
+        curr_m_bads = None
+    group_ids = np.arange(n_bins)
+    n_active = n_bins
+    def calc_iv(bads, vols):
+        if total_goods <= 0 or total_bads <= 0:
+            return 0.0
+        goods = vols - bads
+        p_b = bads / total_bads
+        p_g = goods / total_goods
+        if p_b <= 0 or p_g <= 0:
+            return 0.0
+        return (p_g - p_b) * np.log(p_g / p_b)
+    while True:
+        if n_active <= 1:
+            break
+        active_indices = np.where(active)[0]
+        n_curr = len(active_indices)
+        min_cost = np.inf
+        best_merge_idx = -1
+        for i in range(n_curr - 1):
+            idx1 = active_indices[i]
+            idx2 = active_indices[i+1]
+            v1 = current_vol[idx1]
+            v2 = current_vol[idx2]
+            b1 = current_bads[idx1]
+            b2 = current_bads[idx2]
+            p1 = b1 / v1 if v1 > 0 else 0.0
+            p2 = b2 / v2 if v2 > 0 else 0.0
+            # Hard skip for monotonicity violation unless it's a forced merge
+            # Monotonicity violation: p1 >= p2
+            violation = (p1 >= p2) and (v1 > 0) and (v2 > 0)
+            # Force merges if volume is 0
+            if v1 == 0 or v2 == 0:
+                cost = -1e9
+            else:
+                if violation:
+                    cost = -1e6 # prioritize fixing monotonicity over normal merges
+                else:
+                    # Calculate IV loss
+                    iv1 = calc_iv(b1, v1)
+                    iv2 = calc_iv(b2, v2)
+                    iv_merged = calc_iv(b1 + b2, v1 + v2)
+                    iv_loss = iv1 + iv2 - iv_merged
+                    cross_penalty = 0.0
+                    volatility_penalty = 0.0
+                    if curr_m_vols is not None and curr_m_bads is not None:
+                        mv = curr_m_vols[idx1] + curr_m_vols[idx2]
+                        mb = curr_m_bads[idx1] + curr_m_bads[idx2]
+                        valid = mv > 0
+                        if valid.any():
+                            mp = mb[valid] / mv[valid]
+                            volatility_penalty = np.std(mp)
+                        # crossings between new merged group and neighbors?
+                        # To simplify, the C++ IV clustering engine penalizes crossings
+                        # *within* the merged group (i.e. did the two groups cross each other?)
+                        mv1 = curr_m_vols[idx1]
+                        mv2 = curr_m_vols[idx2]
+                        mb1 = curr_m_bads[idx1]
+                        mb2 = curr_m_bads[idx2]
+                        v_valid = (mv1 > 0) & (mv2 > 0)
+                        if v_valid.any():
+                            mp1 = mb1[v_valid] / mv1[v_valid]
+                            mp2 = mb2[v_valid] / mv2[v_valid]
+                            crossings = np.sum(mp1 >= mp2)
+                            cross_penalty = crossings
+                    cost = iv_loss + lambda_cross * cross_penalty + lambda_vol * volatility_penalty
+                    # Force merge if volume below threshold
+                    if (v1 / total_vol < min_vol_ratio) or (v2 / total_vol < min_vol_ratio):
+                        cost -= 1000.0 # arbitrary large priority but less than monotonicity
+            if cost < min_cost:
+                min_cost = cost
+                best_merge_idx = i
+        # Stopping condition
+        # If no forced merges are required AND we reached max_groups, stop.
+        # Forced merges have cost < -100
+        if min_cost >= -100 and n_active <= max_groups:
+            break
+        # Execute merge
+        idx1 = active_indices[best_merge_idx]
+        idx2 = active_indices[best_merge_idx + 1]
+        current_vol[idx1] += current_vol[idx2]
+        current_bads[idx1] += current_bads[idx2]
+        if curr_m_vols is not None and curr_m_bads is not None:
+            curr_m_vols[idx1] += curr_m_vols[idx2]
+            curr_m_bads[idx1] += curr_m_bads[idx2]
+        active[idx2] = False
+        group_ids[group_ids == idx2] = idx1
+        n_active -= 1
+    # Remap to 1-based sequential integers
+    active_indices = np.where(active)[0]
+    final_mapping = {old_idx: new_idx for new_idx, old_idx in enumerate(active_indices, 1)}
+    result = np.array([final_mapping[g] for g in group_ids], dtype=np.int64)
+    return result

pycreditools/_kernels/tier_metrics.py ADDED Viewed

@@ -0,0 +1,103 @@
+import numpy as np
+import pandas as pd
+def calculate_tier_metrics(
+    values: np.ndarray,
+    groups: np.ndarray,
+    defaults: np.ndarray,
+    n_bins: int,
+) -> pd.DataFrame:
+    """
+    Fast screening metrics calculating IV and PD spread for a variable across risk groups.
+    Args:
+        values: float64[n_obs] - candidate variable values
+        groups: int64[n_obs] - risk group assignments
+        defaults: int64[n_obs] - default flags (0/1)
+        n_bins: number of quantile bins per group
+    Returns:
+        DataFrame with columns: risk_group, iv, pd_min, pd_max, pd_spread, tier_vol
+    """
+    unique_groups = np.unique(groups)
+    results = []
+    # We should exclude NaNs from calculations
+    valid_mask = ~np.isnan(values) & ~np.isnan(defaults) & ~np.isnan(groups)
+    values = values[valid_mask]
+    groups = groups[valid_mask].astype(np.int64)
+    defaults = defaults[valid_mask].astype(np.int64)
+    for g in unique_groups:
+        if np.isnan(g):
+            continue
+        g_mask = (groups == g)
+        g_values = values[g_mask]
+        g_defaults = defaults[g_mask]
+        tier_vol = len(g_values)
+        if tier_vol == 0:
+            results.append({
+                "risk_group": g,
+                "iv": 0.0,
+                "pd_min": np.nan,
+                "pd_max": np.nan,
+                "pd_spread": 0.0,
+                "tier_vol": 0
+            })
+            continue
+        total_bads = g_defaults.sum()
+        total_goods = tier_vol - total_bads
+        # Sort by value to bin
+        sort_idx = np.argsort(g_values)
+        sorted_defaults = g_defaults[sort_idx]
+        # Rank-based binning (similar to pd.qcut with duplicates handled implicitly by position)
+        bin_assignments = (np.arange(tier_vol) * n_bins) // tier_vol
+        iv_sum = 0.0
+        pd_list = []
+        for b in range(n_bins):
+            b_mask = (bin_assignments == b)
+            b_vol = b_mask.sum()
+            if b_vol == 0:
+                continue
+            b_bads = sorted_defaults[b_mask].sum()
+            b_goods = b_vol - b_bads
+            pd_list.append(b_bads / b_vol)
+            # Laplace smoothing for IV
+            p_b = (b_bads + 0.5) / (total_bads + 1.0)
+            p_g = (b_goods + 0.5) / (total_goods + 1.0)
+            iv_sum += (p_g - p_b) * np.log(p_g / p_b)
+        if len(pd_list) > 0:
+            pd_min = min(pd_list)
+            pd_max = max(pd_list)
+            pd_spread = pd_max - pd_min
+        else:
+            pd_min = np.nan
+            pd_max = np.nan
+            pd_spread = 0.0
+        results.append({
+            "risk_group": g,
+            "iv": iv_sum,
+            "pd_min": pd_min,
+            "pd_max": pd_max,
+            "pd_spread": pd_spread,
+            "tier_vol": tier_vol
+        })
+    if not results:
+        return pd.DataFrame(columns=["risk_group", "iv", "pd_min", "pd_max", "pd_spread", "tier_vol"])
+    return pd.DataFrame(results)

pycreditools/_kernels/ward.py ADDED Viewed

@@ -0,0 +1,155 @@
+from __future__ import annotations
+import numpy as np
+def ward_cluster(
+    pd_values: np.ndarray,
+    volumes: np.ndarray,
+    max_groups: int,
+    min_vol_ratio: float,
+    max_crossings: int,
+    use_volume_weights: bool = True,
+    monthly_vols: np.ndarray | None = None,
+    monthly_bads: np.ndarray | None = None,
+) -> np.ndarray:
+    """
+    Ward agglomerative clustering with credit-risk constraints.
+    Args:
+        pd_values: float64[n_bins] - mean PD per bin
+        volumes: int64[n_bins] - volume per bin
+        max_groups: max number of output clusters
+        min_vol_ratio: min fraction of total volume per cluster
+        max_crossings: max vintage inversions between adjacent groups
+        use_volume_weights: if False, performs pure distance-based linkage
+        monthly_vols: int64[n_bins, n_months]
+        monthly_bads: int64[n_bins, n_months]
+    Returns:
+        int64[n_bins] - 1-based group assignments
+    """
+    n_bins = len(pd_values)
+    if n_bins == 0:
+        return np.array([], dtype=np.int64)
+    if n_bins <= max_groups and (volumes == 0).sum() == 0:
+        # Check if all other constraints hold? Actually if we just want to force merges
+        # when constraints are violated, we should still run the loop.
+        pass
+    # State vectors
+    # We maintain active groups in a linked list structure to allow O(1) merges,
+    # or just use masking since n_bins is typically small (e.g. 100-1000).
+    # Since this is pure numpy for small N, masking and array recreation is fine.
+    active = np.ones(n_bins, dtype=bool)
+    current_pd = pd_values.copy().astype(np.float64)
+    current_vol = volumes.copy().astype(np.float64)
+    total_vol = current_vol.sum()
+    if monthly_vols is not None and monthly_bads is not None:
+        curr_m_vols = monthly_vols.copy().astype(np.float64)
+        curr_m_bads = monthly_bads.copy().astype(np.float64)
+    else:
+        curr_m_vols = None
+        curr_m_bads = None
+    # group_ids tracks which original bins belong to which current cluster index.
+    # initially bin i belongs to cluster i
+    group_ids = np.arange(n_bins)
+    n_active = n_bins
+    while True:
+        if n_active <= 1:
+            break
+        active_indices = np.where(active)[0]
+        n_curr = len(active_indices)
+        min_cost = np.inf
+        best_merge_idx = -1 # index in active_indices of the left group
+        for i in range(n_curr - 1):
+            idx1 = active_indices[i]
+            idx2 = active_indices[i+1]
+            v1 = current_vol[idx1]
+            v2 = current_vol[idx2]
+            p1 = current_pd[idx1]
+            p2 = current_pd[idx2]
+            # Linkage distance
+            if use_volume_weights:
+                if v1 + v2 == 0:
+                    delta = 0.0
+                else:
+                    delta = (v1 * v2) / (v1 + v2) * (p1 - p2)**2
+            else:
+                delta = (p1 - p2)**2
+            cost = delta
+            # Priority 0: Zero volume
+            if v1 == 0 or v2 == 0:
+                cost = -2e9 + delta
+            # Priority 1: Monotonicity violation (p1 >= p2)
+            elif p1 >= p2:
+                cost = -1e9 + delta
+            # Priority 2: Volume below min_vol_ratio
+            elif (v1 / total_vol) < min_vol_ratio or (v2 / total_vol) < min_vol_ratio:
+                cost = -1e6 + delta
+            else:
+                # Priority 3: Crossings
+                if curr_m_vols is not None and curr_m_bads is not None:
+                    mv1 = curr_m_vols[idx1]
+                    mv2 = curr_m_vols[idx2]
+                    mb1 = curr_m_bads[idx1]
+                    mb2 = curr_m_bads[idx2]
+                    # Compute monthly PDs, ignoring months with zero volume in either group
+                    valid_months = (mv1 > 0) & (mv2 > 0)
+                    if valid_months.any():
+                        mp1 = mb1[valid_months] / mv1[valid_months]
+                        mp2 = mb2[valid_months] / mv2[valid_months]
+                        crossings = np.sum(mp1 >= mp2)
+                        if crossings > max_crossings:
+                            cost = -1e3 + delta
+            if cost < min_cost:
+                min_cost = cost
+                best_merge_idx = i
+        # Stopping condition: if no constraint violated AND n_active <= max_groups
+        if min_cost >= 0 and n_active <= max_groups:
+            break
+        # Execute merge
+        idx1 = active_indices[best_merge_idx]
+        idx2 = active_indices[best_merge_idx + 1]
+        # Merge idx2 into idx1
+        v1 = current_vol[idx1]
+        v2 = current_vol[idx2]
+        if v1 + v2 > 0:
+            current_pd[idx1] = (current_pd[idx1] * v1 + current_pd[idx2] * v2) / (v1 + v2)
+        else:
+            current_pd[idx1] = 0.0
+        current_vol[idx1] = v1 + v2
+        if curr_m_vols is not None and curr_m_bads is not None:
+            curr_m_vols[idx1] += curr_m_vols[idx2]
+            curr_m_bads[idx1] += curr_m_bads[idx2]
+        active[idx2] = False
+        group_ids[group_ids == idx2] = idx1
+        n_active -= 1
+    # Remap active groups to 1-based sequential integers
+    active_indices = np.where(active)[0]
+    final_mapping = {old_idx: new_idx for new_idx, old_idx in enumerate(active_indices, 1)}
+    result = np.array([final_mapping[g] for g in group_ids], dtype=np.int64)
+    return result

pycreditools/_parallel.py ADDED Viewed

@@ -0,0 +1,32 @@
+import pandas as pd
+from typing import Callable, Iterable, Any, Optional
+def parallel_map(
+    fn: Callable[[Any], Any],
+    items: Iterable[Any],
+    parallel: bool = False,
+    n_workers: Optional[int] = None,
+    desc: Optional[str] = None
+) -> list[Any]:
+    """Map fn over items, optionally in parallel via concurrent.futures."""
+    if not parallel:
+        return [fn(item) for item in items]
+    import concurrent.futures
+    with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
+        results = list(executor.map(fn, items))
+    return results
+def parallel_map_df(
+    fn: Callable[[Any], pd.DataFrame],
+    items: Iterable[Any],
+    parallel: bool = False,
+    n_workers: Optional[int] = None,
+    desc: Optional[str] = None
+) -> pd.DataFrame:
+    """Map fn over items and concat results into DataFrame."""
+    results = parallel_map(fn, items, parallel, n_workers, desc)
+    if not results:
+        return pd.DataFrame()
+    return pd.concat(results, ignore_index=True)

pycreditools/_types.py ADDED Viewed

@@ -0,0 +1,28 @@
+from enum import Enum
+from typing import TypedDict
+class SimulationMethod(str, Enum):
+    ANALYTICAL = "analytical"
+    STOCHASTIC = "stochastic"
+class ClusteringMethod(str, Enum):
+    WARD = "ward"
+    IV = "iv"
+class Quadrant(str, Enum):
+    KEEP_IN = "keep_in"
+    SWAP_IN = "swap_in"
+    SWAP_OUT = "swap_out"
+    KEEP_OUT = "keep_out"
+class StageDirection(str, Enum):
+    GTE = "gte"
+    LTE = "lte"
+class PolicySummary(TypedDict):
+    """Schema for simulation summary outputs."""
+    scenario: str
+    applicants: int
+    approved: float
+    hired: float
+    bad_rate: float

pycreditools/analysis.py ADDED Viewed

@@ -0,0 +1,96 @@
+import pandas as pd
+import itertools
+from typing import Any
+import copy
+from .policy import CreditPolicy
+from .stages import CutoffStage, RateStage, FilterStage
+from .stress import AggravationStress
+from .simulation import run_simulation, SimulationMethod
+def run_tradeoff_analysis(
+    data: pd.DataFrame,
+    base_policy: CreditPolicy,
+    vary_params: dict[str, list[Any]],
+    parallel: bool = False,
+) -> pd.DataFrame:
+    """Run a trade-off analysis simulation over a grid of parameters.
+    Args:
+        data: Applicant data.
+        base_policy: The template policy.
+        vary_params: Dictionary mapping parameter names to lists of values.
+        parallel: Whether to run in parallel using concurrent.futures.
+    Returns:
+        DataFrame containing results.
+    """
+    keys = list(vary_params.keys())
+    values = list(vary_params.values())
+    # Create parameter grid
+    grid = [dict(zip(keys, v)) for v in itertools.product(*values)]
+    def _run_single(params: dict[str, Any]) -> dict[str, Any]:
+        temp_policy = copy.deepcopy(base_policy)
+        # 1. Handle Cutoffs
+        cutoff_params = {k: v for k, v in params.items() if k.endswith("_cutoff")}
+        if cutoff_params:
+            actual_cutoffs = {}
+            for k, v in cutoff_params.items():
+                col_name = k.replace("_cutoff", "")
+                if col_name in data.columns:
+                    actual_cutoffs[col_name] = v
+            if actual_cutoffs:
+                temp_policy = temp_policy.add_stage(
+                    CutoffStage(name="dynamic_cutoffs", cutoffs=actual_cutoffs)
+                )
+        # 2. Handle Aggravation Factor
+        if "aggravation_factor" in params:
+            agg_stress = AggravationStress(factor=params["aggravation_factor"])
+            # Replace stress scenarios
+            import dataclasses
+            temp_policy = dataclasses.replace(temp_policy, stress_scenarios=(agg_stress,))
+        # 3. Handle Dynamic Base Rates
+        base_rate_params = {k: v for k, v in params.items() if k.endswith("_base_rate")}
+        if base_rate_params:
+            stages_list = list(temp_policy.stages)
+            for k, v in base_rate_params.items():
+                stage_name = k.replace("_base_rate", "")
+                for i, stage in enumerate(stages_list):
+                    if stage.name == stage_name and isinstance(stage, RateStage):
+                        stages_list[i] = RateStage(name=stage.name, base_rate=v, variable=stage.variable)
+            import dataclasses
+            temp_policy = dataclasses.replace(temp_policy, stages=tuple(stages_list))
+        # Run simulation
+        sim_results = run_simulation(data, temp_policy, method=SimulationMethod.ANALYTICAL)
+        final_data = sim_results.data
+        app_sum = final_data["new_approval"].sum()
+        total = len(final_data)
+        approval_rate = app_sum / total if total > 0 else 0.0
+        if app_sum > 0:
+            bad_rate = (final_data["simulated_default"] * final_data["new_approval"]).sum() / app_sum
+        else:
+            bad_rate = 0.0
+        result = dict(params)
+        result["approval_rate"] = approval_rate
+        result["default_rate"] = bad_rate
+        return result
+    if parallel:
+        import concurrent.futures
+        with concurrent.futures.ProcessPoolExecutor() as executor:
+            results = list(executor.map(_run_single, grid))
+    else:
+        results = [_run_single(p) for p in grid]
+    return pd.DataFrame(results)