cranalytics 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cranalytics/README.md +42 -0
- cranalytics/__init__.py +151 -0
- cranalytics/__main__.py +6 -0
- cranalytics/_contract_helpers.py +53 -0
- cranalytics/_contract_issues.py +112 -0
- cranalytics/_helpers.py +140 -0
- cranalytics/_loan_contract_base.py +393 -0
- cranalytics/_loan_states.py +171 -0
- cranalytics/_loan_tape_normalization.py +126 -0
- cranalytics/_quickstart_demos.py +479 -0
- cranalytics/_quickstart_ui.py +129 -0
- cranalytics/_rollforward_readiness_session.py +534 -0
- cranalytics/_rollforward_reporting.py +615 -0
- cranalytics/_rollforward_session.py +366 -0
- cranalytics/_rollforward_splitting.py +85 -0
- cranalytics/_rollforward_variants.py +28 -0
- cranalytics/_transition_estimation.py +120 -0
- cranalytics/_validation_core.py +70 -0
- cranalytics/_version.py +3 -0
- cranalytics/_workflow_results.py +68 -0
- cranalytics/cli.py +827 -0
- cranalytics/datasets.py +816 -0
- cranalytics/distributions.py +44 -0
- cranalytics/early_performance.py +676 -0
- cranalytics/examples/__init__.py +23 -0
- cranalytics/examples/core_feature_analytics.py +49 -0
- cranalytics/examples/core_lifetime_loss.py +45 -0
- cranalytics/examples/core_ml_modeling.py +56 -0
- cranalytics/examples/core_rollforward.py +55 -0
- cranalytics/examples/core_segmentation.py +22 -0
- cranalytics/examples/core_simulation.py +39 -0
- cranalytics/examples/core_survival.py +365 -0
- cranalytics/examples/core_vintage.py +44 -0
- cranalytics/examples/fpf_workflow.py +88 -0
- cranalytics/examples/loss_forecasting_demo.py +61 -0
- cranalytics/examples/segmentation_demo.py +105 -0
- cranalytics/finance.py +90 -0
- cranalytics/forecasting_bridge.py +251 -0
- cranalytics/fpf_workflow.py +484 -0
- cranalytics/governance_models.py +266 -0
- cranalytics/loan_history.py +144 -0
- cranalytics/loan_history_contract.py +142 -0
- cranalytics/loan_snapshot.py +210 -0
- cranalytics/loan_snapshot_contract.py +120 -0
- cranalytics/loss_forecasting.py +400 -0
- cranalytics/method_selection.py +474 -0
- cranalytics/metrics.py +130 -0
- cranalytics/model_development.py +225 -0
- cranalytics/onboarding.py +469 -0
- cranalytics/portfolio.py +295 -0
- cranalytics/predictive_backtest.py +183 -0
- cranalytics/predictive_contract.py +192 -0
- cranalytics/predictive_modeling.py +331 -0
- cranalytics/predictive_session.py +105 -0
- cranalytics/predictive_targets.py +279 -0
- cranalytics/py.typed +0 -0
- cranalytics/quickstart.py +119 -0
- cranalytics/rollforward.py +70 -0
- cranalytics/rollforward_backtest.py +268 -0
- cranalytics/rollforward_contract.py +389 -0
- cranalytics/rollforward_evaluation.py +344 -0
- cranalytics/rollforward_readiness.py +69 -0
- cranalytics/rollforward_results.py +587 -0
- cranalytics/rollforward_workflow.py +90 -0
- cranalytics/score_monitoring.py +784 -0
- cranalytics/simulation.py +474 -0
- cranalytics/skills/loss-forecasting/SKILL.md +500 -0
- cranalytics/skills/loss-forecasting/agents/openai.yaml +4 -0
- cranalytics/skills/loss-forecasting/references/flow-hazard-guide.md +146 -0
- cranalytics/skills/loss-forecasting/references/stress-scenario-guide.md +166 -0
- cranalytics/skills/loss-forecasting/references/transition-matrix-guide.md +146 -0
- cranalytics/skills/portfolio-diagnostics/SKILL.md +324 -0
- cranalytics/skills/portfolio-diagnostics/agents/openai.yaml +4 -0
- cranalytics/skills/portfolio-diagnostics/references/lgd-methodology.md +213 -0
- cranalytics/skills/predictive-credit-modeling/SKILL.md +448 -0
- cranalytics/skills/predictive-credit-modeling/agents/openai.yaml +4 -0
- cranalytics/skills/predictive-credit-modeling/references/early-performance-guide.md +119 -0
- cranalytics/skills/predictive-credit-modeling/references/model-training-guide.md +157 -0
- cranalytics/skills/predictive-credit-modeling/references/score-monitoring-guide.md +150 -0
- cranalytics/skills/vintage-loss-curves/SKILL.md +450 -0
- cranalytics/skills/vintage-loss-curves/agents/openai.yaml +4 -0
- cranalytics/skills/vintage-loss-curves/references/methodology.md +272 -0
- cranalytics/skills/vintage-loss-curves/references/smoother-guide.md +231 -0
- cranalytics/skills/vintage-loss-curves/references/validation-api.md +121 -0
- cranalytics/survival.py +349 -0
- cranalytics/survival_flows.py +464 -0
- cranalytics/transition/__init__.py +29 -0
- cranalytics/validation.py +150 -0
- cranalytics/validation_dispatch.py +155 -0
- cranalytics/validation_flow.py +428 -0
- cranalytics/validation_loan.py +727 -0
- cranalytics/validation_vintage.py +337 -0
- cranalytics/vintage.py +79 -0
- cranalytics/vintage_contract.py +191 -0
- cranalytics/vintage_fitting.py +191 -0
- cranalytics/vintage_session.py +175 -0
- cranalytics/vintage_smoothing.py +791 -0
- cranalytics/vintage_transforms.py +275 -0
- cranalytics/vintage_validation.py +489 -0
- cranalytics/vintage_wide.py +341 -0
- cranalytics/viz.py +36 -0
- cranalytics/viz_early_performance.py +343 -0
- cranalytics/viz_heatmaps.py +353 -0
- cranalytics/viz_smoothing.py +723 -0
- cranalytics-0.2.0.dist-info/METADATA +519 -0
- cranalytics-0.2.0.dist-info/RECORD +109 -0
- cranalytics-0.2.0.dist-info/WHEEL +4 -0
- cranalytics-0.2.0.dist-info/entry_points.txt +2 -0
- cranalytics-0.2.0.dist-info/licenses/LICENSE +21 -0
cranalytics/README.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Source Layout Guide
|
|
2
|
+
|
|
3
|
+
This package is organized around workflow boundaries rather than one deep class
|
|
4
|
+
hierarchy. Use this file when you know the business workflow and need to find
|
|
5
|
+
the right implementation file quickly.
|
|
6
|
+
|
|
7
|
+
## Start here first
|
|
8
|
+
|
|
9
|
+
- `__init__.py`: top-level re-export surface for common user-facing imports
|
|
10
|
+
- `cli.py`: CLI command registration and command-specific flow
|
|
11
|
+
- `validation.py`: shared reusable validation rules across workflows
|
|
12
|
+
- `governance_models.py`: internal Pydantic models for non-DataFrame governance artifacts and workflow metadata
|
|
13
|
+
- `docs/reference/module_map.md`: source-to-test-to-doc map by workflow
|
|
14
|
+
|
|
15
|
+
## Workflow boundaries
|
|
16
|
+
|
|
17
|
+
- Vintage: `vintage.py` facade over `vintage_fitting.py`, `vintage_smoothing.py`, `vintage_validation.py`, `vintage_transforms.py`, `vintage_session.py`, `vintage_wide.py`; prefer `run_vintage_analysis_session()` for multi-step comparison and validation
|
|
18
|
+
- Lifetime Loss Forecasting: `loss_forecasting.py`, with related cashflow logic in `simulation.py`
|
|
19
|
+
- Loan snapshot normalization: `loan_snapshot.py`, `loan_snapshot_contract.py`
|
|
20
|
+
- Loan history normalization: `loan_history.py`, `loan_history_contract.py`
|
|
21
|
+
- Transition estimation: `transition/estimator.py`
|
|
22
|
+
- FICO / portfolio diagnostics: `portfolio.py`, `metrics.py`
|
|
23
|
+
- Feature analytics: `early_performance.py`, `model_development.py`, `score_monitoring.py`
|
|
24
|
+
- Predictive modeling: `predictive_targets.py`, `predictive_modeling.py`, `predictive_backtest.py`, `predictive_session.py`, `forecasting_bridge.py`; prefer `run_predictive_modeling_session()` for the end-to-end modeling path
|
|
25
|
+
- Rollforward workflow: `rollforward_workflow.py`, `rollforward_readiness.py`, `rollforward_backtest.py`, `rollforward_contract.py`, `rollforward_evaluation.py`, plus internal `_rollforward_*` coordination and reporting modules
|
|
26
|
+
- Survival: `survival.py`, `survival_flows.py`
|
|
27
|
+
- Skills bundle: `skills/`
|
|
28
|
+
- Packaged demos: `examples/`
|
|
29
|
+
|
|
30
|
+
## Naming conventions
|
|
31
|
+
|
|
32
|
+
- `*_contract.py`: workflow-specific reusable validation seam
|
|
33
|
+
- `*_workflow.py`: top-level orchestration entrypoint
|
|
34
|
+
- `*_session.py`: workflow-aligned coordination boundaries; some are public and return typed session result objects
|
|
35
|
+
- `*_report*.py`: artifact rendering or summary outputs
|
|
36
|
+
- underscore-prefixed modules: internal only
|
|
37
|
+
|
|
38
|
+
## Editing guidance
|
|
39
|
+
|
|
40
|
+
- Start at the public workflow seam before editing internal helpers.
|
|
41
|
+
- If a change affects accepted inputs, inspect the relevant `*_contract.py` file before broadening logic elsewhere.
|
|
42
|
+
- If a change affects docs, onboarding, or first-run discovery, edit both the command surface and the workflow routing docs in the same pass.
|
cranalytics/__init__.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cranalytics - Credit risk analytics library for vintage forecasting,
|
|
3
|
+
FICO segmentation, and portfolio modeling.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from importlib import import_module
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
__version__ = "0.2.0"
|
|
12
|
+
|
|
13
|
+
_PREFERRED_EXPORTS = {
|
|
14
|
+
"CurveFitter": "cranalytics.vintage",
|
|
15
|
+
"DynamicTransitionModel": "cranalytics.simulation",
|
|
16
|
+
"StaticMatrixTransitionModel": "cranalytics.simulation",
|
|
17
|
+
"calculate_classification_metrics": "cranalytics.metrics",
|
|
18
|
+
"calculate_fico_mix": "cranalytics.portfolio",
|
|
19
|
+
"calculate_gini": "cranalytics.metrics",
|
|
20
|
+
"calculate_ks": "cranalytics.metrics",
|
|
21
|
+
"calculate_lgd": "cranalytics.portfolio",
|
|
22
|
+
"calculate_wal": "cranalytics.metrics",
|
|
23
|
+
"create_vintage_triangle": "cranalytics.vintage",
|
|
24
|
+
"detect_incomplete_vintages": "cranalytics.vintage",
|
|
25
|
+
"estimate_recovery": "cranalytics.portfolio",
|
|
26
|
+
"forecast_lifetime_loss": "cranalytics.loss_forecasting",
|
|
27
|
+
"forecast_portfolio_states": "cranalytics.loss_forecasting",
|
|
28
|
+
"load_sample_transition_matrix": "cranalytics.datasets",
|
|
29
|
+
"make_mock_performance_data": "cranalytics.datasets",
|
|
30
|
+
"make_mock_portfolio": "cranalytics.datasets",
|
|
31
|
+
"normalize_vintage_data": "cranalytics.vintage",
|
|
32
|
+
"segment_fico": "cranalytics.portfolio",
|
|
33
|
+
"simulate_portfolio_cashflows": "cranalytics.simulation",
|
|
34
|
+
"smooth_curve": "cranalytics.vintage",
|
|
35
|
+
"summarize_lifetime_loss": "cranalytics.loss_forecasting",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
_COMPAT_EXPORTS = {
|
|
39
|
+
"ReadinessConfig": "cranalytics.rollforward_readiness",
|
|
40
|
+
"Rollforward": "cranalytics.rollforward",
|
|
41
|
+
"RollforwardResult": "cranalytics.rollforward",
|
|
42
|
+
"aggregate_by_dollar_weights": "cranalytics.vintage",
|
|
43
|
+
"assemble_modeling_frame": "cranalytics.predictive_targets",
|
|
44
|
+
"build_targets": "cranalytics.predictive_targets",
|
|
45
|
+
"compute_woe_iv": "cranalytics.early_performance",
|
|
46
|
+
"engineer_loan_features": "cranalytics.model_development",
|
|
47
|
+
"fit_woe_binning": "cranalytics.model_development",
|
|
48
|
+
"forecast_calendar_chargeoff_from_predictions": "cranalytics.forecasting_bridge",
|
|
49
|
+
"generate_rollforward_readiness_report": "cranalytics.rollforward_readiness",
|
|
50
|
+
"get_best_method": "cranalytics.vintage",
|
|
51
|
+
"lift_gain_table": "cranalytics.model_development",
|
|
52
|
+
"make_mock_fpf_data": "cranalytics.datasets",
|
|
53
|
+
"make_performance_flag_schema": "cranalytics.validation",
|
|
54
|
+
"project_incomplete_vintage_tails": "cranalytics.vintage",
|
|
55
|
+
"rank_smoothing_methods": "cranalytics.vintage",
|
|
56
|
+
"run_predictive_backtest": "cranalytics.predictive_backtest",
|
|
57
|
+
"run_predictive_modeling_session": "cranalytics.predictive_session",
|
|
58
|
+
"run_rollforward": "cranalytics.rollforward",
|
|
59
|
+
"run_rollforward_workflow": "cranalytics.rollforward_workflow",
|
|
60
|
+
"run_validation_suite": "cranalytics.vintage",
|
|
61
|
+
"run_vintage_analysis_session": "cranalytics.vintage",
|
|
62
|
+
"score_model": "cranalytics.predictive_modeling",
|
|
63
|
+
"smooth_vintage": "cranalytics.vintage",
|
|
64
|
+
"summarize_predictive_backtest": "cranalytics.predictive_backtest",
|
|
65
|
+
"train_binary_model": "cranalytics.predictive_modeling",
|
|
66
|
+
"train_regression_model": "cranalytics.predictive_modeling",
|
|
67
|
+
"validate_rollforward_input_contract": "cranalytics.rollforward_contract",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# Tombstones for top-level aliases removed in 0.2.0. Accessing one raises an
|
|
71
|
+
# AttributeError that names the module to import from instead.
|
|
72
|
+
_DEPRECATED_TOP_LEVEL_EXPORT_GROUPS = {
|
|
73
|
+
"cranalytics.early_performance": (
|
|
74
|
+
"calculate_early_performance_rates",
|
|
75
|
+
"compute_conditional_loss_table",
|
|
76
|
+
"compute_marginal_impact",
|
|
77
|
+
"compute_segment_rates",
|
|
78
|
+
"estimate_vintage_lifetime_profit",
|
|
79
|
+
"rank_features_by_separation",
|
|
80
|
+
"validate_performance_flags",
|
|
81
|
+
),
|
|
82
|
+
"cranalytics.fpf_workflow": (
|
|
83
|
+
"FPFWorkflowReport",
|
|
84
|
+
"print_fpf_report",
|
|
85
|
+
"run_fpf_workflow",
|
|
86
|
+
"train_fpf_challenger",
|
|
87
|
+
),
|
|
88
|
+
"cranalytics.method_selection": (
|
|
89
|
+
"ModelSweepSpec",
|
|
90
|
+
"build_curve_fitter_sweep_spec",
|
|
91
|
+
"run_backtest_sweeps",
|
|
92
|
+
"select_champion_and_challengers",
|
|
93
|
+
"summarize_variant_performance",
|
|
94
|
+
),
|
|
95
|
+
"cranalytics.rollforward_backtest": (
|
|
96
|
+
"run_rollforward_backtest_sweeps",
|
|
97
|
+
"select_rollforward_champion_and_challengers",
|
|
98
|
+
"summarize_rollforward_variant_performance",
|
|
99
|
+
),
|
|
100
|
+
"cranalytics.score_monitoring": (
|
|
101
|
+
"calibrate_score_to_event_rate",
|
|
102
|
+
"compute_actual_vs_expected",
|
|
103
|
+
"compute_psi",
|
|
104
|
+
"score_performance_monitoring_report",
|
|
105
|
+
"simulate_policy_cutoff",
|
|
106
|
+
),
|
|
107
|
+
"cranalytics.survival_flows": (
|
|
108
|
+
"compare_known_actuals_to_curves",
|
|
109
|
+
"fit_flow_hazard_curves",
|
|
110
|
+
"forecast_balance_flows",
|
|
111
|
+
"validate_flow_data",
|
|
112
|
+
),
|
|
113
|
+
"cranalytics.vintage_wide": (
|
|
114
|
+
"compute_cgco_curve_wide",
|
|
115
|
+
"compute_final_cgco_wide",
|
|
116
|
+
"load_wide_vintage_data",
|
|
117
|
+
),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
_REMOVED_TOP_LEVEL_EXPORTS = {
|
|
121
|
+
name: module_name
|
|
122
|
+
for module_name, names in _DEPRECATED_TOP_LEVEL_EXPORT_GROUPS.items()
|
|
123
|
+
for name in names
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# Keep the top-level namespace compatibility-friendly, but treat `_PREFERRED_EXPORTS`
|
|
127
|
+
# as the small stable promise and `_COMPAT_EXPORTS` as a wider discovery surface.
|
|
128
|
+
_EXPORTS = {**_PREFERRED_EXPORTS, **_COMPAT_EXPORTS}
|
|
129
|
+
_PREFERRED_TOP_LEVEL_NAMES = tuple(sorted(_PREFERRED_EXPORTS))
|
|
130
|
+
|
|
131
|
+
__all__ = tuple(sorted(_EXPORTS))
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def __getattr__(name: str) -> Any:
|
|
135
|
+
module_name = _EXPORTS.get(name)
|
|
136
|
+
if module_name is None:
|
|
137
|
+
removed_module_name = _REMOVED_TOP_LEVEL_EXPORTS.get(name)
|
|
138
|
+
if removed_module_name is not None:
|
|
139
|
+
raise AttributeError(
|
|
140
|
+
f"Top-level import {name!r} was removed in cranalytics 0.2.0; "
|
|
141
|
+
f"use 'from {removed_module_name} import {name}' instead."
|
|
142
|
+
)
|
|
143
|
+
raise AttributeError(f"module 'cranalytics' has no attribute {name!r}")
|
|
144
|
+
|
|
145
|
+
value = getattr(import_module(module_name), name)
|
|
146
|
+
globals()[name] = value
|
|
147
|
+
return value
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def __dir__() -> list[str]:
|
|
151
|
+
return sorted([*globals(), *__all__])
|
cranalytics/__main__.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Domain-agnostic helpers shared by all workflow contract modules.
|
|
2
|
+
|
|
3
|
+
These are private utilities — not part of the public API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
import pandas as pd # noqa: TC002
|
|
11
|
+
|
|
12
|
+
from ._contract_issues import _append_issue
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _norm(col: str) -> str:
|
|
16
|
+
"""Lowercase, strip, and collapse non-alphanumeric runs to underscores."""
|
|
17
|
+
return re.sub(r"[^a-z0-9]+", "_", str(col).strip().lower()).strip("_")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def resolve_alias_columns(
|
|
21
|
+
work: pd.DataFrame,
|
|
22
|
+
aliases: dict[str, tuple[str, ...]],
|
|
23
|
+
issues: list[dict[str, str]],
|
|
24
|
+
) -> pd.DataFrame:
|
|
25
|
+
"""Rename DataFrame columns from any known alias to the canonical name.
|
|
26
|
+
|
|
27
|
+
For each canonical column name in *aliases*, finds matching columns in
|
|
28
|
+
*work* and renames the first match to the canonical name. If more than
|
|
29
|
+
one alias is present a ``MULTIPLE_ALIASES`` warning is appended to
|
|
30
|
+
*issues*. Returns the renamed DataFrame.
|
|
31
|
+
"""
|
|
32
|
+
rename_map: dict[str, str] = {}
|
|
33
|
+
for canonical, alias_tuple in aliases.items():
|
|
34
|
+
matches = [col for col in work.columns if col in alias_tuple]
|
|
35
|
+
if len(matches) > 1:
|
|
36
|
+
_append_issue(
|
|
37
|
+
issues,
|
|
38
|
+
severity="warning",
|
|
39
|
+
issue_code="MULTIPLE_ALIASES",
|
|
40
|
+
message=(
|
|
41
|
+
f"Multiple aliases found for '{canonical}' ({matches}); "
|
|
42
|
+
f"using '{matches[0]}'."
|
|
43
|
+
),
|
|
44
|
+
)
|
|
45
|
+
if matches:
|
|
46
|
+
rename_map[matches[0]] = canonical
|
|
47
|
+
return work.rename(columns=rename_map)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
"_norm",
|
|
52
|
+
"resolve_alias_columns",
|
|
53
|
+
]
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Shared issue-table helpers for workflow contract modules."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Iterable
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class IssueTableResultMixin:
|
|
14
|
+
"""Mixin for contract results that expose an ``issue_table`` DataFrame."""
|
|
15
|
+
|
|
16
|
+
issue_table: pd.DataFrame
|
|
17
|
+
|
|
18
|
+
def has_severity(self, severity: str) -> bool:
|
|
19
|
+
return issue_table_has_severity(self.issue_table, severity)
|
|
20
|
+
|
|
21
|
+
def has_issue_code(self, issue_code: str) -> bool:
|
|
22
|
+
return issue_table_has_issue_code(self.issue_table, issue_code)
|
|
23
|
+
|
|
24
|
+
def failing_issue_table(
|
|
25
|
+
self,
|
|
26
|
+
*,
|
|
27
|
+
severities: Iterable[str],
|
|
28
|
+
ignored_issue_codes: Iterable[str] = (),
|
|
29
|
+
) -> pd.DataFrame:
|
|
30
|
+
return select_issue_rows(
|
|
31
|
+
self.issue_table,
|
|
32
|
+
severities=severities,
|
|
33
|
+
ignored_issue_codes=ignored_issue_codes,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _issue_frame(issues: list[dict[str, str]]) -> pd.DataFrame:
|
|
38
|
+
frame = pd.DataFrame.from_records(issues)
|
|
39
|
+
return frame.reindex(columns=["severity", "issue_code", "message"])
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def issue_table_has_severity(issue_table: pd.DataFrame, severity: str) -> bool:
|
|
43
|
+
if issue_table.empty:
|
|
44
|
+
return False
|
|
45
|
+
return bool(issue_table["severity"].eq(str(severity)).any())
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def issue_table_has_issue_code(issue_table: pd.DataFrame, issue_code: str) -> bool:
|
|
49
|
+
if issue_table.empty:
|
|
50
|
+
return False
|
|
51
|
+
return bool(issue_table["issue_code"].eq(str(issue_code)).any())
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def select_issue_rows(
|
|
55
|
+
issue_table: pd.DataFrame,
|
|
56
|
+
*,
|
|
57
|
+
severities: Iterable[str],
|
|
58
|
+
ignored_issue_codes: Iterable[str] = (),
|
|
59
|
+
) -> pd.DataFrame:
|
|
60
|
+
selected_severities = tuple(str(severity) for severity in severities)
|
|
61
|
+
if issue_table.empty or not selected_severities:
|
|
62
|
+
return issue_table.iloc[0:0].copy()
|
|
63
|
+
|
|
64
|
+
mask = issue_table["severity"].isin(selected_severities)
|
|
65
|
+
ignored_codes = tuple(str(code) for code in ignored_issue_codes)
|
|
66
|
+
if ignored_codes:
|
|
67
|
+
mask = mask & ~issue_table["issue_code"].isin(ignored_codes)
|
|
68
|
+
return issue_table.loc[mask].copy()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def raise_on_issue_rows(
|
|
72
|
+
issue_table: pd.DataFrame,
|
|
73
|
+
*,
|
|
74
|
+
message_prefix: str,
|
|
75
|
+
severities: Iterable[str],
|
|
76
|
+
ignored_issue_codes: Iterable[str] = (),
|
|
77
|
+
) -> None:
|
|
78
|
+
failing = select_issue_rows(
|
|
79
|
+
issue_table,
|
|
80
|
+
severities=severities,
|
|
81
|
+
ignored_issue_codes=ignored_issue_codes,
|
|
82
|
+
)
|
|
83
|
+
if failing.empty:
|
|
84
|
+
return
|
|
85
|
+
raise ValueError(f"{message_prefix} - " + "; ".join(failing["message"].tolist()))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _append_issue(
|
|
89
|
+
issues: list[dict[str, str]],
|
|
90
|
+
*,
|
|
91
|
+
severity: str,
|
|
92
|
+
issue_code: str,
|
|
93
|
+
message: str,
|
|
94
|
+
) -> None:
|
|
95
|
+
issues.append(
|
|
96
|
+
{
|
|
97
|
+
"severity": severity,
|
|
98
|
+
"issue_code": issue_code,
|
|
99
|
+
"message": message,
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
__all__ = [
|
|
105
|
+
"IssueTableResultMixin",
|
|
106
|
+
"_append_issue",
|
|
107
|
+
"_issue_frame",
|
|
108
|
+
"issue_table_has_issue_code",
|
|
109
|
+
"issue_table_has_severity",
|
|
110
|
+
"raise_on_issue_rows",
|
|
111
|
+
"select_issue_rows",
|
|
112
|
+
]
|
cranalytics/_helpers.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Shared internal helpers for analytics modules.
|
|
2
|
+
|
|
3
|
+
These are private utilities — not part of the public API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import warnings
|
|
9
|
+
from typing import TYPE_CHECKING, cast
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from numpy.typing import NDArray
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _check_maturity_coverage(
|
|
19
|
+
flag_series: pd.Series,
|
|
20
|
+
flag_col: str,
|
|
21
|
+
threshold: float = 0.10,
|
|
22
|
+
) -> float:
|
|
23
|
+
"""Compute mature fraction and warn if below threshold. Returns the fraction."""
|
|
24
|
+
n_total = len(flag_series)
|
|
25
|
+
if n_total == 0:
|
|
26
|
+
return 0.0
|
|
27
|
+
n_mature = int(flag_series.notna().sum())
|
|
28
|
+
fraction = n_mature / n_total
|
|
29
|
+
if fraction < threshold:
|
|
30
|
+
warnings.warn(
|
|
31
|
+
f"'{flag_col}' has only {fraction:.1%} mature observations "
|
|
32
|
+
f"({n_mature}/{n_total}). Results may be unreliable.",
|
|
33
|
+
UserWarning,
|
|
34
|
+
stacklevel=3,
|
|
35
|
+
)
|
|
36
|
+
return fraction
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _require_columns(df: pd.DataFrame, columns: list[str]) -> None:
|
|
40
|
+
missing = [col for col in columns if col not in df.columns]
|
|
41
|
+
if missing:
|
|
42
|
+
raise ValueError(f"Missing required columns: {sorted(missing)}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _validate_confidence(confidence: float) -> None:
|
|
46
|
+
if not 0 < confidence < 1:
|
|
47
|
+
raise ValueError("confidence must be between 0 and 1.")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _coerce_weights(
|
|
51
|
+
weight_series: pd.Series,
|
|
52
|
+
weight_col: str,
|
|
53
|
+
) -> NDArray[np.float64]:
|
|
54
|
+
numeric_weights = cast("pd.Series", pd.to_numeric(weight_series, errors="coerce"))
|
|
55
|
+
weights = np.asarray(numeric_weights.to_numpy(dtype=np.float64), dtype=np.float64)
|
|
56
|
+
if np.isnan(weights).any():
|
|
57
|
+
raise ValueError(f"{weight_col} contains non-numeric values.")
|
|
58
|
+
if (weights < 0).any():
|
|
59
|
+
raise ValueError(f"{weight_col} must be non-negative.")
|
|
60
|
+
if weights.sum() <= 0:
|
|
61
|
+
raise ValueError(f"{weight_col} must have a positive total weight.")
|
|
62
|
+
return weights
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _effective_sample_size(weights: NDArray[np.float64]) -> float:
|
|
66
|
+
sum_w = float(np.sum(weights))
|
|
67
|
+
sum_w2 = float(np.sum(np.square(weights)))
|
|
68
|
+
if sum_w <= 0 or sum_w2 <= 0:
|
|
69
|
+
return 0.0
|
|
70
|
+
return (sum_w**2) / sum_w2
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _bin_series(series: pd.Series, n_bins: int) -> pd.Series:
|
|
74
|
+
"""Quantile-bin a numeric series; returns object Series with pd.NA for nulls."""
|
|
75
|
+
if n_bins < 2:
|
|
76
|
+
raise ValueError("n_bins must be >= 2.")
|
|
77
|
+
|
|
78
|
+
out = pd.Series(pd.NA, index=series.index, dtype="object")
|
|
79
|
+
non_null = cast("pd.Series", series.dropna())
|
|
80
|
+
if non_null.empty:
|
|
81
|
+
return out
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
binned = cast(
|
|
85
|
+
"pd.Series",
|
|
86
|
+
pd.Series(
|
|
87
|
+
pd.qcut(non_null, q=n_bins, duplicates="drop"), index=non_null.index
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
except ValueError as exc:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"Unable to create quantile bins for '{series.name}'. "
|
|
93
|
+
"Check cardinality and null coverage."
|
|
94
|
+
) from exc
|
|
95
|
+
|
|
96
|
+
out.loc[non_null.index] = cast("pd.Series", binned.astype("object")).to_numpy()
|
|
97
|
+
return out
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _months_elapsed(start_dates: pd.Series, as_of_date: pd.Timestamp) -> pd.Series:
|
|
101
|
+
"""Whole-month elapsed time from start_date to as_of_date (floored at 0).
|
|
102
|
+
|
|
103
|
+
Applies a day-of-month correction: if the as_of_date day has not yet
|
|
104
|
+
reached the start-date day within the current month, the month is not
|
|
105
|
+
counted as complete.
|
|
106
|
+
"""
|
|
107
|
+
elapsed = (as_of_date.year - start_dates.dt.year) * 12 + (
|
|
108
|
+
as_of_date.month - start_dates.dt.month
|
|
109
|
+
)
|
|
110
|
+
before_month_day = as_of_date.day < start_dates.dt.day
|
|
111
|
+
elapsed = elapsed - before_month_day.astype(int)
|
|
112
|
+
return elapsed.clip(lower=0).astype(int)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _series(value: object) -> pd.Series:
|
|
116
|
+
"""Type-narrowing cast: raise TypeError if value is not a pandas Series."""
|
|
117
|
+
if not isinstance(value, pd.Series):
|
|
118
|
+
raise TypeError("Expected a pandas Series.")
|
|
119
|
+
return value
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _frame(value: object) -> pd.DataFrame:
|
|
123
|
+
"""Type-narrowing cast: raise TypeError if value is not a pandas DataFrame."""
|
|
124
|
+
if not isinstance(value, pd.DataFrame):
|
|
125
|
+
raise TypeError("Expected a pandas DataFrame.")
|
|
126
|
+
return value
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _months_between(start_dates: pd.Series, end_dates: pd.Series) -> pd.Series:
|
|
130
|
+
"""Whole-month elapsed time between two parallel date Series (floored at 0).
|
|
131
|
+
|
|
132
|
+
Element-wise variant of ``_months_elapsed`` for when both start and end
|
|
133
|
+
dates vary per row. Applies the same day-of-month correction.
|
|
134
|
+
"""
|
|
135
|
+
elapsed = (end_dates.dt.year - start_dates.dt.year) * 12 + (
|
|
136
|
+
end_dates.dt.month - start_dates.dt.month
|
|
137
|
+
)
|
|
138
|
+
before_month_day = end_dates.dt.day < start_dates.dt.day
|
|
139
|
+
elapsed = elapsed - before_month_day.astype(int)
|
|
140
|
+
return elapsed.clip(lower=0).astype(int)
|