hmda-analyzer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hmda_analyzer-0.1.0.dist-info/METADATA +125 -0
- hmda_analyzer-0.1.0.dist-info/RECORD +21 -0
- hmda_analyzer-0.1.0.dist-info/WHEEL +5 -0
- hmda_analyzer-0.1.0.dist-info/top_level.txt +2 -0
- hmdaanalyzer/__init__.py +28 -0
- hmdaanalyzer/analysis/__init__.py +0 -0
- hmdaanalyzer/analysis/disparity.py +128 -0
- hmdaanalyzer/analysis/geographic.py +125 -0
- hmdaanalyzer/analysis/lender.py +109 -0
- hmdaanalyzer/data/__init__.py +0 -0
- hmdaanalyzer/data/loader.py +186 -0
- hmdaanalyzer/data/schema.py +124 -0
- hmdaanalyzer/report/__init__.py +0 -0
- hmdaanalyzer/report/generator.py +147 -0
- tests/__init__.py +0 -0
- tests/conftest.py +12 -0
- tests/test_disparity.py +58 -0
- tests/test_geographic.py +38 -0
- tests/test_lender.py +37 -0
- tests/test_loader.py +43 -0
- tests/test_report.py +27 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hmda-analyzer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: HMDA mortgage lending disparity analyzer — denial rates, racial disparities, lending deserts, and lender benchmarking
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/Jaypatel1511/hmda-analyzer
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pandas>=1.4.0
|
|
10
|
+
Requires-Dist: numpy>=1.21.0
|
|
11
|
+
Requires-Dist: requests>=2.27.0
|
|
12
|
+
|
|
13
|
+
# hmda-analyzer 📊
|
|
14
|
+
|
|
15
|
+
**HMDA mortgage lending disparity analyzer.**
|
|
16
|
+
|
|
17
|
+
Compute denial rate disparities by race, identify lending deserts, benchmark lenders
|
|
18
|
+
against peers, and generate fair lending analysis reports — using CFPB HMDA LAR data.
|
|
19
|
+
Free public API, no authentication required.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Why hmda-analyzer?
|
|
24
|
+
|
|
25
|
+
HMDA data covers 10+ million mortgage applications per year with borrower demographics,
|
|
26
|
+
denial rates, loan amounts, and census tract locations. It is the most powerful public
|
|
27
|
+
dataset for analyzing mortgage lending disparities — but it requires significant
|
|
28
|
+
engineering to use. hmda-analyzer makes it accessible in Python.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
pip install hmda-analyzer
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Quickstart
|
|
39
|
+
|
|
40
|
+
from hmdaanalyzer import (
|
|
41
|
+
load_sample, denial_rate_by_race, disparity_ratio,
|
|
42
|
+
lending_by_tract, lender_summary, generate_disparity_report,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Load sample data (no API required)
|
|
46
|
+
df = load_sample(n=5000)
|
|
47
|
+
|
|
48
|
+
# Or load from CFPB API (real data)
|
|
49
|
+
# df = load_from_api(year=2023, state="IL")
|
|
50
|
+
|
|
51
|
+
# Denial rates by race
|
|
52
|
+
rates = denial_rate_by_race(df)
|
|
53
|
+
print(rates)
|
|
54
|
+
|
|
55
|
+
# Disparity ratios vs White applicants
|
|
56
|
+
disparities = disparity_ratio(df)
|
|
57
|
+
print(disparities)
|
|
58
|
+
|
|
59
|
+
# Geographic analysis
|
|
60
|
+
tracts = lending_by_tract(df)
|
|
61
|
+
deserts = lending_by_tract(df)
|
|
62
|
+
|
|
63
|
+
# Lender analysis
|
|
64
|
+
summary = lender_summary(df, lei="LEI000001")
|
|
65
|
+
|
|
66
|
+
# Full disparity report
|
|
67
|
+
report = generate_disparity_report(df, title="Illinois Mortgage Market 2023")
|
|
68
|
+
print(report)
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Analyses Supported
|
|
73
|
+
|
|
74
|
+
- Denial rate by race and ethnicity
|
|
75
|
+
- Disparity ratios vs reference group (default: White applicants)
|
|
76
|
+
- Denial rate by income band
|
|
77
|
+
- Denial reasons by race
|
|
78
|
+
- Lending activity by census tract, county, and state
|
|
79
|
+
- Lending desert identification (low application volume tracts)
|
|
80
|
+
- Lender vs market comparison
|
|
81
|
+
- Top lenders by origination volume
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Disparity Ratio Thresholds
|
|
86
|
+
|
|
87
|
+
Based on CFPB fair lending examination standards:
|
|
88
|
+
|
|
89
|
+
- >= 2.0x — HIGH disparity (triggers regulatory scrutiny)
|
|
90
|
+
- >= 1.5x — MODERATE disparity
|
|
91
|
+
- < 1.5x — LOW disparity
|
|
92
|
+
- < 1.0x — FAVORABLE (group has lower denial rate than reference)
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Data Sources
|
|
97
|
+
|
|
98
|
+
CFPB HMDA Data Browser API — free, no API key required.
|
|
99
|
+
2024 data covers 4,908 institutions and millions of loan applications.
|
|
100
|
+
|
|
101
|
+
https://ffiec.cfpb.gov/data-browser/
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Running Tests
|
|
106
|
+
|
|
107
|
+
PYTHONPATH=. pytest tests/ -v
|
|
108
|
+
|
|
109
|
+
28 tests across all modules.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Who This Is For
|
|
114
|
+
|
|
115
|
+
- Fair lending analysts and compliance teams at banks and CDFIs
|
|
116
|
+
- Community reinvestment researchers studying mortgage disparities
|
|
117
|
+
- Journalists covering housing discrimination and redlining
|
|
118
|
+
- Regulators and examiners analyzing lender performance
|
|
119
|
+
- Academics studying racial wealth gaps and homeownership barriers
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## License
|
|
124
|
+
|
|
125
|
+
MIT 2026 Jaypatel1511
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
hmdaanalyzer/__init__.py,sha256=TJuAUuk9uX7UgW8pwqvTMF27bFbjqQL5WJCkRoWqrhs,1034
|
|
2
|
+
hmdaanalyzer/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
hmdaanalyzer/analysis/disparity.py,sha256=0kxcvw3SIWP_SAnPGGD0bC7tb3aQTkJFNVHZO3byWW8,4165
|
|
4
|
+
hmdaanalyzer/analysis/geographic.py,sha256=J5Mfd6mnAtO-wUCmLbFpyJqecZEuYC2aJ7g7YdmcKhY,4140
|
|
5
|
+
hmdaanalyzer/analysis/lender.py,sha256=k5DYRIwLuCdn0TmhrahTWWIM898uCpWN99yNA5CJZ68,3336
|
|
6
|
+
hmdaanalyzer/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
hmdaanalyzer/data/loader.py,sha256=XFx3_VeGfczfTCit1rkz2dVkfvGZ5ArVsCFM8bWpT_E,6219
|
|
8
|
+
hmdaanalyzer/data/schema.py,sha256=3Voi5ZRk5LtZiyJcn4J_Q9u4s-QB1fxL863PXMYzMIw,4991
|
|
9
|
+
hmdaanalyzer/report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
hmdaanalyzer/report/generator.py,sha256=ogefRB-5FgMUDyIYcjZIEkZEZspbWGXzfPAwJx2T7YI,4633
|
|
11
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
tests/conftest.py,sha256=PksNnhkWPOyqsMP7Au4HVa7LAQsAS5g0Etq7BjuIeOE,211
|
|
13
|
+
tests/test_disparity.py,sha256=OUUmGwtsSB7H4YBKgNGHjfgecIUZaV7KTSZO1NGw2Q8,1901
|
|
14
|
+
tests/test_geographic.py,sha256=_Pj5LB2PRTwVbfX6t-xxotNzS-m5eF2CEpWW8ZQ3Iow,1157
|
|
15
|
+
tests/test_lender.py,sha256=BStl8s1un1rENvZKzXFxebtJSTLBxrxnapeDCSYzRNg,1146
|
|
16
|
+
tests/test_loader.py,sha256=LoBfXJmh64LFSc3qb6QFOKhckk4EW0R0qMSujQfaW2U,1329
|
|
17
|
+
tests/test_report.py,sha256=HMbj707Jb7wfVdjhyZU9gtKJlmhLDiZ-yIMQ5gf0XAU,800
|
|
18
|
+
hmda_analyzer-0.1.0.dist-info/METADATA,sha256=fdFw3qoUWOW6s4EbvQI8kBL-J8XzxXmJNR3MHgqjZic,3251
|
|
19
|
+
hmda_analyzer-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
20
|
+
hmda_analyzer-0.1.0.dist-info/top_level.txt,sha256=co7d3qOb1t3FdKP69KBrETmXneiJ3zDhY9WijVeQ0cs,19
|
|
21
|
+
hmda_analyzer-0.1.0.dist-info/RECORD,,
|
hmdaanalyzer/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from hmdaanalyzer.data.loader import (
|
|
2
|
+
load_from_api, load_from_file, load_sample,
|
|
3
|
+
)
|
|
4
|
+
from hmdaanalyzer.analysis.disparity import (
|
|
5
|
+
denial_rate_by_race, disparity_ratio,
|
|
6
|
+
denial_rate_by_income_band, denial_reasons_by_race,
|
|
7
|
+
)
|
|
8
|
+
from hmdaanalyzer.analysis.geographic import (
|
|
9
|
+
lending_by_tract, lending_by_county, lending_by_state,
|
|
10
|
+
lending_desert_score, racial_composition_by_tract,
|
|
11
|
+
)
|
|
12
|
+
from hmdaanalyzer.analysis.lender import (
|
|
13
|
+
lender_summary, lender_vs_market, top_lenders_by_volume,
|
|
14
|
+
)
|
|
15
|
+
from hmdaanalyzer.report.generator import (
|
|
16
|
+
generate_disparity_report, summary_table,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
__all__ = [
|
|
21
|
+
"load_from_api", "load_from_file", "load_sample",
|
|
22
|
+
"denial_rate_by_race", "disparity_ratio",
|
|
23
|
+
"denial_rate_by_income_band", "denial_reasons_by_race",
|
|
24
|
+
"lending_by_tract", "lending_by_county", "lending_by_state",
|
|
25
|
+
"lending_desert_score", "racial_composition_by_tract",
|
|
26
|
+
"lender_summary", "lender_vs_market", "top_lenders_by_volume",
|
|
27
|
+
"generate_disparity_report", "summary_table",
|
|
28
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Denial rate disparity analysis.
|
|
3
|
+
Computes disparate impact ratios between racial/ethnic groups.
|
|
4
|
+
"""
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
from hmdaanalyzer.data.schema import DISPARITY_THRESHOLDS, REFERENCE_RACE
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def denial_rate_by_race(df: pd.DataFrame) -> pd.DataFrame:
|
|
11
|
+
"""
|
|
12
|
+
Compute denial rates by race for a HMDA LAR DataFrame.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
df: Cleaned HMDA LAR DataFrame with is_denied and derived_race columns
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
DataFrame with denial rates by race
|
|
19
|
+
"""
|
|
20
|
+
if "derived_race" not in df.columns or "is_denied" not in df.columns:
|
|
21
|
+
raise ValueError("DataFrame must have 'derived_race' and 'is_denied' columns")
|
|
22
|
+
|
|
23
|
+
actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
|
|
24
|
+
|
|
25
|
+
result = actionable.groupby("derived_race").agg(
|
|
26
|
+
applications=("is_denied", "count"),
|
|
27
|
+
denials=("is_denied", "sum"),
|
|
28
|
+
).reset_index()
|
|
29
|
+
|
|
30
|
+
result["denial_rate"] = result["denials"] / result["applications"]
|
|
31
|
+
result = result[result["applications"] >= 5]
|
|
32
|
+
result = result.sort_values("denial_rate", ascending=False)
|
|
33
|
+
|
|
34
|
+
return result
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def disparity_ratio(df: pd.DataFrame, reference: str = None) -> pd.DataFrame:
|
|
38
|
+
"""
|
|
39
|
+
Compute disparity ratios relative to a reference group (default: White).
|
|
40
|
+
|
|
41
|
+
Disparity ratio = group denial rate / reference group denial rate
|
|
42
|
+
A ratio > 2.0 indicates high disparity (CFPB threshold).
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
df: Cleaned HMDA LAR DataFrame
|
|
46
|
+
reference: Reference race group (default: "White")
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
DataFrame with disparity ratios and severity flags
|
|
50
|
+
"""
|
|
51
|
+
reference = reference or REFERENCE_RACE
|
|
52
|
+
denial_rates = denial_rate_by_race(df)
|
|
53
|
+
|
|
54
|
+
ref_row = denial_rates[denial_rates["derived_race"] == reference]
|
|
55
|
+
if ref_row.empty:
|
|
56
|
+
raise ValueError(f"Reference group '{reference}' not found in data.")
|
|
57
|
+
|
|
58
|
+
ref_rate = ref_row["denial_rate"].iloc[0]
|
|
59
|
+
|
|
60
|
+
result = denial_rates.copy()
|
|
61
|
+
result["reference_group"] = reference
|
|
62
|
+
result["reference_denial_rate"] = ref_rate
|
|
63
|
+
result["disparity_ratio"] = result["denial_rate"] / ref_rate if ref_rate > 0 else None
|
|
64
|
+
|
|
65
|
+
def classify(ratio):
|
|
66
|
+
if ratio is None or pd.isna(ratio):
|
|
67
|
+
return "N/A"
|
|
68
|
+
if ratio >= DISPARITY_THRESHOLDS["high"]:
|
|
69
|
+
return "HIGH"
|
|
70
|
+
elif ratio >= DISPARITY_THRESHOLDS["moderate"]:
|
|
71
|
+
return "MODERATE"
|
|
72
|
+
elif ratio < 1.0:
|
|
73
|
+
return "FAVORABLE"
|
|
74
|
+
return "LOW"
|
|
75
|
+
|
|
76
|
+
result["disparity_level"] = result["disparity_ratio"].apply(classify)
|
|
77
|
+
result = result.sort_values("disparity_ratio", ascending=False)
|
|
78
|
+
|
|
79
|
+
return result
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def denial_rate_by_income_band(df: pd.DataFrame) -> pd.DataFrame:
|
|
83
|
+
"""
|
|
84
|
+
Compute denial rates by income band to identify income-based disparities.
|
|
85
|
+
"""
|
|
86
|
+
df = df.copy()
|
|
87
|
+
df["income_band"] = pd.cut(
|
|
88
|
+
df["income"],
|
|
89
|
+
bins=[0, 50, 80, 120, 200, float("inf")],
|
|
90
|
+
labels=["<$50k", "$50-80k", "$80-120k", "$120-200k", "$200k+"],
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
|
|
94
|
+
|
|
95
|
+
result = actionable.groupby("income_band", observed=True).agg(
|
|
96
|
+
applications=("is_denied", "count"),
|
|
97
|
+
denials=("is_denied", "sum"),
|
|
98
|
+
).reset_index()
|
|
99
|
+
|
|
100
|
+
result["denial_rate"] = result["denials"] / result["applications"]
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def denial_reasons_by_race(df: pd.DataFrame) -> pd.DataFrame:
|
|
105
|
+
"""
|
|
106
|
+
Analyze denial reasons broken down by race.
|
|
107
|
+
"""
|
|
108
|
+
from hmdaanalyzer.data.schema import DENIAL_REASONS
|
|
109
|
+
|
|
110
|
+
denied = df[df["is_denied"] == True].copy()
|
|
111
|
+
|
|
112
|
+
if "denial_reason_1" not in denied.columns:
|
|
113
|
+
return pd.DataFrame()
|
|
114
|
+
|
|
115
|
+
denied["denial_reason_label"] = denied["denial_reason_1"].map(
|
|
116
|
+
lambda x: DENIAL_REASONS.get(int(x), "Unknown") if pd.notna(x) else "Unknown"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
result = denied.groupby(
|
|
120
|
+
["derived_race", "denial_reason_label"]
|
|
121
|
+
).size().reset_index(name="count")
|
|
122
|
+
|
|
123
|
+
totals = denied.groupby("derived_race").size().reset_index(name="total")
|
|
124
|
+
result = result.merge(totals, on="derived_race")
|
|
125
|
+
result["pct"] = result["count"] / result["total"] * 100
|
|
126
|
+
result = result.sort_values(["derived_race", "pct"], ascending=[True, False])
|
|
127
|
+
|
|
128
|
+
return result
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Geographic analysis of HMDA lending patterns.
|
|
3
|
+
Identifies lending deserts and maps activity by census tract.
|
|
4
|
+
"""
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def lending_by_tract(df: pd.DataFrame) -> pd.DataFrame:
|
|
10
|
+
"""
|
|
11
|
+
Aggregate HMDA lending activity by census tract.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
DataFrame with application counts, denial rates, and loan volumes by tract
|
|
15
|
+
"""
|
|
16
|
+
if "census_tract" not in df.columns:
|
|
17
|
+
raise ValueError("DataFrame must have 'census_tract' column")
|
|
18
|
+
|
|
19
|
+
actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
|
|
20
|
+
|
|
21
|
+
result = actionable.groupby("census_tract").agg(
|
|
22
|
+
applications=("is_denied", "count"),
|
|
23
|
+
denials=("is_denied", "sum"),
|
|
24
|
+
originations=("is_approved", "sum"),
|
|
25
|
+
avg_loan_amount=("loan_amount", "mean"),
|
|
26
|
+
median_income=("income", "median"),
|
|
27
|
+
).reset_index()
|
|
28
|
+
|
|
29
|
+
result["denial_rate"] = result["denials"] / result["applications"]
|
|
30
|
+
result["origination_rate"] = result["originations"] / result["applications"]
|
|
31
|
+
|
|
32
|
+
return result.sort_values("applications", ascending=False)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def lending_by_county(df: pd.DataFrame) -> pd.DataFrame:
|
|
36
|
+
"""
|
|
37
|
+
Aggregate HMDA lending activity by county.
|
|
38
|
+
"""
|
|
39
|
+
if "county_code" not in df.columns:
|
|
40
|
+
raise ValueError("DataFrame must have 'county_code' column")
|
|
41
|
+
|
|
42
|
+
actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
|
|
43
|
+
|
|
44
|
+
result = actionable.groupby("county_code").agg(
|
|
45
|
+
applications=("is_denied", "count"),
|
|
46
|
+
denials=("is_denied", "sum"),
|
|
47
|
+
originations=("is_approved", "sum"),
|
|
48
|
+
total_loan_volume=("loan_amount", "sum"),
|
|
49
|
+
avg_loan_amount=("loan_amount", "mean"),
|
|
50
|
+
).reset_index()
|
|
51
|
+
|
|
52
|
+
result["denial_rate"] = result["denials"] / result["applications"]
|
|
53
|
+
result["state_code"] = result["county_code"].str[:2]
|
|
54
|
+
|
|
55
|
+
return result.sort_values("applications", ascending=False)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def lending_desert_score(df: pd.DataFrame) -> pd.DataFrame:
|
|
59
|
+
"""
|
|
60
|
+
Identify census tracts with abnormally low application volumes.
|
|
61
|
+
A 'lending desert' is a tract with very few mortgage applications
|
|
62
|
+
relative to its expected volume based on housing units.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
DataFrame with lending desert scores by census tract
|
|
66
|
+
"""
|
|
67
|
+
tract_df = lending_by_tract(df)
|
|
68
|
+
|
|
69
|
+
# Percentile rank by application volume
|
|
70
|
+
tract_df["app_percentile"] = (
|
|
71
|
+
tract_df["applications"].rank(pct=True) * 100
|
|
72
|
+
).round(1)
|
|
73
|
+
|
|
74
|
+
# Low denial rate + low application volume = potential lending desert
|
|
75
|
+
# (lenders may be avoiding the area entirely)
|
|
76
|
+
tract_df["desert_score"] = (
|
|
77
|
+
(100 - tract_df["app_percentile"]) * 0.6 +
|
|
78
|
+
tract_df["denial_rate"] * 100 * 0.4
|
|
79
|
+
).round(1)
|
|
80
|
+
|
|
81
|
+
tract_df["is_lending_desert"] = (
|
|
82
|
+
(tract_df["app_percentile"] < 25) &
|
|
83
|
+
(tract_df["denial_rate"] > 0.15)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return tract_df.sort_values("desert_score", ascending=False)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def racial_composition_by_tract(df: pd.DataFrame) -> pd.DataFrame:
|
|
90
|
+
"""
|
|
91
|
+
Show racial composition of applicants by census tract.
|
|
92
|
+
Useful for identifying tracts where lending may differ by applicant race.
|
|
93
|
+
"""
|
|
94
|
+
if "derived_race" not in df.columns or "census_tract" not in df.columns:
|
|
95
|
+
return pd.DataFrame()
|
|
96
|
+
|
|
97
|
+
result = df.groupby(
|
|
98
|
+
["census_tract", "derived_race"]
|
|
99
|
+
).agg(
|
|
100
|
+
applications=("is_denied", "count"),
|
|
101
|
+
denial_rate=("is_denied", "mean"),
|
|
102
|
+
).reset_index()
|
|
103
|
+
|
|
104
|
+
return result.sort_values(["census_tract", "applications"], ascending=[True, False])
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def lending_by_state(df: pd.DataFrame) -> pd.DataFrame:
|
|
108
|
+
"""
|
|
109
|
+
Aggregate lending activity by state.
|
|
110
|
+
"""
|
|
111
|
+
state_col = "state_code" if "state_code" in df.columns else None
|
|
112
|
+
if state_col is None:
|
|
113
|
+
return pd.DataFrame()
|
|
114
|
+
|
|
115
|
+
actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
|
|
116
|
+
|
|
117
|
+
result = actionable.groupby(state_col).agg(
|
|
118
|
+
applications=("is_denied", "count"),
|
|
119
|
+
denials=("is_denied", "sum"),
|
|
120
|
+
originations=("is_approved", "sum"),
|
|
121
|
+
total_volume=("loan_amount", "sum"),
|
|
122
|
+
).reset_index()
|
|
123
|
+
|
|
124
|
+
result["denial_rate"] = result["denials"] / result["applications"]
|
|
125
|
+
return result.sort_values("applications", ascending=False)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lender-level HMDA analysis.
|
|
3
|
+
Compare a lender's performance against market peers.
|
|
4
|
+
"""
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from hmdaanalyzer.analysis.disparity import denial_rate_by_race, disparity_ratio
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def lender_summary(df: pd.DataFrame, lei: str = None) -> dict:
|
|
10
|
+
"""
|
|
11
|
+
Compute summary statistics for a single lender.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
df: HMDA LAR DataFrame (filtered to lender or full market)
|
|
15
|
+
lei: Lender LEI to filter to (optional)
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Dict with key lender performance metrics
|
|
19
|
+
"""
|
|
20
|
+
if lei and "lei" in df.columns:
|
|
21
|
+
df = df[df["lei"] == lei]
|
|
22
|
+
|
|
23
|
+
if df.empty:
|
|
24
|
+
return {}
|
|
25
|
+
|
|
26
|
+
actionable = df[df["action_taken"].isin([1, 2, 3])]
|
|
27
|
+
total = len(actionable)
|
|
28
|
+
if total == 0:
|
|
29
|
+
return {}
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
"total_applications": total,
|
|
33
|
+
"originations": int(actionable["is_approved"].sum()),
|
|
34
|
+
"denials": int(actionable["is_denied"].sum()),
|
|
35
|
+
"approval_rate": round(actionable["is_approved"].mean() * 100, 2),
|
|
36
|
+
"denial_rate": round(actionable["is_denied"].mean() * 100, 2),
|
|
37
|
+
"avg_loan_amount": round(actionable["loan_amount"].mean(), 0),
|
|
38
|
+
"median_loan_amount": round(actionable["loan_amount"].median(), 0),
|
|
39
|
+
"avg_applicant_income": round(actionable["income"].mean(), 0),
|
|
40
|
+
"unique_tracts": actionable["census_tract"].nunique(),
|
|
41
|
+
"unique_counties": actionable["county_code"].nunique(),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def lender_vs_market(
|
|
46
|
+
df: pd.DataFrame,
|
|
47
|
+
lei: str,
|
|
48
|
+
) -> pd.DataFrame:
|
|
49
|
+
"""
|
|
50
|
+
Compare a lender's denial rates against the overall market
|
|
51
|
+
by racial group.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
df: Full market HMDA LAR DataFrame
|
|
55
|
+
lei: Lender LEI to compare
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
DataFrame showing lender vs market denial rates by race
|
|
59
|
+
"""
|
|
60
|
+
lender_df = df[df["lei"] == lei] if "lei" in df.columns else df
|
|
61
|
+
|
|
62
|
+
lender_rates = denial_rate_by_race(lender_df).rename(
|
|
63
|
+
columns={"denial_rate": "lender_denial_rate",
|
|
64
|
+
"applications": "lender_applications",
|
|
65
|
+
"denials": "lender_denials"}
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
market_rates = denial_rate_by_race(df).rename(
|
|
69
|
+
columns={"denial_rate": "market_denial_rate",
|
|
70
|
+
"applications": "market_applications",
|
|
71
|
+
"denials": "market_denials"}
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
result = lender_rates.merge(
|
|
75
|
+
market_rates[["derived_race", "market_denial_rate"]],
|
|
76
|
+
on="derived_race", how="left"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
result["vs_market"] = (
|
|
80
|
+
result["lender_denial_rate"] - result["market_denial_rate"]
|
|
81
|
+
)
|
|
82
|
+
result["vs_market_pct"] = (result["vs_market"] * 100).round(2)
|
|
83
|
+
|
|
84
|
+
return result.sort_values("lender_denial_rate", ascending=False)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def top_lenders_by_volume(
|
|
88
|
+
df: pd.DataFrame,
|
|
89
|
+
n: int = 10,
|
|
90
|
+
state: str = None,
|
|
91
|
+
) -> pd.DataFrame:
|
|
92
|
+
"""
|
|
93
|
+
Rank lenders by origination volume.
|
|
94
|
+
"""
|
|
95
|
+
if state and "state_code" in df.columns:
|
|
96
|
+
df = df[df["state_code"] == state]
|
|
97
|
+
|
|
98
|
+
originated = df[df["action_taken"] == 1]
|
|
99
|
+
|
|
100
|
+
if "lei" not in originated.columns:
|
|
101
|
+
return pd.DataFrame()
|
|
102
|
+
|
|
103
|
+
result = originated.groupby("lei").agg(
|
|
104
|
+
originations=("loan_amount", "count"),
|
|
105
|
+
total_volume=("loan_amount", "sum"),
|
|
106
|
+
avg_loan=("loan_amount", "mean"),
|
|
107
|
+
).reset_index()
|
|
108
|
+
|
|
109
|
+
return result.sort_values("originations", ascending=False).head(n)
|
|
File without changes
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Load HMDA LAR data from CFPB Data Browser API or local CSV.
|
|
3
|
+
Free public API — no authentication required.
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
import requests
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from hmdaanalyzer.data.schema import (
|
|
10
|
+
HMDA_API_BASE, CACHE_DIR, ACTION_TAKEN,
|
|
11
|
+
APPROVED_ACTIONS, DENIED_ACTIONS,
|
|
12
|
+
RACE_CODES, ETHNICITY_CODES, LOAN_PURPOSE, LOAN_TYPE,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_cache_dir() -> Path:
|
|
17
|
+
path = Path(CACHE_DIR)
|
|
18
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
19
|
+
return path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_from_api(
|
|
23
|
+
year: int = 2023,
|
|
24
|
+
state: str = None,
|
|
25
|
+
lei: str = None,
|
|
26
|
+
county: str = None,
|
|
27
|
+
limit: int = 10_000,
|
|
28
|
+
) -> pd.DataFrame:
|
|
29
|
+
"""
|
|
30
|
+
Load HMDA LAR data from CFPB Data Browser API.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
year: Data year e.g. 2023
|
|
34
|
+
state: Two-letter state code e.g. "IL"
|
|
35
|
+
lei: Lender LEI identifier
|
|
36
|
+
county: County FIPS code e.g. "17031"
|
|
37
|
+
limit: Max records to fetch
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Clean pandas DataFrame with standardized columns
|
|
41
|
+
"""
|
|
42
|
+
params = {
|
|
43
|
+
"years": year,
|
|
44
|
+
"actions_taken": "1,2,3,4,5",
|
|
45
|
+
"limit": min(limit, 1_000_000),
|
|
46
|
+
}
|
|
47
|
+
if state:
|
|
48
|
+
params["states"] = state.upper()
|
|
49
|
+
if lei:
|
|
50
|
+
params["leis"] = lei
|
|
51
|
+
if county:
|
|
52
|
+
params["counties"] = county
|
|
53
|
+
|
|
54
|
+
url = f"{HMDA_API_BASE}/csv"
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
print(f"Fetching HMDA data from CFPB API (year={year})...")
|
|
58
|
+
r = requests.get(url, params=params, timeout=120, stream=True)
|
|
59
|
+
r.raise_for_status()
|
|
60
|
+
|
|
61
|
+
from io import StringIO
|
|
62
|
+
content = r.content.decode("utf-8")
|
|
63
|
+
df = pd.read_csv(StringIO(content), dtype=str, low_memory=False)
|
|
64
|
+
print(f"Loaded {len(df):,} LAR records")
|
|
65
|
+
return _clean(df)
|
|
66
|
+
|
|
67
|
+
except Exception as e:
|
|
68
|
+
print(f"API error: {e}. Use load_sample() for testing.")
|
|
69
|
+
return pd.DataFrame()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def load_from_file(path: str) -> pd.DataFrame:
|
|
73
|
+
"""
|
|
74
|
+
Load HMDA LAR data from a local CSV file.
|
|
75
|
+
Compatible with CFPB modified LAR files.
|
|
76
|
+
"""
|
|
77
|
+
print(f"Loading HMDA data from {path}...")
|
|
78
|
+
df = pd.read_csv(path, dtype=str, low_memory=False)
|
|
79
|
+
print(f"Loaded {len(df):,} LAR records")
|
|
80
|
+
return _clean(df)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_sample(n: int = 5000, seed: int = 42) -> pd.DataFrame:
|
|
84
|
+
"""
|
|
85
|
+
Generate synthetic HMDA LAR data for testing and demos.
|
|
86
|
+
Realistic distribution based on 2023 national HMDA statistics.
|
|
87
|
+
No internet connection required.
|
|
88
|
+
"""
|
|
89
|
+
import numpy as np
|
|
90
|
+
rng = np.random.default_rng(seed)
|
|
91
|
+
|
|
92
|
+
states = ["IL", "NY", "CA", "TX", "GA", "NC", "OH", "PA", "FL", "MI"]
|
|
93
|
+
leis = [f"LEI{i:06d}" for i in range(1, 11)]
|
|
94
|
+
|
|
95
|
+
# Realistic denial rates by race (based on 2023 HMDA national data)
|
|
96
|
+
race_denial_rates = {
|
|
97
|
+
"White": 0.095,
|
|
98
|
+
"Black or African American": 0.195,
|
|
99
|
+
"Asian": 0.090,
|
|
100
|
+
"Hispanic or Latino": 0.145,
|
|
101
|
+
"American Indian or Alaska Native": 0.175,
|
|
102
|
+
"Native Hawaiian or Other Pacific Islander": 0.160,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
races = list(race_denial_rates.keys())
|
|
106
|
+
race_weights = [0.65, 0.13, 0.07, 0.10, 0.02, 0.03]
|
|
107
|
+
|
|
108
|
+
records = []
|
|
109
|
+
for i in range(n):
|
|
110
|
+
race = rng.choice(races, p=race_weights)
|
|
111
|
+
denial_prob = race_denial_rates[race]
|
|
112
|
+
|
|
113
|
+
# Income and loan amount correlated
|
|
114
|
+
income = max(20, rng.normal(85, 45))
|
|
115
|
+
loan_amount = max(50, income * rng.uniform(2.5, 5.5))
|
|
116
|
+
|
|
117
|
+
# Action taken based on race denial probability
|
|
118
|
+
r = rng.random()
|
|
119
|
+
if r < denial_prob:
|
|
120
|
+
action = 3
|
|
121
|
+
elif r < denial_prob + 0.05:
|
|
122
|
+
action = 4
|
|
123
|
+
else:
|
|
124
|
+
action = 1
|
|
125
|
+
|
|
126
|
+
state = rng.choice(states)
|
|
127
|
+
county_num = rng.integers(1, 200)
|
|
128
|
+
state_fips = {
|
|
129
|
+
"IL": "17", "NY": "36", "CA": "06", "TX": "48",
|
|
130
|
+
"GA": "13", "NC": "37", "OH": "39", "PA": "42",
|
|
131
|
+
"FL": "12", "MI": "26",
|
|
132
|
+
}[state]
|
|
133
|
+
county_code = f"{state_fips}{county_num:03d}"
|
|
134
|
+
tract = f"{county_code}{rng.integers(100000, 999999)}"
|
|
135
|
+
|
|
136
|
+
records.append({
|
|
137
|
+
"action_taken": str(action),
|
|
138
|
+
"loan_type": str(rng.choice([1, 1, 1, 2, 3], p=[0.7, 0.1, 0.1, 0.05, 0.05])),
|
|
139
|
+
"loan_purpose": str(rng.choice([1, 31, 32, 2], p=[0.5, 0.3, 0.15, 0.05])),
|
|
140
|
+
"loan_amount": str(round(loan_amount)),
|
|
141
|
+
"income": str(round(income)),
|
|
142
|
+
"derived_race": race,
|
|
143
|
+
"derived_ethnicity": (
|
|
144
|
+
"Hispanic or Latino" if race == "Hispanic or Latino"
|
|
145
|
+
else "Not Hispanic or Latino"
|
|
146
|
+
),
|
|
147
|
+
"derived_sex": rng.choice(["Male", "Female", "Joint"], p=[0.45, 0.3, 0.25]),
|
|
148
|
+
"census_tract": tract,
|
|
149
|
+
"county_code": county_code,
|
|
150
|
+
"state_code": state_fips,
|
|
151
|
+
"denial_reason_1": str(rng.choice([1, 3, 4, 9, 10], p=[0.3, 0.25, 0.2, 0.15, 0.1])) if action == 3 else "10",
|
|
152
|
+
"interest_rate": str(round(rng.uniform(5.5, 8.5), 2)) if action == 1 else "",
|
|
153
|
+
"rate_spread": str(round(rng.uniform(-0.5, 2.0), 2)) if action == 1 else "",
|
|
154
|
+
"lei": rng.choice(leis),
|
|
155
|
+
"activity_year": "2023",
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
df = pd.DataFrame(records)
|
|
159
|
+
return _clean(df)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _clean(df: pd.DataFrame) -> pd.DataFrame:
|
|
163
|
+
"""Standardize and clean a raw HMDA LAR DataFrame."""
|
|
164
|
+
df.columns = df.columns.str.lower().str.strip()
|
|
165
|
+
|
|
166
|
+
numeric_cols = ["loan_amount", "income", "interest_rate", "rate_spread"]
|
|
167
|
+
for col in numeric_cols:
|
|
168
|
+
if col in df.columns:
|
|
169
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
170
|
+
|
|
171
|
+
int_cols = ["action_taken", "loan_type", "loan_purpose",
|
|
172
|
+
"denial_reason_1", "denial_reason_2"]
|
|
173
|
+
for col in int_cols:
|
|
174
|
+
if col in df.columns:
|
|
175
|
+
df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
|
|
176
|
+
|
|
177
|
+
if "action_taken" in df.columns:
|
|
178
|
+
df["is_approved"] = df["action_taken"].isin(APPROVED_ACTIONS)
|
|
179
|
+
df["is_denied"] = df["action_taken"].isin(DENIED_ACTIONS)
|
|
180
|
+
|
|
181
|
+
if "derived_race" not in df.columns and "applicant_race_1" in df.columns:
|
|
182
|
+
df["derived_race"] = df["applicant_race_1"].map(
|
|
183
|
+
lambda x: RACE_CODES.get(int(x), "Unknown") if pd.notna(x) else "Unknown"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return df.reset_index(drop=True)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constants, field mappings, and dataclasses for HMDA LAR data analysis.
|
|
3
|
+
Based on 2024 HMDA Filing Instruction Guide (FIG) and CFPB Data Browser API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# ── CFPB HMDA Data Browser API ────────────────────────────────────────────────
|
|
7
|
+
HMDA_API_BASE = "https://ffiec.cfpb.gov/v2/data-browser-api/view"
|
|
8
|
+
HMDA_AGG_BASE = "https://ffiec.cfpb.gov/v2/data-browser-api/view/aggregations"
|
|
9
|
+
|
|
10
|
+
# ── Action Taken Codes ────────────────────────────────────────────────────────
|
|
11
|
+
ACTION_TAKEN = {
|
|
12
|
+
1: "Loan originated",
|
|
13
|
+
2: "Application approved but not accepted",
|
|
14
|
+
3: "Application denied",
|
|
15
|
+
4: "Application withdrawn by applicant",
|
|
16
|
+
5: "File closed for incompleteness",
|
|
17
|
+
6: "Purchased loan",
|
|
18
|
+
7: "Preapproval request denied",
|
|
19
|
+
8: "Preapproval request approved but not accepted",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
APPROVED_ACTIONS = {1, 2, 8}
|
|
23
|
+
DENIED_ACTIONS = {3, 7}
|
|
24
|
+
WITHDRAWN_ACTIONS = {4, 5}
|
|
25
|
+
|
|
26
|
+
# ── Race Codes ────────────────────────────────────────────────────────────────
|
|
27
|
+
RACE_CODES = {
|
|
28
|
+
1: "American Indian or Alaska Native",
|
|
29
|
+
2: "Asian",
|
|
30
|
+
3: "Black or African American",
|
|
31
|
+
4: "Native Hawaiian or Other Pacific Islander",
|
|
32
|
+
5: "White",
|
|
33
|
+
6: "Not applicable",
|
|
34
|
+
7: "Information not provided",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# ── Ethnicity Codes ───────────────────────────────────────────────────────────
|
|
38
|
+
ETHNICITY_CODES = {
|
|
39
|
+
1: "Hispanic or Latino",
|
|
40
|
+
2: "Not Hispanic or Latino",
|
|
41
|
+
3: "Information not provided",
|
|
42
|
+
4: "Not applicable",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# ── Sex Codes ─────────────────────────────────────────────────────────────────
|
|
46
|
+
SEX_CODES = {
|
|
47
|
+
1: "Male",
|
|
48
|
+
2: "Female",
|
|
49
|
+
3: "Information not provided",
|
|
50
|
+
4: "Not applicable",
|
|
51
|
+
6: "Both male and female",
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# ── Loan Type Codes ───────────────────────────────────────────────────────────
|
|
55
|
+
LOAN_TYPE = {
|
|
56
|
+
1: "Conventional",
|
|
57
|
+
2: "FHA",
|
|
58
|
+
3: "VA",
|
|
59
|
+
4: "RHS/FSA",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# ── Loan Purpose Codes ────────────────────────────────────────────────────────
|
|
63
|
+
LOAN_PURPOSE = {
|
|
64
|
+
1: "Home purchase",
|
|
65
|
+
2: "Home improvement",
|
|
66
|
+
31: "Refinancing",
|
|
67
|
+
32: "Cash-out refinancing",
|
|
68
|
+
4: "Other purpose",
|
|
69
|
+
5: "Not applicable",
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# ── Denial Reason Codes ───────────────────────────────────────────────────────
|
|
73
|
+
DENIAL_REASONS = {
|
|
74
|
+
1: "Debt-to-income ratio",
|
|
75
|
+
2: "Employment history",
|
|
76
|
+
3: "Credit history",
|
|
77
|
+
4: "Collateral",
|
|
78
|
+
5: "Insufficient cash (downpayment, closing costs)",
|
|
79
|
+
6: "Unverifiable information",
|
|
80
|
+
7: "Credit application incomplete",
|
|
81
|
+
8: "Mortgage insurance denied",
|
|
82
|
+
9: "Other",
|
|
83
|
+
10: "Not applicable",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# ── Key LAR Fields We Use ─────────────────────────────────────────────────────
|
|
87
|
+
LAR_FIELDS = [
|
|
88
|
+
"action_taken",
|
|
89
|
+
"loan_type",
|
|
90
|
+
"loan_purpose",
|
|
91
|
+
"loan_amount",
|
|
92
|
+
"income",
|
|
93
|
+
"applicant_race_1",
|
|
94
|
+
"applicant_ethnicity_1",
|
|
95
|
+
"applicant_sex",
|
|
96
|
+
"derived_race",
|
|
97
|
+
"derived_ethnicity",
|
|
98
|
+
"derived_sex",
|
|
99
|
+
"census_tract",
|
|
100
|
+
"county_code",
|
|
101
|
+
"state_code",
|
|
102
|
+
"denial_reason_1",
|
|
103
|
+
"denial_reason_2",
|
|
104
|
+
"interest_rate",
|
|
105
|
+
"rate_spread",
|
|
106
|
+
"hoepa_status",
|
|
107
|
+
"lien_status",
|
|
108
|
+
"lei",
|
|
109
|
+
"activity_year",
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
# ── Cache Directory ───────────────────────────────────────────────────────────
|
|
113
|
+
import os
|
|
114
|
+
CACHE_DIR = os.path.join(os.path.expanduser("~"), ".hmdaanalyzer", "cache")
|
|
115
|
+
|
|
116
|
+
# ── Disparity Thresholds ──────────────────────────────────────────────────────
|
|
117
|
+
DISPARITY_THRESHOLDS = {
|
|
118
|
+
"high": 2.0, # Denial rate ratio >= 2.0x = high disparity
|
|
119
|
+
"moderate": 1.5, # Denial rate ratio >= 1.5x = moderate disparity
|
|
120
|
+
"low": 1.0, # Below 1.0x = no disparity
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# ── Reference Group for Disparity ────────────────────────────────────────────
|
|
124
|
+
REFERENCE_RACE = "White"
|
|
File without changes
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generate HMDA analysis reports.
|
|
3
|
+
"""
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from hmdaanalyzer.analysis.disparity import (
|
|
6
|
+
denial_rate_by_race, disparity_ratio, denial_rate_by_income_band
|
|
7
|
+
)
|
|
8
|
+
from hmdaanalyzer.analysis.geographic import (
|
|
9
|
+
lending_by_state, lending_by_county, lending_desert_score
|
|
10
|
+
)
|
|
11
|
+
from hmdaanalyzer.analysis.lender import lender_summary, lender_vs_market
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def generate_disparity_report(
|
|
15
|
+
df: pd.DataFrame,
|
|
16
|
+
title: str = "HMDA Disparity Analysis",
|
|
17
|
+
lei: str = None,
|
|
18
|
+
) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Generate a full HMDA disparity analysis report as Markdown.
|
|
21
|
+
"""
|
|
22
|
+
if lei and "lei" in df.columns:
|
|
23
|
+
analysis_df = df[df["lei"] == lei]
|
|
24
|
+
scope = f"Lender: {lei}"
|
|
25
|
+
else:
|
|
26
|
+
analysis_df = df
|
|
27
|
+
scope = "All Lenders"
|
|
28
|
+
|
|
29
|
+
total = len(analysis_df)
|
|
30
|
+
actionable = analysis_df[analysis_df["action_taken"].isin([1, 2, 3])]
|
|
31
|
+
year = analysis_df["activity_year"].iloc[0] if "activity_year" in analysis_df.columns else "N/A"
|
|
32
|
+
|
|
33
|
+
lines = [
|
|
34
|
+
f"# HMDA Lending Disparity Analysis Report",
|
|
35
|
+
f"## {title}",
|
|
36
|
+
"",
|
|
37
|
+
f"**Scope:** {scope}",
|
|
38
|
+
f"**Year:** {year}",
|
|
39
|
+
f"**Total Records:** {total:,}",
|
|
40
|
+
f"**Actionable Applications:** {len(actionable):,}",
|
|
41
|
+
"",
|
|
42
|
+
"---",
|
|
43
|
+
"",
|
|
44
|
+
"## Denial Rate by Race",
|
|
45
|
+
"",
|
|
46
|
+
"| Race/Ethnicity | Applications | Denials | Denial Rate |",
|
|
47
|
+
"|----------------|-------------|---------|-------------|",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
rates = denial_rate_by_race(analysis_df)
|
|
52
|
+
for _, row in rates.iterrows():
|
|
53
|
+
lines.append(
|
|
54
|
+
f"| {row['derived_race']} | {row['applications']:,} | "
|
|
55
|
+
f"{int(row['denials']):,} | {row['denial_rate']*100:.1f}% |"
|
|
56
|
+
)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
lines.append(f"| Error computing denial rates: {e} |")
|
|
59
|
+
|
|
60
|
+
lines += [
|
|
61
|
+
"",
|
|
62
|
+
"---",
|
|
63
|
+
"",
|
|
64
|
+
"## Disparity Ratios (vs White Applicants)",
|
|
65
|
+
"",
|
|
66
|
+
"A disparity ratio >= 2.0 indicates HIGH disparity (CFPB threshold).",
|
|
67
|
+
"A disparity ratio >= 1.5 indicates MODERATE disparity.",
|
|
68
|
+
"",
|
|
69
|
+
"| Race/Ethnicity | Denial Rate | Reference Rate | Disparity Ratio | Level |",
|
|
70
|
+
"|----------------|-------------|----------------|-----------------|-------|",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
disp = disparity_ratio(analysis_df)
|
|
75
|
+
for _, row in disp.iterrows():
|
|
76
|
+
if row["derived_race"] == "White":
|
|
77
|
+
continue
|
|
78
|
+
ratio = f"{row['disparity_ratio']:.2f}x" if pd.notna(row.get("disparity_ratio")) else "N/A"
|
|
79
|
+
level_emoji = {
|
|
80
|
+
"HIGH": "🔴 HIGH",
|
|
81
|
+
"MODERATE": "🟡 MODERATE",
|
|
82
|
+
"LOW": "🟢 LOW",
|
|
83
|
+
"FAVORABLE": "✅ FAVORABLE",
|
|
84
|
+
"N/A": "—",
|
|
85
|
+
}.get(row.get("disparity_level", "N/A"), "—")
|
|
86
|
+
|
|
87
|
+
lines.append(
|
|
88
|
+
f"| {row['derived_race']} | "
|
|
89
|
+
f"{row['denial_rate']*100:.1f}% | "
|
|
90
|
+
f"{row['reference_denial_rate']*100:.1f}% | "
|
|
91
|
+
f"{ratio} | {level_emoji} |"
|
|
92
|
+
)
|
|
93
|
+
except Exception as e:
|
|
94
|
+
lines.append(f"| Error: {e} |")
|
|
95
|
+
|
|
96
|
+
lines += [
|
|
97
|
+
"",
|
|
98
|
+
"---",
|
|
99
|
+
"",
|
|
100
|
+
"## Denial Rate by Income Band",
|
|
101
|
+
"",
|
|
102
|
+
"| Income Band | Applications | Denial Rate |",
|
|
103
|
+
"|-------------|-------------|-------------|",
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
income_df = denial_rate_by_income_band(analysis_df)
|
|
108
|
+
for _, row in income_df.iterrows():
|
|
109
|
+
lines.append(
|
|
110
|
+
f"| {row['income_band']} | {row['applications']:,} | "
|
|
111
|
+
f"{row['denial_rate']*100:.1f}% |"
|
|
112
|
+
)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
lines.append(f"| Error: {e} |")
|
|
115
|
+
|
|
116
|
+
lines += [
|
|
117
|
+
"",
|
|
118
|
+
"---",
|
|
119
|
+
"",
|
|
120
|
+
"## Key Findings",
|
|
121
|
+
"",
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
disp = disparity_ratio(analysis_df)
|
|
126
|
+
high = disp[disp.get("disparity_level", pd.Series()) == "HIGH"]
|
|
127
|
+
if not high.empty:
|
|
128
|
+
lines.append("**High Disparity Groups:**")
|
|
129
|
+
for _, row in high.iterrows():
|
|
130
|
+
if row["derived_race"] != "White":
|
|
131
|
+
lines.append(
|
|
132
|
+
f"- {row['derived_race']}: "
|
|
133
|
+
f"{row['disparity_ratio']:.1f}x denial rate vs White applicants"
|
|
134
|
+
)
|
|
135
|
+
lines.append("")
|
|
136
|
+
except Exception:
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
return "\n".join(lines)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def summary_table(df: pd.DataFrame) -> pd.DataFrame:
|
|
143
|
+
"""Return denial rates and disparity ratios as a DataFrame."""
|
|
144
|
+
try:
|
|
145
|
+
return disparity_ratio(df)
|
|
146
|
+
except Exception:
|
|
147
|
+
return denial_rate_by_race(df)
|
tests/__init__.py
ADDED
|
File without changes
|
tests/conftest.py
ADDED
tests/test_disparity.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from hmdaanalyzer.analysis.disparity import (
|
|
4
|
+
denial_rate_by_race, disparity_ratio,
|
|
5
|
+
denial_rate_by_income_band, denial_reasons_by_race,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_denial_rate_by_race_returns_df(sample_df):
|
|
10
|
+
result = denial_rate_by_race(sample_df)
|
|
11
|
+
assert isinstance(result, pd.DataFrame)
|
|
12
|
+
assert "denial_rate" in result.columns
|
|
13
|
+
assert "derived_race" in result.columns
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_denial_rate_by_race_values_valid(sample_df):
|
|
17
|
+
result = denial_rate_by_race(sample_df)
|
|
18
|
+
assert (result["denial_rate"] >= 0).all()
|
|
19
|
+
assert (result["denial_rate"] <= 1).all()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_disparity_ratio_returns_df(sample_df):
|
|
23
|
+
result = disparity_ratio(sample_df)
|
|
24
|
+
assert isinstance(result, pd.DataFrame)
|
|
25
|
+
assert "disparity_ratio" in result.columns
|
|
26
|
+
assert "disparity_level" in result.columns
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_disparity_ratio_white_is_reference(sample_df):
|
|
30
|
+
result = disparity_ratio(sample_df, reference="White")
|
|
31
|
+
white_row = result[result["derived_race"] == "White"]
|
|
32
|
+
assert len(white_row) == 1
|
|
33
|
+
assert abs(white_row["disparity_ratio"].iloc[0] - 1.0) < 0.01
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_disparity_levels_valid(sample_df):
|
|
37
|
+
result = disparity_ratio(sample_df)
|
|
38
|
+
valid = {"HIGH", "MODERATE", "LOW", "FAVORABLE", "N/A"}
|
|
39
|
+
assert set(result["disparity_level"].unique()).issubset(valid)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_black_disparity_high(sample_df):
|
|
43
|
+
result = disparity_ratio(sample_df)
|
|
44
|
+
black = result[result["derived_race"] == "Black or African American"]
|
|
45
|
+
if len(black) > 0:
|
|
46
|
+
assert black["disparity_ratio"].iloc[0] > 1.0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_denial_rate_by_income_band(sample_df):
|
|
50
|
+
result = denial_rate_by_income_band(sample_df)
|
|
51
|
+
assert isinstance(result, pd.DataFrame)
|
|
52
|
+
assert "denial_rate" in result.columns
|
|
53
|
+
assert len(result) > 0
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_denial_reasons_by_race(sample_df):
|
|
57
|
+
result = denial_reasons_by_race(sample_df)
|
|
58
|
+
assert isinstance(result, pd.DataFrame)
|
tests/test_geographic.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from hmdaanalyzer.analysis.geographic import (
|
|
4
|
+
lending_by_tract, lending_by_county, lending_by_state,
|
|
5
|
+
lending_desert_score,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_lending_by_tract_returns_df(sample_df):
|
|
10
|
+
result = lending_by_tract(sample_df)
|
|
11
|
+
assert isinstance(result, pd.DataFrame)
|
|
12
|
+
assert "denial_rate" in result.columns
|
|
13
|
+
assert "census_tract" in result.columns
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_lending_by_county_returns_df(sample_df):
|
|
17
|
+
result = lending_by_county(sample_df)
|
|
18
|
+
assert isinstance(result, pd.DataFrame)
|
|
19
|
+
assert "denial_rate" in result.columns
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_lending_by_state_returns_df(sample_df):
|
|
23
|
+
result = lending_by_state(sample_df)
|
|
24
|
+
assert isinstance(result, pd.DataFrame)
|
|
25
|
+
assert len(result) > 0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_lending_desert_score_returns_df(sample_df):
|
|
29
|
+
result = lending_desert_score(sample_df)
|
|
30
|
+
assert isinstance(result, pd.DataFrame)
|
|
31
|
+
assert "desert_score" in result.columns
|
|
32
|
+
assert "is_lending_desert" in result.columns
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_denial_rates_between_0_and_1(sample_df):
|
|
36
|
+
result = lending_by_tract(sample_df)
|
|
37
|
+
assert (result["denial_rate"] >= 0).all()
|
|
38
|
+
assert (result["denial_rate"] <= 1).all()
|
tests/test_lender.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from hmdaanalyzer.analysis.lender import (
|
|
4
|
+
lender_summary, lender_vs_market, top_lenders_by_volume,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_lender_summary_returns_dict(sample_df):
|
|
9
|
+
result = lender_summary(sample_df)
|
|
10
|
+
assert isinstance(result, dict)
|
|
11
|
+
assert "total_applications" in result
|
|
12
|
+
assert "denial_rate" in result
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_lender_summary_denial_rate_valid(sample_df):
|
|
16
|
+
result = lender_summary(sample_df)
|
|
17
|
+
assert 0 <= result["denial_rate"] <= 100
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_lender_vs_market_returns_df(sample_df):
|
|
21
|
+
lei = sample_df["lei"].iloc[0]
|
|
22
|
+
result = lender_vs_market(sample_df, lei)
|
|
23
|
+
assert isinstance(result, pd.DataFrame)
|
|
24
|
+
assert "lender_denial_rate" in result.columns
|
|
25
|
+
assert "market_denial_rate" in result.columns
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_top_lenders_by_volume_returns_df(sample_df):
|
|
29
|
+
result = top_lenders_by_volume(sample_df, n=5)
|
|
30
|
+
assert isinstance(result, pd.DataFrame)
|
|
31
|
+
assert len(result) <= 5
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_top_lenders_sorted_by_volume(sample_df):
|
|
35
|
+
result = top_lenders_by_volume(sample_df, n=10)
|
|
36
|
+
if len(result) > 1:
|
|
37
|
+
assert result["originations"].iloc[0] >= result["originations"].iloc[1]
|
tests/test_loader.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from hmdaanalyzer.data.loader import load_sample
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_load_sample_returns_dataframe():
|
|
7
|
+
df = load_sample(n=100)
|
|
8
|
+
assert isinstance(df, pd.DataFrame)
|
|
9
|
+
assert len(df) == 100
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_load_sample_has_required_columns():
|
|
13
|
+
df = load_sample(n=100)
|
|
14
|
+
required = ["action_taken", "derived_race", "loan_amount",
|
|
15
|
+
"income", "is_denied", "is_approved"]
|
|
16
|
+
for col in required:
|
|
17
|
+
assert col in df.columns
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_load_sample_action_taken_valid():
|
|
21
|
+
df = load_sample(n=500)
|
|
22
|
+
valid = {1, 2, 3, 4, 5}
|
|
23
|
+
assert df["action_taken"].dropna().isin(valid).all()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_load_sample_is_denied_bool():
|
|
27
|
+
df = load_sample(n=500)
|
|
28
|
+
assert str(df["is_denied"].dtype) in ("bool", "boolean")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_load_sample_denial_rate_realistic():
|
|
32
|
+
df = load_sample(n=2000)
|
|
33
|
+
actionable = df[df["action_taken"].isin([1, 2, 3])]
|
|
34
|
+
overall_denial_rate = actionable["is_denied"].mean()
|
|
35
|
+
assert 0.05 < overall_denial_rate < 0.35
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_load_sample_race_disparity():
|
|
39
|
+
df = load_sample(n=3000)
|
|
40
|
+
actionable = df[df["action_taken"].isin([1, 2, 3])]
|
|
41
|
+
black = actionable[actionable["derived_race"] == "Black or African American"]
|
|
42
|
+
white = actionable[actionable["derived_race"] == "White"]
|
|
43
|
+
assert black["is_denied"].mean() > white["is_denied"].mean()
|
tests/test_report.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from hmdaanalyzer.report.generator import generate_disparity_report, summary_table
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_generate_report_returns_string(sample_df):
|
|
7
|
+
report = generate_disparity_report(sample_df)
|
|
8
|
+
assert isinstance(report, str)
|
|
9
|
+
assert len(report) > 100
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_report_contains_sections(sample_df):
|
|
13
|
+
report = generate_disparity_report(sample_df)
|
|
14
|
+
assert "Denial Rate by Race" in report
|
|
15
|
+
assert "Disparity Ratios" in report
|
|
16
|
+
assert "Income Band" in report
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_report_contains_black_disparity(sample_df):
|
|
20
|
+
report = generate_disparity_report(sample_df)
|
|
21
|
+
assert "Black or African American" in report
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_summary_table_returns_df(sample_df):
|
|
25
|
+
df = summary_table(sample_df)
|
|
26
|
+
assert isinstance(df, pd.DataFrame)
|
|
27
|
+
assert len(df) > 0
|