hmda-analyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,125 @@
1
+ Metadata-Version: 2.4
2
+ Name: hmda-analyzer
3
+ Version: 0.1.0
4
+ Summary: HMDA mortgage lending disparity analyzer — denial rates, racial disparities, lending deserts, and lender benchmarking
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/Jaypatel1511/hmda-analyzer
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: pandas>=1.4.0
10
+ Requires-Dist: numpy>=1.21.0
11
+ Requires-Dist: requests>=2.27.0
12
+
13
+ # hmda-analyzer 📊
14
+
15
+ **HMDA mortgage lending disparity analyzer.**
16
+
17
+ Compute denial rate disparities by race, identify lending deserts, benchmark lenders
18
+ against peers, and generate fair lending analysis reports — using CFPB HMDA LAR data.
19
+ Free public API, no authentication required.
20
+
21
+ ---
22
+
23
+ ## Why hmda-analyzer?
24
+
25
+ HMDA data covers 10+ million mortgage applications per year with borrower demographics,
26
+ denial rates, loan amounts, and census tract locations. It is the most powerful public
27
+ dataset for analyzing mortgage lending disparities — but it requires significant
28
+ engineering to use. hmda-analyzer makes it accessible in Python.
29
+
30
+ ---
31
+
32
+ ## Installation
33
+
34
+ pip install hmda-analyzer
35
+
36
+ ---
37
+
38
+ ## Quickstart
39
+
40
+ from hmdaanalyzer import (
41
+ load_sample, denial_rate_by_race, disparity_ratio,
42
+ lending_by_tract, lender_summary, generate_disparity_report,
43
+ )
44
+
45
+ # Load sample data (no API required)
46
+ df = load_sample(n=5000)
47
+
48
+ # Or load from CFPB API (real data)
49
+ # df = load_from_api(year=2023, state="IL")
50
+
51
+ # Denial rates by race
52
+ rates = denial_rate_by_race(df)
53
+ print(rates)
54
+
55
+ # Disparity ratios vs White applicants
56
+ disparities = disparity_ratio(df)
57
+ print(disparities)
58
+
59
+ # Geographic analysis
60
+ tracts = lending_by_tract(df)
61
+ deserts = lending_by_tract(df)
62
+
63
+ # Lender analysis
64
+ summary = lender_summary(df, lei="LEI000001")
65
+
66
+ # Full disparity report
67
+ report = generate_disparity_report(df, title="Illinois Mortgage Market 2023")
68
+ print(report)
69
+
70
+ ---
71
+
72
+ ## Analyses Supported
73
+
74
+ - Denial rate by race and ethnicity
75
+ - Disparity ratios vs reference group (default: White applicants)
76
+ - Denial rate by income band
77
+ - Denial reasons by race
78
+ - Lending activity by census tract, county, and state
79
+ - Lending desert identification (low application volume tracts)
80
+ - Lender vs market comparison
81
+ - Top lenders by origination volume
82
+
83
+ ---
84
+
85
+ ## Disparity Ratio Thresholds
86
+
87
+ Based on CFPB fair lending examination standards:
88
+
89
+ - >= 2.0x — HIGH disparity (triggers regulatory scrutiny)
90
+ - >= 1.5x — MODERATE disparity
91
+ - < 1.5x — LOW disparity
92
+ - < 1.0x — FAVORABLE (group has lower denial rate than reference)
93
+
94
+ ---
95
+
96
+ ## Data Sources
97
+
98
+ CFPB HMDA Data Browser API — free, no API key required.
99
+ 2024 data covers 4,908 institutions and millions of loan applications.
100
+
101
+ https://ffiec.cfpb.gov/data-browser/
102
+
103
+ ---
104
+
105
+ ## Running Tests
106
+
107
+ PYTHONPATH=. pytest tests/ -v
108
+
109
+ 28 tests across all modules.
110
+
111
+ ---
112
+
113
+ ## Who This Is For
114
+
115
+ - Fair lending analysts and compliance teams at banks and CDFIs
116
+ - Community reinvestment researchers studying mortgage disparities
117
+ - Journalists covering housing discrimination and redlining
118
+ - Regulators and examiners analyzing lender performance
119
+ - Academics studying racial wealth gaps and homeownership barriers
120
+
121
+ ---
122
+
123
+ ## License
124
+
125
+ MIT 2026 Jaypatel1511
@@ -0,0 +1,21 @@
1
+ hmdaanalyzer/__init__.py,sha256=TJuAUuk9uX7UgW8pwqvTMF27bFbjqQL5WJCkRoWqrhs,1034
2
+ hmdaanalyzer/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ hmdaanalyzer/analysis/disparity.py,sha256=0kxcvw3SIWP_SAnPGGD0bC7tb3aQTkJFNVHZO3byWW8,4165
4
+ hmdaanalyzer/analysis/geographic.py,sha256=J5Mfd6mnAtO-wUCmLbFpyJqecZEuYC2aJ7g7YdmcKhY,4140
5
+ hmdaanalyzer/analysis/lender.py,sha256=k5DYRIwLuCdn0TmhrahTWWIM898uCpWN99yNA5CJZ68,3336
6
+ hmdaanalyzer/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ hmdaanalyzer/data/loader.py,sha256=XFx3_VeGfczfTCit1rkz2dVkfvGZ5ArVsCFM8bWpT_E,6219
8
+ hmdaanalyzer/data/schema.py,sha256=3Voi5ZRk5LtZiyJcn4J_Q9u4s-QB1fxL863PXMYzMIw,4991
9
+ hmdaanalyzer/report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ hmdaanalyzer/report/generator.py,sha256=ogefRB-5FgMUDyIYcjZIEkZEZspbWGXzfPAwJx2T7YI,4633
11
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ tests/conftest.py,sha256=PksNnhkWPOyqsMP7Au4HVa7LAQsAS5g0Etq7BjuIeOE,211
13
+ tests/test_disparity.py,sha256=OUUmGwtsSB7H4YBKgNGHjfgecIUZaV7KTSZO1NGw2Q8,1901
14
+ tests/test_geographic.py,sha256=_Pj5LB2PRTwVbfX6t-xxotNzS-m5eF2CEpWW8ZQ3Iow,1157
15
+ tests/test_lender.py,sha256=BStl8s1un1rENvZKzXFxebtJSTLBxrxnapeDCSYzRNg,1146
16
+ tests/test_loader.py,sha256=LoBfXJmh64LFSc3qb6QFOKhckk4EW0R0qMSujQfaW2U,1329
17
+ tests/test_report.py,sha256=HMbj707Jb7wfVdjhyZU9gtKJlmhLDiZ-yIMQ5gf0XAU,800
18
+ hmda_analyzer-0.1.0.dist-info/METADATA,sha256=fdFw3qoUWOW6s4EbvQI8kBL-J8XzxXmJNR3MHgqjZic,3251
19
+ hmda_analyzer-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
+ hmda_analyzer-0.1.0.dist-info/top_level.txt,sha256=co7d3qOb1t3FdKP69KBrETmXneiJ3zDhY9WijVeQ0cs,19
21
+ hmda_analyzer-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ hmdaanalyzer
2
+ tests
@@ -0,0 +1,28 @@
1
+ from hmdaanalyzer.data.loader import (
2
+ load_from_api, load_from_file, load_sample,
3
+ )
4
+ from hmdaanalyzer.analysis.disparity import (
5
+ denial_rate_by_race, disparity_ratio,
6
+ denial_rate_by_income_band, denial_reasons_by_race,
7
+ )
8
+ from hmdaanalyzer.analysis.geographic import (
9
+ lending_by_tract, lending_by_county, lending_by_state,
10
+ lending_desert_score, racial_composition_by_tract,
11
+ )
12
+ from hmdaanalyzer.analysis.lender import (
13
+ lender_summary, lender_vs_market, top_lenders_by_volume,
14
+ )
15
+ from hmdaanalyzer.report.generator import (
16
+ generate_disparity_report, summary_table,
17
+ )
18
+
19
+ __version__ = "0.1.0"
20
+ __all__ = [
21
+ "load_from_api", "load_from_file", "load_sample",
22
+ "denial_rate_by_race", "disparity_ratio",
23
+ "denial_rate_by_income_band", "denial_reasons_by_race",
24
+ "lending_by_tract", "lending_by_county", "lending_by_state",
25
+ "lending_desert_score", "racial_composition_by_tract",
26
+ "lender_summary", "lender_vs_market", "top_lenders_by_volume",
27
+ "generate_disparity_report", "summary_table",
28
+ ]
File without changes
@@ -0,0 +1,128 @@
1
+ """
2
+ Denial rate disparity analysis.
3
+ Computes disparate impact ratios between racial/ethnic groups.
4
+ """
5
+ import pandas as pd
6
+ import numpy as np
7
+ from hmdaanalyzer.data.schema import DISPARITY_THRESHOLDS, REFERENCE_RACE
8
+
9
+
10
+ def denial_rate_by_race(df: pd.DataFrame) -> pd.DataFrame:
11
+ """
12
+ Compute denial rates by race for a HMDA LAR DataFrame.
13
+
14
+ Args:
15
+ df: Cleaned HMDA LAR DataFrame with is_denied and derived_race columns
16
+
17
+ Returns:
18
+ DataFrame with denial rates by race
19
+ """
20
+ if "derived_race" not in df.columns or "is_denied" not in df.columns:
21
+ raise ValueError("DataFrame must have 'derived_race' and 'is_denied' columns")
22
+
23
+ actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
24
+
25
+ result = actionable.groupby("derived_race").agg(
26
+ applications=("is_denied", "count"),
27
+ denials=("is_denied", "sum"),
28
+ ).reset_index()
29
+
30
+ result["denial_rate"] = result["denials"] / result["applications"]
31
+ result = result[result["applications"] >= 5]
32
+ result = result.sort_values("denial_rate", ascending=False)
33
+
34
+ return result
35
+
36
+
37
+ def disparity_ratio(df: pd.DataFrame, reference: str = None) -> pd.DataFrame:
38
+ """
39
+ Compute disparity ratios relative to a reference group (default: White).
40
+
41
+ Disparity ratio = group denial rate / reference group denial rate
42
+ A ratio > 2.0 indicates high disparity (CFPB threshold).
43
+
44
+ Args:
45
+ df: Cleaned HMDA LAR DataFrame
46
+ reference: Reference race group (default: "White")
47
+
48
+ Returns:
49
+ DataFrame with disparity ratios and severity flags
50
+ """
51
+ reference = reference or REFERENCE_RACE
52
+ denial_rates = denial_rate_by_race(df)
53
+
54
+ ref_row = denial_rates[denial_rates["derived_race"] == reference]
55
+ if ref_row.empty:
56
+ raise ValueError(f"Reference group '{reference}' not found in data.")
57
+
58
+ ref_rate = ref_row["denial_rate"].iloc[0]
59
+
60
+ result = denial_rates.copy()
61
+ result["reference_group"] = reference
62
+ result["reference_denial_rate"] = ref_rate
63
+ result["disparity_ratio"] = result["denial_rate"] / ref_rate if ref_rate > 0 else None
64
+
65
+ def classify(ratio):
66
+ if ratio is None or pd.isna(ratio):
67
+ return "N/A"
68
+ if ratio >= DISPARITY_THRESHOLDS["high"]:
69
+ return "HIGH"
70
+ elif ratio >= DISPARITY_THRESHOLDS["moderate"]:
71
+ return "MODERATE"
72
+ elif ratio < 1.0:
73
+ return "FAVORABLE"
74
+ return "LOW"
75
+
76
+ result["disparity_level"] = result["disparity_ratio"].apply(classify)
77
+ result = result.sort_values("disparity_ratio", ascending=False)
78
+
79
+ return result
80
+
81
+
82
+ def denial_rate_by_income_band(df: pd.DataFrame) -> pd.DataFrame:
83
+ """
84
+ Compute denial rates by income band to identify income-based disparities.
85
+ """
86
+ df = df.copy()
87
+ df["income_band"] = pd.cut(
88
+ df["income"],
89
+ bins=[0, 50, 80, 120, 200, float("inf")],
90
+ labels=["<$50k", "$50-80k", "$80-120k", "$120-200k", "$200k+"],
91
+ )
92
+
93
+ actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
94
+
95
+ result = actionable.groupby("income_band", observed=True).agg(
96
+ applications=("is_denied", "count"),
97
+ denials=("is_denied", "sum"),
98
+ ).reset_index()
99
+
100
+ result["denial_rate"] = result["denials"] / result["applications"]
101
+ return result
102
+
103
+
104
+ def denial_reasons_by_race(df: pd.DataFrame) -> pd.DataFrame:
105
+ """
106
+ Analyze denial reasons broken down by race.
107
+ """
108
+ from hmdaanalyzer.data.schema import DENIAL_REASONS
109
+
110
+ denied = df[df["is_denied"] == True].copy()
111
+
112
+ if "denial_reason_1" not in denied.columns:
113
+ return pd.DataFrame()
114
+
115
+ denied["denial_reason_label"] = denied["denial_reason_1"].map(
116
+ lambda x: DENIAL_REASONS.get(int(x), "Unknown") if pd.notna(x) else "Unknown"
117
+ )
118
+
119
+ result = denied.groupby(
120
+ ["derived_race", "denial_reason_label"]
121
+ ).size().reset_index(name="count")
122
+
123
+ totals = denied.groupby("derived_race").size().reset_index(name="total")
124
+ result = result.merge(totals, on="derived_race")
125
+ result["pct"] = result["count"] / result["total"] * 100
126
+ result = result.sort_values(["derived_race", "pct"], ascending=[True, False])
127
+
128
+ return result
@@ -0,0 +1,125 @@
1
+ """
2
+ Geographic analysis of HMDA lending patterns.
3
+ Identifies lending deserts and maps activity by census tract.
4
+ """
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+
9
+ def lending_by_tract(df: pd.DataFrame) -> pd.DataFrame:
10
+ """
11
+ Aggregate HMDA lending activity by census tract.
12
+
13
+ Returns:
14
+ DataFrame with application counts, denial rates, and loan volumes by tract
15
+ """
16
+ if "census_tract" not in df.columns:
17
+ raise ValueError("DataFrame must have 'census_tract' column")
18
+
19
+ actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
20
+
21
+ result = actionable.groupby("census_tract").agg(
22
+ applications=("is_denied", "count"),
23
+ denials=("is_denied", "sum"),
24
+ originations=("is_approved", "sum"),
25
+ avg_loan_amount=("loan_amount", "mean"),
26
+ median_income=("income", "median"),
27
+ ).reset_index()
28
+
29
+ result["denial_rate"] = result["denials"] / result["applications"]
30
+ result["origination_rate"] = result["originations"] / result["applications"]
31
+
32
+ return result.sort_values("applications", ascending=False)
33
+
34
+
35
+ def lending_by_county(df: pd.DataFrame) -> pd.DataFrame:
36
+ """
37
+ Aggregate HMDA lending activity by county.
38
+ """
39
+ if "county_code" not in df.columns:
40
+ raise ValueError("DataFrame must have 'county_code' column")
41
+
42
+ actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
43
+
44
+ result = actionable.groupby("county_code").agg(
45
+ applications=("is_denied", "count"),
46
+ denials=("is_denied", "sum"),
47
+ originations=("is_approved", "sum"),
48
+ total_loan_volume=("loan_amount", "sum"),
49
+ avg_loan_amount=("loan_amount", "mean"),
50
+ ).reset_index()
51
+
52
+ result["denial_rate"] = result["denials"] / result["applications"]
53
+ result["state_code"] = result["county_code"].str[:2]
54
+
55
+ return result.sort_values("applications", ascending=False)
56
+
57
+
58
+ def lending_desert_score(df: pd.DataFrame) -> pd.DataFrame:
59
+ """
60
+ Identify census tracts with abnormally low application volumes.
61
+ A 'lending desert' is a tract with very few mortgage applications
62
+ relative to its expected volume based on housing units.
63
+
64
+ Returns:
65
+ DataFrame with lending desert scores by census tract
66
+ """
67
+ tract_df = lending_by_tract(df)
68
+
69
+ # Percentile rank by application volume
70
+ tract_df["app_percentile"] = (
71
+ tract_df["applications"].rank(pct=True) * 100
72
+ ).round(1)
73
+
74
+ # Low denial rate + low application volume = potential lending desert
75
+ # (lenders may be avoiding the area entirely)
76
+ tract_df["desert_score"] = (
77
+ (100 - tract_df["app_percentile"]) * 0.6 +
78
+ tract_df["denial_rate"] * 100 * 0.4
79
+ ).round(1)
80
+
81
+ tract_df["is_lending_desert"] = (
82
+ (tract_df["app_percentile"] < 25) &
83
+ (tract_df["denial_rate"] > 0.15)
84
+ )
85
+
86
+ return tract_df.sort_values("desert_score", ascending=False)
87
+
88
+
89
+ def racial_composition_by_tract(df: pd.DataFrame) -> pd.DataFrame:
90
+ """
91
+ Show racial composition of applicants by census tract.
92
+ Useful for identifying tracts where lending may differ by applicant race.
93
+ """
94
+ if "derived_race" not in df.columns or "census_tract" not in df.columns:
95
+ return pd.DataFrame()
96
+
97
+ result = df.groupby(
98
+ ["census_tract", "derived_race"]
99
+ ).agg(
100
+ applications=("is_denied", "count"),
101
+ denial_rate=("is_denied", "mean"),
102
+ ).reset_index()
103
+
104
+ return result.sort_values(["census_tract", "applications"], ascending=[True, False])
105
+
106
+
107
+ def lending_by_state(df: pd.DataFrame) -> pd.DataFrame:
108
+ """
109
+ Aggregate lending activity by state.
110
+ """
111
+ state_col = "state_code" if "state_code" in df.columns else None
112
+ if state_col is None:
113
+ return pd.DataFrame()
114
+
115
+ actionable = df[df["action_taken"].isin([1, 2, 3])].copy()
116
+
117
+ result = actionable.groupby(state_col).agg(
118
+ applications=("is_denied", "count"),
119
+ denials=("is_denied", "sum"),
120
+ originations=("is_approved", "sum"),
121
+ total_volume=("loan_amount", "sum"),
122
+ ).reset_index()
123
+
124
+ result["denial_rate"] = result["denials"] / result["applications"]
125
+ return result.sort_values("applications", ascending=False)
@@ -0,0 +1,109 @@
1
+ """
2
+ Lender-level HMDA analysis.
3
+ Compare a lender's performance against market peers.
4
+ """
5
+ import pandas as pd
6
+ from hmdaanalyzer.analysis.disparity import denial_rate_by_race, disparity_ratio
7
+
8
+
9
+ def lender_summary(df: pd.DataFrame, lei: str = None) -> dict:
10
+ """
11
+ Compute summary statistics for a single lender.
12
+
13
+ Args:
14
+ df: HMDA LAR DataFrame (filtered to lender or full market)
15
+ lei: Lender LEI to filter to (optional)
16
+
17
+ Returns:
18
+ Dict with key lender performance metrics
19
+ """
20
+ if lei and "lei" in df.columns:
21
+ df = df[df["lei"] == lei]
22
+
23
+ if df.empty:
24
+ return {}
25
+
26
+ actionable = df[df["action_taken"].isin([1, 2, 3])]
27
+ total = len(actionable)
28
+ if total == 0:
29
+ return {}
30
+
31
+ return {
32
+ "total_applications": total,
33
+ "originations": int(actionable["is_approved"].sum()),
34
+ "denials": int(actionable["is_denied"].sum()),
35
+ "approval_rate": round(actionable["is_approved"].mean() * 100, 2),
36
+ "denial_rate": round(actionable["is_denied"].mean() * 100, 2),
37
+ "avg_loan_amount": round(actionable["loan_amount"].mean(), 0),
38
+ "median_loan_amount": round(actionable["loan_amount"].median(), 0),
39
+ "avg_applicant_income": round(actionable["income"].mean(), 0),
40
+ "unique_tracts": actionable["census_tract"].nunique(),
41
+ "unique_counties": actionable["county_code"].nunique(),
42
+ }
43
+
44
+
45
+ def lender_vs_market(
46
+ df: pd.DataFrame,
47
+ lei: str,
48
+ ) -> pd.DataFrame:
49
+ """
50
+ Compare a lender's denial rates against the overall market
51
+ by racial group.
52
+
53
+ Args:
54
+ df: Full market HMDA LAR DataFrame
55
+ lei: Lender LEI to compare
56
+
57
+ Returns:
58
+ DataFrame showing lender vs market denial rates by race
59
+ """
60
+ lender_df = df[df["lei"] == lei] if "lei" in df.columns else df
61
+
62
+ lender_rates = denial_rate_by_race(lender_df).rename(
63
+ columns={"denial_rate": "lender_denial_rate",
64
+ "applications": "lender_applications",
65
+ "denials": "lender_denials"}
66
+ )
67
+
68
+ market_rates = denial_rate_by_race(df).rename(
69
+ columns={"denial_rate": "market_denial_rate",
70
+ "applications": "market_applications",
71
+ "denials": "market_denials"}
72
+ )
73
+
74
+ result = lender_rates.merge(
75
+ market_rates[["derived_race", "market_denial_rate"]],
76
+ on="derived_race", how="left"
77
+ )
78
+
79
+ result["vs_market"] = (
80
+ result["lender_denial_rate"] - result["market_denial_rate"]
81
+ )
82
+ result["vs_market_pct"] = (result["vs_market"] * 100).round(2)
83
+
84
+ return result.sort_values("lender_denial_rate", ascending=False)
85
+
86
+
87
+ def top_lenders_by_volume(
88
+ df: pd.DataFrame,
89
+ n: int = 10,
90
+ state: str = None,
91
+ ) -> pd.DataFrame:
92
+ """
93
+ Rank lenders by origination volume.
94
+ """
95
+ if state and "state_code" in df.columns:
96
+ df = df[df["state_code"] == state]
97
+
98
+ originated = df[df["action_taken"] == 1]
99
+
100
+ if "lei" not in originated.columns:
101
+ return pd.DataFrame()
102
+
103
+ result = originated.groupby("lei").agg(
104
+ originations=("loan_amount", "count"),
105
+ total_volume=("loan_amount", "sum"),
106
+ avg_loan=("loan_amount", "mean"),
107
+ ).reset_index()
108
+
109
+ return result.sort_values("originations", ascending=False).head(n)
File without changes
@@ -0,0 +1,186 @@
1
+ """
2
+ Load HMDA LAR data from CFPB Data Browser API or local CSV.
3
+ Free public API — no authentication required.
4
+ """
5
+ import os
6
+ import requests
7
+ import pandas as pd
8
+ from pathlib import Path
9
+ from hmdaanalyzer.data.schema import (
10
+ HMDA_API_BASE, CACHE_DIR, ACTION_TAKEN,
11
+ APPROVED_ACTIONS, DENIED_ACTIONS,
12
+ RACE_CODES, ETHNICITY_CODES, LOAN_PURPOSE, LOAN_TYPE,
13
+ )
14
+
15
+
16
+ def get_cache_dir() -> Path:
17
+ path = Path(CACHE_DIR)
18
+ path.mkdir(parents=True, exist_ok=True)
19
+ return path
20
+
21
+
22
+ def load_from_api(
23
+ year: int = 2023,
24
+ state: str = None,
25
+ lei: str = None,
26
+ county: str = None,
27
+ limit: int = 10_000,
28
+ ) -> pd.DataFrame:
29
+ """
30
+ Load HMDA LAR data from CFPB Data Browser API.
31
+
32
+ Args:
33
+ year: Data year e.g. 2023
34
+ state: Two-letter state code e.g. "IL"
35
+ lei: Lender LEI identifier
36
+ county: County FIPS code e.g. "17031"
37
+ limit: Max records to fetch
38
+
39
+ Returns:
40
+ Clean pandas DataFrame with standardized columns
41
+ """
42
+ params = {
43
+ "years": year,
44
+ "actions_taken": "1,2,3,4,5",
45
+ "limit": min(limit, 1_000_000),
46
+ }
47
+ if state:
48
+ params["states"] = state.upper()
49
+ if lei:
50
+ params["leis"] = lei
51
+ if county:
52
+ params["counties"] = county
53
+
54
+ url = f"{HMDA_API_BASE}/csv"
55
+
56
+ try:
57
+ print(f"Fetching HMDA data from CFPB API (year={year})...")
58
+ r = requests.get(url, params=params, timeout=120, stream=True)
59
+ r.raise_for_status()
60
+
61
+ from io import StringIO
62
+ content = r.content.decode("utf-8")
63
+ df = pd.read_csv(StringIO(content), dtype=str, low_memory=False)
64
+ print(f"Loaded {len(df):,} LAR records")
65
+ return _clean(df)
66
+
67
+ except Exception as e:
68
+ print(f"API error: {e}. Use load_sample() for testing.")
69
+ return pd.DataFrame()
70
+
71
+
72
+ def load_from_file(path: str) -> pd.DataFrame:
73
+ """
74
+ Load HMDA LAR data from a local CSV file.
75
+ Compatible with CFPB modified LAR files.
76
+ """
77
+ print(f"Loading HMDA data from {path}...")
78
+ df = pd.read_csv(path, dtype=str, low_memory=False)
79
+ print(f"Loaded {len(df):,} LAR records")
80
+ return _clean(df)
81
+
82
+
83
+ def load_sample(n: int = 5000, seed: int = 42) -> pd.DataFrame:
84
+ """
85
+ Generate synthetic HMDA LAR data for testing and demos.
86
+ Realistic distribution based on 2023 national HMDA statistics.
87
+ No internet connection required.
88
+ """
89
+ import numpy as np
90
+ rng = np.random.default_rng(seed)
91
+
92
+ states = ["IL", "NY", "CA", "TX", "GA", "NC", "OH", "PA", "FL", "MI"]
93
+ leis = [f"LEI{i:06d}" for i in range(1, 11)]
94
+
95
+ # Realistic denial rates by race (based on 2023 HMDA national data)
96
+ race_denial_rates = {
97
+ "White": 0.095,
98
+ "Black or African American": 0.195,
99
+ "Asian": 0.090,
100
+ "Hispanic or Latino": 0.145,
101
+ "American Indian or Alaska Native": 0.175,
102
+ "Native Hawaiian or Other Pacific Islander": 0.160,
103
+ }
104
+
105
+ races = list(race_denial_rates.keys())
106
+ race_weights = [0.65, 0.13, 0.07, 0.10, 0.02, 0.03]
107
+
108
+ records = []
109
+ for i in range(n):
110
+ race = rng.choice(races, p=race_weights)
111
+ denial_prob = race_denial_rates[race]
112
+
113
+ # Income and loan amount correlated
114
+ income = max(20, rng.normal(85, 45))
115
+ loan_amount = max(50, income * rng.uniform(2.5, 5.5))
116
+
117
+ # Action taken based on race denial probability
118
+ r = rng.random()
119
+ if r < denial_prob:
120
+ action = 3
121
+ elif r < denial_prob + 0.05:
122
+ action = 4
123
+ else:
124
+ action = 1
125
+
126
+ state = rng.choice(states)
127
+ county_num = rng.integers(1, 200)
128
+ state_fips = {
129
+ "IL": "17", "NY": "36", "CA": "06", "TX": "48",
130
+ "GA": "13", "NC": "37", "OH": "39", "PA": "42",
131
+ "FL": "12", "MI": "26",
132
+ }[state]
133
+ county_code = f"{state_fips}{county_num:03d}"
134
+ tract = f"{county_code}{rng.integers(100000, 999999)}"
135
+
136
+ records.append({
137
+ "action_taken": str(action),
138
+ "loan_type": str(rng.choice([1, 1, 1, 2, 3], p=[0.7, 0.1, 0.1, 0.05, 0.05])),
139
+ "loan_purpose": str(rng.choice([1, 31, 32, 2], p=[0.5, 0.3, 0.15, 0.05])),
140
+ "loan_amount": str(round(loan_amount)),
141
+ "income": str(round(income)),
142
+ "derived_race": race,
143
+ "derived_ethnicity": (
144
+ "Hispanic or Latino" if race == "Hispanic or Latino"
145
+ else "Not Hispanic or Latino"
146
+ ),
147
+ "derived_sex": rng.choice(["Male", "Female", "Joint"], p=[0.45, 0.3, 0.25]),
148
+ "census_tract": tract,
149
+ "county_code": county_code,
150
+ "state_code": state_fips,
151
+ "denial_reason_1": str(rng.choice([1, 3, 4, 9, 10], p=[0.3, 0.25, 0.2, 0.15, 0.1])) if action == 3 else "10",
152
+ "interest_rate": str(round(rng.uniform(5.5, 8.5), 2)) if action == 1 else "",
153
+ "rate_spread": str(round(rng.uniform(-0.5, 2.0), 2)) if action == 1 else "",
154
+ "lei": rng.choice(leis),
155
+ "activity_year": "2023",
156
+ })
157
+
158
+ df = pd.DataFrame(records)
159
+ return _clean(df)
160
+
161
+
162
+ def _clean(df: pd.DataFrame) -> pd.DataFrame:
163
+ """Standardize and clean a raw HMDA LAR DataFrame."""
164
+ df.columns = df.columns.str.lower().str.strip()
165
+
166
+ numeric_cols = ["loan_amount", "income", "interest_rate", "rate_spread"]
167
+ for col in numeric_cols:
168
+ if col in df.columns:
169
+ df[col] = pd.to_numeric(df[col], errors="coerce")
170
+
171
+ int_cols = ["action_taken", "loan_type", "loan_purpose",
172
+ "denial_reason_1", "denial_reason_2"]
173
+ for col in int_cols:
174
+ if col in df.columns:
175
+ df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
176
+
177
+ if "action_taken" in df.columns:
178
+ df["is_approved"] = df["action_taken"].isin(APPROVED_ACTIONS)
179
+ df["is_denied"] = df["action_taken"].isin(DENIED_ACTIONS)
180
+
181
+ if "derived_race" not in df.columns and "applicant_race_1" in df.columns:
182
+ df["derived_race"] = df["applicant_race_1"].map(
183
+ lambda x: RACE_CODES.get(int(x), "Unknown") if pd.notna(x) else "Unknown"
184
+ )
185
+
186
+ return df.reset_index(drop=True)
@@ -0,0 +1,124 @@
1
+ """
2
+ Constants, field mappings, and dataclasses for HMDA LAR data analysis.
3
+ Based on 2024 HMDA Filing Instruction Guide (FIG) and CFPB Data Browser API.
4
+ """
5
+
6
+ # ── CFPB HMDA Data Browser API ────────────────────────────────────────────────
7
+ HMDA_API_BASE = "https://ffiec.cfpb.gov/v2/data-browser-api/view"
8
+ HMDA_AGG_BASE = "https://ffiec.cfpb.gov/v2/data-browser-api/view/aggregations"
9
+
10
+ # ── Action Taken Codes ────────────────────────────────────────────────────────
11
+ ACTION_TAKEN = {
12
+ 1: "Loan originated",
13
+ 2: "Application approved but not accepted",
14
+ 3: "Application denied",
15
+ 4: "Application withdrawn by applicant",
16
+ 5: "File closed for incompleteness",
17
+ 6: "Purchased loan",
18
+ 7: "Preapproval request denied",
19
+ 8: "Preapproval request approved but not accepted",
20
+ }
21
+
22
+ APPROVED_ACTIONS = {1, 2, 8}
23
+ DENIED_ACTIONS = {3, 7}
24
+ WITHDRAWN_ACTIONS = {4, 5}
25
+
26
+ # ── Race Codes ────────────────────────────────────────────────────────────────
27
+ RACE_CODES = {
28
+ 1: "American Indian or Alaska Native",
29
+ 2: "Asian",
30
+ 3: "Black or African American",
31
+ 4: "Native Hawaiian or Other Pacific Islander",
32
+ 5: "White",
33
+ 6: "Not applicable",
34
+ 7: "Information not provided",
35
+ }
36
+
37
+ # ── Ethnicity Codes ───────────────────────────────────────────────────────────
38
+ ETHNICITY_CODES = {
39
+ 1: "Hispanic or Latino",
40
+ 2: "Not Hispanic or Latino",
41
+ 3: "Information not provided",
42
+ 4: "Not applicable",
43
+ }
44
+
45
+ # ── Sex Codes ─────────────────────────────────────────────────────────────────
46
+ SEX_CODES = {
47
+ 1: "Male",
48
+ 2: "Female",
49
+ 3: "Information not provided",
50
+ 4: "Not applicable",
51
+ 6: "Both male and female",
52
+ }
53
+
54
+ # ── Loan Type Codes ───────────────────────────────────────────────────────────
55
+ LOAN_TYPE = {
56
+ 1: "Conventional",
57
+ 2: "FHA",
58
+ 3: "VA",
59
+ 4: "RHS/FSA",
60
+ }
61
+
62
+ # ── Loan Purpose Codes ────────────────────────────────────────────────────────
63
+ LOAN_PURPOSE = {
64
+ 1: "Home purchase",
65
+ 2: "Home improvement",
66
+ 31: "Refinancing",
67
+ 32: "Cash-out refinancing",
68
+ 4: "Other purpose",
69
+ 5: "Not applicable",
70
+ }
71
+
72
+ # ── Denial Reason Codes ───────────────────────────────────────────────────────
73
+ DENIAL_REASONS = {
74
+ 1: "Debt-to-income ratio",
75
+ 2: "Employment history",
76
+ 3: "Credit history",
77
+ 4: "Collateral",
78
+ 5: "Insufficient cash (downpayment, closing costs)",
79
+ 6: "Unverifiable information",
80
+ 7: "Credit application incomplete",
81
+ 8: "Mortgage insurance denied",
82
+ 9: "Other",
83
+ 10: "Not applicable",
84
+ }
85
+
86
+ # ── Key LAR Fields We Use ─────────────────────────────────────────────────────
87
+ LAR_FIELDS = [
88
+ "action_taken",
89
+ "loan_type",
90
+ "loan_purpose",
91
+ "loan_amount",
92
+ "income",
93
+ "applicant_race_1",
94
+ "applicant_ethnicity_1",
95
+ "applicant_sex",
96
+ "derived_race",
97
+ "derived_ethnicity",
98
+ "derived_sex",
99
+ "census_tract",
100
+ "county_code",
101
+ "state_code",
102
+ "denial_reason_1",
103
+ "denial_reason_2",
104
+ "interest_rate",
105
+ "rate_spread",
106
+ "hoepa_status",
107
+ "lien_status",
108
+ "lei",
109
+ "activity_year",
110
+ ]
111
+
112
+ # ── Cache Directory ───────────────────────────────────────────────────────────
113
+ import os
114
+ CACHE_DIR = os.path.join(os.path.expanduser("~"), ".hmdaanalyzer", "cache")
115
+
116
+ # ── Disparity Thresholds ──────────────────────────────────────────────────────
117
+ DISPARITY_THRESHOLDS = {
118
+ "high": 2.0, # Denial rate ratio >= 2.0x = high disparity
119
+ "moderate": 1.5, # Denial rate ratio >= 1.5x = moderate disparity
120
+ "low": 1.0, # Below 1.0x = no disparity
121
+ }
122
+
123
+ # ── Reference Group for Disparity ────────────────────────────────────────────
124
+ REFERENCE_RACE = "White"
File without changes
@@ -0,0 +1,147 @@
1
+ """
2
+ Generate HMDA analysis reports.
3
+ """
4
+ import pandas as pd
5
+ from hmdaanalyzer.analysis.disparity import (
6
+ denial_rate_by_race, disparity_ratio, denial_rate_by_income_band
7
+ )
8
+ from hmdaanalyzer.analysis.geographic import (
9
+ lending_by_state, lending_by_county, lending_desert_score
10
+ )
11
+ from hmdaanalyzer.analysis.lender import lender_summary, lender_vs_market
12
+
13
+
14
+ def generate_disparity_report(
15
+ df: pd.DataFrame,
16
+ title: str = "HMDA Disparity Analysis",
17
+ lei: str = None,
18
+ ) -> str:
19
+ """
20
+ Generate a full HMDA disparity analysis report as Markdown.
21
+ """
22
+ if lei and "lei" in df.columns:
23
+ analysis_df = df[df["lei"] == lei]
24
+ scope = f"Lender: {lei}"
25
+ else:
26
+ analysis_df = df
27
+ scope = "All Lenders"
28
+
29
+ total = len(analysis_df)
30
+ actionable = analysis_df[analysis_df["action_taken"].isin([1, 2, 3])]
31
+ year = analysis_df["activity_year"].iloc[0] if "activity_year" in analysis_df.columns else "N/A"
32
+
33
+ lines = [
34
+ f"# HMDA Lending Disparity Analysis Report",
35
+ f"## {title}",
36
+ "",
37
+ f"**Scope:** {scope}",
38
+ f"**Year:** {year}",
39
+ f"**Total Records:** {total:,}",
40
+ f"**Actionable Applications:** {len(actionable):,}",
41
+ "",
42
+ "---",
43
+ "",
44
+ "## Denial Rate by Race",
45
+ "",
46
+ "| Race/Ethnicity | Applications | Denials | Denial Rate |",
47
+ "|----------------|-------------|---------|-------------|",
48
+ ]
49
+
50
+ try:
51
+ rates = denial_rate_by_race(analysis_df)
52
+ for _, row in rates.iterrows():
53
+ lines.append(
54
+ f"| {row['derived_race']} | {row['applications']:,} | "
55
+ f"{int(row['denials']):,} | {row['denial_rate']*100:.1f}% |"
56
+ )
57
+ except Exception as e:
58
+ lines.append(f"| Error computing denial rates: {e} |")
59
+
60
+ lines += [
61
+ "",
62
+ "---",
63
+ "",
64
+ "## Disparity Ratios (vs White Applicants)",
65
+ "",
66
+ "A disparity ratio >= 2.0 indicates HIGH disparity (CFPB threshold).",
67
+ "A disparity ratio >= 1.5 indicates MODERATE disparity.",
68
+ "",
69
+ "| Race/Ethnicity | Denial Rate | Reference Rate | Disparity Ratio | Level |",
70
+ "|----------------|-------------|----------------|-----------------|-------|",
71
+ ]
72
+
73
+ try:
74
+ disp = disparity_ratio(analysis_df)
75
+ for _, row in disp.iterrows():
76
+ if row["derived_race"] == "White":
77
+ continue
78
+ ratio = f"{row['disparity_ratio']:.2f}x" if pd.notna(row.get("disparity_ratio")) else "N/A"
79
+ level_emoji = {
80
+ "HIGH": "🔴 HIGH",
81
+ "MODERATE": "🟡 MODERATE",
82
+ "LOW": "🟢 LOW",
83
+ "FAVORABLE": "✅ FAVORABLE",
84
+ "N/A": "—",
85
+ }.get(row.get("disparity_level", "N/A"), "—")
86
+
87
+ lines.append(
88
+ f"| {row['derived_race']} | "
89
+ f"{row['denial_rate']*100:.1f}% | "
90
+ f"{row['reference_denial_rate']*100:.1f}% | "
91
+ f"{ratio} | {level_emoji} |"
92
+ )
93
+ except Exception as e:
94
+ lines.append(f"| Error: {e} |")
95
+
96
+ lines += [
97
+ "",
98
+ "---",
99
+ "",
100
+ "## Denial Rate by Income Band",
101
+ "",
102
+ "| Income Band | Applications | Denial Rate |",
103
+ "|-------------|-------------|-------------|",
104
+ ]
105
+
106
+ try:
107
+ income_df = denial_rate_by_income_band(analysis_df)
108
+ for _, row in income_df.iterrows():
109
+ lines.append(
110
+ f"| {row['income_band']} | {row['applications']:,} | "
111
+ f"{row['denial_rate']*100:.1f}% |"
112
+ )
113
+ except Exception as e:
114
+ lines.append(f"| Error: {e} |")
115
+
116
+ lines += [
117
+ "",
118
+ "---",
119
+ "",
120
+ "## Key Findings",
121
+ "",
122
+ ]
123
+
124
+ try:
125
+ disp = disparity_ratio(analysis_df)
126
+ high = disp[disp.get("disparity_level", pd.Series()) == "HIGH"]
127
+ if not high.empty:
128
+ lines.append("**High Disparity Groups:**")
129
+ for _, row in high.iterrows():
130
+ if row["derived_race"] != "White":
131
+ lines.append(
132
+ f"- {row['derived_race']}: "
133
+ f"{row['disparity_ratio']:.1f}x denial rate vs White applicants"
134
+ )
135
+ lines.append("")
136
+ except Exception:
137
+ pass
138
+
139
+ return "\n".join(lines)
140
+
141
+
142
+ def summary_table(df: pd.DataFrame) -> pd.DataFrame:
143
+ """Return denial rates and disparity ratios as a DataFrame."""
144
+ try:
145
+ return disparity_ratio(df)
146
+ except Exception:
147
+ return denial_rate_by_race(df)
tests/__init__.py ADDED
File without changes
tests/conftest.py ADDED
@@ -0,0 +1,12 @@
1
+ import pytest
2
+ from hmdaanalyzer.data.loader import load_sample
3
+
4
+
5
+ @pytest.fixture
6
+ def sample_df():
7
+ return load_sample(n=2000, seed=42)
8
+
9
+
10
+ @pytest.fixture
11
+ def small_df():
12
+ return load_sample(n=500, seed=42)
@@ -0,0 +1,58 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from hmdaanalyzer.analysis.disparity import (
4
+ denial_rate_by_race, disparity_ratio,
5
+ denial_rate_by_income_band, denial_reasons_by_race,
6
+ )
7
+
8
+
9
+ def test_denial_rate_by_race_returns_df(sample_df):
10
+ result = denial_rate_by_race(sample_df)
11
+ assert isinstance(result, pd.DataFrame)
12
+ assert "denial_rate" in result.columns
13
+ assert "derived_race" in result.columns
14
+
15
+
16
+ def test_denial_rate_by_race_values_valid(sample_df):
17
+ result = denial_rate_by_race(sample_df)
18
+ assert (result["denial_rate"] >= 0).all()
19
+ assert (result["denial_rate"] <= 1).all()
20
+
21
+
22
+ def test_disparity_ratio_returns_df(sample_df):
23
+ result = disparity_ratio(sample_df)
24
+ assert isinstance(result, pd.DataFrame)
25
+ assert "disparity_ratio" in result.columns
26
+ assert "disparity_level" in result.columns
27
+
28
+
29
+ def test_disparity_ratio_white_is_reference(sample_df):
30
+ result = disparity_ratio(sample_df, reference="White")
31
+ white_row = result[result["derived_race"] == "White"]
32
+ assert len(white_row) == 1
33
+ assert abs(white_row["disparity_ratio"].iloc[0] - 1.0) < 0.01
34
+
35
+
36
+ def test_disparity_levels_valid(sample_df):
37
+ result = disparity_ratio(sample_df)
38
+ valid = {"HIGH", "MODERATE", "LOW", "FAVORABLE", "N/A"}
39
+ assert set(result["disparity_level"].unique()).issubset(valid)
40
+
41
+
42
+ def test_black_disparity_high(sample_df):
43
+ result = disparity_ratio(sample_df)
44
+ black = result[result["derived_race"] == "Black or African American"]
45
+ if len(black) > 0:
46
+ assert black["disparity_ratio"].iloc[0] > 1.0
47
+
48
+
49
+ def test_denial_rate_by_income_band(sample_df):
50
+ result = denial_rate_by_income_band(sample_df)
51
+ assert isinstance(result, pd.DataFrame)
52
+ assert "denial_rate" in result.columns
53
+ assert len(result) > 0
54
+
55
+
56
+ def test_denial_reasons_by_race(sample_df):
57
+ result = denial_reasons_by_race(sample_df)
58
+ assert isinstance(result, pd.DataFrame)
@@ -0,0 +1,38 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from hmdaanalyzer.analysis.geographic import (
4
+ lending_by_tract, lending_by_county, lending_by_state,
5
+ lending_desert_score,
6
+ )
7
+
8
+
9
+ def test_lending_by_tract_returns_df(sample_df):
10
+ result = lending_by_tract(sample_df)
11
+ assert isinstance(result, pd.DataFrame)
12
+ assert "denial_rate" in result.columns
13
+ assert "census_tract" in result.columns
14
+
15
+
16
+ def test_lending_by_county_returns_df(sample_df):
17
+ result = lending_by_county(sample_df)
18
+ assert isinstance(result, pd.DataFrame)
19
+ assert "denial_rate" in result.columns
20
+
21
+
22
+ def test_lending_by_state_returns_df(sample_df):
23
+ result = lending_by_state(sample_df)
24
+ assert isinstance(result, pd.DataFrame)
25
+ assert len(result) > 0
26
+
27
+
28
+ def test_lending_desert_score_returns_df(sample_df):
29
+ result = lending_desert_score(sample_df)
30
+ assert isinstance(result, pd.DataFrame)
31
+ assert "desert_score" in result.columns
32
+ assert "is_lending_desert" in result.columns
33
+
34
+
35
+ def test_denial_rates_between_0_and_1(sample_df):
36
+ result = lending_by_tract(sample_df)
37
+ assert (result["denial_rate"] >= 0).all()
38
+ assert (result["denial_rate"] <= 1).all()
tests/test_lender.py ADDED
@@ -0,0 +1,37 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from hmdaanalyzer.analysis.lender import (
4
+ lender_summary, lender_vs_market, top_lenders_by_volume,
5
+ )
6
+
7
+
8
+ def test_lender_summary_returns_dict(sample_df):
9
+ result = lender_summary(sample_df)
10
+ assert isinstance(result, dict)
11
+ assert "total_applications" in result
12
+ assert "denial_rate" in result
13
+
14
+
15
+ def test_lender_summary_denial_rate_valid(sample_df):
16
+ result = lender_summary(sample_df)
17
+ assert 0 <= result["denial_rate"] <= 100
18
+
19
+
20
+ def test_lender_vs_market_returns_df(sample_df):
21
+ lei = sample_df["lei"].iloc[0]
22
+ result = lender_vs_market(sample_df, lei)
23
+ assert isinstance(result, pd.DataFrame)
24
+ assert "lender_denial_rate" in result.columns
25
+ assert "market_denial_rate" in result.columns
26
+
27
+
28
+ def test_top_lenders_by_volume_returns_df(sample_df):
29
+ result = top_lenders_by_volume(sample_df, n=5)
30
+ assert isinstance(result, pd.DataFrame)
31
+ assert len(result) <= 5
32
+
33
+
34
+ def test_top_lenders_sorted_by_volume(sample_df):
35
+ result = top_lenders_by_volume(sample_df, n=10)
36
+ if len(result) > 1:
37
+ assert result["originations"].iloc[0] >= result["originations"].iloc[1]
tests/test_loader.py ADDED
@@ -0,0 +1,43 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from hmdaanalyzer.data.loader import load_sample
4
+
5
+
6
+ def test_load_sample_returns_dataframe():
7
+ df = load_sample(n=100)
8
+ assert isinstance(df, pd.DataFrame)
9
+ assert len(df) == 100
10
+
11
+
12
+ def test_load_sample_has_required_columns():
13
+ df = load_sample(n=100)
14
+ required = ["action_taken", "derived_race", "loan_amount",
15
+ "income", "is_denied", "is_approved"]
16
+ for col in required:
17
+ assert col in df.columns
18
+
19
+
20
+ def test_load_sample_action_taken_valid():
21
+ df = load_sample(n=500)
22
+ valid = {1, 2, 3, 4, 5}
23
+ assert df["action_taken"].dropna().isin(valid).all()
24
+
25
+
26
+ def test_load_sample_is_denied_bool():
27
+ df = load_sample(n=500)
28
+ assert str(df["is_denied"].dtype) in ("bool", "boolean")
29
+
30
+
31
+ def test_load_sample_denial_rate_realistic():
32
+ df = load_sample(n=2000)
33
+ actionable = df[df["action_taken"].isin([1, 2, 3])]
34
+ overall_denial_rate = actionable["is_denied"].mean()
35
+ assert 0.05 < overall_denial_rate < 0.35
36
+
37
+
38
+ def test_load_sample_race_disparity():
39
+ df = load_sample(n=3000)
40
+ actionable = df[df["action_taken"].isin([1, 2, 3])]
41
+ black = actionable[actionable["derived_race"] == "Black or African American"]
42
+ white = actionable[actionable["derived_race"] == "White"]
43
+ assert black["is_denied"].mean() > white["is_denied"].mean()
tests/test_report.py ADDED
@@ -0,0 +1,27 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from hmdaanalyzer.report.generator import generate_disparity_report, summary_table
4
+
5
+
6
+ def test_generate_report_returns_string(sample_df):
7
+ report = generate_disparity_report(sample_df)
8
+ assert isinstance(report, str)
9
+ assert len(report) > 100
10
+
11
+
12
+ def test_report_contains_sections(sample_df):
13
+ report = generate_disparity_report(sample_df)
14
+ assert "Denial Rate by Race" in report
15
+ assert "Disparity Ratios" in report
16
+ assert "Income Band" in report
17
+
18
+
19
+ def test_report_contains_black_disparity(sample_df):
20
+ report = generate_disparity_report(sample_df)
21
+ assert "Black or African American" in report
22
+
23
+
24
+ def test_summary_table_returns_df(sample_df):
25
+ df = summary_table(sample_df)
26
+ assert isinstance(df, pd.DataFrame)
27
+ assert len(df) > 0