cdfidata 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdfidata/__init__.py +12 -0
- cdfidata/analysis/__init__.py +0 -0
- cdfidata/pipeline/__init__.py +0 -0
- cdfidata/pipeline/cleaner.py +115 -0
- cdfidata/pipeline/downloader.py +134 -0
- cdfidata/pipeline/exporter.py +65 -0
- cdfidata/sources/__init__.py +0 -0
- cdfidata/sources/awards.py +99 -0
- cdfidata/sources/clr.py +107 -0
- cdfidata/sources/tlr.py +172 -0
- cdfidata/utils/__init__.py +0 -0
- cdfidata/utils/schema.py +174 -0
- cdfidata-0.1.0.dist-info/METADATA +11 -0
- cdfidata-0.1.0.dist-info/RECORD +22 -0
- cdfidata-0.1.0.dist-info/WHEEL +5 -0
- cdfidata-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +31 -0
- tests/test_awards.py +50 -0
- tests/test_cleaner.py +56 -0
- tests/test_clr.py +33 -0
- tests/test_tlr.py +71 -0
cdfidata/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from cdfidata.sources.tlr import TLRLoader
|
|
2
|
+
from cdfidata.sources.clr import CLRLoader
|
|
3
|
+
from cdfidata.sources.awards import AwardsLoader
|
|
4
|
+
from cdfidata.pipeline.downloader import download_tlr, list_cached, clear_cache
|
|
5
|
+
from cdfidata.pipeline.exporter import to_csv, to_sqlite, to_parquet
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__all__ = [
|
|
9
|
+
"TLRLoader", "CLRLoader", "AwardsLoader",
|
|
10
|
+
"download_tlr", "list_cached", "clear_cache",
|
|
11
|
+
"to_csv", "to_sqlite", "to_parquet",
|
|
12
|
+
]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Standardize and clean raw CDFI Fund DataFrames.
|
|
3
|
+
Handles column renaming, type casting, null handling, and boolean normalization.
|
|
4
|
+
"""
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from cdfidata.utils.schema import BOOL_TRUE_VALUES, BOOL_FALSE_VALUES
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def normalize_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame:
|
|
10
|
+
"""
|
|
11
|
+
Rename columns using a mapping dict.
|
|
12
|
+
Only renames columns that exist in the DataFrame.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
df: Raw DataFrame
|
|
16
|
+
column_map: Dict of {RAW_NAME: clean_name}
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
DataFrame with renamed columns
|
|
20
|
+
"""
|
|
21
|
+
df.columns = df.columns.str.strip().str.upper()
|
|
22
|
+
available = {k: v for k, v in column_map.items() if k in df.columns}
|
|
23
|
+
return df.rename(columns=available)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def cast_types(df: pd.DataFrame, dtype_map: dict) -> pd.DataFrame:
|
|
27
|
+
"""
|
|
28
|
+
Cast columns to specified types.
|
|
29
|
+
Silently skips columns that don't exist or can't be cast.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
df: DataFrame with clean column names
|
|
33
|
+
dtype_map: Dict of {column_name: type_string}
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
DataFrame with cast columns
|
|
37
|
+
"""
|
|
38
|
+
for col, dtype in dtype_map.items():
|
|
39
|
+
if col not in df.columns:
|
|
40
|
+
continue
|
|
41
|
+
try:
|
|
42
|
+
if dtype == "bool":
|
|
43
|
+
df[col] = df[col].apply(_parse_bool)
|
|
44
|
+
elif dtype == "int":
|
|
45
|
+
df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
|
|
46
|
+
elif dtype == "float":
|
|
47
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
48
|
+
else:
|
|
49
|
+
df[col] = df[col].astype(dtype)
|
|
50
|
+
except Exception:
|
|
51
|
+
pass
|
|
52
|
+
return df
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _parse_bool(val) -> bool:
|
|
56
|
+
"""Parse a value to boolean using CDFI Fund conventions."""
|
|
57
|
+
if pd.isna(val):
|
|
58
|
+
return None
|
|
59
|
+
s = str(val).strip()
|
|
60
|
+
if s in BOOL_TRUE_VALUES:
|
|
61
|
+
return True
|
|
62
|
+
if s in BOOL_FALSE_VALUES:
|
|
63
|
+
return False
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def clean_strings(df: pd.DataFrame, cols: list = None) -> pd.DataFrame:
|
|
68
|
+
"""
|
|
69
|
+
Strip whitespace and normalize string columns to uppercase.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
df: DataFrame
|
|
73
|
+
cols: List of columns to clean (default: all object columns)
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Cleaned DataFrame
|
|
77
|
+
"""
|
|
78
|
+
if cols is None:
|
|
79
|
+
cols = df.select_dtypes(include="object").columns.tolist()
|
|
80
|
+
for col in cols:
|
|
81
|
+
if col in df.columns:
|
|
82
|
+
df[col] = df[col].str.strip()
|
|
83
|
+
return df
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def drop_empty_rows(df: pd.DataFrame, required_cols: list) -> pd.DataFrame:
|
|
87
|
+
"""Drop rows where all required columns are null."""
|
|
88
|
+
return df.dropna(subset=required_cols, how="all")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def standardize(
|
|
92
|
+
df: pd.DataFrame,
|
|
93
|
+
column_map: dict,
|
|
94
|
+
dtype_map: dict,
|
|
95
|
+
required_cols: list = None,
|
|
96
|
+
) -> pd.DataFrame:
|
|
97
|
+
"""
|
|
98
|
+
Full standardization pipeline: rename → cast → clean → drop empty.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
df: Raw DataFrame
|
|
102
|
+
column_map: Column rename mapping
|
|
103
|
+
dtype_map: Type casting mapping
|
|
104
|
+
required_cols: Columns required to be non-null
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Clean, standardized DataFrame
|
|
108
|
+
"""
|
|
109
|
+
df = normalize_columns(df, column_map)
|
|
110
|
+
df = cast_types(df, dtype_map)
|
|
111
|
+
df = clean_strings(df)
|
|
112
|
+
if required_cols:
|
|
113
|
+
df = drop_empty_rows(df, required_cols)
|
|
114
|
+
df = df.reset_index(drop=True)
|
|
115
|
+
return df
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Download and cache CDFI Fund public datasets locally.
|
|
3
|
+
Files are cached to avoid repeated downloads.
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
import zipfile
|
|
7
|
+
import requests
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from cdfidata.utils.schema import CACHE_DIR, TLR_URLS
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_cache_dir() -> Path:
|
|
14
|
+
"""Return and create the local cache directory."""
|
|
15
|
+
path = Path(CACHE_DIR)
|
|
16
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
17
|
+
return path
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cache_path(filename: str) -> Path:
|
|
21
|
+
"""Return the full cache path for a given filename."""
|
|
22
|
+
return get_cache_dir() / filename
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def is_cached(filename: str) -> bool:
|
|
26
|
+
"""Check if a file is already cached locally."""
|
|
27
|
+
return cache_path(filename).exists()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def download_file(url: str, filename: str, force: bool = False) -> Path:
|
|
31
|
+
"""
|
|
32
|
+
Download a file from a URL and cache it locally.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
url: Full URL to download from
|
|
36
|
+
filename: Local filename to save as
|
|
37
|
+
force: Re-download even if cached
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Path to the cached file
|
|
41
|
+
"""
|
|
42
|
+
path = cache_path(filename)
|
|
43
|
+
|
|
44
|
+
if path.exists() and not force:
|
|
45
|
+
print(f"Using cached file: {path}")
|
|
46
|
+
return path
|
|
47
|
+
|
|
48
|
+
print(f"Downloading {filename}...")
|
|
49
|
+
print(f"URL: {url}")
|
|
50
|
+
|
|
51
|
+
response = requests.get(url, stream=True, timeout=120)
|
|
52
|
+
response.raise_for_status()
|
|
53
|
+
|
|
54
|
+
total = int(response.headers.get("content-length", 0))
|
|
55
|
+
downloaded = 0
|
|
56
|
+
|
|
57
|
+
with open(path, "wb") as f:
|
|
58
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
59
|
+
f.write(chunk)
|
|
60
|
+
downloaded += len(chunk)
|
|
61
|
+
if total:
|
|
62
|
+
pct = downloaded / total * 100
|
|
63
|
+
print(f"\r {pct:.1f}% ({downloaded/1e6:.1f}MB)", end="")
|
|
64
|
+
|
|
65
|
+
print(f"\nSaved to {path}")
|
|
66
|
+
return path
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def extract_zip(zip_path: Path, extract_to: Path = None) -> list:
|
|
70
|
+
"""
|
|
71
|
+
Extract a zip file to the cache directory.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
zip_path: Path to the zip file
|
|
75
|
+
extract_to: Directory to extract to (default: cache dir)
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
List of extracted file paths
|
|
79
|
+
"""
|
|
80
|
+
extract_to = extract_to or get_cache_dir()
|
|
81
|
+
|
|
82
|
+
with zipfile.ZipFile(zip_path, "r") as zf:
|
|
83
|
+
names = zf.namelist()
|
|
84
|
+
zf.extractall(extract_to)
|
|
85
|
+
print(f"Extracted {len(names)} files to {extract_to}")
|
|
86
|
+
|
|
87
|
+
return [extract_to / name for name in names]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def download_tlr(year: int, force: bool = False) -> Path:
|
|
91
|
+
"""
|
|
92
|
+
Download TLR/CLR zip file for a given fiscal year.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
year: Fiscal year e.g. 2022
|
|
96
|
+
force: Re-download even if cached
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Path to the downloaded zip file
|
|
100
|
+
"""
|
|
101
|
+
if year not in TLR_URLS:
|
|
102
|
+
available = list(TLR_URLS.keys())
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"No TLR URL available for FY{year}. "
|
|
105
|
+
f"Available years: {available}"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
url = TLR_URLS[year]
|
|
109
|
+
filename = f"TLR_CLR_FY{year}.zip"
|
|
110
|
+
return download_file(url, filename, force=force)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def list_cached() -> list:
|
|
114
|
+
"""List all files currently in the local cache."""
|
|
115
|
+
cache = get_cache_dir()
|
|
116
|
+
files = list(cache.iterdir())
|
|
117
|
+
if not files:
|
|
118
|
+
print("Cache is empty.")
|
|
119
|
+
else:
|
|
120
|
+
print(f"Cached files in {cache}:")
|
|
121
|
+
for f in sorted(files):
|
|
122
|
+
size_mb = f.stat().st_size / 1e6
|
|
123
|
+
print(f" {f.name} ({size_mb:.1f}MB)")
|
|
124
|
+
return files
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def clear_cache() -> None:
|
|
128
|
+
"""Delete all cached files."""
|
|
129
|
+
cache = get_cache_dir()
|
|
130
|
+
count = 0
|
|
131
|
+
for f in cache.iterdir():
|
|
132
|
+
f.unlink()
|
|
133
|
+
count += 1
|
|
134
|
+
print(f"Cleared {count} cached files from {cache}")
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Export cleaned CDFI Fund DataFrames to CSV, SQLite, and Parquet formats.
|
|
3
|
+
"""
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_csv(df: pd.DataFrame, path: str, **kwargs) -> None:
|
|
9
|
+
"""Export DataFrame to CSV."""
|
|
10
|
+
path = Path(path)
|
|
11
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
12
|
+
df.to_csv(path, index=False, **kwargs)
|
|
13
|
+
print(f"Exported {len(df):,} rows to {path}")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def to_sqlite(df: pd.DataFrame, db_path: str, table_name: str,
|
|
17
|
+
if_exists: str = "replace") -> None:
|
|
18
|
+
"""
|
|
19
|
+
Export DataFrame to a SQLite database table.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
df: DataFrame to export
|
|
23
|
+
db_path: Path to SQLite database file
|
|
24
|
+
table_name: Table name to write to
|
|
25
|
+
if_exists: 'replace', 'append', or 'fail'
|
|
26
|
+
"""
|
|
27
|
+
import sqlite3
|
|
28
|
+
path = Path(db_path)
|
|
29
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
with sqlite3.connect(path) as conn:
|
|
32
|
+
df.to_sql(table_name, conn, if_exists=if_exists, index=False)
|
|
33
|
+
print(f"Exported {len(df):,} rows to {path} (table: {table_name})")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def to_parquet(df: pd.DataFrame, path: str, **kwargs) -> None:
|
|
37
|
+
"""Export DataFrame to Parquet format."""
|
|
38
|
+
try:
|
|
39
|
+
import pyarrow
|
|
40
|
+
except ImportError:
|
|
41
|
+
raise ImportError(
|
|
42
|
+
"pyarrow is required for parquet export. "
|
|
43
|
+
"Install it with: pip install pyarrow"
|
|
44
|
+
)
|
|
45
|
+
path = Path(path)
|
|
46
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
df.to_parquet(path, index=False, **kwargs)
|
|
48
|
+
print(f"Exported {len(df):,} rows to {path}")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def export_all(df: pd.DataFrame, base_path: str, table_name: str) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Export to all three formats at once.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
df: DataFrame to export
|
|
57
|
+
base_path: Base path without extension
|
|
58
|
+
table_name: SQLite table name
|
|
59
|
+
"""
|
|
60
|
+
to_csv(df, f"{base_path}.csv")
|
|
61
|
+
to_sqlite(df, f"{base_path}.db", table_name)
|
|
62
|
+
try:
|
|
63
|
+
to_parquet(df, f"{base_path}.parquet")
|
|
64
|
+
except ImportError:
|
|
65
|
+
print("Skipping parquet export — pyarrow not installed")
|
|
File without changes
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CDFI Fund Awards Database loader.
|
|
3
|
+
Covers all CDFI Fund program awardees across all years.
|
|
4
|
+
"""
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
import random
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from cdfidata.pipeline.cleaner import standardize
|
|
11
|
+
from cdfidata.pipeline.exporter import to_csv, to_sqlite
|
|
12
|
+
from cdfidata.utils.schema import AWARDS_COLUMNS, AWARDS_DTYPES
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AwardsLoader:
|
|
16
|
+
"""
|
|
17
|
+
Loader for CDFI Fund Awards Database.
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
loader = AwardsLoader()
|
|
21
|
+
df = loader.load_sample()
|
|
22
|
+
df_il = loader.filter_state("IL")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
self._df: Optional[pd.DataFrame] = None
|
|
27
|
+
|
|
28
|
+
def load_from_file(self, path: str) -> pd.DataFrame:
|
|
29
|
+
"""Load awards data from a local CSV file."""
|
|
30
|
+
print(f"Loading awards data from {path}...")
|
|
31
|
+
df = pd.read_csv(path, dtype=str, low_memory=False)
|
|
32
|
+
df = standardize(df, AWARDS_COLUMNS, AWARDS_DTYPES,
|
|
33
|
+
required_cols=["award_amount"])
|
|
34
|
+
self._df = df
|
|
35
|
+
return df
|
|
36
|
+
|
|
37
|
+
def load_sample(self, n: int = 500) -> pd.DataFrame:
|
|
38
|
+
"""Generate synthetic awards sample data for testing."""
|
|
39
|
+
random.seed(42)
|
|
40
|
+
np.random.seed(42)
|
|
41
|
+
|
|
42
|
+
states = ["IL", "NY", "CA", "TX", "GA", "NC", "OH", "PA", "FL", "MI"]
|
|
43
|
+
programs = ["FA", "NACA", "NMTC", "BEA", "CMF", "BOND"]
|
|
44
|
+
inst_types = ["Loan Fund", "Bank", "Credit Union", "Venture Fund",
|
|
45
|
+
"Depository Institution Holding Company"]
|
|
46
|
+
|
|
47
|
+
records = []
|
|
48
|
+
for i in range(n):
|
|
49
|
+
records.append({
|
|
50
|
+
"awardee_name": f"Community Development Fund {i+1}",
|
|
51
|
+
"state": random.choice(states),
|
|
52
|
+
"award_year": random.randint(2010, 2023),
|
|
53
|
+
"award_amount": round(np.random.lognormal(14, 1)),
|
|
54
|
+
"program": random.choice(programs),
|
|
55
|
+
"award_type": "Financial Assistance",
|
|
56
|
+
"institution_type": random.choice(inst_types),
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
self._df = pd.DataFrame(records)
|
|
60
|
+
return self._df
|
|
61
|
+
|
|
62
|
+
def filter_state(self, state: str) -> pd.DataFrame:
|
|
63
|
+
self._check_loaded()
|
|
64
|
+
return self._df[self._df["state"] == state.upper()].copy()
|
|
65
|
+
|
|
66
|
+
def filter_program(self, program: str) -> pd.DataFrame:
|
|
67
|
+
self._check_loaded()
|
|
68
|
+
return self._df[
|
|
69
|
+
self._df["program"].str.contains(program, case=False, na=False)
|
|
70
|
+
].copy()
|
|
71
|
+
|
|
72
|
+
def filter_year(self, year: int) -> pd.DataFrame:
|
|
73
|
+
self._check_loaded()
|
|
74
|
+
return self._df[self._df["award_year"] == year].copy()
|
|
75
|
+
|
|
76
|
+
def summary(self) -> None:
|
|
77
|
+
self._check_loaded()
|
|
78
|
+
df = self._df
|
|
79
|
+
print(f"\nAwards Database Summary")
|
|
80
|
+
print(f" Total awards: {len(df):,}")
|
|
81
|
+
print(f" Total amount: ${df['award_amount'].sum()/1e9:.2f}B")
|
|
82
|
+
print(f" States covered: {df['state'].nunique()}")
|
|
83
|
+
print(f" Programs: {df['program'].nunique()}")
|
|
84
|
+
print(f" Year range: {df['award_year'].min()}–{df['award_year'].max()}")
|
|
85
|
+
print()
|
|
86
|
+
|
|
87
|
+
def to_csv(self, path: str) -> None:
|
|
88
|
+
self._check_loaded()
|
|
89
|
+
to_csv(self._df, path)
|
|
90
|
+
|
|
91
|
+
def to_sqlite(self, db_path: str, table: str = "awards") -> None:
|
|
92
|
+
self._check_loaded()
|
|
93
|
+
to_sqlite(self._df, db_path, table)
|
|
94
|
+
|
|
95
|
+
def _check_loaded(self) -> None:
|
|
96
|
+
if self._df is None:
|
|
97
|
+
raise RuntimeError(
|
|
98
|
+
"No data loaded. Call .load_from_file() or .load_sample() first."
|
|
99
|
+
)
|
cdfidata/sources/clr.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLR (Consumer Loan Report) data loader.
|
|
3
|
+
Aggregated to census tract level, 12 variables.
|
|
4
|
+
"""
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
import random
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from cdfidata.pipeline.cleaner import standardize
|
|
11
|
+
from cdfidata.pipeline.exporter import to_csv, to_sqlite, to_parquet
|
|
12
|
+
from cdfidata.utils.schema import CLR_COLUMNS, CLR_DTYPES
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CLRLoader:
|
|
16
|
+
"""
|
|
17
|
+
Loader for CDFI Fund Consumer Loan Report (CLR) data.
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
loader = CLRLoader()
|
|
21
|
+
df = loader.load_sample()
|
|
22
|
+
df_il = loader.filter_state("IL")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
self._df: Optional[pd.DataFrame] = None
|
|
27
|
+
self._year: Optional[int] = None
|
|
28
|
+
|
|
29
|
+
def load_from_file(self, path: str, year: int = 2022) -> pd.DataFrame:
|
|
30
|
+
"""
|
|
31
|
+
Load CLR data from a local CSV file.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
path: Path to the CLR CSV file
|
|
35
|
+
year: Fiscal year for reference
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Clean pandas DataFrame
|
|
39
|
+
"""
|
|
40
|
+
self._year = year
|
|
41
|
+
print(f"Loading CLR data from {path}...")
|
|
42
|
+
df = pd.read_csv(path, dtype=str, low_memory=False)
|
|
43
|
+
print(f"Raw records: {len(df):,}")
|
|
44
|
+
df = standardize(df, CLR_COLUMNS, CLR_DTYPES,
|
|
45
|
+
required_cols=["total_amount"])
|
|
46
|
+
print(f"Clean records: {len(df):,}")
|
|
47
|
+
self._df = df
|
|
48
|
+
return df
|
|
49
|
+
|
|
50
|
+
def load_sample(self, n: int = 1000) -> pd.DataFrame:
|
|
51
|
+
"""Generate synthetic CLR sample data for testing."""
|
|
52
|
+
random.seed(42)
|
|
53
|
+
np.random.seed(42)
|
|
54
|
+
|
|
55
|
+
states = ["IL", "NY", "CA", "TX", "GA", "NC", "OH", "PA", "FL", "MI"]
|
|
56
|
+
loan_types = ["Auto Loan", "Personal Loan", "Credit Card",
|
|
57
|
+
"Student Loan", "Home Improvement"]
|
|
58
|
+
|
|
59
|
+
records = []
|
|
60
|
+
for i in range(n):
|
|
61
|
+
n_loans = random.randint(1, 500)
|
|
62
|
+
avg = round(np.random.lognormal(8, 1))
|
|
63
|
+
records.append({
|
|
64
|
+
"fiscal_year": random.choice([2020, 2021, 2022]),
|
|
65
|
+
"state": random.choice(states),
|
|
66
|
+
"census_tract": f"{random.randint(1000, 9999):04d}",
|
|
67
|
+
"loan_type": random.choice(loan_types),
|
|
68
|
+
"number_of_loans": n_loans,
|
|
69
|
+
"total_amount": n_loans * avg,
|
|
70
|
+
"average_amount": avg,
|
|
71
|
+
"low_income_area": random.choice([True, False]),
|
|
72
|
+
"minority_area": random.choice([True, False]),
|
|
73
|
+
"rural_area": random.choice([True, False]),
|
|
74
|
+
"program": random.choice(["FA", "NACA", "RRP"]),
|
|
75
|
+
"award_type": "Financial Assistance",
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
self._df = pd.DataFrame(records)
|
|
79
|
+
return self._df
|
|
80
|
+
|
|
81
|
+
def filter_state(self, state: str) -> pd.DataFrame:
|
|
82
|
+
self._check_loaded()
|
|
83
|
+
return self._df[self._df["state"] == state.upper()].copy()
|
|
84
|
+
|
|
85
|
+
def summary(self) -> None:
|
|
86
|
+
self._check_loaded()
|
|
87
|
+
df = self._df
|
|
88
|
+
print(f"\nCLR Data Summary")
|
|
89
|
+
print(f" Total records: {len(df):,}")
|
|
90
|
+
print(f" Total loans: {df['number_of_loans'].sum():,.0f}")
|
|
91
|
+
print(f" Total amount: ${df['total_amount'].sum()/1e9:.2f}B")
|
|
92
|
+
print(f" States covered: {df['state'].nunique()}")
|
|
93
|
+
print()
|
|
94
|
+
|
|
95
|
+
def to_csv(self, path: str) -> None:
|
|
96
|
+
self._check_loaded()
|
|
97
|
+
to_csv(self._df, path)
|
|
98
|
+
|
|
99
|
+
def to_sqlite(self, db_path: str, table: str = "clr") -> None:
|
|
100
|
+
self._check_loaded()
|
|
101
|
+
to_sqlite(self._df, db_path, table)
|
|
102
|
+
|
|
103
|
+
def _check_loaded(self) -> None:
|
|
104
|
+
if self._df is None:
|
|
105
|
+
raise RuntimeError(
|
|
106
|
+
"No data loaded. Call .load_from_file() or .load_sample() first."
|
|
107
|
+
)
|
cdfidata/sources/tlr.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TLR (Transaction Level Report) data loader.
|
|
3
|
+
Downloads, cleans, and returns CDFI Fund transaction-level loan data.
|
|
4
|
+
"""
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from cdfidata.pipeline.downloader import download_tlr, extract_zip, cache_path
|
|
10
|
+
from cdfidata.pipeline.cleaner import standardize
|
|
11
|
+
from cdfidata.pipeline.exporter import to_csv, to_sqlite, to_parquet
|
|
12
|
+
from cdfidata.utils.schema import TLR_COLUMNS, TLR_DTYPES
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TLRLoader:
|
|
16
|
+
"""
|
|
17
|
+
Loader for CDFI Fund Transaction Level Report (TLR) data.
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
loader = TLRLoader()
|
|
21
|
+
df = loader.load(year=2022)
|
|
22
|
+
df_il = loader.filter_state("IL")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
self._df: Optional[pd.DataFrame] = None
|
|
27
|
+
self._year: Optional[int] = None
|
|
28
|
+
|
|
29
|
+
def load(self, year: int = 2022, force: bool = False) -> pd.DataFrame:
|
|
30
|
+
"""
|
|
31
|
+
Download and load TLR data for a given fiscal year.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
year: Fiscal year e.g. 2022
|
|
35
|
+
force: Re-download even if cached
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Clean pandas DataFrame with standardized columns
|
|
39
|
+
"""
|
|
40
|
+
self._year = year
|
|
41
|
+
|
|
42
|
+
# Download zip
|
|
43
|
+
zip_path = download_tlr(year, force=force)
|
|
44
|
+
|
|
45
|
+
# Extract
|
|
46
|
+
extracted = extract_zip(zip_path)
|
|
47
|
+
tlr_files = [f for f in extracted if "TLR" in str(f).upper()
|
|
48
|
+
and str(f).endswith(".csv")]
|
|
49
|
+
|
|
50
|
+
if not tlr_files:
|
|
51
|
+
# Try finding CSV directly in cache
|
|
52
|
+
tlr_files = list(cache_path("").parent.glob(f"*TLR*{year}*.csv"))
|
|
53
|
+
|
|
54
|
+
if not tlr_files:
|
|
55
|
+
raise FileNotFoundError(
|
|
56
|
+
f"Could not find TLR CSV file in extracted archive for FY{year}. "
|
|
57
|
+
f"Extracted files: {extracted}"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
csv_path = tlr_files[0]
|
|
61
|
+
print(f"Loading TLR data from {csv_path}...")
|
|
62
|
+
|
|
63
|
+
df = pd.read_csv(csv_path, dtype=str, low_memory=False)
|
|
64
|
+
print(f"Raw records: {len(df):,}")
|
|
65
|
+
|
|
66
|
+
df = standardize(df, TLR_COLUMNS, TLR_DTYPES,
|
|
67
|
+
required_cols=["amount"])
|
|
68
|
+
|
|
69
|
+
print(f"Clean records: {len(df):,}")
|
|
70
|
+
self._df = df
|
|
71
|
+
return df
|
|
72
|
+
|
|
73
|
+
def load_sample(self, n: int = 1000) -> pd.DataFrame:
|
|
74
|
+
"""
|
|
75
|
+
Generate synthetic TLR sample data for testing and demos.
|
|
76
|
+
Does not require downloading real data.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
n: Number of sample records
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Synthetic DataFrame with TLR schema
|
|
83
|
+
"""
|
|
84
|
+
import numpy as np
|
|
85
|
+
import random
|
|
86
|
+
random.seed(42)
|
|
87
|
+
np.random.seed(42)
|
|
88
|
+
|
|
89
|
+
states = ["IL", "NY", "CA", "TX", "GA", "NC", "OH", "PA", "FL", "MI"]
|
|
90
|
+
loan_types = ["Business Loan", "Microenterprise Loan", "Home Mortgage",
|
|
91
|
+
"Home Improvement", "Consumer Loan", "Commercial RE"]
|
|
92
|
+
purposes = ["Job Creation", "Affordable Housing", "Small Business",
|
|
93
|
+
"Community Facility", "Microenterprise"]
|
|
94
|
+
programs = ["FA", "NACA", "RRP"]
|
|
95
|
+
|
|
96
|
+
records = []
|
|
97
|
+
for i in range(n):
|
|
98
|
+
records.append({
|
|
99
|
+
"fiscal_year": random.choice([2020, 2021, 2022]),
|
|
100
|
+
"award_number": f"FA-{i:06d}",
|
|
101
|
+
"financing_type": "Loan",
|
|
102
|
+
"loan_type": random.choice(loan_types),
|
|
103
|
+
"purpose": random.choice(purposes),
|
|
104
|
+
"amount": round(np.random.lognormal(10, 1.5)),
|
|
105
|
+
"term_months": random.choice([12, 24, 36, 60, 84, 120]),
|
|
106
|
+
"interest_rate": round(np.random.uniform(0.02, 0.08), 4),
|
|
107
|
+
"state": random.choice(states),
|
|
108
|
+
"census_tract": f"{random.randint(1000, 9999):04d}",
|
|
109
|
+
"low_income_area": random.choice([True, False]),
|
|
110
|
+
"distressed_area": random.choice([True, False]),
|
|
111
|
+
"minority_borrower": random.choice([True, False]),
|
|
112
|
+
"women_borrower": random.choice([True, False]),
|
|
113
|
+
"jobs_created": random.randint(0, 50),
|
|
114
|
+
"jobs_retained": random.randint(0, 100),
|
|
115
|
+
"program": random.choice(programs),
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
self._df = pd.DataFrame(records)
|
|
119
|
+
return self._df
|
|
120
|
+
|
|
121
|
+
def filter_state(self, state: str) -> pd.DataFrame:
|
|
122
|
+
"""Filter loaded data to a specific state."""
|
|
123
|
+
self._check_loaded()
|
|
124
|
+
return self._df[self._df["state"] == state.upper()].copy()
|
|
125
|
+
|
|
126
|
+
def filter_loan_type(self, loan_type: str) -> pd.DataFrame:
|
|
127
|
+
"""Filter by loan type (partial match)."""
|
|
128
|
+
self._check_loaded()
|
|
129
|
+
return self._df[
|
|
130
|
+
self._df["loan_type"].str.contains(loan_type, case=False, na=False)
|
|
131
|
+
].copy()
|
|
132
|
+
|
|
133
|
+
def filter_amount(self, min_amount: float = 0,
|
|
134
|
+
max_amount: float = float("inf")) -> pd.DataFrame:
|
|
135
|
+
"""Filter by loan amount range."""
|
|
136
|
+
self._check_loaded()
|
|
137
|
+
return self._df[
|
|
138
|
+
(self._df["amount"] >= min_amount) &
|
|
139
|
+
(self._df["amount"] <= max_amount)
|
|
140
|
+
].copy()
|
|
141
|
+
|
|
142
|
+
def summary(self) -> pd.DataFrame:
|
|
143
|
+
"""Return a summary of the loaded TLR data."""
|
|
144
|
+
self._check_loaded()
|
|
145
|
+
df = self._df
|
|
146
|
+
print(f"\nTLR Data Summary — FY{self._year}")
|
|
147
|
+
print(f" Total records: {len(df):,}")
|
|
148
|
+
print(f" Total amount: ${df['amount'].sum()/1e9:.2f}B")
|
|
149
|
+
print(f" Median loan size: ${df['amount'].median():,.0f}")
|
|
150
|
+
print(f" States covered: {df['state'].nunique()}")
|
|
151
|
+
if "jobs_created" in df.columns:
|
|
152
|
+
print(f" Total jobs created: {df['jobs_created'].sum():,.0f}")
|
|
153
|
+
print()
|
|
154
|
+
return df.describe()
|
|
155
|
+
|
|
156
|
+
def to_csv(self, path: str) -> None:
|
|
157
|
+
self._check_loaded()
|
|
158
|
+
to_csv(self._df, path)
|
|
159
|
+
|
|
160
|
+
def to_sqlite(self, db_path: str, table: str = "tlr") -> None:
|
|
161
|
+
self._check_loaded()
|
|
162
|
+
to_sqlite(self._df, db_path, table)
|
|
163
|
+
|
|
164
|
+
def to_parquet(self, path: str) -> None:
|
|
165
|
+
self._check_loaded()
|
|
166
|
+
to_parquet(self._df, path)
|
|
167
|
+
|
|
168
|
+
def _check_loaded(self) -> None:
|
|
169
|
+
if self._df is None:
|
|
170
|
+
raise RuntimeError(
|
|
171
|
+
"No data loaded. Call .load(year=2022) or .load_sample() first."
|
|
172
|
+
)
|
|
File without changes
|
cdfidata/utils/schema.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Column mappings, data dictionaries, and constants for CDFI Fund public datasets.
|
|
3
|
+
All source URLs and field definitions are based on official CDFI Fund documentation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# ── TLR (Transaction Level Report) ───────────────────────────────────────────
|
|
7
|
+
# Source: cdfifund.gov — annual release, 61 variables, 1M+ loan observations
|
|
8
|
+
# Masked to protect individual CDFI identity
|
|
9
|
+
|
|
10
|
+
TLR_COLUMNS = {
|
|
11
|
+
"FISCAL_YEAR": "fiscal_year",
|
|
12
|
+
"AWARD_NUMBER": "award_number",
|
|
13
|
+
"AWARDEE_TYPE": "awardee_type",
|
|
14
|
+
"FINANCING_TYPE": "financing_type",
|
|
15
|
+
"LOAN_TYPE": "loan_type",
|
|
16
|
+
"ACTIVITY_TYPE": "activity_type",
|
|
17
|
+
"AMOUNT": "amount",
|
|
18
|
+
"TERM": "term_months",
|
|
19
|
+
"INTEREST_RATE": "interest_rate",
|
|
20
|
+
"PURPOSE": "purpose",
|
|
21
|
+
"STATE": "state",
|
|
22
|
+
"CENSUS_TRACT": "census_tract",
|
|
23
|
+
"COUNTY": "county",
|
|
24
|
+
"METROPOLITAN_AREA": "metropolitan_area",
|
|
25
|
+
"LOW_INCOME_AREA": "low_income_area",
|
|
26
|
+
"DISTRESSED_AREA": "distressed_area",
|
|
27
|
+
"MINORITY_BORROWER": "minority_borrower",
|
|
28
|
+
"WOMEN_BORROWER": "women_borrower",
|
|
29
|
+
"LOW_INCOME_BORROWER": "low_income_borrower",
|
|
30
|
+
"FIRST_TIME_BORROWER": "first_time_borrower",
|
|
31
|
+
"JOBS_CREATED": "jobs_created",
|
|
32
|
+
"JOBS_RETAINED": "jobs_retained",
|
|
33
|
+
"AFFORDABLE_UNITS": "affordable_units",
|
|
34
|
+
"PROGRAM": "program",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
TLR_DTYPES = {
|
|
38
|
+
"fiscal_year": "int",
|
|
39
|
+
"amount": "float",
|
|
40
|
+
"term_months": "float",
|
|
41
|
+
"interest_rate": "float",
|
|
42
|
+
"jobs_created": "float",
|
|
43
|
+
"jobs_retained": "float",
|
|
44
|
+
"affordable_units": "float",
|
|
45
|
+
"low_income_area": "bool",
|
|
46
|
+
"distressed_area": "bool",
|
|
47
|
+
"minority_borrower": "bool",
|
|
48
|
+
"women_borrower": "bool",
|
|
49
|
+
"low_income_borrower": "bool",
|
|
50
|
+
"first_time_borrower": "bool",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# ── CLR (Consumer Loan Report) ────────────────────────────────────────────────
|
|
54
|
+
# Source: cdfifund.gov — aggregated to census tract, 12 variables
|
|
55
|
+
|
|
56
|
+
CLR_COLUMNS = {
|
|
57
|
+
"FISCAL_YEAR": "fiscal_year",
|
|
58
|
+
"STATE": "state",
|
|
59
|
+
"CENSUS_TRACT": "census_tract",
|
|
60
|
+
"LOAN_TYPE": "loan_type",
|
|
61
|
+
"NUMBER_OF_LOANS": "number_of_loans",
|
|
62
|
+
"TOTAL_AMOUNT": "total_amount",
|
|
63
|
+
"AVERAGE_AMOUNT": "average_amount",
|
|
64
|
+
"LOW_INCOME_AREA": "low_income_area",
|
|
65
|
+
"MINORITY_AREA": "minority_area",
|
|
66
|
+
"RURAL_AREA": "rural_area",
|
|
67
|
+
"PROGRAM": "program",
|
|
68
|
+
"AWARD_TYPE": "award_type",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
CLR_DTYPES = {
|
|
72
|
+
"fiscal_year": "int",
|
|
73
|
+
"number_of_loans": "int",
|
|
74
|
+
"total_amount": "float",
|
|
75
|
+
"average_amount": "float",
|
|
76
|
+
"low_income_area": "bool",
|
|
77
|
+
"minority_area": "bool",
|
|
78
|
+
"rural_area": "bool",
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# ── ILR (Institution Level Report) ───────────────────────────────────────────
|
|
82
|
+
# Source: data.gov CIIS — 11 years (FY2003-2013), 728 CDFIs
|
|
83
|
+
|
|
84
|
+
ILR_COLUMNS = {
|
|
85
|
+
"FISCAL_YEAR": "fiscal_year",
|
|
86
|
+
"CDFI_NAME": "cdfi_name",
|
|
87
|
+
"STATE": "state",
|
|
88
|
+
"INSTITUTION_TYPE": "institution_type",
|
|
89
|
+
"TOTAL_ASSETS": "total_assets",
|
|
90
|
+
"TOTAL_NET_ASSETS": "total_net_assets",
|
|
91
|
+
"TOTAL_LOANS": "total_loans",
|
|
92
|
+
"TOTAL_DEPOSITS": "total_deposits",
|
|
93
|
+
"NET_INCOME": "net_income",
|
|
94
|
+
"TOTAL_FINANCING": "total_financing",
|
|
95
|
+
"TARGET_MARKET": "target_market",
|
|
96
|
+
"CERTIFICATION_STATUS": "certification_status",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
ILR_DTYPES = {
|
|
100
|
+
"fiscal_year": "int",
|
|
101
|
+
"total_assets": "float",
|
|
102
|
+
"total_net_assets": "float",
|
|
103
|
+
"total_loans": "float",
|
|
104
|
+
"total_deposits": "float",
|
|
105
|
+
"net_income": "float",
|
|
106
|
+
"total_financing": "float",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# ── NMTC Allocatee Data ───────────────────────────────────────────────────────
|
|
110
|
+
# Source: cdfifund.gov NMTC program allocatee database
|
|
111
|
+
|
|
112
|
+
NMTC_COLUMNS = {
|
|
113
|
+
"CDE_NAME": "cde_name",
|
|
114
|
+
"STATE": "state",
|
|
115
|
+
"ALLOCATION_YEAR": "allocation_year",
|
|
116
|
+
"ALLOCATION_AMOUNT": "allocation_amount",
|
|
117
|
+
"CUMULATIVE_ALLOCATION": "cumulative_allocation",
|
|
118
|
+
"SERVICE_AREA": "service_area",
|
|
119
|
+
"MISSION_FOCUS": "mission_focus",
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
NMTC_DTYPES = {
|
|
123
|
+
"allocation_year": "int",
|
|
124
|
+
"allocation_amount": "float",
|
|
125
|
+
"cumulative_allocation": "float",
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# ── Awards Database ───────────────────────────────────────────────────────────
|
|
129
|
+
# Source: cdfifund.gov Awards Database
|
|
130
|
+
|
|
131
|
+
AWARDS_COLUMNS = {
|
|
132
|
+
"AWARDEE_NAME": "awardee_name",
|
|
133
|
+
"STATE": "state",
|
|
134
|
+
"AWARD_YEAR": "award_year",
|
|
135
|
+
"AWARD_AMOUNT": "award_amount",
|
|
136
|
+
"PROGRAM": "program",
|
|
137
|
+
"AWARD_TYPE": "award_type",
|
|
138
|
+
"INSTITUTION_TYPE": "institution_type",
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
AWARDS_DTYPES = {
|
|
142
|
+
"award_year": "int",
|
|
143
|
+
"award_amount": "float",
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# ── Download URLs ─────────────────────────────────────────────────────────────
|
|
147
|
+
CDFI_FUND_BASE = "https://www.cdfifund.gov"
|
|
148
|
+
|
|
149
|
+
TLR_URLS = {
|
|
150
|
+
2022: "https://www.cdfifund.gov/sites/cdfi/files/2024-12/FY2022_TLR_CLR_Public_Data_File.zip",
|
|
151
|
+
2021: "https://www.cdfifund.gov/sites/cdfi/files/2023-09/FY2021_TLR_CLR_Public_Data_File.zip",
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
ILR_URL = "https://data.gov/dataset/data-on-cdfi-program-awardees"
|
|
155
|
+
|
|
156
|
+
NMTC_URL = "https://www.cdfifund.gov/programs-training/programs/new-markets-tax-credit/allocatees"
|
|
157
|
+
|
|
158
|
+
# ── Cache directory ───────────────────────────────────────────────────────────
|
|
159
|
+
import os
|
|
160
|
+
CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cdfidata", "cache")
|
|
161
|
+
|
|
162
|
+
# ── Boolean value mappings ────────────────────────────────────────────────────
|
|
163
|
+
BOOL_TRUE_VALUES = {"Y", "YES", "1", "TRUE", "X", "yes", "true", "y"}
|
|
164
|
+
BOOL_FALSE_VALUES = {"N", "NO", "0", "FALSE", "", "no", "false", "n"}
|
|
165
|
+
|
|
166
|
+
# ── US State abbreviations ────────────────────────────────────────────────────
|
|
167
|
+
US_STATES = {
|
|
168
|
+
"AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
|
|
169
|
+
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
|
|
170
|
+
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
|
|
171
|
+
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
|
|
172
|
+
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY",
|
|
173
|
+
"DC", "GU", "PR", "VI",
|
|
174
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cdfidata
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: ETL pipeline for US Treasury CDFI Fund public datasets — TLR, CLR, ILR, NMTC, and Awards data
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/Jaypatel1511/cdfi-data
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pandas>=1.4.0
|
|
10
|
+
Requires-Dist: numpy>=1.21.0
|
|
11
|
+
Requires-Dist: requests>=2.27.0
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
cdfidata/__init__.py,sha256=nUODBEA_UCmmvQGlWpQmd4Idr4AlViEEJZzoeeEkeuI,458
|
|
2
|
+
cdfidata/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
cdfidata/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
cdfidata/pipeline/cleaner.py,sha256=FWvZ7-71R57oryeg8qkvrdJz6DQSPCpmzwawdASXocE,3272
|
|
5
|
+
cdfidata/pipeline/downloader.py,sha256=LvHyxBmYRHeT7JjqH7ucFbQmladnAKTR0-i-dZluAls,3519
|
|
6
|
+
cdfidata/pipeline/exporter.py,sha256=XNKvE6ZahnCkUOYU2NLB-I-9LF7AkP3leMkEJPXqsvw,2023
|
|
7
|
+
cdfidata/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
cdfidata/sources/awards.py,sha256=So3KrPgDhFRSFTcDB-0vFOLSS1uCytWPvTz-jMNFiYw,3465
|
|
9
|
+
cdfidata/sources/clr.py,sha256=ThEUKyHbpzij-lQE8rTWBkkedi0DEENqQC71dOFDDm8,3648
|
|
10
|
+
cdfidata/sources/tlr.py,sha256=YiAAgc1V1RD9EemHefmnTFgRYstx0U1ccNcePJPWQYw,6135
|
|
11
|
+
cdfidata/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
cdfidata/utils/schema.py,sha256=suAMxlvemblZClOWS2TCKCpdezGqWCUEWBwIfUqW9lo,7385
|
|
13
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
tests/conftest.py,sha256=mjjNNhFkzjLPGaOSmlvK-bS00PjXQsStr75jOWD1mBE,580
|
|
15
|
+
tests/test_awards.py,sha256=bghSiCPy1tJmGGYxFs9uhijpRWtFiBtxIuHxq5k2-t8,1284
|
|
16
|
+
tests/test_cleaner.py,sha256=SXFsRUvVpc5gOiV1rOZoEcfw-MiKGS8pGNJNDUc1fvU,1672
|
|
17
|
+
tests/test_clr.py,sha256=gyECfdpjXgl3bQzR4_zS28GSAAd1jxR5fPr_sEmWQNo,814
|
|
18
|
+
tests/test_tlr.py,sha256=YSDLjgJcj3a24_wgerhHNDTxhEa1tjbldxwSMgNwsT4,1827
|
|
19
|
+
cdfidata-0.1.0.dist-info/METADATA,sha256=2hgGvrWdU71W98bvVyYj7T3P3uVnTe9oty8Uy16Ow8U,388
|
|
20
|
+
cdfidata-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
21
|
+
cdfidata-0.1.0.dist-info/top_level.txt,sha256=q-ti-k5gZ_-15nSh0kiVoq5UMK6uajqbb09uSXpHCag,15
|
|
22
|
+
cdfidata-0.1.0.dist-info/RECORD,,
|
tests/__init__.py
ADDED
|
File without changes
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from cdfidata.sources.tlr import TLRLoader
|
|
3
|
+
from cdfidata.sources.clr import CLRLoader
|
|
4
|
+
from cdfidata.sources.awards import AwardsLoader
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.fixture
|
|
8
|
+
def tlr_sample():
|
|
9
|
+
loader = TLRLoader()
|
|
10
|
+
return loader.load_sample(n=500)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.fixture
|
|
14
|
+
def tlr_loader():
|
|
15
|
+
loader = TLRLoader()
|
|
16
|
+
loader.load_sample(n=500)
|
|
17
|
+
return loader
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.fixture
|
|
21
|
+
def clr_loader():
|
|
22
|
+
loader = CLRLoader()
|
|
23
|
+
loader.load_sample(n=500)
|
|
24
|
+
return loader
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.fixture
|
|
28
|
+
def awards_loader():
|
|
29
|
+
loader = AwardsLoader()
|
|
30
|
+
loader.load_sample(n=200)
|
|
31
|
+
return loader
|
tests/test_awards.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from cdfidata.sources.awards import AwardsLoader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_load_sample_returns_dataframe():
|
|
7
|
+
loader = AwardsLoader()
|
|
8
|
+
df = loader.load_sample(n=100)
|
|
9
|
+
assert isinstance(df, pd.DataFrame)
|
|
10
|
+
assert len(df) == 100
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_load_sample_has_required_columns():
|
|
14
|
+
loader = AwardsLoader()
|
|
15
|
+
df = loader.load_sample(n=100)
|
|
16
|
+
required = ["awardee_name", "state", "award_year", "award_amount", "program"]
|
|
17
|
+
for col in required:
|
|
18
|
+
assert col in df.columns
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_filter_state(awards_loader):
|
|
22
|
+
df = awards_loader.filter_state("IL")
|
|
23
|
+
assert all(df["state"] == "IL")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_filter_program(awards_loader):
|
|
27
|
+
df = awards_loader.filter_program("FA")
|
|
28
|
+
assert all(df["program"].str.contains("FA", case=False))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_filter_year(awards_loader):
|
|
32
|
+
df = awards_loader.filter_year(2020)
|
|
33
|
+
assert all(df["award_year"] == 2020)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_summary_runs(awards_loader):
|
|
37
|
+
awards_loader.summary()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_not_loaded_raises():
|
|
41
|
+
loader = AwardsLoader()
|
|
42
|
+
with pytest.raises(RuntimeError, match="No data loaded"):
|
|
43
|
+
loader.filter_state("IL")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_to_csv(awards_loader, tmp_path):
|
|
47
|
+
path = str(tmp_path / "awards.csv")
|
|
48
|
+
awards_loader.to_csv(path)
|
|
49
|
+
df = pd.read_csv(path)
|
|
50
|
+
assert len(df) == 200
|
tests/test_cleaner.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from cdfidata.pipeline.cleaner import (
|
|
4
|
+
normalize_columns, cast_types, clean_strings,
|
|
5
|
+
drop_empty_rows, standardize
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_normalize_columns():
|
|
10
|
+
df = pd.DataFrame({"AMOUNT": [1, 2], "STATE": ["IL", "NY"]})
|
|
11
|
+
col_map = {"AMOUNT": "amount", "STATE": "state"}
|
|
12
|
+
result = normalize_columns(df, col_map)
|
|
13
|
+
assert "amount" in result.columns
|
|
14
|
+
assert "state" in result.columns
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_cast_types_float():
|
|
18
|
+
df = pd.DataFrame({"amount": ["1000.50", "2000.00", "abc"]})
|
|
19
|
+
result = cast_types(df, {"amount": "float"})
|
|
20
|
+
assert result["amount"].dtype == float
|
|
21
|
+
assert pd.isna(result["amount"].iloc[2])
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_cast_types_bool():
|
|
25
|
+
df = pd.DataFrame({"flag": ["Y", "N", "YES", "NO", "1", "0"]})
|
|
26
|
+
result = cast_types(df, {"flag": "bool"})
|
|
27
|
+
assert result["flag"].iloc[0] == True
|
|
28
|
+
assert result["flag"].iloc[1] == False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_clean_strings():
|
|
32
|
+
df = pd.DataFrame({"name": [" IL ", " NY "]})
|
|
33
|
+
result = clean_strings(df)
|
|
34
|
+
assert result["name"].iloc[0] == "IL"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_drop_empty_rows():
|
|
38
|
+
df = pd.DataFrame({"amount": [100, None, 200], "state": ["IL", "NY", None]})
|
|
39
|
+
result = drop_empty_rows(df, required_cols=["amount"])
|
|
40
|
+
assert len(result) == 2
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_standardize_pipeline():
|
|
44
|
+
df = pd.DataFrame({
|
|
45
|
+
"AMOUNT": ["1000", "2000", None],
|
|
46
|
+
"STATE": [" IL ", " NY ", " CA "],
|
|
47
|
+
})
|
|
48
|
+
result = standardize(
|
|
49
|
+
df,
|
|
50
|
+
column_map={"AMOUNT": "amount", "STATE": "state"},
|
|
51
|
+
dtype_map={"amount": "float"},
|
|
52
|
+
required_cols=["amount"],
|
|
53
|
+
)
|
|
54
|
+
assert "amount" in result.columns
|
|
55
|
+
assert "state" in result.columns
|
|
56
|
+
assert len(result) == 2
|
tests/test_clr.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from cdfidata.sources.clr import CLRLoader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_load_sample_returns_dataframe():
|
|
7
|
+
loader = CLRLoader()
|
|
8
|
+
df = loader.load_sample(n=100)
|
|
9
|
+
assert isinstance(df, pd.DataFrame)
|
|
10
|
+
assert len(df) == 100
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_load_sample_has_required_columns():
|
|
14
|
+
loader = CLRLoader()
|
|
15
|
+
df = loader.load_sample(n=100)
|
|
16
|
+
required = ["fiscal_year", "state", "number_of_loans", "total_amount"]
|
|
17
|
+
for col in required:
|
|
18
|
+
assert col in df.columns
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_filter_state(clr_loader):
|
|
22
|
+
df = clr_loader.filter_state("IL")
|
|
23
|
+
assert all(df["state"] == "IL")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_summary_runs(clr_loader):
|
|
27
|
+
clr_loader.summary()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_not_loaded_raises():
|
|
31
|
+
loader = CLRLoader()
|
|
32
|
+
with pytest.raises(RuntimeError, match="No data loaded"):
|
|
33
|
+
loader.filter_state("IL")
|
tests/test_tlr.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from cdfidata.sources.tlr import TLRLoader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_load_sample_returns_dataframe():
|
|
7
|
+
loader = TLRLoader()
|
|
8
|
+
df = loader.load_sample(n=100)
|
|
9
|
+
assert isinstance(df, pd.DataFrame)
|
|
10
|
+
assert len(df) == 100
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_load_sample_has_required_columns():
|
|
14
|
+
loader = TLRLoader()
|
|
15
|
+
df = loader.load_sample(n=100)
|
|
16
|
+
required = ["fiscal_year", "amount", "state", "loan_type", "purpose"]
|
|
17
|
+
for col in required:
|
|
18
|
+
assert col in df.columns
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_load_sample_amounts_positive():
|
|
22
|
+
loader = TLRLoader()
|
|
23
|
+
df = loader.load_sample(n=100)
|
|
24
|
+
assert (df["amount"] > 0).all()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_filter_state(tlr_loader):
|
|
28
|
+
df = tlr_loader.filter_state("IL")
|
|
29
|
+
assert all(df["state"] == "IL")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_filter_state_empty(tlr_loader):
|
|
33
|
+
df = tlr_loader.filter_state("ZZ")
|
|
34
|
+
assert len(df) == 0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_filter_loan_type(tlr_loader):
|
|
38
|
+
df = tlr_loader.filter_loan_type("Business")
|
|
39
|
+
assert all(df["loan_type"].str.contains("Business", case=False))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_filter_amount(tlr_loader):
|
|
43
|
+
df = tlr_loader.filter_amount(min_amount=10_000, max_amount=100_000)
|
|
44
|
+
assert all(df["amount"] >= 10_000)
|
|
45
|
+
assert all(df["amount"] <= 100_000)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_not_loaded_raises():
|
|
49
|
+
loader = TLRLoader()
|
|
50
|
+
with pytest.raises(RuntimeError, match="No data loaded"):
|
|
51
|
+
loader.filter_state("IL")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_summary_runs(tlr_loader):
|
|
55
|
+
tlr_loader.summary()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_to_csv(tlr_loader, tmp_path):
|
|
59
|
+
path = str(tmp_path / "tlr.csv")
|
|
60
|
+
tlr_loader.to_csv(path)
|
|
61
|
+
df = pd.read_csv(path)
|
|
62
|
+
assert len(df) == 500
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_to_sqlite(tlr_loader, tmp_path):
|
|
66
|
+
path = str(tmp_path / "tlr.db")
|
|
67
|
+
tlr_loader.to_sqlite(path, table="tlr")
|
|
68
|
+
import sqlite3
|
|
69
|
+
with sqlite3.connect(path) as conn:
|
|
70
|
+
df = pd.read_sql("SELECT * FROM tlr", conn)
|
|
71
|
+
assert len(df) == 500
|