cdfidata 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cdfidata/__init__.py ADDED
@@ -0,0 +1,12 @@
1
+ from cdfidata.sources.tlr import TLRLoader
2
+ from cdfidata.sources.clr import CLRLoader
3
+ from cdfidata.sources.awards import AwardsLoader
4
+ from cdfidata.pipeline.downloader import download_tlr, list_cached, clear_cache
5
+ from cdfidata.pipeline.exporter import to_csv, to_sqlite, to_parquet
6
+
7
+ __version__ = "0.1.0"
8
+ __all__ = [
9
+ "TLRLoader", "CLRLoader", "AwardsLoader",
10
+ "download_tlr", "list_cached", "clear_cache",
11
+ "to_csv", "to_sqlite", "to_parquet",
12
+ ]
File without changes
File without changes
@@ -0,0 +1,115 @@
1
+ """
2
+ Standardize and clean raw CDFI Fund DataFrames.
3
+ Handles column renaming, type casting, null handling, and boolean normalization.
4
+ """
5
+ import pandas as pd
6
+ from cdfidata.utils.schema import BOOL_TRUE_VALUES, BOOL_FALSE_VALUES
7
+
8
+
9
+ def normalize_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame:
10
+ """
11
+ Rename columns using a mapping dict.
12
+ Only renames columns that exist in the DataFrame.
13
+
14
+ Args:
15
+ df: Raw DataFrame
16
+ column_map: Dict of {RAW_NAME: clean_name}
17
+
18
+ Returns:
19
+ DataFrame with renamed columns
20
+ """
21
+ df.columns = df.columns.str.strip().str.upper()
22
+ available = {k: v for k, v in column_map.items() if k in df.columns}
23
+ return df.rename(columns=available)
24
+
25
+
26
+ def cast_types(df: pd.DataFrame, dtype_map: dict) -> pd.DataFrame:
27
+ """
28
+ Cast columns to specified types.
29
+ Silently skips columns that don't exist or can't be cast.
30
+
31
+ Args:
32
+ df: DataFrame with clean column names
33
+ dtype_map: Dict of {column_name: type_string}
34
+
35
+ Returns:
36
+ DataFrame with cast columns
37
+ """
38
+ for col, dtype in dtype_map.items():
39
+ if col not in df.columns:
40
+ continue
41
+ try:
42
+ if dtype == "bool":
43
+ df[col] = df[col].apply(_parse_bool)
44
+ elif dtype == "int":
45
+ df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
46
+ elif dtype == "float":
47
+ df[col] = pd.to_numeric(df[col], errors="coerce")
48
+ else:
49
+ df[col] = df[col].astype(dtype)
50
+ except Exception:
51
+ pass
52
+ return df
53
+
54
+
55
+ def _parse_bool(val) -> bool:
56
+ """Parse a value to boolean using CDFI Fund conventions."""
57
+ if pd.isna(val):
58
+ return None
59
+ s = str(val).strip()
60
+ if s in BOOL_TRUE_VALUES:
61
+ return True
62
+ if s in BOOL_FALSE_VALUES:
63
+ return False
64
+ return None
65
+
66
+
67
+ def clean_strings(df: pd.DataFrame, cols: list = None) -> pd.DataFrame:
68
+ """
69
+ Strip whitespace and normalize string columns to uppercase.
70
+
71
+ Args:
72
+ df: DataFrame
73
+ cols: List of columns to clean (default: all object columns)
74
+
75
+ Returns:
76
+ Cleaned DataFrame
77
+ """
78
+ if cols is None:
79
+ cols = df.select_dtypes(include="object").columns.tolist()
80
+ for col in cols:
81
+ if col in df.columns:
82
+ df[col] = df[col].str.strip()
83
+ return df
84
+
85
+
86
+ def drop_empty_rows(df: pd.DataFrame, required_cols: list) -> pd.DataFrame:
87
+ """Drop rows where all required columns are null."""
88
+ return df.dropna(subset=required_cols, how="all")
89
+
90
+
91
+ def standardize(
92
+ df: pd.DataFrame,
93
+ column_map: dict,
94
+ dtype_map: dict,
95
+ required_cols: list = None,
96
+ ) -> pd.DataFrame:
97
+ """
98
+ Full standardization pipeline: rename → cast → clean → drop empty.
99
+
100
+ Args:
101
+ df: Raw DataFrame
102
+ column_map: Column rename mapping
103
+ dtype_map: Type casting mapping
104
+ required_cols: Columns required to be non-null
105
+
106
+ Returns:
107
+ Clean, standardized DataFrame
108
+ """
109
+ df = normalize_columns(df, column_map)
110
+ df = cast_types(df, dtype_map)
111
+ df = clean_strings(df)
112
+ if required_cols:
113
+ df = drop_empty_rows(df, required_cols)
114
+ df = df.reset_index(drop=True)
115
+ return df
@@ -0,0 +1,134 @@
1
+ """
2
+ Download and cache CDFI Fund public datasets locally.
3
+ Files are cached to avoid repeated downloads.
4
+ """
5
+ import os
6
+ import zipfile
7
+ import requests
8
+ from pathlib import Path
9
+
10
+ from cdfidata.utils.schema import CACHE_DIR, TLR_URLS
11
+
12
+
13
+ def get_cache_dir() -> Path:
14
+ """Return and create the local cache directory."""
15
+ path = Path(CACHE_DIR)
16
+ path.mkdir(parents=True, exist_ok=True)
17
+ return path
18
+
19
+
20
+ def cache_path(filename: str) -> Path:
21
+ """Return the full cache path for a given filename."""
22
+ return get_cache_dir() / filename
23
+
24
+
25
+ def is_cached(filename: str) -> bool:
26
+ """Check if a file is already cached locally."""
27
+ return cache_path(filename).exists()
28
+
29
+
30
+ def download_file(url: str, filename: str, force: bool = False) -> Path:
31
+ """
32
+ Download a file from a URL and cache it locally.
33
+
34
+ Args:
35
+ url: Full URL to download from
36
+ filename: Local filename to save as
37
+ force: Re-download even if cached
38
+
39
+ Returns:
40
+ Path to the cached file
41
+ """
42
+ path = cache_path(filename)
43
+
44
+ if path.exists() and not force:
45
+ print(f"Using cached file: {path}")
46
+ return path
47
+
48
+ print(f"Downloading {filename}...")
49
+ print(f"URL: {url}")
50
+
51
+ response = requests.get(url, stream=True, timeout=120)
52
+ response.raise_for_status()
53
+
54
+ total = int(response.headers.get("content-length", 0))
55
+ downloaded = 0
56
+
57
+ with open(path, "wb") as f:
58
+ for chunk in response.iter_content(chunk_size=8192):
59
+ f.write(chunk)
60
+ downloaded += len(chunk)
61
+ if total:
62
+ pct = downloaded / total * 100
63
+ print(f"\r {pct:.1f}% ({downloaded/1e6:.1f}MB)", end="")
64
+
65
+ print(f"\nSaved to {path}")
66
+ return path
67
+
68
+
69
+ def extract_zip(zip_path: Path, extract_to: Path = None) -> list:
70
+ """
71
+ Extract a zip file to the cache directory.
72
+
73
+ Args:
74
+ zip_path: Path to the zip file
75
+ extract_to: Directory to extract to (default: cache dir)
76
+
77
+ Returns:
78
+ List of extracted file paths
79
+ """
80
+ extract_to = extract_to or get_cache_dir()
81
+
82
+ with zipfile.ZipFile(zip_path, "r") as zf:
83
+ names = zf.namelist()
84
+ zf.extractall(extract_to)
85
+ print(f"Extracted {len(names)} files to {extract_to}")
86
+
87
+ return [extract_to / name for name in names]
88
+
89
+
90
+ def download_tlr(year: int, force: bool = False) -> Path:
91
+ """
92
+ Download TLR/CLR zip file for a given fiscal year.
93
+
94
+ Args:
95
+ year: Fiscal year e.g. 2022
96
+ force: Re-download even if cached
97
+
98
+ Returns:
99
+ Path to the downloaded zip file
100
+ """
101
+ if year not in TLR_URLS:
102
+ available = list(TLR_URLS.keys())
103
+ raise ValueError(
104
+ f"No TLR URL available for FY{year}. "
105
+ f"Available years: {available}"
106
+ )
107
+
108
+ url = TLR_URLS[year]
109
+ filename = f"TLR_CLR_FY{year}.zip"
110
+ return download_file(url, filename, force=force)
111
+
112
+
113
+ def list_cached() -> list:
114
+ """List all files currently in the local cache."""
115
+ cache = get_cache_dir()
116
+ files = list(cache.iterdir())
117
+ if not files:
118
+ print("Cache is empty.")
119
+ else:
120
+ print(f"Cached files in {cache}:")
121
+ for f in sorted(files):
122
+ size_mb = f.stat().st_size / 1e6
123
+ print(f" {f.name} ({size_mb:.1f}MB)")
124
+ return files
125
+
126
+
127
+ def clear_cache() -> None:
128
+ """Delete all cached files."""
129
+ cache = get_cache_dir()
130
+ count = 0
131
+ for f in cache.iterdir():
132
+ f.unlink()
133
+ count += 1
134
+ print(f"Cleared {count} cached files from {cache}")
@@ -0,0 +1,65 @@
1
+ """
2
+ Export cleaned CDFI Fund DataFrames to CSV, SQLite, and Parquet formats.
3
+ """
4
+ import pandas as pd
5
+ from pathlib import Path
6
+
7
+
8
+ def to_csv(df: pd.DataFrame, path: str, **kwargs) -> None:
9
+ """Export DataFrame to CSV."""
10
+ path = Path(path)
11
+ path.parent.mkdir(parents=True, exist_ok=True)
12
+ df.to_csv(path, index=False, **kwargs)
13
+ print(f"Exported {len(df):,} rows to {path}")
14
+
15
+
16
+ def to_sqlite(df: pd.DataFrame, db_path: str, table_name: str,
17
+ if_exists: str = "replace") -> None:
18
+ """
19
+ Export DataFrame to a SQLite database table.
20
+
21
+ Args:
22
+ df: DataFrame to export
23
+ db_path: Path to SQLite database file
24
+ table_name: Table name to write to
25
+ if_exists: 'replace', 'append', or 'fail'
26
+ """
27
+ import sqlite3
28
+ path = Path(db_path)
29
+ path.parent.mkdir(parents=True, exist_ok=True)
30
+
31
+ with sqlite3.connect(path) as conn:
32
+ df.to_sql(table_name, conn, if_exists=if_exists, index=False)
33
+ print(f"Exported {len(df):,} rows to {path} (table: {table_name})")
34
+
35
+
36
+ def to_parquet(df: pd.DataFrame, path: str, **kwargs) -> None:
37
+ """Export DataFrame to Parquet format."""
38
+ try:
39
+ import pyarrow
40
+ except ImportError:
41
+ raise ImportError(
42
+ "pyarrow is required for parquet export. "
43
+ "Install it with: pip install pyarrow"
44
+ )
45
+ path = Path(path)
46
+ path.parent.mkdir(parents=True, exist_ok=True)
47
+ df.to_parquet(path, index=False, **kwargs)
48
+ print(f"Exported {len(df):,} rows to {path}")
49
+
50
+
51
+ def export_all(df: pd.DataFrame, base_path: str, table_name: str) -> None:
52
+ """
53
+ Export to all three formats at once.
54
+
55
+ Args:
56
+ df: DataFrame to export
57
+ base_path: Base path without extension
58
+ table_name: SQLite table name
59
+ """
60
+ to_csv(df, f"{base_path}.csv")
61
+ to_sqlite(df, f"{base_path}.db", table_name)
62
+ try:
63
+ to_parquet(df, f"{base_path}.parquet")
64
+ except ImportError:
65
+ print("Skipping parquet export — pyarrow not installed")
File without changes
@@ -0,0 +1,99 @@
1
+ """
2
+ CDFI Fund Awards Database loader.
3
+ Covers all CDFI Fund program awardees across all years.
4
+ """
5
+ import pandas as pd
6
+ import numpy as np
7
+ import random
8
+ from typing import Optional
9
+
10
+ from cdfidata.pipeline.cleaner import standardize
11
+ from cdfidata.pipeline.exporter import to_csv, to_sqlite
12
+ from cdfidata.utils.schema import AWARDS_COLUMNS, AWARDS_DTYPES
13
+
14
+
15
+ class AwardsLoader:
16
+ """
17
+ Loader for CDFI Fund Awards Database.
18
+
19
+ Usage:
20
+ loader = AwardsLoader()
21
+ df = loader.load_sample()
22
+ df_il = loader.filter_state("IL")
23
+ """
24
+
25
+ def __init__(self):
26
+ self._df: Optional[pd.DataFrame] = None
27
+
28
+ def load_from_file(self, path: str) -> pd.DataFrame:
29
+ """Load awards data from a local CSV file."""
30
+ print(f"Loading awards data from {path}...")
31
+ df = pd.read_csv(path, dtype=str, low_memory=False)
32
+ df = standardize(df, AWARDS_COLUMNS, AWARDS_DTYPES,
33
+ required_cols=["award_amount"])
34
+ self._df = df
35
+ return df
36
+
37
+ def load_sample(self, n: int = 500) -> pd.DataFrame:
38
+ """Generate synthetic awards sample data for testing."""
39
+ random.seed(42)
40
+ np.random.seed(42)
41
+
42
+ states = ["IL", "NY", "CA", "TX", "GA", "NC", "OH", "PA", "FL", "MI"]
43
+ programs = ["FA", "NACA", "NMTC", "BEA", "CMF", "BOND"]
44
+ inst_types = ["Loan Fund", "Bank", "Credit Union", "Venture Fund",
45
+ "Depository Institution Holding Company"]
46
+
47
+ records = []
48
+ for i in range(n):
49
+ records.append({
50
+ "awardee_name": f"Community Development Fund {i+1}",
51
+ "state": random.choice(states),
52
+ "award_year": random.randint(2010, 2023),
53
+ "award_amount": round(np.random.lognormal(14, 1)),
54
+ "program": random.choice(programs),
55
+ "award_type": "Financial Assistance",
56
+ "institution_type": random.choice(inst_types),
57
+ })
58
+
59
+ self._df = pd.DataFrame(records)
60
+ return self._df
61
+
62
+ def filter_state(self, state: str) -> pd.DataFrame:
63
+ self._check_loaded()
64
+ return self._df[self._df["state"] == state.upper()].copy()
65
+
66
+ def filter_program(self, program: str) -> pd.DataFrame:
67
+ self._check_loaded()
68
+ return self._df[
69
+ self._df["program"].str.contains(program, case=False, na=False)
70
+ ].copy()
71
+
72
+ def filter_year(self, year: int) -> pd.DataFrame:
73
+ self._check_loaded()
74
+ return self._df[self._df["award_year"] == year].copy()
75
+
76
+ def summary(self) -> None:
77
+ self._check_loaded()
78
+ df = self._df
79
+ print(f"\nAwards Database Summary")
80
+ print(f" Total awards: {len(df):,}")
81
+ print(f" Total amount: ${df['award_amount'].sum()/1e9:.2f}B")
82
+ print(f" States covered: {df['state'].nunique()}")
83
+ print(f" Programs: {df['program'].nunique()}")
84
+ print(f" Year range: {df['award_year'].min()}–{df['award_year'].max()}")
85
+ print()
86
+
87
+ def to_csv(self, path: str) -> None:
88
+ self._check_loaded()
89
+ to_csv(self._df, path)
90
+
91
+ def to_sqlite(self, db_path: str, table: str = "awards") -> None:
92
+ self._check_loaded()
93
+ to_sqlite(self._df, db_path, table)
94
+
95
+ def _check_loaded(self) -> None:
96
+ if self._df is None:
97
+ raise RuntimeError(
98
+ "No data loaded. Call .load_from_file() or .load_sample() first."
99
+ )
@@ -0,0 +1,107 @@
1
+ """
2
+ CLR (Consumer Loan Report) data loader.
3
+ Aggregated to census tract level, 12 variables.
4
+ """
5
+ import pandas as pd
6
+ import numpy as np
7
+ import random
8
+ from typing import Optional
9
+
10
+ from cdfidata.pipeline.cleaner import standardize
11
+ from cdfidata.pipeline.exporter import to_csv, to_sqlite, to_parquet
12
+ from cdfidata.utils.schema import CLR_COLUMNS, CLR_DTYPES
13
+
14
+
15
+ class CLRLoader:
16
+ """
17
+ Loader for CDFI Fund Consumer Loan Report (CLR) data.
18
+
19
+ Usage:
20
+ loader = CLRLoader()
21
+ df = loader.load_sample()
22
+ df_il = loader.filter_state("IL")
23
+ """
24
+
25
+ def __init__(self):
26
+ self._df: Optional[pd.DataFrame] = None
27
+ self._year: Optional[int] = None
28
+
29
+ def load_from_file(self, path: str, year: int = 2022) -> pd.DataFrame:
30
+ """
31
+ Load CLR data from a local CSV file.
32
+
33
+ Args:
34
+ path: Path to the CLR CSV file
35
+ year: Fiscal year for reference
36
+
37
+ Returns:
38
+ Clean pandas DataFrame
39
+ """
40
+ self._year = year
41
+ print(f"Loading CLR data from {path}...")
42
+ df = pd.read_csv(path, dtype=str, low_memory=False)
43
+ print(f"Raw records: {len(df):,}")
44
+ df = standardize(df, CLR_COLUMNS, CLR_DTYPES,
45
+ required_cols=["total_amount"])
46
+ print(f"Clean records: {len(df):,}")
47
+ self._df = df
48
+ return df
49
+
50
+ def load_sample(self, n: int = 1000) -> pd.DataFrame:
51
+ """Generate synthetic CLR sample data for testing."""
52
+ random.seed(42)
53
+ np.random.seed(42)
54
+
55
+ states = ["IL", "NY", "CA", "TX", "GA", "NC", "OH", "PA", "FL", "MI"]
56
+ loan_types = ["Auto Loan", "Personal Loan", "Credit Card",
57
+ "Student Loan", "Home Improvement"]
58
+
59
+ records = []
60
+ for i in range(n):
61
+ n_loans = random.randint(1, 500)
62
+ avg = round(np.random.lognormal(8, 1))
63
+ records.append({
64
+ "fiscal_year": random.choice([2020, 2021, 2022]),
65
+ "state": random.choice(states),
66
+ "census_tract": f"{random.randint(1000, 9999):04d}",
67
+ "loan_type": random.choice(loan_types),
68
+ "number_of_loans": n_loans,
69
+ "total_amount": n_loans * avg,
70
+ "average_amount": avg,
71
+ "low_income_area": random.choice([True, False]),
72
+ "minority_area": random.choice([True, False]),
73
+ "rural_area": random.choice([True, False]),
74
+ "program": random.choice(["FA", "NACA", "RRP"]),
75
+ "award_type": "Financial Assistance",
76
+ })
77
+
78
+ self._df = pd.DataFrame(records)
79
+ return self._df
80
+
81
+ def filter_state(self, state: str) -> pd.DataFrame:
82
+ self._check_loaded()
83
+ return self._df[self._df["state"] == state.upper()].copy()
84
+
85
+ def summary(self) -> None:
86
+ self._check_loaded()
87
+ df = self._df
88
+ print(f"\nCLR Data Summary")
89
+ print(f" Total records: {len(df):,}")
90
+ print(f" Total loans: {df['number_of_loans'].sum():,.0f}")
91
+ print(f" Total amount: ${df['total_amount'].sum()/1e9:.2f}B")
92
+ print(f" States covered: {df['state'].nunique()}")
93
+ print()
94
+
95
+ def to_csv(self, path: str) -> None:
96
+ self._check_loaded()
97
+ to_csv(self._df, path)
98
+
99
+ def to_sqlite(self, db_path: str, table: str = "clr") -> None:
100
+ self._check_loaded()
101
+ to_sqlite(self._df, db_path, table)
102
+
103
+ def _check_loaded(self) -> None:
104
+ if self._df is None:
105
+ raise RuntimeError(
106
+ "No data loaded. Call .load_from_file() or .load_sample() first."
107
+ )
@@ -0,0 +1,172 @@
1
+ """
2
+ TLR (Transaction Level Report) data loader.
3
+ Downloads, cleans, and returns CDFI Fund transaction-level loan data.
4
+ """
5
+ import pandas as pd
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ from cdfidata.pipeline.downloader import download_tlr, extract_zip, cache_path
10
+ from cdfidata.pipeline.cleaner import standardize
11
+ from cdfidata.pipeline.exporter import to_csv, to_sqlite, to_parquet
12
+ from cdfidata.utils.schema import TLR_COLUMNS, TLR_DTYPES
13
+
14
+
15
+ class TLRLoader:
16
+ """
17
+ Loader for CDFI Fund Transaction Level Report (TLR) data.
18
+
19
+ Usage:
20
+ loader = TLRLoader()
21
+ df = loader.load(year=2022)
22
+ df_il = loader.filter_state("IL")
23
+ """
24
+
25
+ def __init__(self):
26
+ self._df: Optional[pd.DataFrame] = None
27
+ self._year: Optional[int] = None
28
+
29
+ def load(self, year: int = 2022, force: bool = False) -> pd.DataFrame:
30
+ """
31
+ Download and load TLR data for a given fiscal year.
32
+
33
+ Args:
34
+ year: Fiscal year e.g. 2022
35
+ force: Re-download even if cached
36
+
37
+ Returns:
38
+ Clean pandas DataFrame with standardized columns
39
+ """
40
+ self._year = year
41
+
42
+ # Download zip
43
+ zip_path = download_tlr(year, force=force)
44
+
45
+ # Extract
46
+ extracted = extract_zip(zip_path)
47
+ tlr_files = [f for f in extracted if "TLR" in str(f).upper()
48
+ and str(f).endswith(".csv")]
49
+
50
+ if not tlr_files:
51
+ # Try finding CSV directly in cache
52
+ tlr_files = list(cache_path("").parent.glob(f"*TLR*{year}*.csv"))
53
+
54
+ if not tlr_files:
55
+ raise FileNotFoundError(
56
+ f"Could not find TLR CSV file in extracted archive for FY{year}. "
57
+ f"Extracted files: {extracted}"
58
+ )
59
+
60
+ csv_path = tlr_files[0]
61
+ print(f"Loading TLR data from {csv_path}...")
62
+
63
+ df = pd.read_csv(csv_path, dtype=str, low_memory=False)
64
+ print(f"Raw records: {len(df):,}")
65
+
66
+ df = standardize(df, TLR_COLUMNS, TLR_DTYPES,
67
+ required_cols=["amount"])
68
+
69
+ print(f"Clean records: {len(df):,}")
70
+ self._df = df
71
+ return df
72
+
73
+ def load_sample(self, n: int = 1000) -> pd.DataFrame:
74
+ """
75
+ Generate synthetic TLR sample data for testing and demos.
76
+ Does not require downloading real data.
77
+
78
+ Args:
79
+ n: Number of sample records
80
+
81
+ Returns:
82
+ Synthetic DataFrame with TLR schema
83
+ """
84
+ import numpy as np
85
+ import random
86
+ random.seed(42)
87
+ np.random.seed(42)
88
+
89
+ states = ["IL", "NY", "CA", "TX", "GA", "NC", "OH", "PA", "FL", "MI"]
90
+ loan_types = ["Business Loan", "Microenterprise Loan", "Home Mortgage",
91
+ "Home Improvement", "Consumer Loan", "Commercial RE"]
92
+ purposes = ["Job Creation", "Affordable Housing", "Small Business",
93
+ "Community Facility", "Microenterprise"]
94
+ programs = ["FA", "NACA", "RRP"]
95
+
96
+ records = []
97
+ for i in range(n):
98
+ records.append({
99
+ "fiscal_year": random.choice([2020, 2021, 2022]),
100
+ "award_number": f"FA-{i:06d}",
101
+ "financing_type": "Loan",
102
+ "loan_type": random.choice(loan_types),
103
+ "purpose": random.choice(purposes),
104
+ "amount": round(np.random.lognormal(10, 1.5)),
105
+ "term_months": random.choice([12, 24, 36, 60, 84, 120]),
106
+ "interest_rate": round(np.random.uniform(0.02, 0.08), 4),
107
+ "state": random.choice(states),
108
+ "census_tract": f"{random.randint(1000, 9999):04d}",
109
+ "low_income_area": random.choice([True, False]),
110
+ "distressed_area": random.choice([True, False]),
111
+ "minority_borrower": random.choice([True, False]),
112
+ "women_borrower": random.choice([True, False]),
113
+ "jobs_created": random.randint(0, 50),
114
+ "jobs_retained": random.randint(0, 100),
115
+ "program": random.choice(programs),
116
+ })
117
+
118
+ self._df = pd.DataFrame(records)
119
+ return self._df
120
+
121
+ def filter_state(self, state: str) -> pd.DataFrame:
122
+ """Filter loaded data to a specific state."""
123
+ self._check_loaded()
124
+ return self._df[self._df["state"] == state.upper()].copy()
125
+
126
+ def filter_loan_type(self, loan_type: str) -> pd.DataFrame:
127
+ """Filter by loan type (partial match)."""
128
+ self._check_loaded()
129
+ return self._df[
130
+ self._df["loan_type"].str.contains(loan_type, case=False, na=False)
131
+ ].copy()
132
+
133
+ def filter_amount(self, min_amount: float = 0,
134
+ max_amount: float = float("inf")) -> pd.DataFrame:
135
+ """Filter by loan amount range."""
136
+ self._check_loaded()
137
+ return self._df[
138
+ (self._df["amount"] >= min_amount) &
139
+ (self._df["amount"] <= max_amount)
140
+ ].copy()
141
+
142
+ def summary(self) -> pd.DataFrame:
143
+ """Return a summary of the loaded TLR data."""
144
+ self._check_loaded()
145
+ df = self._df
146
+ print(f"\nTLR Data Summary — FY{self._year}")
147
+ print(f" Total records: {len(df):,}")
148
+ print(f" Total amount: ${df['amount'].sum()/1e9:.2f}B")
149
+ print(f" Median loan size: ${df['amount'].median():,.0f}")
150
+ print(f" States covered: {df['state'].nunique()}")
151
+ if "jobs_created" in df.columns:
152
+ print(f" Total jobs created: {df['jobs_created'].sum():,.0f}")
153
+ print()
154
+ return df.describe()
155
+
156
+ def to_csv(self, path: str) -> None:
157
+ self._check_loaded()
158
+ to_csv(self._df, path)
159
+
160
+ def to_sqlite(self, db_path: str, table: str = "tlr") -> None:
161
+ self._check_loaded()
162
+ to_sqlite(self._df, db_path, table)
163
+
164
+ def to_parquet(self, path: str) -> None:
165
+ self._check_loaded()
166
+ to_parquet(self._df, path)
167
+
168
+ def _check_loaded(self) -> None:
169
+ if self._df is None:
170
+ raise RuntimeError(
171
+ "No data loaded. Call .load(year=2022) or .load_sample() first."
172
+ )
File without changes
@@ -0,0 +1,174 @@
1
+ """
2
+ Column mappings, data dictionaries, and constants for CDFI Fund public datasets.
3
+ All source URLs and field definitions are based on official CDFI Fund documentation.
4
+ """
5
+
6
+ # ── TLR (Transaction Level Report) ───────────────────────────────────────────
7
+ # Source: cdfifund.gov — annual release, 61 variables, 1M+ loan observations
8
+ # Masked to protect individual CDFI identity
9
+
10
+ TLR_COLUMNS = {
11
+ "FISCAL_YEAR": "fiscal_year",
12
+ "AWARD_NUMBER": "award_number",
13
+ "AWARDEE_TYPE": "awardee_type",
14
+ "FINANCING_TYPE": "financing_type",
15
+ "LOAN_TYPE": "loan_type",
16
+ "ACTIVITY_TYPE": "activity_type",
17
+ "AMOUNT": "amount",
18
+ "TERM": "term_months",
19
+ "INTEREST_RATE": "interest_rate",
20
+ "PURPOSE": "purpose",
21
+ "STATE": "state",
22
+ "CENSUS_TRACT": "census_tract",
23
+ "COUNTY": "county",
24
+ "METROPOLITAN_AREA": "metropolitan_area",
25
+ "LOW_INCOME_AREA": "low_income_area",
26
+ "DISTRESSED_AREA": "distressed_area",
27
+ "MINORITY_BORROWER": "minority_borrower",
28
+ "WOMEN_BORROWER": "women_borrower",
29
+ "LOW_INCOME_BORROWER": "low_income_borrower",
30
+ "FIRST_TIME_BORROWER": "first_time_borrower",
31
+ "JOBS_CREATED": "jobs_created",
32
+ "JOBS_RETAINED": "jobs_retained",
33
+ "AFFORDABLE_UNITS": "affordable_units",
34
+ "PROGRAM": "program",
35
+ }
36
+
37
+ TLR_DTYPES = {
38
+ "fiscal_year": "int",
39
+ "amount": "float",
40
+ "term_months": "float",
41
+ "interest_rate": "float",
42
+ "jobs_created": "float",
43
+ "jobs_retained": "float",
44
+ "affordable_units": "float",
45
+ "low_income_area": "bool",
46
+ "distressed_area": "bool",
47
+ "minority_borrower": "bool",
48
+ "women_borrower": "bool",
49
+ "low_income_borrower": "bool",
50
+ "first_time_borrower": "bool",
51
+ }
52
+
53
+ # ── CLR (Consumer Loan Report) ────────────────────────────────────────────────
54
+ # Source: cdfifund.gov — aggregated to census tract, 12 variables
55
+
56
+ CLR_COLUMNS = {
57
+ "FISCAL_YEAR": "fiscal_year",
58
+ "STATE": "state",
59
+ "CENSUS_TRACT": "census_tract",
60
+ "LOAN_TYPE": "loan_type",
61
+ "NUMBER_OF_LOANS": "number_of_loans",
62
+ "TOTAL_AMOUNT": "total_amount",
63
+ "AVERAGE_AMOUNT": "average_amount",
64
+ "LOW_INCOME_AREA": "low_income_area",
65
+ "MINORITY_AREA": "minority_area",
66
+ "RURAL_AREA": "rural_area",
67
+ "PROGRAM": "program",
68
+ "AWARD_TYPE": "award_type",
69
+ }
70
+
71
+ CLR_DTYPES = {
72
+ "fiscal_year": "int",
73
+ "number_of_loans": "int",
74
+ "total_amount": "float",
75
+ "average_amount": "float",
76
+ "low_income_area": "bool",
77
+ "minority_area": "bool",
78
+ "rural_area": "bool",
79
+ }
80
+
81
+ # ── ILR (Institution Level Report) ───────────────────────────────────────────
82
+ # Source: data.gov CIIS — 11 years (FY2003-2013), 728 CDFIs
83
+
84
+ ILR_COLUMNS = {
85
+ "FISCAL_YEAR": "fiscal_year",
86
+ "CDFI_NAME": "cdfi_name",
87
+ "STATE": "state",
88
+ "INSTITUTION_TYPE": "institution_type",
89
+ "TOTAL_ASSETS": "total_assets",
90
+ "TOTAL_NET_ASSETS": "total_net_assets",
91
+ "TOTAL_LOANS": "total_loans",
92
+ "TOTAL_DEPOSITS": "total_deposits",
93
+ "NET_INCOME": "net_income",
94
+ "TOTAL_FINANCING": "total_financing",
95
+ "TARGET_MARKET": "target_market",
96
+ "CERTIFICATION_STATUS": "certification_status",
97
+ }
98
+
99
+ ILR_DTYPES = {
100
+ "fiscal_year": "int",
101
+ "total_assets": "float",
102
+ "total_net_assets": "float",
103
+ "total_loans": "float",
104
+ "total_deposits": "float",
105
+ "net_income": "float",
106
+ "total_financing": "float",
107
+ }
108
+
109
+ # ── NMTC Allocatee Data ───────────────────────────────────────────────────────
110
+ # Source: cdfifund.gov NMTC program allocatee database
111
+
112
+ NMTC_COLUMNS = {
113
+ "CDE_NAME": "cde_name",
114
+ "STATE": "state",
115
+ "ALLOCATION_YEAR": "allocation_year",
116
+ "ALLOCATION_AMOUNT": "allocation_amount",
117
+ "CUMULATIVE_ALLOCATION": "cumulative_allocation",
118
+ "SERVICE_AREA": "service_area",
119
+ "MISSION_FOCUS": "mission_focus",
120
+ }
121
+
122
+ NMTC_DTYPES = {
123
+ "allocation_year": "int",
124
+ "allocation_amount": "float",
125
+ "cumulative_allocation": "float",
126
+ }
127
+
128
+ # ── Awards Database ───────────────────────────────────────────────────────────
129
+ # Source: cdfifund.gov Awards Database
130
+
131
+ AWARDS_COLUMNS = {
132
+ "AWARDEE_NAME": "awardee_name",
133
+ "STATE": "state",
134
+ "AWARD_YEAR": "award_year",
135
+ "AWARD_AMOUNT": "award_amount",
136
+ "PROGRAM": "program",
137
+ "AWARD_TYPE": "award_type",
138
+ "INSTITUTION_TYPE": "institution_type",
139
+ }
140
+
141
+ AWARDS_DTYPES = {
142
+ "award_year": "int",
143
+ "award_amount": "float",
144
+ }
145
+
146
+ # ── Download URLs ─────────────────────────────────────────────────────────────
147
+ CDFI_FUND_BASE = "https://www.cdfifund.gov"
148
+
149
+ TLR_URLS = {
150
+ 2022: "https://www.cdfifund.gov/sites/cdfi/files/2024-12/FY2022_TLR_CLR_Public_Data_File.zip",
151
+ 2021: "https://www.cdfifund.gov/sites/cdfi/files/2023-09/FY2021_TLR_CLR_Public_Data_File.zip",
152
+ }
153
+
154
+ ILR_URL = "https://data.gov/dataset/data-on-cdfi-program-awardees"
155
+
156
+ NMTC_URL = "https://www.cdfifund.gov/programs-training/programs/new-markets-tax-credit/allocatees"
157
+
158
+ # ── Cache directory ───────────────────────────────────────────────────────────
159
+ import os
160
+ CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cdfidata", "cache")
161
+
162
+ # ── Boolean value mappings ────────────────────────────────────────────────────
163
+ BOOL_TRUE_VALUES = {"Y", "YES", "1", "TRUE", "X", "yes", "true", "y"}
164
+ BOOL_FALSE_VALUES = {"N", "NO", "0", "FALSE", "", "no", "false", "n"}
165
+
166
+ # ── US State abbreviations ────────────────────────────────────────────────────
167
+ US_STATES = {
168
+ "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
169
+ "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
170
+ "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
171
+ "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
172
+ "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY",
173
+ "DC", "GU", "PR", "VI",
174
+ }
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: cdfidata
3
+ Version: 0.1.0
4
+ Summary: ETL pipeline for US Treasury CDFI Fund public datasets — TLR, CLR, ILR, NMTC, and Awards data
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/Jaypatel1511/cdfi-data
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: pandas>=1.4.0
10
+ Requires-Dist: numpy>=1.21.0
11
+ Requires-Dist: requests>=2.27.0
@@ -0,0 +1,22 @@
1
+ cdfidata/__init__.py,sha256=nUODBEA_UCmmvQGlWpQmd4Idr4AlViEEJZzoeeEkeuI,458
2
+ cdfidata/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ cdfidata/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ cdfidata/pipeline/cleaner.py,sha256=FWvZ7-71R57oryeg8qkvrdJz6DQSPCpmzwawdASXocE,3272
5
+ cdfidata/pipeline/downloader.py,sha256=LvHyxBmYRHeT7JjqH7ucFbQmladnAKTR0-i-dZluAls,3519
6
+ cdfidata/pipeline/exporter.py,sha256=XNKvE6ZahnCkUOYU2NLB-I-9LF7AkP3leMkEJPXqsvw,2023
7
+ cdfidata/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ cdfidata/sources/awards.py,sha256=So3KrPgDhFRSFTcDB-0vFOLSS1uCytWPvTz-jMNFiYw,3465
9
+ cdfidata/sources/clr.py,sha256=ThEUKyHbpzij-lQE8rTWBkkedi0DEENqQC71dOFDDm8,3648
10
+ cdfidata/sources/tlr.py,sha256=YiAAgc1V1RD9EemHefmnTFgRYstx0U1ccNcePJPWQYw,6135
11
+ cdfidata/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ cdfidata/utils/schema.py,sha256=suAMxlvemblZClOWS2TCKCpdezGqWCUEWBwIfUqW9lo,7385
13
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ tests/conftest.py,sha256=mjjNNhFkzjLPGaOSmlvK-bS00PjXQsStr75jOWD1mBE,580
15
+ tests/test_awards.py,sha256=bghSiCPy1tJmGGYxFs9uhijpRWtFiBtxIuHxq5k2-t8,1284
16
+ tests/test_cleaner.py,sha256=SXFsRUvVpc5gOiV1rOZoEcfw-MiKGS8pGNJNDUc1fvU,1672
17
+ tests/test_clr.py,sha256=gyECfdpjXgl3bQzR4_zS28GSAAd1jxR5fPr_sEmWQNo,814
18
+ tests/test_tlr.py,sha256=YSDLjgJcj3a24_wgerhHNDTxhEa1tjbldxwSMgNwsT4,1827
19
+ cdfidata-0.1.0.dist-info/METADATA,sha256=2hgGvrWdU71W98bvVyYj7T3P3uVnTe9oty8Uy16Ow8U,388
20
+ cdfidata-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
21
+ cdfidata-0.1.0.dist-info/top_level.txt,sha256=q-ti-k5gZ_-15nSh0kiVoq5UMK6uajqbb09uSXpHCag,15
22
+ cdfidata-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ cdfidata
2
+ tests
tests/__init__.py ADDED
File without changes
tests/conftest.py ADDED
@@ -0,0 +1,31 @@
1
+ import pytest
2
+ from cdfidata.sources.tlr import TLRLoader
3
+ from cdfidata.sources.clr import CLRLoader
4
+ from cdfidata.sources.awards import AwardsLoader
5
+
6
+
7
+ @pytest.fixture
8
+ def tlr_sample():
9
+ loader = TLRLoader()
10
+ return loader.load_sample(n=500)
11
+
12
+
13
+ @pytest.fixture
14
+ def tlr_loader():
15
+ loader = TLRLoader()
16
+ loader.load_sample(n=500)
17
+ return loader
18
+
19
+
20
+ @pytest.fixture
21
+ def clr_loader():
22
+ loader = CLRLoader()
23
+ loader.load_sample(n=500)
24
+ return loader
25
+
26
+
27
+ @pytest.fixture
28
+ def awards_loader():
29
+ loader = AwardsLoader()
30
+ loader.load_sample(n=200)
31
+ return loader
tests/test_awards.py ADDED
@@ -0,0 +1,50 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from cdfidata.sources.awards import AwardsLoader
4
+
5
+
6
+ def test_load_sample_returns_dataframe():
7
+ loader = AwardsLoader()
8
+ df = loader.load_sample(n=100)
9
+ assert isinstance(df, pd.DataFrame)
10
+ assert len(df) == 100
11
+
12
+
13
+ def test_load_sample_has_required_columns():
14
+ loader = AwardsLoader()
15
+ df = loader.load_sample(n=100)
16
+ required = ["awardee_name", "state", "award_year", "award_amount", "program"]
17
+ for col in required:
18
+ assert col in df.columns
19
+
20
+
21
+ def test_filter_state(awards_loader):
22
+ df = awards_loader.filter_state("IL")
23
+ assert all(df["state"] == "IL")
24
+
25
+
26
+ def test_filter_program(awards_loader):
27
+ df = awards_loader.filter_program("FA")
28
+ assert all(df["program"].str.contains("FA", case=False))
29
+
30
+
31
+ def test_filter_year(awards_loader):
32
+ df = awards_loader.filter_year(2020)
33
+ assert all(df["award_year"] == 2020)
34
+
35
+
36
+ def test_summary_runs(awards_loader):
37
+ awards_loader.summary()
38
+
39
+
40
+ def test_not_loaded_raises():
41
+ loader = AwardsLoader()
42
+ with pytest.raises(RuntimeError, match="No data loaded"):
43
+ loader.filter_state("IL")
44
+
45
+
46
+ def test_to_csv(awards_loader, tmp_path):
47
+ path = str(tmp_path / "awards.csv")
48
+ awards_loader.to_csv(path)
49
+ df = pd.read_csv(path)
50
+ assert len(df) == 200
tests/test_cleaner.py ADDED
@@ -0,0 +1,56 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from cdfidata.pipeline.cleaner import (
4
+ normalize_columns, cast_types, clean_strings,
5
+ drop_empty_rows, standardize
6
+ )
7
+
8
+
9
+ def test_normalize_columns():
10
+ df = pd.DataFrame({"AMOUNT": [1, 2], "STATE": ["IL", "NY"]})
11
+ col_map = {"AMOUNT": "amount", "STATE": "state"}
12
+ result = normalize_columns(df, col_map)
13
+ assert "amount" in result.columns
14
+ assert "state" in result.columns
15
+
16
+
17
+ def test_cast_types_float():
18
+ df = pd.DataFrame({"amount": ["1000.50", "2000.00", "abc"]})
19
+ result = cast_types(df, {"amount": "float"})
20
+ assert result["amount"].dtype == float
21
+ assert pd.isna(result["amount"].iloc[2])
22
+
23
+
24
+ def test_cast_types_bool():
25
+ df = pd.DataFrame({"flag": ["Y", "N", "YES", "NO", "1", "0"]})
26
+ result = cast_types(df, {"flag": "bool"})
27
+ assert result["flag"].iloc[0] == True
28
+ assert result["flag"].iloc[1] == False
29
+
30
+
31
+ def test_clean_strings():
32
+ df = pd.DataFrame({"name": [" IL ", " NY "]})
33
+ result = clean_strings(df)
34
+ assert result["name"].iloc[0] == "IL"
35
+
36
+
37
+ def test_drop_empty_rows():
38
+ df = pd.DataFrame({"amount": [100, None, 200], "state": ["IL", "NY", None]})
39
+ result = drop_empty_rows(df, required_cols=["amount"])
40
+ assert len(result) == 2
41
+
42
+
43
+ def test_standardize_pipeline():
44
+ df = pd.DataFrame({
45
+ "AMOUNT": ["1000", "2000", None],
46
+ "STATE": [" IL ", " NY ", " CA "],
47
+ })
48
+ result = standardize(
49
+ df,
50
+ column_map={"AMOUNT": "amount", "STATE": "state"},
51
+ dtype_map={"amount": "float"},
52
+ required_cols=["amount"],
53
+ )
54
+ assert "amount" in result.columns
55
+ assert "state" in result.columns
56
+ assert len(result) == 2
tests/test_clr.py ADDED
@@ -0,0 +1,33 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from cdfidata.sources.clr import CLRLoader
4
+
5
+
6
+ def test_load_sample_returns_dataframe():
7
+ loader = CLRLoader()
8
+ df = loader.load_sample(n=100)
9
+ assert isinstance(df, pd.DataFrame)
10
+ assert len(df) == 100
11
+
12
+
13
+ def test_load_sample_has_required_columns():
14
+ loader = CLRLoader()
15
+ df = loader.load_sample(n=100)
16
+ required = ["fiscal_year", "state", "number_of_loans", "total_amount"]
17
+ for col in required:
18
+ assert col in df.columns
19
+
20
+
21
+ def test_filter_state(clr_loader):
22
+ df = clr_loader.filter_state("IL")
23
+ assert all(df["state"] == "IL")
24
+
25
+
26
+ def test_summary_runs(clr_loader):
27
+ clr_loader.summary()
28
+
29
+
30
+ def test_not_loaded_raises():
31
+ loader = CLRLoader()
32
+ with pytest.raises(RuntimeError, match="No data loaded"):
33
+ loader.filter_state("IL")
tests/test_tlr.py ADDED
@@ -0,0 +1,71 @@
1
+ import pytest
2
+ import pandas as pd
3
+ from cdfidata.sources.tlr import TLRLoader
4
+
5
+
6
+ def test_load_sample_returns_dataframe():
7
+ loader = TLRLoader()
8
+ df = loader.load_sample(n=100)
9
+ assert isinstance(df, pd.DataFrame)
10
+ assert len(df) == 100
11
+
12
+
13
+ def test_load_sample_has_required_columns():
14
+ loader = TLRLoader()
15
+ df = loader.load_sample(n=100)
16
+ required = ["fiscal_year", "amount", "state", "loan_type", "purpose"]
17
+ for col in required:
18
+ assert col in df.columns
19
+
20
+
21
+ def test_load_sample_amounts_positive():
22
+ loader = TLRLoader()
23
+ df = loader.load_sample(n=100)
24
+ assert (df["amount"] > 0).all()
25
+
26
+
27
+ def test_filter_state(tlr_loader):
28
+ df = tlr_loader.filter_state("IL")
29
+ assert all(df["state"] == "IL")
30
+
31
+
32
+ def test_filter_state_empty(tlr_loader):
33
+ df = tlr_loader.filter_state("ZZ")
34
+ assert len(df) == 0
35
+
36
+
37
+ def test_filter_loan_type(tlr_loader):
38
+ df = tlr_loader.filter_loan_type("Business")
39
+ assert all(df["loan_type"].str.contains("Business", case=False))
40
+
41
+
42
+ def test_filter_amount(tlr_loader):
43
+ df = tlr_loader.filter_amount(min_amount=10_000, max_amount=100_000)
44
+ assert all(df["amount"] >= 10_000)
45
+ assert all(df["amount"] <= 100_000)
46
+
47
+
48
+ def test_not_loaded_raises():
49
+ loader = TLRLoader()
50
+ with pytest.raises(RuntimeError, match="No data loaded"):
51
+ loader.filter_state("IL")
52
+
53
+
54
+ def test_summary_runs(tlr_loader):
55
+ tlr_loader.summary()
56
+
57
+
58
+ def test_to_csv(tlr_loader, tmp_path):
59
+ path = str(tmp_path / "tlr.csv")
60
+ tlr_loader.to_csv(path)
61
+ df = pd.read_csv(path)
62
+ assert len(df) == 500
63
+
64
+
65
+ def test_to_sqlite(tlr_loader, tmp_path):
66
+ path = str(tmp_path / "tlr.db")
67
+ tlr_loader.to_sqlite(path, table="tlr")
68
+ import sqlite3
69
+ with sqlite3.connect(path) as conn:
70
+ df = pd.read_sql("SELECT * FROM tlr", conn)
71
+ assert len(df) == 500