zipcode-features 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipcode_features/__init__.py +44 -3
- zipcode_features-0.0.5.dist-info/METADATA +203 -0
- zipcode_features-0.0.5.dist-info/RECORD +6 -0
- {zipcode_features-0.0.2.dist-info → zipcode_features-0.0.5.dist-info}/WHEEL +1 -1
- zipcode_features-0.0.2.dist-info/METADATA +0 -34
- zipcode_features-0.0.2.dist-info/RECORD +0 -6
- {zipcode_features-0.0.2.dist-info → zipcode_features-0.0.5.dist-info}/licenses/LICENSE +0 -0
- {zipcode_features-0.0.2.dist-info → zipcode_features-0.0.5.dist-info}/top_level.txt +0 -0
zipcode_features/__init__.py
CHANGED
|
@@ -1,9 +1,44 @@
|
|
|
1
|
-
__version__ = '0.0.
|
|
1
|
+
__version__ = '0.0.5'
|
|
2
2
|
|
|
3
3
|
import zipcodes
|
|
4
|
-
from
|
|
4
|
+
from zipcode3.search import SearchEngine
|
|
5
5
|
import pandas as pd
|
|
6
|
+
import json
|
|
6
7
|
|
|
8
|
+
def zipcode_mapper(x):
|
|
9
|
+
if x["ZIP_len"] == 3:
|
|
10
|
+
return "00" + x["ZIP"]
|
|
11
|
+
elif x["ZIP_len"] == 4:
|
|
12
|
+
return "0" + x["ZIP"]
|
|
13
|
+
else:
|
|
14
|
+
return x["ZIP"]
|
|
15
|
+
|
|
16
|
+
def _get_zip_to_cbsa_code() -> dict:
|
|
17
|
+
"""
|
|
18
|
+
This method gets a mapping from zipcode to cbsa code
|
|
19
|
+
mapping is of the form:
|
|
20
|
+
{"zip code": "cbsa code"}
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
df = pd.read_csv("CBSA_ZIP_122025.csv", dtype={'ZIP': str, "CBSA": str})
|
|
24
|
+
df["ZIP_len"] = df["ZIP"].apply(lambda x: len(x))
|
|
25
|
+
df["ZIP"] = df.apply(zipcode_mapper, axis=1)
|
|
26
|
+
return df[["ZIP", "CBSA"]].to_dict()
|
|
27
|
+
|
|
28
|
+
def _get_cbsa_code_to_cbsa_name() -> dict:
|
|
29
|
+
"""
|
|
30
|
+
This method gets a mapping from cbsa code to name
|
|
31
|
+
{cbsa code: cbsa name}
|
|
32
|
+
"""
|
|
33
|
+
code_to_name = json.load(open("cbsa_codes.json"))
|
|
34
|
+
df = pd.DataFrame(columns=["code", "name"])
|
|
35
|
+
df["name"] = code_to_name.values()
|
|
36
|
+
df["code"] = code_to_name.keys()
|
|
37
|
+
df["name"] = df["name"].str.replace(" -", "-")
|
|
38
|
+
df["name"] = df["name"].str.split().str.join(' ')
|
|
39
|
+
return df.to_dict()
|
|
40
|
+
|
|
41
|
+
|
|
7
42
|
def us_get_demographics(state: str, city: str = None, zip_list: list = None) -> pd.DataFrame:
|
|
8
43
|
"""
|
|
9
44
|
This gets demographic information for associated with zipcodes in the United States of America.
|
|
@@ -43,4 +78,10 @@ def us_get_demographics(state: str, city: str = None, zip_list: list = None) ->
|
|
|
43
78
|
tmp_dict = zipcode_and_demo[index][1].to_dict()
|
|
44
79
|
tmp_dict["zip_code"] = zipcode_and_demo[index][0]
|
|
45
80
|
demographics.append(tmp_dict)
|
|
46
|
-
|
|
81
|
+
df = pd.DataFrame(demographics)
|
|
82
|
+
zip_to_cbsa = _get_zip_to_cbsa_code()
|
|
83
|
+
df["cbsa"] = df["zip_code"].map(zip_to_cbsa)
|
|
84
|
+
cbsa_code_to_name = _get_cbsa_code_to_cbsa_name()
|
|
85
|
+
df["cbsa_name"] = df["cbsa"].map(cbsa_code_to_name)
|
|
86
|
+
return df
|
|
87
|
+
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zipcode_features
|
|
3
|
+
Version: 0.0.5
|
|
4
|
+
Summary: A tool to get features based on census data from zipcodes
|
|
5
|
+
Home-page: https://github.com/EricSchles/zipcode_features
|
|
6
|
+
Author: Eric Schles
|
|
7
|
+
Author-email: ericschles@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: zipcodes
|
|
20
|
+
Requires-Dist: pandas
|
|
21
|
+
Requires-Dist: zipcode3
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: summary
|
|
32
|
+
|
|
33
|
+
# zipcode features
|
|
34
|
+
|
|
35
|
+
similar to [uszipcode-project](https://github.com/EricSchles/uszipcode-project)
|
|
36
|
+
|
|
37
|
+
## Getting CBSA mapping
|
|
38
|
+
|
|
39
|
+
If you need CBSA data you can append it to the dataframe with the following example:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from zipcode_features import us_get_demographics
|
|
43
|
+
import pandas as pd
|
|
44
|
+
|
|
45
|
+
def _get_cbsa_data():
|
|
46
|
+
return pd.read_excel(
|
|
47
|
+
"https://github.com/EricSchles/zipcode_features/raw/refs/heads/main/zipcode_features/CBSA_ZIP_122025.xlsx",
|
|
48
|
+
sheet_name='Export Worksheet'
|
|
49
|
+
)[["CBSA", "ZIP"]]
|
|
50
|
+
|
|
51
|
+
demo = us_get_demographics(state="NY")
|
|
52
|
+
cbsa_zip_map = _get_cbsa_data()
|
|
53
|
+
df = pd.merge(demo, cbsa_zip_map, how="left", left_on="zipcode", right_on="ZIP")
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
For the semantic names you can get them [here](https://www2.census.gov/programs-surveys/cps/methodology/2015%20Geography%20Cover.pdf).
|
|
57
|
+
|
|
58
|
+
Here's a python script to parse them:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
import urllib.request
|
|
62
|
+
import PyPDF2
|
|
63
|
+
import json
|
|
64
|
+
import re
|
|
65
|
+
import io
|
|
66
|
+
|
|
67
|
+
def fetch_cbsa_to_json():
|
|
68
|
+
url = "https://www2.census.gov/programs-surveys/cps/methodology/2015%20Geography%20Cover.pdf"
|
|
69
|
+
|
|
70
|
+
print("Downloading Census PDF...")
|
|
71
|
+
# Using a User-Agent to ensure the request isn't blocked by the server
|
|
72
|
+
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
response = urllib.request.urlopen(req)
|
|
76
|
+
pdf_bytes = io.BytesIO(response.read())
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"Failed to download PDF: {e}")
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
print("Parsing PDF...")
|
|
82
|
+
reader = PyPDF2.PdfReader(pdf_bytes)
|
|
83
|
+
|
|
84
|
+
cbsa_mapping = {}
|
|
85
|
+
|
|
86
|
+
# Regular expression to match a 5-digit FIPS/CBSA code followed by the area name
|
|
87
|
+
# Example match: "11460 Ann Arbor, MI"
|
|
88
|
+
pattern = re.compile(r'\b(\d{5})\s+(.+?)(?=\s+\d{5}|\n|$)')
|
|
89
|
+
|
|
90
|
+
for page in reader.pages:
|
|
91
|
+
text = page.extract_text()
|
|
92
|
+
if text:
|
|
93
|
+
matches = pattern.findall(text)
|
|
94
|
+
for code, name in matches:
|
|
95
|
+
# Clean up any trailing spaces or artifacts
|
|
96
|
+
clean_name = name.strip()
|
|
97
|
+
# Exclude standalone numbers or random headers that might get caught
|
|
98
|
+
if len(clean_name) > 2 and not clean_name.isdigit():
|
|
99
|
+
cbsa_mapping[code] = clean_name
|
|
100
|
+
|
|
101
|
+
print(f"Extracted {len(cbsa_mapping)} CBSA codes.")
|
|
102
|
+
|
|
103
|
+
# Save the mapping to a JSON file
|
|
104
|
+
output_file = 'cbsa_codes.json'
|
|
105
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
106
|
+
json.dump(cbsa_mapping, f, indent=4)
|
|
107
|
+
|
|
108
|
+
print(f"Successfully saved to {output_file}")
|
|
109
|
+
|
|
110
|
+
if __name__ == "__main__":
|
|
111
|
+
fetch_cbsa_to_json()
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Here's a working example for using this with the above:
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
import requests
|
|
118
|
+
from zipcode_features import us_get_demographics
|
|
119
|
+
import pandas as pd
|
|
120
|
+
|
|
121
|
+
def _get_cbsa_data():
|
|
122
|
+
return pd.read_excel(
|
|
123
|
+
"https://github.com/EricSchles/zipcode_features/raw/refs/heads/main/zipcode_features/CBSA_ZIP_122025.xlsx",
|
|
124
|
+
sheet_name='Export Worksheet'
|
|
125
|
+
)[["CBSA", "ZIP"]]
|
|
126
|
+
|
|
127
|
+
demo = us_get_demographics(state="NY")
|
|
128
|
+
cbsa_zip_map = _get_cbsa_data()
|
|
129
|
+
df = pd.merge(demo, cbsa_zip_map, how="left", left_on="zipcode", right_on="ZIP")
|
|
130
|
+
df = df.drop("ZIP", axis=1)
|
|
131
|
+
mapping = requests.get("https://raw.githubusercontent.com/EricSchles/zipcode_features/refs/heads/main/zipcode_features/cbsa_codes.json").json()
|
|
132
|
+
df["cbsa_name"] = df["CBSA"].map(mapping)
|
|
133
|
+
df = df.drop("CBSA", axis=1)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Adding County
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from zipcode_features import us_get_demographics
|
|
141
|
+
import pandas as pd
|
|
142
|
+
|
|
143
|
+
def _get_fips_data():
|
|
144
|
+
df = pd.read_excel(
|
|
145
|
+
"https://github.com/EricSchles/zipcode_features/raw/refs/heads/main/zipcode_features/ZIP_COUNTY_122025.xlsx",
|
|
146
|
+
dtype={'ZIP': 'str'},
|
|
147
|
+
sheet_name='Export Worksheet'
|
|
148
|
+
)[["COUNTY", "ZIP"]]
|
|
149
|
+
df["COUNTY"] = df['COUNTY'].astype(str)
|
|
150
|
+
return df.dropna()
|
|
151
|
+
|
|
152
|
+
demo = us_get_demographics(state="NY")
|
|
153
|
+
fips_zip_map = _get_fips_data()
|
|
154
|
+
df = pd.merge(demo, fips_zip_map, how="left", left_on="zipcode", right_on="ZIP")
|
|
155
|
+
df = df.drop("ZIP", axis=1)
|
|
156
|
+
df = df.dropna()
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Adding Regional Prices
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
python -m pip install beaapi us
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
from zipcode_features import us_get_demographics
|
|
167
|
+
import pandas as pd
|
|
168
|
+
import beaapi
|
|
169
|
+
import us
|
|
170
|
+
|
|
171
|
+
df = us_get_demographics(state="NY")
|
|
172
|
+
|
|
173
|
+
# get your key here: https://apps.bea.gov/API/signup/
|
|
174
|
+
beakey = ""
|
|
175
|
+
|
|
176
|
+
dataset="Regional"
|
|
177
|
+
table = "SARPP"
|
|
178
|
+
regional_cpi = beaapi.get_data(
|
|
179
|
+
userid=beakey,
|
|
180
|
+
method='GetData',
|
|
181
|
+
datasetname=dataset, # National Income and Product Accounts
|
|
182
|
+
tablename=table, # Table 1.1.1
|
|
183
|
+
GeoFips="STATE",
|
|
184
|
+
LineCode="1",
|
|
185
|
+
ResultFormat="json"
|
|
186
|
+
#Frequency='A', # Annual data
|
|
187
|
+
)[["GeoName", "DataValue"]]
|
|
188
|
+
regional_cpi = regional_cpi[regional_cpi["GeoName"] != "United States"]
|
|
189
|
+
regional_cpi["year"] = ["2020", "2021", "2022", "2023", "2024"] * 51
|
|
190
|
+
abbreviations_map = us.states.mapping('name', 'abbr')
|
|
191
|
+
regional_cpi["state"] = regional_cpi["GeoName"].map(abbreviations_map)
|
|
192
|
+
regional_cpi["cpi"] = regional_cpi["DataValue"]
|
|
193
|
+
regional_cpi = regional_cpi.drop("DataValue", axis=1)
|
|
194
|
+
regional_cpi = regional_cpi[regional_cpi["year"] == "2024"]
|
|
195
|
+
regional_cpi["cpi_year"] = regional_cpi["year"]
|
|
196
|
+
regional_cpi.drop("year", axis=1)
|
|
197
|
+
df = pd.merge(df, regional_cpi, how='left', on="state")
|
|
198
|
+
df["regional_cpi"] = df["cpi"]
|
|
199
|
+
df = df.drop("cpi", axis=1)
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
zipcode_features/__init__.py,sha256=H4M7B3fzFk-FEgIbfZMdhNP9w4yMQZrAy7ZG0z3sHMs,2727
|
|
2
|
+
zipcode_features-0.0.5.dist-info/licenses/LICENSE,sha256=HDbMJ7oItmxTn3jVtZFi6jUFAHovset5jzAPUderjOc,1073
|
|
3
|
+
zipcode_features-0.0.5.dist-info/METADATA,sha256=beUdSoQfPbegVeQFypOIpsKhHqTTSDCipCK0JdOdTvM,6238
|
|
4
|
+
zipcode_features-0.0.5.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
5
|
+
zipcode_features-0.0.5.dist-info/top_level.txt,sha256=ijGAxdXHaO43tVlCj3Kn05dj-hkXAv1pXBEq1Yj8mt0,17
|
|
6
|
+
zipcode_features-0.0.5.dist-info/RECORD,,
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: zipcode_features
|
|
3
|
-
Version: 0.0.2
|
|
4
|
-
Summary: A tool to get features based on census data from zipcodes
|
|
5
|
-
Home-page: https://github.com/EricSchles/zipcode_features
|
|
6
|
-
Author: Eric Schles
|
|
7
|
-
Author-email: ericschles@gmail.com
|
|
8
|
-
License: MIT
|
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.6
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
-
Description-Content-Type: text/markdown
|
|
18
|
-
License-File: LICENSE
|
|
19
|
-
Requires-Dist: zipcodes
|
|
20
|
-
Requires-Dist: pandas
|
|
21
|
-
Dynamic: author
|
|
22
|
-
Dynamic: author-email
|
|
23
|
-
Dynamic: classifier
|
|
24
|
-
Dynamic: description
|
|
25
|
-
Dynamic: description-content-type
|
|
26
|
-
Dynamic: home-page
|
|
27
|
-
Dynamic: license
|
|
28
|
-
Dynamic: license-file
|
|
29
|
-
Dynamic: requires-dist
|
|
30
|
-
Dynamic: summary
|
|
31
|
-
|
|
32
|
-
# zipcode features
|
|
33
|
-
|
|
34
|
-
similar to [uszipcode-project](https://github.com/EricSchles/uszipcode-project)
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
zipcode_features/__init__.py,sha256=TCSZkr4RP81Nl3lGTfoRMwJa_ASA4pOGfpOPwd-eRZE,1475
|
|
2
|
-
zipcode_features-0.0.2.dist-info/licenses/LICENSE,sha256=HDbMJ7oItmxTn3jVtZFi6jUFAHovset5jzAPUderjOc,1073
|
|
3
|
-
zipcode_features-0.0.2.dist-info/METADATA,sha256=RK72-UXOg_G_v44_BOwALk8TZSVpsGtGlLi-KtI4BZw,1074
|
|
4
|
-
zipcode_features-0.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
5
|
-
zipcode_features-0.0.2.dist-info/top_level.txt,sha256=ijGAxdXHaO43tVlCj3Kn05dj-hkXAv1pXBEq1Yj8mt0,17
|
|
6
|
-
zipcode_features-0.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|