morpc 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morpc/__init__.py +5 -0
- morpc/census/__init__.py +1 -0
- morpc/census/census.py +344 -0
- morpc/frictionless/__init__.py +1 -0
- morpc/frictionless/frictionless.py +621 -0
- morpc/morpc.py +2168 -0
- morpc-0.2.0.dist-info/METADATA +28 -0
- morpc-0.2.0.dist-info/RECORD +10 -0
- morpc-0.2.0.dist-info/WHEEL +5 -0
- morpc-0.2.0.dist-info/top_level.txt +1 -0
morpc/__init__.py
ADDED
morpc/census/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .census import *
|
morpc/census/census.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
ACS_MISSING_VALUES = ["","-222222222","-333333333","-555555555","-666666666","-888888888","-999999999"]
|
|
2
|
+
|
|
3
|
+
ACS_PRIMARY_KEY = "GEO_ID"
|
|
4
|
+
|
|
5
|
+
ACS_ID_FIELDS = {
|
|
6
|
+
"blockgroup": [
|
|
7
|
+
{"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
|
|
8
|
+
{"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
|
|
9
|
+
{"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
|
|
10
|
+
{"name":"COUNTY","type":"string","description":"Unique identifier for county in which geography is located"},
|
|
11
|
+
{"name":"TRACT","type":"string","description":"Unique identifier for tract in which geography is located"}
|
|
12
|
+
],
|
|
13
|
+
"tract": [
|
|
14
|
+
{"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
|
|
15
|
+
{"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
|
|
16
|
+
{"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
|
|
17
|
+
{"name":"COUNTY","type":"string","description":"Unique identifier for county in which geography is located"}
|
|
18
|
+
],
|
|
19
|
+
"county": [
|
|
20
|
+
{"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
|
|
21
|
+
{"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
|
|
22
|
+
{"name":"NAME", "type":"string", "description":"Name by which geography is known"},
|
|
23
|
+
{"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
|
|
24
|
+
{"name":"COUNTY","type":"string","description":"Unique identifier for county in which geography is located"},
|
|
25
|
+
],
|
|
26
|
+
"state": [
|
|
27
|
+
{"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
|
|
28
|
+
{"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
|
|
29
|
+
{"name":"NAME", "type":"string", "description":"Name by which geography is known"},
|
|
30
|
+
{"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
|
|
31
|
+
],
|
|
32
|
+
"msa": [
|
|
33
|
+
{"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
|
|
34
|
+
{"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
|
|
35
|
+
{"name":"NAME", "type":"string", "description":"Name by which geography is known"}
|
|
36
|
+
],
|
|
37
|
+
"division": [
|
|
38
|
+
{"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
|
|
39
|
+
{"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
|
|
40
|
+
{"name":"NAME", "type":"string", "description":"Name by which geography is known"},
|
|
41
|
+
],
|
|
42
|
+
"us": [
|
|
43
|
+
{"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
|
|
44
|
+
{"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
|
|
45
|
+
{"name":"NAME", "type":"string", "description":"Name by which geography is known"}
|
|
46
|
+
]
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
ACS_STANDARD_AGEGROUP_MAP = {
|
|
50
|
+
'Under 5 years': 'Under 5 years',
|
|
51
|
+
'5 to 9 years': '5 to 9 years',
|
|
52
|
+
'10 to 14 years': '10 to 14 years',
|
|
53
|
+
'15 to 17 years': '15 to 19 years',
|
|
54
|
+
'18 and 19 years': '15 to 19 years',
|
|
55
|
+
'20 years': '20 to 24 years',
|
|
56
|
+
'21 years': '20 to 24 years',
|
|
57
|
+
'22 to 24 years': '20 to 24 years',
|
|
58
|
+
'25 to 29 years': '25 to 29 years',
|
|
59
|
+
'30 to 34 years': '30 to 34 years',
|
|
60
|
+
'35 to 39 years': '35 to 39 years',
|
|
61
|
+
'40 to 44 years': '40 to 44 years',
|
|
62
|
+
'45 to 49 years': '45 to 49 years',
|
|
63
|
+
'50 to 54 years': '50 to 54 years',
|
|
64
|
+
'55 to 59 years': '55 to 59 years',
|
|
65
|
+
'60 and 61 years': '60 to 64 years',
|
|
66
|
+
'62 to 64 years': '60 to 64 years',
|
|
67
|
+
'65 and 66 years': '65 to 69 years',
|
|
68
|
+
'67 to 69 years': '65 to 69 years',
|
|
69
|
+
'70 to 74 years': '70 to 74 years',
|
|
70
|
+
'75 to 79 years': '75 to 79 years',
|
|
71
|
+
'80 to 84 years': '80 to 84 years',
|
|
72
|
+
'85 years and over': '85 years and over'
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
ACS_AGEGROUP_SORT_ORDER = {
|
|
76
|
+
'Under 5 years': 1,
|
|
77
|
+
'5 to 9 years': 2,
|
|
78
|
+
'10 to 14 years': 3,
|
|
79
|
+
'15 to 19 years': 4,
|
|
80
|
+
'20 to 24 years': 5,
|
|
81
|
+
'25 to 29 years': 6,
|
|
82
|
+
'30 to 34 years': 7,
|
|
83
|
+
'35 to 39 years': 8,
|
|
84
|
+
'40 to 44 years': 9,
|
|
85
|
+
'45 to 49 years': 10,
|
|
86
|
+
'50 to 54 years': 11,
|
|
87
|
+
'55 to 59 years': 12,
|
|
88
|
+
'60 to 64 years': 13,
|
|
89
|
+
'65 to 69 years': 14,
|
|
90
|
+
'70 to 74 years': 15,
|
|
91
|
+
'75 to 79 years': 16,
|
|
92
|
+
'80 to 84 years': 17,
|
|
93
|
+
'85 years and over': 18
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# acsGet() is a low-level wrapper for Census API requests that returns the results as a pandas dataframe. If necessary, it
|
|
97
|
+
# splits the request into several smaller requests to bypass the 50-variable limit imposed by the API. The resulting dataframe
|
|
98
|
+
# is indexed by GEOID (regardless of whether it was requested) and omits other fields that are not requested but which are returned
|
|
99
|
+
# automatically with each API request (e.g. "state", "county")
|
|
100
|
+
#
|
|
101
|
+
# Input parameters:
|
|
102
|
+
# - url is the base URL of the desired Census API endpoint. For example:
|
|
103
|
+
#
|
|
104
|
+
# https://api.census.gov/data/2022/acs/acs1
|
|
105
|
+
#
|
|
106
|
+
# - params is a dict (in requests format) the parameters for the query string to be sent to the Census API. For example:
|
|
107
|
+
#
|
|
108
|
+
# {
|
|
109
|
+
# "get": "GEO_ID,NAME,B01001_001E",
|
|
110
|
+
# "for": "county:049,041",
|
|
111
|
+
# "in": "state:39"
|
|
112
|
+
# }
|
|
113
|
+
|
|
114
|
+
# - varBatchSize is an integer representing the number of variables to request in each batch. If unspecified it defaults to
|
|
115
|
+
# 20 and is limited to 49 (to respect the API limit while allowing for inclusion of GEO_ID with each request)
|
|
116
|
+
#
|
|
117
|
+
# - verbose is a boolean. If True, the function will display text updates of its status, otherwise it will be silent.
|
|
118
|
+
#
|
|
119
|
+
# Returns:
|
|
120
|
+
# - pandas dataframe indexed by GEO_ID and having a column for each requested variable
|
|
121
|
+
#
|
|
122
|
+
def acs_get(url, params, varBatchSize=20, verbose=True):
|
|
123
|
+
import json # We need json to make a deep copy of the params dict
|
|
124
|
+
import requests
|
|
125
|
+
import pandas as pd
|
|
126
|
+
|
|
127
|
+
# We need to reserve one variable in each batch for GEO_ID. If the user requests more than 49 variables per
|
|
128
|
+
# batch, reduce the batch size to 49 to respect the API limit
|
|
129
|
+
if(varBatchSize > 49):
|
|
130
|
+
print("WARNING: Requested variable batch size exceeds API limit. Reducing batch size to 50 (including GEO_ID).")
|
|
131
|
+
varBatchSize = 49
|
|
132
|
+
|
|
133
|
+
# Extract a list of all of the requested variables from the request parameters
|
|
134
|
+
allVars = params["get"].split(",")
|
|
135
|
+
if(verbose == True):
|
|
136
|
+
print("Total variables requested: {}".format(len(allVars)))
|
|
137
|
+
|
|
138
|
+
remainingVars = allVars
|
|
139
|
+
requestCount = 1
|
|
140
|
+
while(len(remainingVars) > 0):
|
|
141
|
+
if(verbose == True):
|
|
142
|
+
print("Starting request #{0}. {1} variables remain.".format(requestCount, len(remainingVars)))
|
|
143
|
+
|
|
144
|
+
# Create a short list of variables to download in this batch. Reserve one place for GEO_ID
|
|
145
|
+
shortList = remainingVars[0:varBatchSize-2]
|
|
146
|
+
# Check to see if GEO_ID was already included in the short list. If not, append it to the list.
|
|
147
|
+
# If so, try to append another variable from the list of remaining variables. In either case,
|
|
148
|
+
# remove the items in the shortlist from the list of remaining variables.
|
|
149
|
+
if(not "GEO_ID" in shortList):
|
|
150
|
+
shortList.append("GEO_ID")
|
|
151
|
+
remainingVars = remainingVars[varBatchSize-2:]
|
|
152
|
+
else:
|
|
153
|
+
try:
|
|
154
|
+
shortList.append(remainingVars[varBatchSize-2])
|
|
155
|
+
except:
|
|
156
|
+
pass
|
|
157
|
+
remainingVars = remainingVars[varBatchSize-1:]
|
|
158
|
+
|
|
159
|
+
# Create a set of API query parameters for this request. It will be a copy of the original parameters,
|
|
160
|
+
# but with the list of variables replaced by the short list
|
|
161
|
+
shortListParams = json.loads(json.dumps(params))
|
|
162
|
+
shortListParams["get"] = ",".join(shortList)
|
|
163
|
+
|
|
164
|
+
# Send the API request. Throw an error if the resulting status code indicates a failure condition.
|
|
165
|
+
r = requests.get(url, params=shortListParams)
|
|
166
|
+
if(r.status_code != 200):
|
|
167
|
+
print("ERROR: Request finished with status {}.".format(r.status_code))
|
|
168
|
+
print("Request URL: " + r.url)
|
|
169
|
+
print("Response text: " + r.text)
|
|
170
|
+
raise RuntimeError
|
|
171
|
+
|
|
172
|
+
# Extract the JSON-formatted records from the response
|
|
173
|
+
records = r.json()
|
|
174
|
+
|
|
175
|
+
# The first record is actually the column headers. Remove this from the list of records and keep it.
|
|
176
|
+
columns = records.pop(0)
|
|
177
|
+
|
|
178
|
+
# Construct a temporary pandas dataframe from the records
|
|
179
|
+
df = pd.DataFrame.from_records(records, columns=columns)
|
|
180
|
+
|
|
181
|
+
# Extract only the requested columns (plus GEO_ID) from the dataframe. This has the effect of removing
|
|
182
|
+
# unrequested variables like "state" and "county"
|
|
183
|
+
df = df.filter(items=shortList, axis="columns")
|
|
184
|
+
|
|
185
|
+
# If this is our first request, construct the output dataframe by copying the temporary one. Otherwise,
|
|
186
|
+
# join the temporary dataframe to the existing one using the GEO_ID.
|
|
187
|
+
if(requestCount == 1):
|
|
188
|
+
acsData = df.set_index("GEO_ID").copy()
|
|
189
|
+
else:
|
|
190
|
+
acsData = acsData.join(df.set_index("GEO_ID"))
|
|
191
|
+
|
|
192
|
+
requestCount += 1
|
|
193
|
+
|
|
194
|
+
return acsData
|
|
195
|
+
|
|
196
|
+
# acs_label_to_dimensions obtains the data dimensions associated with a particular variable by decomposing the "Label" column as described in the
|
|
197
|
+
# Census API variable list, e.g. https://api.census.gov/data/2022/acs/acs5/variables.html. There is a label associated with each variable provided
|
|
198
|
+
# by the API. For example, one label (for B25127_004E) looks like this:
|
|
199
|
+
#
|
|
200
|
+
# Estimate!!Total:!!Owner occupied:!!Built 2020 or later:!!1, detached or attached
|
|
201
|
+
#
|
|
202
|
+
# The dimensions for the variable are simply the collections of words are separated by ":!!". For example, "Owner occupied" refers to tenure, "Built 2020 or later"
|
|
203
|
+
# refers to the structure age, and "1, detached or attached" refers to the structure configuration or class. Thus, the dimensions might be described as follows:
|
|
204
|
+
# dimensionNames = ["Tenure","Structure age","Structure class"]
|
|
205
|
+
#
|
|
206
|
+
# Inputs:
|
|
207
|
+
# - labelSeries is a pandas Series object that contains a set of labels, one for each ACS variable of interest. The indices of this series typically should match
|
|
208
|
+
# the dataframe that you want to join the dimension values to.
|
|
209
|
+
# - dimensionNames is a list contains descriptions of the dimensions represented by each element in the label. These will be used as column headers in the output
|
|
210
|
+
# dataframe. If dimensionNames is not provided, no column headers will be assigned.
|
|
211
|
+
#
|
|
212
|
+
# Outputs:
|
|
213
|
+
# - df is a dataframe where each record represents the set of dimensions for an ACS variable and each column represents the value of one dimension for that
|
|
214
|
+
# variable. Continuing with the example above, a truncated output may look like this:
|
|
215
|
+
#
|
|
216
|
+
# | | Tenure | Struture age | Structure class |
|
|
217
|
+
# |--------------|----------------|---------------------|-------------------------|
|
|
218
|
+
# | B25127_004E | Owner occupied | Built 2020 or later | 1, detached or attached |
|
|
219
|
+
#
|
|
220
|
+
def acs_label_to_dimensions(labelSeries, dimensionNames=None):
|
|
221
|
+
import numpy as np
|
|
222
|
+
import pandas as pd
|
|
223
|
+
labelSeries = labelSeries \
|
|
224
|
+
.apply(lambda x:x.split("|")[0]) \
|
|
225
|
+
.str.strip() \
|
|
226
|
+
.str.replace("!!","") \
|
|
227
|
+
.apply(lambda x:x.split(":"))
|
|
228
|
+
df = labelSeries \
|
|
229
|
+
.apply(pd.Series) \
|
|
230
|
+
.drop(columns=0) \
|
|
231
|
+
.replace("", np.nan)
|
|
232
|
+
if(type(dimensionNames) == list):
|
|
233
|
+
df.columns = dimensionNames
|
|
234
|
+
return df
|
|
235
|
+
|
|
236
|
+
# From a raw ACS data extract produced by morpc-acs-fetch, produce a table that includes the
|
|
237
|
+
# the universe (total) estimate and MOE for the indicated variable
|
|
238
|
+
#
|
|
239
|
+
# acsDataRaw is a pandas dataframe resulting from using from reading an output of morpc-census-acs-fetch as follows:
|
|
240
|
+
#
|
|
241
|
+
# resource = frictionless.Resource(ACS_COUNTY_RESOURCE_SOURCE_PATH)
|
|
242
|
+
# acsDataRaw = resource.to_pandas()
|
|
243
|
+
#
|
|
244
|
+
# universeVar is the ACS variable included in acsDataRaw that represents the universe/total. Omit the "E" or "M" suffix.
|
|
245
|
+
# For example: universeVar = "B25003_001"
|
|
246
|
+
def acs_generate_universe_table(acsDataRaw, universeVar):
|
|
247
|
+
import pandas as pd
|
|
248
|
+
|
|
249
|
+
acsUniverse = acsDataRaw.copy() \
|
|
250
|
+
.filter(like=universeVar, axis="columns") \
|
|
251
|
+
.rename(columns=(lambda x:("Universe" if x[-1] == "E" else "Universe MOE"))) \
|
|
252
|
+
.reset_index()
|
|
253
|
+
acsUniverse["GEOID"] = acsUniverse["GEO_ID"].apply(lambda x:x.split("US")[1])
|
|
254
|
+
acsUniverse = acsUniverse \
|
|
255
|
+
.set_index("GEOID") \
|
|
256
|
+
.filter(items=["NAME","Universe","Universe MOE"], axis="columns")
|
|
257
|
+
|
|
258
|
+
return acsUniverse
|
|
259
|
+
|
|
260
|
+
# From a raw ACS data extract produced by morpc-acs-fetch, produce a table that includes the
|
|
261
|
+
# the universe (total) estimate and MOE for the indicated variable
|
|
262
|
+
#
|
|
263
|
+
# acsDataRaw is a pandas dataframe resulting from using from reading an output of morpc-census-acs-fetch as follows:
|
|
264
|
+
#
|
|
265
|
+
# resource = frictionless.Resource(ACS_COUNTY_RESOURCE_SOURCE_PATH)
|
|
266
|
+
# acsDataRaw = resource.to_pandas()
|
|
267
|
+
#
|
|
268
|
+
# universeVar is the ACS variable included in acsDataRaw that represents the universe/total. Omit the "E" or "M" suffix.
|
|
269
|
+
# For example: universeVar = "B25003_001"
|
|
270
|
+
def acs_generate_dimension_table(acsDataRaw, schema, idFields, dimensionNames):
|
|
271
|
+
import pandas as pd
|
|
272
|
+
import frictionless
|
|
273
|
+
import morpc
|
|
274
|
+
|
|
275
|
+
# Convert the GEOID to short form. Melt the data from wide to long form. Create a descripton field containing the variable label provided by the Census API.
|
|
276
|
+
dimensionTable = acsDataRaw.copy().reset_index()
|
|
277
|
+
dimensionTable["GEOID"] = dimensionTable["GEO_ID"].apply(lambda x:x.split("US")[1])
|
|
278
|
+
dimensionTable = dimensionTable \
|
|
279
|
+
.drop(columns=idFields) \
|
|
280
|
+
.melt(id_vars=["GEOID"], var_name="Variable", value_name='Value')
|
|
281
|
+
dimensionTable["description"] = dimensionTable["Variable"].map(morpc.frictionless_name_to_desc_map(schema))
|
|
282
|
+
|
|
283
|
+
# Split the description string into dimensions and drop the description. Add a field annotating whether the variable is a margin of error or an estimate.
|
|
284
|
+
# Show example results for Franklin County so it is possible to get a sense of the dimensions.
|
|
285
|
+
dimensionTable = dimensionTable \
|
|
286
|
+
.join(acs_label_to_dimensions(dimensionTable['description'], dimensionNames=dimensionNames), how="left") \
|
|
287
|
+
.drop(columns=["description"])
|
|
288
|
+
dimensionTable["Variable type"] = dimensionTable["Variable"].apply(lambda x:("Estimate" if x[-1]=="E" else "MOE"))
|
|
289
|
+
|
|
290
|
+
return dimensionTable
|
|
291
|
+
|
|
292
|
+
# Sometimes ACS data has one dimension that represents subclasses of another. For example, see this excerpt from C24030 (Sex by Industry)
|
|
293
|
+
# which shows subclasses for agriculture, forestry, etc. However some top level categories - such as construciton - do not have subclasses.
|
|
294
|
+
# acs_flatten_category identifies the top level categories that have no subclasses and flattens those categories with the subclasses. This
|
|
295
|
+
# allows for more convenient comparison and summarizing across industries. It is likely that there is a more intuitive or efficient way to
|
|
296
|
+
# do this.
|
|
297
|
+
#
|
|
298
|
+
# For example, this is what C24030 (partial) looks like before flattening:
|
|
299
|
+
#
|
|
300
|
+
# Label United States!!Estimate
|
|
301
|
+
# Total: 162590221
|
|
302
|
+
# Male: 85740285
|
|
303
|
+
# Agriculture, forestry, fishing and hunting, and mining: 1984422
|
|
304
|
+
# Agriculture, forestry, fishing and hunting 1453344
|
|
305
|
+
# Mining, quarrying, and oil and gas extraction 531078
|
|
306
|
+
# Construction 9968254
|
|
307
|
+
# Manufacturing 11394524
|
|
308
|
+
# Wholesale trade 2467558
|
|
309
|
+
# Retail trade 9453931
|
|
310
|
+
#
|
|
311
|
+
# This is what it looks like after flattening. Note that the top level category for agriculture, etc was dropped (actually, the
|
|
312
|
+
# entire field for the top-level category is dropped).
|
|
313
|
+
#
|
|
314
|
+
# Label United States!!Estimate
|
|
315
|
+
# Total: 162590221
|
|
316
|
+
# Male: 85740285
|
|
317
|
+
# Agriculture, forestry, fishing and hunting 1453344
|
|
318
|
+
# Mining, quarrying, and oil and gas extraction 531078
|
|
319
|
+
# Construction 9968254
|
|
320
|
+
# Manufacturing 11394524
|
|
321
|
+
# Wholesale trade 2467558
|
|
322
|
+
# Retail trade 9453931
|
|
323
|
+
#
|
|
324
|
+
# inDf is a pandas dataframe that was created using acs_generate_dimension_table()
|
|
325
|
+
#
|
|
326
|
+
# categoryField is a string representing the field name of the field that holds top-level categories.
|
|
327
|
+
#
|
|
328
|
+
# subclassField is a string representing the field name of the field that holds the sub-classes
|
|
329
|
+
def acs_flatten_category(inDf, categoryField, subclassField):
|
|
330
|
+
import pandas as pd
|
|
331
|
+
df = inDf.copy()
|
|
332
|
+
noSubClasses = []
|
|
333
|
+
for category in df[categoryField].dropna().unique():
|
|
334
|
+
uniqueByCategory = df.loc[df[categoryField] == category].dropna(subset=subclassField)[subclassField].unique()
|
|
335
|
+
if(len(uniqueByCategory) == 0):
|
|
336
|
+
noSubClasses.append(category)
|
|
337
|
+
|
|
338
|
+
df = df.dropna(subset=categoryField)
|
|
339
|
+
temp = df.filter(items=[categoryField, subclassField], axis="columns").copy()
|
|
340
|
+
temp = temp.loc[temp[categoryField].isin(noSubClasses)].copy()
|
|
341
|
+
temp[subclassField] = temp[categoryField]
|
|
342
|
+
df.update(temp)
|
|
343
|
+
df = df.drop(columns=categoryField)
|
|
344
|
+
return df
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .frictionless import *
|