morpc 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morpc/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ __version__ = "0.2.0"
2
+
3
+ from .morpc import *
4
+ from .frictionless import *
5
+ from .census import *
@@ -0,0 +1 @@
1
+ from .census import *
morpc/census/census.py ADDED
@@ -0,0 +1,344 @@
1
+ ACS_MISSING_VALUES = ["","-222222222","-333333333","-555555555","-666666666","-888888888","-999999999"]
2
+
3
+ ACS_PRIMARY_KEY = "GEO_ID"
4
+
5
+ ACS_ID_FIELDS = {
6
+ "blockgroup": [
7
+ {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
8
+ {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
9
+ {"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
10
+ {"name":"COUNTY","type":"string","description":"Unique identifier for county in which geography is located"},
11
+ {"name":"TRACT","type":"string","description":"Unique identifier for tract in which geography is located"}
12
+ ],
13
+ "tract": [
14
+ {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
15
+ {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
16
+ {"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
17
+ {"name":"COUNTY","type":"string","description":"Unique identifier for county in which geography is located"}
18
+ ],
19
+ "county": [
20
+ {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
21
+ {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
22
+ {"name":"NAME", "type":"string", "description":"Name by which geography is known"},
23
+ {"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
24
+ {"name":"COUNTY","type":"string","description":"Unique identifier for county in which geography is located"},
25
+ ],
26
+ "state": [
27
+ {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
28
+ {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
29
+ {"name":"NAME", "type":"string", "description":"Name by which geography is known"},
30
+ {"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
31
+ ],
32
+ "msa": [
33
+ {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
34
+ {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
35
+ {"name":"NAME", "type":"string", "description":"Name by which geography is known"}
36
+ ],
37
+ "division": [
38
+ {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
39
+ {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
40
+ {"name":"NAME", "type":"string", "description":"Name by which geography is known"},
41
+ ],
42
+ "us": [
43
+ {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
44
+ {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
45
+ {"name":"NAME", "type":"string", "description":"Name by which geography is known"}
46
+ ]
47
+ }
48
+
49
+ ACS_STANDARD_AGEGROUP_MAP = {
50
+ 'Under 5 years': 'Under 5 years',
51
+ '5 to 9 years': '5 to 9 years',
52
+ '10 to 14 years': '10 to 14 years',
53
+ '15 to 17 years': '15 to 19 years',
54
+ '18 and 19 years': '15 to 19 years',
55
+ '20 years': '20 to 24 years',
56
+ '21 years': '20 to 24 years',
57
+ '22 to 24 years': '20 to 24 years',
58
+ '25 to 29 years': '25 to 29 years',
59
+ '30 to 34 years': '30 to 34 years',
60
+ '35 to 39 years': '35 to 39 years',
61
+ '40 to 44 years': '40 to 44 years',
62
+ '45 to 49 years': '45 to 49 years',
63
+ '50 to 54 years': '50 to 54 years',
64
+ '55 to 59 years': '55 to 59 years',
65
+ '60 and 61 years': '60 to 64 years',
66
+ '62 to 64 years': '60 to 64 years',
67
+ '65 and 66 years': '65 to 69 years',
68
+ '67 to 69 years': '65 to 69 years',
69
+ '70 to 74 years': '70 to 74 years',
70
+ '75 to 79 years': '75 to 79 years',
71
+ '80 to 84 years': '80 to 84 years',
72
+ '85 years and over': '85 years and over'
73
+ }
74
+
75
+ ACS_AGEGROUP_SORT_ORDER = {
76
+ 'Under 5 years': 1,
77
+ '5 to 9 years': 2,
78
+ '10 to 14 years': 3,
79
+ '15 to 19 years': 4,
80
+ '20 to 24 years': 5,
81
+ '25 to 29 years': 6,
82
+ '30 to 34 years': 7,
83
+ '35 to 39 years': 8,
84
+ '40 to 44 years': 9,
85
+ '45 to 49 years': 10,
86
+ '50 to 54 years': 11,
87
+ '55 to 59 years': 12,
88
+ '60 to 64 years': 13,
89
+ '65 to 69 years': 14,
90
+ '70 to 74 years': 15,
91
+ '75 to 79 years': 16,
92
+ '80 to 84 years': 17,
93
+ '85 years and over': 18
94
+ }
95
+
96
+ # acsGet() is a low-level wrapper for Census API requests that returns the results as a pandas dataframe. If necessary, it
97
+ # splits the request into several smaller requests to bypass the 50-variable limit imposed by the API. The resulting dataframe
98
+ # is indexed by GEOID (regardless of whether it was requested) and omits other fields that are not requested but which are returned
99
+ # automatically with each API request (e.g. "state", "county")
100
+ #
101
+ # Input parameters:
102
+ # - url is the base URL of the desired Census API endpoint. For example:
103
+ #
104
+ # https://api.census.gov/data/2022/acs/acs1
105
+ #
106
+ # - params is a dict (in requests format) the parameters for the query string to be sent to the Census API. For example:
107
+ #
108
+ # {
109
+ # "get": "GEO_ID,NAME,B01001_001E",
110
+ # "for": "county:049,041",
111
+ # "in": "state:39"
112
+ # }
113
+
114
+ # - varBatchSize is an integer representing the number of variables to request in each batch. If unspecified it defaults to
115
+ # 20 and is limited to 49 (to respect the API limit while allowing for inclusion of GEO_ID with each request)
116
+ #
117
+ # - verbose is a boolean. If True, the function will display text updates of its status, otherwise it will be silent.
118
+ #
119
+ # Returns:
120
+ # - pandas dataframe indexed by GEO_ID and having a column for each requested variable
121
+ #
122
+ def acs_get(url, params, varBatchSize=20, verbose=True):
123
+ import json # We need json to make a deep copy of the params dict
124
+ import requests
125
+ import pandas as pd
126
+
127
+ # We need to reserve one variable in each batch for GEO_ID. If the user requests more than 49 variables per
128
+ # batch, reduce the batch size to 49 to respect the API limit
129
+ if(varBatchSize > 49):
130
+ print("WARNING: Requested variable batch size exceeds API limit. Reducing batch size to 50 (including GEO_ID).")
131
+ varBatchSize = 49
132
+
133
+ # Extract a list of all of the requested variables from the request parameters
134
+ allVars = params["get"].split(",")
135
+ if(verbose == True):
136
+ print("Total variables requested: {}".format(len(allVars)))
137
+
138
+ remainingVars = allVars
139
+ requestCount = 1
140
+ while(len(remainingVars) > 0):
141
+ if(verbose == True):
142
+ print("Starting request #{0}. {1} variables remain.".format(requestCount, len(remainingVars)))
143
+
144
+ # Create a short list of variables to download in this batch. Reserve one place for GEO_ID
145
+ shortList = remainingVars[0:varBatchSize-2]
146
+ # Check to see if GEO_ID was already included in the short list. If not, append it to the list.
147
+ # If so, try to append another variable from the list of remaining variables. In either case,
148
+ # remove the items in the shortlist from the list of remaining variables.
149
+ if(not "GEO_ID" in shortList):
150
+ shortList.append("GEO_ID")
151
+ remainingVars = remainingVars[varBatchSize-2:]
152
+ else:
153
+ try:
154
+ shortList.append(remainingVars[varBatchSize-2])
155
+ except:
156
+ pass
157
+ remainingVars = remainingVars[varBatchSize-1:]
158
+
159
+ # Create a set of API query parameters for this request. It will be a copy of the original parameters,
160
+ # but with the list of variables replaced by the short list
161
+ shortListParams = json.loads(json.dumps(params))
162
+ shortListParams["get"] = ",".join(shortList)
163
+
164
+ # Send the API request. Throw an error if the resulting status code indicates a failure condition.
165
+ r = requests.get(url, params=shortListParams)
166
+ if(r.status_code != 200):
167
+ print("ERROR: Request finished with status {}.".format(r.status_code))
168
+ print("Request URL: " + r.url)
169
+ print("Response text: " + r.text)
170
+ raise RuntimeError
171
+
172
+ # Extract the JSON-formatted records from the response
173
+ records = r.json()
174
+
175
+ # The first record is actually the column headers. Remove this from the list of records and keep it.
176
+ columns = records.pop(0)
177
+
178
+ # Construct a temporary pandas dataframe from the records
179
+ df = pd.DataFrame.from_records(records, columns=columns)
180
+
181
+ # Extract only the requested columns (plus GEO_ID) from the dataframe. This has the effect of removing
182
+ # unrequested variables like "state" and "county"
183
+ df = df.filter(items=shortList, axis="columns")
184
+
185
+ # If this is our first request, construct the output dataframe by copying the temporary one. Otherwise,
186
+ # join the temporary dataframe to the existing one using the GEO_ID.
187
+ if(requestCount == 1):
188
+ acsData = df.set_index("GEO_ID").copy()
189
+ else:
190
+ acsData = acsData.join(df.set_index("GEO_ID"))
191
+
192
+ requestCount += 1
193
+
194
+ return acsData
195
+
196
+ # acs_label_to_dimensions obtains the data dimensions associated with a particular variable by decomposing the "Label" column as described in the
197
+ # Census API variable list, e.g. https://api.census.gov/data/2022/acs/acs5/variables.html. There is a label associated with each variable provided
198
+ # by the API. For example, one label (for B25127_004E) looks like this:
199
+ #
200
+ # Estimate!!Total:!!Owner occupied:!!Built 2020 or later:!!1, detached or attached
201
+ #
202
+ # The dimensions for the variable are simply the collections of words are separated by ":!!". For example, "Owner occupied" refers to tenure, "Built 2020 or later"
203
+ # refers to the structure age, and "1, detached or attached" refers to the structure configuration or class. Thus, the dimensions might be described as follows:
204
+ # dimensionNames = ["Tenure","Structure age","Structure class"]
205
+ #
206
+ # Inputs:
207
+ # - labelSeries is a pandas Series object that contains a set of labels, one for each ACS variable of interest. The indices of this series typically should match
208
+ # the dataframe that you want to join the dimension values to.
209
+ # - dimensionNames is a list contains descriptions of the dimensions represented by each element in the label. These will be used as column headers in the output
210
+ # dataframe. If dimensionNames is not provided, no column headers will be assigned.
211
+ #
212
+ # Outputs:
213
+ # - df is a dataframe where each record represents the set of dimensions for an ACS variable and each column represents the value of one dimension for that
214
+ # variable. Continuing with the example above, a truncated output may look like this:
215
+ #
216
+ # | | Tenure | Struture age | Structure class |
217
+ # |--------------|----------------|---------------------|-------------------------|
218
+ # | B25127_004E | Owner occupied | Built 2020 or later | 1, detached or attached |
219
+ #
220
+ def acs_label_to_dimensions(labelSeries, dimensionNames=None):
221
+ import numpy as np
222
+ import pandas as pd
223
+ labelSeries = labelSeries \
224
+ .apply(lambda x:x.split("|")[0]) \
225
+ .str.strip() \
226
+ .str.replace("!!","") \
227
+ .apply(lambda x:x.split(":"))
228
+ df = labelSeries \
229
+ .apply(pd.Series) \
230
+ .drop(columns=0) \
231
+ .replace("", np.nan)
232
+ if(type(dimensionNames) == list):
233
+ df.columns = dimensionNames
234
+ return df
235
+
236
+ # From a raw ACS data extract produced by morpc-acs-fetch, produce a table that includes the
237
+ # the universe (total) estimate and MOE for the indicated variable
238
+ #
239
+ # acsDataRaw is a pandas dataframe resulting from using from reading an output of morpc-census-acs-fetch as follows:
240
+ #
241
+ # resource = frictionless.Resource(ACS_COUNTY_RESOURCE_SOURCE_PATH)
242
+ # acsDataRaw = resource.to_pandas()
243
+ #
244
+ # universeVar is the ACS variable included in acsDataRaw that represents the universe/total. Omit the "E" or "M" suffix.
245
+ # For example: universeVar = "B25003_001"
246
+ def acs_generate_universe_table(acsDataRaw, universeVar):
247
+ import pandas as pd
248
+
249
+ acsUniverse = acsDataRaw.copy() \
250
+ .filter(like=universeVar, axis="columns") \
251
+ .rename(columns=(lambda x:("Universe" if x[-1] == "E" else "Universe MOE"))) \
252
+ .reset_index()
253
+ acsUniverse["GEOID"] = acsUniverse["GEO_ID"].apply(lambda x:x.split("US")[1])
254
+ acsUniverse = acsUniverse \
255
+ .set_index("GEOID") \
256
+ .filter(items=["NAME","Universe","Universe MOE"], axis="columns")
257
+
258
+ return acsUniverse
259
+
260
+ # From a raw ACS data extract produced by morpc-acs-fetch, produce a table that includes the
261
+ # the universe (total) estimate and MOE for the indicated variable
262
+ #
263
+ # acsDataRaw is a pandas dataframe resulting from using from reading an output of morpc-census-acs-fetch as follows:
264
+ #
265
+ # resource = frictionless.Resource(ACS_COUNTY_RESOURCE_SOURCE_PATH)
266
+ # acsDataRaw = resource.to_pandas()
267
+ #
268
+ # universeVar is the ACS variable included in acsDataRaw that represents the universe/total. Omit the "E" or "M" suffix.
269
+ # For example: universeVar = "B25003_001"
270
+ def acs_generate_dimension_table(acsDataRaw, schema, idFields, dimensionNames):
271
+ import pandas as pd
272
+ import frictionless
273
+ import morpc
274
+
275
+ # Convert the GEOID to short form. Melt the data from wide to long form. Create a descripton field containing the variable label provided by the Census API.
276
+ dimensionTable = acsDataRaw.copy().reset_index()
277
+ dimensionTable["GEOID"] = dimensionTable["GEO_ID"].apply(lambda x:x.split("US")[1])
278
+ dimensionTable = dimensionTable \
279
+ .drop(columns=idFields) \
280
+ .melt(id_vars=["GEOID"], var_name="Variable", value_name='Value')
281
+ dimensionTable["description"] = dimensionTable["Variable"].map(morpc.frictionless_name_to_desc_map(schema))
282
+
283
+ # Split the description string into dimensions and drop the description. Add a field annotating whether the variable is a margin of error or an estimate.
284
+ # Show example results for Franklin County so it is possible to get a sense of the dimensions.
285
+ dimensionTable = dimensionTable \
286
+ .join(acs_label_to_dimensions(dimensionTable['description'], dimensionNames=dimensionNames), how="left") \
287
+ .drop(columns=["description"])
288
+ dimensionTable["Variable type"] = dimensionTable["Variable"].apply(lambda x:("Estimate" if x[-1]=="E" else "MOE"))
289
+
290
+ return dimensionTable
291
+
292
+ # Sometimes ACS data has one dimension that represents subclasses of another. For example, see this excerpt from C24030 (Sex by Industry)
293
+ # which shows subclasses for agriculture, forestry, etc. However some top level categories - such as construciton - do not have subclasses.
294
+ # acs_flatten_category identifies the top level categories that have no subclasses and flattens those categories with the subclasses. This
295
+ # allows for more convenient comparison and summarizing across industries. It is likely that there is a more intuitive or efficient way to
296
+ # do this.
297
+ #
298
+ # For example, this is what C24030 (partial) looks like before flattening:
299
+ #
300
+ # Label United States!!Estimate
301
+ # Total: 162590221
302
+ #     Male: 85740285
303
+ #         Agriculture, forestry, fishing and hunting, and mining: 1984422
304
+ #             Agriculture, forestry, fishing and hunting 1453344
305
+ #             Mining, quarrying, and oil and gas extraction 531078
306
+ #         Construction 9968254
307
+ #         Manufacturing 11394524
308
+ #         Wholesale trade 2467558
309
+ #         Retail trade 9453931
310
+ #
311
+ # This is what it looks like after flattening. Note that the top level category for agriculture, etc was dropped (actually, the
312
+ # entire field for the top-level category is dropped).
313
+ #
314
+ # Label United States!!Estimate
315
+ # Total: 162590221
316
+ #     Male: 85740285
317
+ #       Agriculture, forestry, fishing and hunting 1453344
318
+ #       Mining, quarrying, and oil and gas extraction 531078
319
+ #       Construction 9968254
320
+ #       Manufacturing 11394524
321
+ #       Wholesale trade 2467558
322
+ #       Retail trade 9453931
323
+ #
324
+ # inDf is a pandas dataframe that was created using acs_generate_dimension_table()
325
+ #
326
+ # categoryField is a string representing the field name of the field that holds top-level categories.
327
+ #
328
+ # subclassField is a string representing the field name of the field that holds the sub-classes
329
+ def acs_flatten_category(inDf, categoryField, subclassField):
330
+ import pandas as pd
331
+ df = inDf.copy()
332
+ noSubClasses = []
333
+ for category in df[categoryField].dropna().unique():
334
+ uniqueByCategory = df.loc[df[categoryField] == category].dropna(subset=subclassField)[subclassField].unique()
335
+ if(len(uniqueByCategory) == 0):
336
+ noSubClasses.append(category)
337
+
338
+ df = df.dropna(subset=categoryField)
339
+ temp = df.filter(items=[categoryField, subclassField], axis="columns").copy()
340
+ temp = temp.loc[temp[categoryField].isin(noSubClasses)].copy()
341
+ temp[subclassField] = temp[categoryField]
342
+ df.update(temp)
343
+ df = df.drop(columns=categoryField)
344
+ return df
@@ -0,0 +1 @@
1
+ from .frictionless import *