morpc 0.3.3__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {morpc-0.3.3 → morpc-0.3.4}/PKG-INFO +1 -1
  2. {morpc-0.3.3 → morpc-0.3.4}/morpc/__init__.py +1 -1
  3. {morpc-0.3.3 → morpc-0.3.4}/morpc/census/census.py +41 -2
  4. {morpc-0.3.3 → morpc-0.3.4}/morpc/frictionless/frictionless.py +19 -5
  5. {morpc-0.3.3 → morpc-0.3.4}/morpc/morpc.py +208 -40
  6. {morpc-0.3.3 → morpc-0.3.4}/morpc.egg-info/PKG-INFO +1 -1
  7. {morpc-0.3.3 → morpc-0.3.4}/.gitattributes +0 -0
  8. {morpc-0.3.3 → morpc-0.3.4}/.github/workflows/deploy.yml +0 -0
  9. {morpc-0.3.3 → morpc-0.3.4}/.github/workflows/python-publish.yml +0 -0
  10. {morpc-0.3.3 → morpc-0.3.4}/.gitignore +0 -0
  11. {morpc-0.3.3 → morpc-0.3.4}/README.md +0 -0
  12. {morpc-0.3.3 → morpc-0.3.4}/docs/.gitignore +0 -0
  13. {morpc-0.3.3 → morpc-0.3.4}/docs/.ipynb_checkpoints/index-checkpoint.md +0 -0
  14. {morpc-0.3.3 → morpc-0.3.4}/docs/.ipynb_checkpoints/morpc-color-demo-checkpoint.ipynb +0 -0
  15. {morpc-0.3.3 → morpc-0.3.4}/docs/.ipynb_checkpoints/myst-checkpoint.yml +0 -0
  16. {morpc-0.3.3 → morpc-0.3.4}/docs/01-morpc-py-demos.ipynb +0 -0
  17. {morpc-0.3.3 → morpc-0.3.4}/docs/02-morpc-countylookup-demo.ipynb +0 -0
  18. {morpc-0.3.3 → morpc-0.3.4}/docs/03-morpc-varlookup-demo.ipynb +0 -0
  19. {morpc-0.3.3 → morpc-0.3.4}/docs/04-morpc-restapi-demo.ipynb +0 -0
  20. {morpc-0.3.3 → morpc-0.3.4}/docs/05-morpc-geos-demo.ipynb +0 -0
  21. {morpc-0.3.3 → morpc-0.3.4}/docs/06-morpc-frictionless-demo.ipynb +0 -0
  22. {morpc-0.3.3 → morpc-0.3.4}/docs/07-morpc-census-demo.ipynb +0 -0
  23. {morpc-0.3.3 → morpc-0.3.4}/docs/08-morpc-plot-demo.ipynb +0 -0
  24. {morpc-0.3.3 → morpc-0.3.4}/docs/09-morpc-color-demo.ipynb +0 -0
  25. {morpc-0.3.3 → morpc-0.3.4}/docs/assets/HORIZONTAL_LOGOS_PRIMARY_COLOR_V2.png +0 -0
  26. {morpc-0.3.3 → morpc-0.3.4}/docs/index.md +0 -0
  27. {morpc-0.3.3 → morpc-0.3.4}/docs/myst.yml +0 -0
  28. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/MORPC MPO Boundary.gpkg +0 -0
  29. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/Screenshot 2025-06-03 080403.png +0 -0
  30. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/dataChartToExcelOutput.xlsx +0 -0
  31. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/plot_df.csv +0 -0
  32. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/plot_df.resource.yaml +0 -0
  33. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/plot_df.schema.yaml +0 -0
  34. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/rest_resource.json +0 -0
  35. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/temp_df.csv +0 -0
  36. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/temp_df.resource.yaml +0 -0
  37. {morpc-0.3.3 → morpc-0.3.4}/docs/temp_data/temp_df.schema.yaml +0 -0
  38. {morpc-0.3.3 → morpc-0.3.4}/morpc/census/__init__.py +0 -0
  39. {morpc-0.3.3 → morpc-0.3.4}/morpc/color/.ipynb_checkpoints/color-checkpoint.py +0 -0
  40. {morpc-0.3.3 → morpc-0.3.4}/morpc/color/.ipynb_checkpoints/morpc_colors-checkpoint.json +0 -0
  41. {morpc-0.3.3 → morpc-0.3.4}/morpc/color/__init__.py +0 -0
  42. {morpc-0.3.3 → morpc-0.3.4}/morpc/color/color.py +0 -0
  43. {morpc-0.3.3 → morpc-0.3.4}/morpc/color/morpc_colors.json +0 -0
  44. {morpc-0.3.3 → morpc-0.3.4}/morpc/color/palette.py +0 -0
  45. {morpc-0.3.3 → morpc-0.3.4}/morpc/frictionless/__init__.py +0 -0
  46. {morpc-0.3.3 → morpc-0.3.4}/morpc/plot/__init__.py +0 -0
  47. {morpc-0.3.3 → morpc-0.3.4}/morpc/plot/plot.py +0 -0
  48. {morpc-0.3.3 → morpc-0.3.4}/morpc/rest_api/__init__.py +0 -0
  49. {morpc-0.3.3 → morpc-0.3.4}/morpc/rest_api/rest_api.py +0 -0
  50. {morpc-0.3.3 → morpc-0.3.4}/morpc.egg-info/SOURCES.txt +0 -0
  51. {morpc-0.3.3 → morpc-0.3.4}/morpc.egg-info/dependency_links.txt +0 -0
  52. {morpc-0.3.3 → morpc-0.3.4}/morpc.egg-info/requires.txt +0 -0
  53. {morpc-0.3.3 → morpc-0.3.4}/morpc.egg-info/top_level.txt +0 -0
  54. {morpc-0.3.3 → morpc-0.3.4}/pyproject.toml +0 -0
  55. {morpc-0.3.3 → morpc-0.3.4}/release_new_package.md +0 -0
  56. {morpc-0.3.3 → morpc-0.3.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: morpc
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: Data managment tools used by MORPC
5
5
  Author-email: MORPC data team <dataandmaps@morpc.org>
6
6
  License-Expression: MIT
@@ -1,4 +1,4 @@
1
- __version__ = "0.3.3"
1
+ __version__ = "0.3.4"
2
2
 
3
3
  from .morpc import *
4
4
  import morpc.frictionless
@@ -10,14 +10,20 @@ ACS_ID_FIELDS = {
10
10
  {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
11
11
  {"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
12
12
  {"name":"COUNTY","type":"string","description":"Unique identifier for county in which geography is located"},
13
- {"name":"TRACT","type":"string","description":"Unique identifier for tract in which geography is located"}
13
+ {"name":"TRACT","type":"string","description":"Unique identifier for tract in which geography is located"}
14
14
  ],
15
15
  "tract": [
16
16
  {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
17
17
  {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
18
18
  {"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
19
19
  {"name":"COUNTY","type":"string","description":"Unique identifier for county in which geography is located"}
20
- ],
20
+ ],
21
+ "county subdivision": [
22
+ {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
23
+ {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
24
+ {"name":"STATE","type":"string","description":"Unique identifier for state in which geography is located"},
25
+ {"name":"COUNTY","type":"string","description":"Unique identifier for county in which geography is located"}
26
+ ],
21
27
  "county": [
22
28
  {"name":"GEO_ID", "type":"string", "description":"Unique identifier for geography"},
23
29
  {"name":"SUMLEVEL", "type":"string", "description":"Code representing the geographic summary level for the data"},
@@ -278,6 +284,39 @@ def api_get(url, params, varBatchSize=20, verbose=True):
278
284
  # |--------------|----------------|---------------------|-------------------------|
279
285
  # | B25127_004E | Owner occupied | Built 2020 or later | 1, detached or attached |
280
286
  #
287
+
288
+ def acs_variables_by_group(groupNumber, acsYear, acsSurvey):
289
+ """
290
+ Get a list of all variables that are in a census variable group.
291
+
292
+ Parameters
293
+ ----------
294
+ groupNumber : str
295
+ The group number to search for within the variables table. ie. B11001
296
+
297
+ acsYear : str
298
+ The year of the survey. ie. 2023
299
+
300
+ acsSurvey : str
301
+ The acs survey to get variables for. ie. 1 or 5
302
+
303
+ Returns
304
+ -------
305
+ dict
306
+ A dict of the variables in the group and related fields.
307
+ """
308
+ import requests
309
+ import json
310
+
311
+ r = requests.get(f'https://api.census.gov/data/{acsYear}/acs/acs{acsSurvey}/variables.json')
312
+ json = r.json()
313
+
314
+ variables = {}
315
+ for variable in json['variables']:
316
+ if json['variables'][variable]['group'] == groupNumber:
317
+ variables[variable] = json['variables'][variable]
318
+ return variables
319
+
281
320
  def acs_label_to_dimensions(labelSeries, dimensionNames=None):
282
321
  """
283
322
  acs_label_to_dimensions(labelSeries, dimensionNames=None)
@@ -36,7 +36,7 @@ def name_to_desc_map(schema):
36
36
 
37
37
  # Given a dataframe and the Frictionless Schema object (see load_schema), recast each of the fields in the
38
38
  # dataframe to the data type specified in the schema.
39
- def cast_field_types(df, schema, forceInteger=False, handleMissingFields="error", verbose=True):
39
+ def cast_field_types(df, schema, forceInteger=False, forceInt64=False, handleMissingFields="error", verbose=True):
40
40
  import frictionless
41
41
  import pandas as pd
42
42
  import shapely
@@ -64,8 +64,13 @@ def cast_field_types(df, schema, forceInteger=False, handleMissingFields="error"
64
64
  # the field must be cast as "Int64" instead.
65
65
  if((fieldType == "int") or (fieldType == "integer")):
66
66
  try:
67
- # Try to cast the field as an "int". This will fail if nulls are present.
68
- outDF[fieldName] = outDF[fieldName].astype("int")
67
+ if(forceInt64 == True):
68
+ # Cast all integer fields as Int64 whether this is necessary or not. This is useful when trying to merge
69
+ # dataframes with mixed int32 and Int64 values.
70
+ outDF[fieldName] = outDF[fieldName].astype("Int64")
71
+ else:
72
+ # Try to cast the field as an "int". This will fail if nulls are present.
73
+ outDF[fieldName] = outDF[fieldName].astype("int")
69
74
  except:
70
75
  try:
71
76
  # Try to cast as "Int64", which supports nulls. This will fail if the fractional part is non-zero.
@@ -472,7 +477,7 @@ def validate_resource(resourcePath, verbose=True):
472
477
  print(results)
473
478
  return False
474
479
 
475
- def load_data(resourcePath, archiveDir=None, validate=False, verbose=True):
480
+ def load_data(resourcePath, archiveDir=None, validate=False, forceInteger=False, forceInt64=False, verbose=True):
476
481
  """Often we want to make a copy of some input data and work with the copy, for example to protect
477
482
  the original data or to create an archival copy of it so that we can replicate the process later.
478
483
  The `load_data()` function simplifies the process of reading the data and
@@ -488,6 +493,15 @@ def load_data(resourcePath, archiveDir=None, validate=False, verbose=True):
488
493
  validate : bool
489
494
  Optional. If True, the resource file, schema file, and data file will be validated. If archiveDir is
490
495
  specified, the copies of the files will be validated. If not, the original files will be validated.
496
+ Defaults to False.
497
+ forceInteger : bool
498
+ Optional. If True, then try harder to cast integer fields. This may involve rounding the values to the ones places.
499
+ Defaults to False.
500
+ forceInt64 : bool
501
+ Optional. If True, then cast all integer fields as Int64 regardless of whether this is necessary. This is useful
502
+ when trying to merge dataframes which would otherwise have mixed int32 and Int64 fields. Defaults to False.
503
+ verbose : bool
504
+ Optional. If False, then most output will be suppressed. Defaults to True.
491
505
 
492
506
  Returns
493
507
  -------
@@ -559,7 +573,7 @@ def load_data(resourcePath, archiveDir=None, validate=False, verbose=True):
559
573
  print("morpc.load_data | ERROR | Unknown data file extension: {}".format(dataFileExtension))
560
574
  raise RuntimeError
561
575
 
562
- df = cast_field_types(df, resource.schema, verbose=verbose)
576
+ df = cast_field_types(df, resource.schema, forceInteger=forceInteger, forceInt64=forceInt64, verbose=verbose)
563
577
 
564
578
  return df, resource, resource.schema
565
579
 
@@ -196,7 +196,7 @@ SUMLEVEL_DESCRIPTIONS = {
196
196
  "plural":"states",
197
197
  "hierarchy_string":"STATE",
198
198
  "authority":"census",
199
- "idField":"STATEID",
199
+ "idField":"STATEFP",
200
200
  "nameField":"STATE"
201
201
  },
202
202
  '050': {
@@ -204,7 +204,7 @@ SUMLEVEL_DESCRIPTIONS = {
204
204
  "plural":"counties",
205
205
  "hierarchy_string":"COUNTY",
206
206
  "authority":"census",
207
- "idField":"COUNTYID",
207
+ "idField":"COUNTYFP",
208
208
  "nameField":"COUNTY"
209
209
  },
210
210
  '060': {
@@ -212,7 +212,7 @@ SUMLEVEL_DESCRIPTIONS = {
212
212
  "plural":"county subdivisions",
213
213
  "hierarchy_string":"COUNTY-COUSUB",
214
214
  "authority":"census",
215
- "idField":"COUSUBID",
215
+ "idField":"COUSUBFP",
216
216
  "nameField":"COUSUB"
217
217
  },
218
218
  '070': {
@@ -230,24 +230,24 @@ SUMLEVEL_DESCRIPTIONS = {
230
230
  "plural":"census blocks",
231
231
  "hierarchy_string":"COUNTY-TRACT-BG-BLOCK",
232
232
  "authority":"census",
233
- "idField":"BLOCKID",
234
- "nameField":"BLOCK"
235
- },
233
+ "idField":"BLOCKCE",
234
+ "nameField":None
235
+ },
236
236
  '140': {
237
237
  "singular":"tract",
238
238
  "plural":"tracts",
239
239
  "hierarchy_string":"COUNTY-TRACT",
240
240
  "authority":"census",
241
- "idField":"TRACTID",
242
- "nameField":"TRACT"
241
+ "idField":"TRACTCE",
242
+ "nameField":None
243
243
  },
244
244
  '150': {
245
245
  "singular":"block group",
246
246
  "plural":"block groups",
247
247
  "hierarchy_string":"COUNTY-TRACT-BG",
248
248
  "authority":"census",
249
- "idField":"BLOCKGROUPID",
250
- "nameField":"BLOCKGROUP"
249
+ "idField":"BLKGRPCE",
250
+ "nameField":None
251
251
  },
252
252
  '155': {
253
253
  "singular":"place county part",
@@ -262,7 +262,7 @@ SUMLEVEL_DESCRIPTIONS = {
262
262
  "plural":"places",
263
263
  "hierarchy_string":"PLACE",
264
264
  "authority":"census",
265
- "idField":"PLACEID",
265
+ "idField":"PLACEFP",
266
266
  "nameField":"PLACE"
267
267
  },
268
268
  '310': {
@@ -270,7 +270,7 @@ SUMLEVEL_DESCRIPTIONS = {
270
270
  "plural":"metro areas",
271
271
  "hierarchy_string":"CBSA",
272
272
  "authority":"census",
273
- "idField":"CBSAID",
273
+ "idField":"CBSAFP",
274
274
  "nameField":"CBSA"
275
275
  },
276
276
  '400': {
@@ -278,7 +278,7 @@ SUMLEVEL_DESCRIPTIONS = {
278
278
  "plural":"urban areas",
279
279
  "hierarchy_string":"URBANAREA",
280
280
  "authority":"census",
281
- "idField":"URBANAREAID",
281
+ "idField":"UACE",
282
282
  "nameField":"URBANAREA"
283
283
  },
284
284
  '500': {
@@ -286,7 +286,7 @@ SUMLEVEL_DESCRIPTIONS = {
286
286
  "plural":"congressional districts",
287
287
  "hierarchy_string":"CONGRESS",
288
288
  "authority":"census",
289
- "idField":"CONGRESSID",
289
+ "idField":"CDFP", # Census uses CDNNNFP where NNN is the congressional session number
290
290
  "nameField":"CONGRESS"
291
291
  },
292
292
  '610': {
@@ -294,23 +294,23 @@ SUMLEVEL_DESCRIPTIONS = {
294
294
  "plural":"state senate districts",
295
295
  "hierarchy_string":"STATESENATE",
296
296
  "authority":"census",
297
- "idField":"STATESENATEID",
298
- "nameField":"STATESENATE"
297
+ "idField":"SLDUST",
298
+ "nameField":None
299
299
  },
300
300
  '620': {
301
301
  "singular":"state house district",
302
302
  "plural":"state house districts",
303
303
  "hierarchy_string":"STATEHOUSE",
304
304
  "authority":"census",
305
- "idField":"STATEHOUSEID",
306
- "nameField":"STATEHOUSE"
305
+ "idField":"SLDLST",
306
+ "nameField":None
307
307
  },
308
308
  '795': {
309
309
  "singular":"public use microdata area",
310
310
  "plural":"public use microdata areas",
311
311
  "hierarchy_string":"PUMA",
312
312
  "authority":"census",
313
- "idField":"PUMAID",
313
+ "idField":"PUMACE",
314
314
  "nameField":"PUMA"
315
315
  },
316
316
  '850': {
@@ -318,7 +318,7 @@ SUMLEVEL_DESCRIPTIONS = {
318
318
  "plural":"zip code tabulation areas",
319
319
  "hierarchy_string":"ZCTA3",
320
320
  "authority":"census",
321
- "idField":"ZCTA3",
321
+ "idField":"ZCTA3CE",
322
322
  "nameField":None
323
323
  },
324
324
  '860': {
@@ -326,7 +326,7 @@ SUMLEVEL_DESCRIPTIONS = {
326
326
  "plural":"zip code tabulation area",
327
327
  "hierarchy_string":"ZCTA5",
328
328
  "authority":"census",
329
- "idField":"ZCTA5",
329
+ "idField":"ZCTA5CE",
330
330
  "nameField":None
331
331
  },
332
332
  '930': {
@@ -340,25 +340,25 @@ SUMLEVEL_DESCRIPTIONS = {
340
340
  '950': {
341
341
  "singular":"elementary school district",
342
342
  "plural":"elementary school districts",
343
- "hierarchy_string":"SDELEM",
343
+ "hierarchy_string":"ELSD",
344
344
  "authority":"census",
345
- "idField":"SCHOOLDELEMID",
345
+ "idField":"ELSDLEA",
346
346
  "nameField":"SCHOOLDELEM"
347
347
  },
348
348
  '960': {
349
349
  "singular":"high school district",
350
350
  "plural":"high school districts",
351
- "hierarchy_string":"SDHIGH",
351
+ "hierarchy_string":"SCSD",
352
352
  "authority":"census",
353
- "idField":"SCHOOLDHIGHID",
353
+ "idField":"SCSDLEA",
354
354
  "nameField":"SCHOOLDHIGH"
355
355
  },
356
356
  '970': {
357
357
  "singular":"unified school district",
358
358
  "plural":"unified school districts",
359
- "hierarchy_string":"SDUNIFIED",
359
+ "hierarchy_string":"UNSD",
360
360
  "authority":"census",
361
- "idField":"SCHOOLDID",
361
+ "idField":"UNSDLEA",
362
362
  "nameField":"SCHOOLD"
363
363
  },
364
364
  'M01': {
@@ -446,7 +446,7 @@ SUMLEVEL_DESCRIPTIONS = {
446
446
  "plural":"Traffic analysis zones",
447
447
  "hierarchy_string":"COUNTY-TAZ",
448
448
  "authority":"morpc",
449
- "idField":"TAZ",
449
+ "idField":"TAZ2020",
450
450
  "nameField":None
451
451
  },
452
452
  'M21': {
@@ -454,7 +454,7 @@ SUMLEVEL_DESCRIPTIONS = {
454
454
  "plural":"Micro analysis zones",
455
455
  "hierarchy_string":"COUNTY-TAZ-MAZ",
456
456
  "authority":"morpc",
457
- "idField":"MAZ",
457
+ "idField":"MAZ2020",
458
458
  "nameField":None
459
459
  },
460
460
  'M22': {
@@ -462,7 +462,7 @@ SUMLEVEL_DESCRIPTIONS = {
462
462
  "plural":"GridMAZ zones",
463
463
  "hierarchy_string":"COUNTY-TAZ-MAZ-GRIDMAZ",
464
464
  "authority":"morpc",
465
- "idField":"GridMAZ",
465
+ "idField":"GridMAZ20",
466
466
  "nameField":None
467
467
  },
468
468
  }
@@ -473,7 +473,6 @@ SUMLEVEL_DESCRIPTIONS = {
473
473
  # GRID1MILE
474
474
  # GRIDQUARTERMILE
475
475
  # COUNTY-COUSUB-SCD
476
- # COUNTY-TRACT-BG-BLOCK
477
476
  # RESBLOB
478
477
  # EMPBLOB
479
478
  # GQBLOB
@@ -817,7 +816,7 @@ def avro_map_from_first_alias(schema):
817
816
  return fieldMap
818
817
 
819
818
  # Wrapper for backward compatibility
820
- def cast_field_types(df, schema, forceInteger=False, handleMissingFields='error', verbose=True):
819
+ def cast_field_types(df, schema, forceInteger=False, forceInt64=False, handleMissingFields='error', verbose=True):
821
820
  """
822
821
  Wrapper for backward compatibility with AVRO Schema
823
822
 
@@ -825,15 +824,15 @@ def cast_field_types(df, schema, forceInteger=False, handleMissingFields='error'
825
824
  import morpc
826
825
  # If schema is a dict object, assume it is in Avro format
827
826
  if(type(schema) == dict):
828
- outDF = avro_cast_field_types(df, schema, forceInteger=forceInteger, verbose=verbose)
827
+ outDF = avro_cast_field_types(df, schema, forceInteger=forceInteger, forceInt64=forceInt64, verbose=verbose)
829
828
  # Otherwise, assume it is in Frictionless format
830
829
  else:
831
- outDF = morpc.frictionless.cast_field_types(df, schema, forceInteger=forceInteger, handleMissingFields=handleMissingFields, verbose=verbose)
830
+ outDF = morpc.frictionless.cast_field_types(df, schema, forceInteger=forceInteger, forceInt64=forceInt64, handleMissingFields=handleMissingFields, verbose=verbose)
832
831
  return outDF
833
832
 
834
833
  # Given a dataframe and the Avro dictionary object that describes its schema (see load_avro_schema), recast each of the fields in the dataframe
835
834
  # to the data type specified in the schema.
836
- def avro_cast_field_types(df, schema, forceInteger=False, verbose=True):
835
+ def avro_cast_field_types(df, schema, forceInteger=False, forceInt64=False, verbose=True):
837
836
  outDF = df.copy()
838
837
  for field in schema["fields"]:
839
838
  fieldName = field["name"]
@@ -844,8 +843,13 @@ def avro_cast_field_types(df, schema, forceInteger=False, verbose=True):
844
843
  # the field must be cast as "Int64" instead.
845
844
  if((fieldType == "int") or (fieldType == "integer")):
846
845
  try:
847
- # Try to cast the field as an "int". This will fail if nulls are present.
848
- outDF[fieldName] = outDF[fieldName].astype(fieldType)
846
+ if(forceInt64 == True):
847
+ # Cast all integer fields as Int64 whether this is necessary or not. This is useful when trying to merge
848
+ # dataframes with mixed int32 and Int64 values.
849
+ outDF[fieldName] = outDF[fieldName].astype("Int64")
850
+ else:
851
+ # Try to cast the field as an "int". This will fail if nulls are present.
852
+ outDF[fieldName] = outDF[fieldName].astype("int")
849
853
  except:
850
854
  try:
851
855
  # Try to cast as "Int64", which supports nulls. This will fail if the fractional part is non-zero.
@@ -1038,6 +1042,139 @@ def load_spatial_data(sourcePath, layerName=None, driverName=None, archiveDir=No
1038
1042
 
1039
1043
  return gdf
1040
1044
 
1045
+ # Load tabular data
1046
+ def load_tabular_data(sourcePath, sheetName=None, fileType=None, archiveDir=None, archiveFileName=None, verbose=True, sep=None, encoding=None):
1047
+ """Often we want to make a copy of some input data and work with the copy, for example to protect
1048
+ the original data or to create an archival copy of it so that we can replicate the process later.
1049
+ The `load_tabular_data()` function simplifies the process of reading the data and (optionally) making
1050
+ an archival copy.
1051
+
1052
+ Example usage: df = morpc.load_tabular_data("somefile.xlsx", sheetName="Sheet1", archiveDir="./input_data"))
1053
+
1054
+ Parameters
1055
+ ----------
1056
+ sourcePath : str
1057
+ The path to the tabular data. It may be a file path or URL.
1058
+ sheetName : str
1059
+ Optional. The name of the sheet that you wish to extract from an Excel workbook. If unspecified, the
1060
+ function will read the first sheet in the workbook.
1061
+ fileType : str
1062
+ Optional. One of "csv" or "xlsx" or "xls". If unspecified, the function will attempt to infer from sourcePath.
1063
+ archiveDir : str
1064
+ Optional. The path to the directory where a copy of a data should be archived. If this is specified,
1065
+ the data will be copied to this location.
1066
+ archiveFileName : str
1067
+ Optional. If `archiveDir` is specified, you may use this to specify the name of the archived file.
1068
+ If this is unspecified, the function will preserve the original filename as-is.
1069
+ verbose : bool
1070
+ Set verbose to False to reduce the text output from the function.
1071
+ sep : str
1072
+ Optional. Delimiter to use for delimited text files. Defaults to "," (i.e. CSV file). Tabs ("\t")
1073
+ and pipes ("|") are also common.
1074
+ encoding : str
1075
+ Optional. Character encoding to use for delimited text files. Defaults to "utf-8" which works in most cases.
1076
+ Sometimes other encodings are required. Notably, Census PEP tables require the "ISO-8859-1" encoding.
1077
+
1078
+ Returns
1079
+ -------
1080
+ df : pandas.core.frame.DataFrame
1081
+ A Pandas GeoDataframe constructed from the data at the location specified by sourcePath and sheetName
1082
+
1083
+ """
1084
+
1085
+ import pandas as pd
1086
+ import os
1087
+
1088
+ if(verbose):
1089
+ print("morpc.load_tabular_data | INFO | Loading tabular data from location: {}".format(sourcePath))
1090
+
1091
+ # Due to changes at the Census pd.read_csv(), pd.read_excel(), and requests.get() are blocked. Using wget as work around.
1092
+ if sourcePath.find('www2.census.gov') > -1:
1093
+ if(verbose):
1094
+ print("morpc.load_tabular_data | INFO | Attempting to load data from Census FTP site. Using wget to retrieve file.")
1095
+ print("morpc.load_tabular_data | WARNING | Data from Census FTP must be temp saved. Using ./temp_data.")
1096
+ tempDir = os.path.normpath('./temp_data')
1097
+ if not os.path.exists(tempDir):
1098
+ os.makedirs(tempDir)
1099
+ wget(url = sourcePath, archive_dir = tempDir)
1100
+ sourcePath = os.path.join(tempDir, os.path.split(sourcePath)[-1])
1101
+
1102
+ if(fileType == None):
1103
+ if(verbose):
1104
+ print("morpc.load_tabular_data | INFO | File type is unspecified. Will attempt to infer file type from file extension in source path.")
1105
+ fileExt = os.path.splitext(sourcePath)[1]
1106
+ if(fileExt == ".csv"):
1107
+ fileType = "csv"
1108
+ elif(fileExt == ".xlsx"):
1109
+ fileType = "xlsx"
1110
+ elif(fileExt == ".xls"):
1111
+ fileType = "xls"
1112
+ else:
1113
+ print("morpc.load_tabular_data | ERROR | File extension is unsupported: {}.".format(fileExt))
1114
+ raise RuntimeError
1115
+ if(verbose):
1116
+ print("morpc.load_tabular_data | INFO | Selecting file type {} based on file extension {}".format(fileType, fileExt))
1117
+ else:
1118
+ if(verbose):
1119
+ print("morpc.load_tabular_data | INFO | Using file type {} as specified by user.".format(fileType))
1120
+
1121
+ if("sheetName") == None:
1122
+ if(fileType == "xlsx" or fileType == "xls"):
1123
+ print("morpc.load_tabular_data | WARNING | Sheet name was not specified. Will load first sheet in workbook.")
1124
+
1125
+ if(verbose):
1126
+ print("morpc.load_tabular_data | INFO | Reading tabular data...")
1127
+
1128
+ if(fileType == "csv"):
1129
+ df = pd.read_csv(sourcePath, sep=sep, encoding=encoding)
1130
+ elif(fileType == "xlsx" or fileType == "xls"):
1131
+ df = pd.read_excel(sourcePath, sheet_name=sheetName)
1132
+ else:
1133
+ print("morpc.load_tabular_data | ERROR | File type {} is not handled. Troubleshoot function.".format(fileType))
1134
+ raise RuntimeError
1135
+
1136
+ # If the user has specified an archive directory, create an archival copy of the data
1137
+ if(archiveDir != None):
1138
+ # If no file name was specified, we need to assign one
1139
+ if(archiveFileName) == None:
1140
+ # First try to determine whether we are retrieving data from an API. In this case we may not be able to extract
1141
+ # a file name from the source path. Specifically, look for a "?" character in the path. This is forbidden in
1142
+ # Windows file paths and suggests that a query string is present.
1143
+ if(sourcePath.find("?") > -1):
1144
+ if(verbose):
1145
+ print("morpc.load_tabular_data | INFO | File name is unspecified and source path appears to be an API query. Will assign an alternate file name.")
1146
+ # If the sheet name is specified, use that as the file name. Otherwise use a generic file name.
1147
+ if(sheetName != None):
1148
+ archiveFileName = "{0}.{1}".format(sheetName, fileType)
1149
+ else:
1150
+ archiveFileName == "tabularData.{}".format(fileType)
1151
+
1152
+ # If the source path doesn't look like an API query, then attempt to extract the file name from the path
1153
+ else:
1154
+ if(verbose):
1155
+ print("morpc.load_tabular_data | INFO | File name is unspecified. Will infer file name from source path.")
1156
+ archiveFileName = os.path.split(sourcePath)[-1]
1157
+ if(verbose):
1158
+ print("morpc.load_tabular_data | INFO | Using automatically-selected file name: {}".format(archiveFileName))
1159
+
1160
+ archivePath = os.path.join(archiveDir, archiveFileName)
1161
+
1162
+ if(verbose):
1163
+ print("morpc.load_tabular_data | INFO | Creating archival copy of tabular data at {}".format(archivePath))
1164
+ if(fileType == "csv"):
1165
+ df.to_csv(archivePath, sep=sep, encoding=encoding, index=False)
1166
+ elif(fileType == "xlsx" or fileType == "xls"):
1167
+ df.to_excel(archivePath, sheet_name=sheetName, index=False)
1168
+ else:
1169
+ print("morpc.load_tabular_data | ERROR | File type {} is not handled. Troubleshoot function.".format(fileType))
1170
+ raise RuntimeError
1171
+
1172
+ if(tempDir):
1173
+ print("morpc.load_tabular_data | INFO | Removing temporary directory for Census file: {}".format(tempDir))
1174
+ #shutil.rmtree(tempDir)
1175
+
1176
+ return df
1177
+
1041
1178
  # Assign geographic identifiers
1042
1179
  # Sometimes we have a set of locations and we would like to know what geography (county, zipcode, etc.) they fall in. The
1043
1180
  # `assign_geo_identifiers()` function takes a set of georeference points and a list of geography levels and determines for each
@@ -1063,7 +1200,36 @@ def load_spatial_data(sourcePath, layerName=None, driverName=None, archiveDir=No
1063
1200
  # that has not yet been implemented, please contact Adam Porr (or implement it yourself).
1064
1201
  def assign_geo_identifiers(points, geographies):
1065
1202
  """
1066
- TODO: add docstring
1203
+ Assign geographic identifiers
1204
+ Sometimes we have a set of locations and we would like to know what geography (county, zipcode, etc.) they fall in. The
1205
+ `assign_geo_identifiers()` function takes a set of georeference points and a list of geography levels and determines for each
1206
+ level which area each point falls in
1207
+
1208
+ Parameters
1209
+ ----------
1210
+ points : geopandas.GeoDataFrame
1211
+ a GeoPandas GeoDataFrame consisting of the points of interest
1212
+ geographies : list of str
1213
+ A Python list of one or more strings in which each element corresponds to a geography level. You can specify as
1214
+ many levels as you want from the following list, however note that the function must download the polygons and perform the analysis
1215
+ for each level so if you specify many levels it may take a long time.
1216
+ - "county" - County (Census TIGER)
1217
+ - "tract" - *Not currently implemented*
1218
+ - "blockgroup" - *Not currently implemented*
1219
+ - "block" - *Not currently implemented*
1220
+ - "zcta" - Census ZCTA (tl_2024_us_zcta520)
1221
+ - "place" - Census place (Census TIGER)
1222
+ - "placecombo" - *Not currently implemented*
1223
+ - "juris" - *Not currently implemented*
1224
+ - "region15County" - *Not currently implemented*
1225
+ - "region10County" - *Not currently implemented*
1226
+ - "regionCORPO" - *Not currently implemented*
1227
+ - "regionMPO" - *Not currently implemented*
1228
+
1229
+ Returns
1230
+ -------
1231
+ geopandas.GeoDataFrame
1232
+ A geodataframe with column name id_{geographies} representing the id from the geographies passed
1067
1233
  """
1068
1234
  import geopandas as gpd
1069
1235
  import pyogrio
@@ -1098,8 +1264,10 @@ def assign_geo_identifiers(points, geographies):
1098
1264
  print("ERROR: Geography is currently unsupported: {}".format(geography))
1099
1265
  raise RuntimeError
1100
1266
  elif(geography == "zcta"):
1101
- print("ERROR: Geography is currently unsupported: {}".format(geography))
1102
- raise RuntimeError
1267
+ filePath = "https://www2.census.gov/geo/tiger/TIGER2024/ZCTA520/tl_2024_us_zcta520.zip"
1268
+ layerName = None
1269
+ driverName = "Census Shapefile"
1270
+ polyIdField = ""
1103
1271
  elif(geography == "place"):
1104
1272
  filePath = "https://www2.census.gov/geo/tiger/TIGER2020/PLACE/tl_2020_39_place.zip"
1105
1273
  layerName = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: morpc
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: Data managment tools used by MORPC
5
5
  Author-email: MORPC data team <dataandmaps@morpc.org>
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes