morpc 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morpc/__init__.py +5 -0
- morpc/census/__init__.py +1 -0
- morpc/census/census.py +344 -0
- morpc/frictionless/__init__.py +1 -0
- morpc/frictionless/frictionless.py +621 -0
- morpc/morpc.py +2168 -0
- morpc-0.2.0.dist-info/METADATA +28 -0
- morpc-0.2.0.dist-info/RECORD +10 -0
- morpc-0.2.0.dist-info/WHEEL +5 -0
- morpc-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,621 @@
|
|
|
1
|
+
|
|
2
|
+
# md5() computes the MD5 checksum for a file. When the original checksum is known, the current checksum can be compared
|
|
3
|
+
# to it to determine whether the file has changed.
|
|
4
|
+
#
|
|
5
|
+
# Input parameters:
|
|
6
|
+
# - fname is a string representing the path to the file for which the checksum is to be computed
|
|
7
|
+
#
|
|
8
|
+
# Returns:
|
|
9
|
+
# - MD5 checksum for the file
|
|
10
|
+
def md5(fname):
|
|
11
|
+
import hashlib
|
|
12
|
+
hash_md5 = hashlib.md5()
|
|
13
|
+
with open(fname, "rb") as f:
|
|
14
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
15
|
+
hash_md5.update(chunk)
|
|
16
|
+
return hash_md5.hexdigest()
|
|
17
|
+
|
|
18
|
+
# Functions for manipulating schemas in Frictionless TableSchema format
|
|
19
|
+
# Reference: https://specs.frictionlessdata.io/table-schema/
|
|
20
|
+
|
|
21
|
+
# Given the path to a Frictionless schema file in JSON or YAML format, load the file into memory as a Frictionless
|
|
22
|
+
# Schema object.
|
|
23
|
+
def load_schema(path):
|
|
24
|
+
import frictionless
|
|
25
|
+
return frictionless.Schema(path)
|
|
26
|
+
|
|
27
|
+
# Given the path to a Frictionless Resource file in JSON or YAML format, load the file into memory as a Frictionless
|
|
28
|
+
# Resource object.
|
|
29
|
+
def load_resource(path):
|
|
30
|
+
import frictionless
|
|
31
|
+
return frictionless.Resource(path)
|
|
32
|
+
|
|
33
|
+
# Given a Frictionless TableSchema object, return a list containing the names of the fields defined in the schema.
|
|
34
|
+
# NOTE: This is implemented natively using the TableSchema.field_names() method. Functional implementation is just to provide
|
|
35
|
+
# consistency with morpc.avro_get_field_names()
|
|
36
|
+
def get_field_names(schema):
|
|
37
|
+
import frictionless
|
|
38
|
+
return schema.field_names
|
|
39
|
+
|
|
40
|
+
# Given a Frictionless TableSchema object, return a dictionary mapping each field name to the corresponding data type
|
|
41
|
+
# specified in the schema. The resulting dictionary is suitable for use by the pandas.DataFrame.astype() method (for example)
|
|
42
|
+
def name_to_dtype_map(schema):
|
|
43
|
+
import frictionless
|
|
44
|
+
return {schema.fields[i].name:schema.fields[i].type for i in range(len(schema.fields))}
|
|
45
|
+
|
|
46
|
+
# Given a Frictionless TableSchema object, return a dictionary mapping each field name to the corresponding description
|
|
47
|
+
# specified in the schema.
|
|
48
|
+
def name_to_desc_map(schema):
|
|
49
|
+
import frictionless
|
|
50
|
+
return {schema.fields[i].name:schema.fields[i].description for i in range(len(schema.fields))}
|
|
51
|
+
|
|
52
|
+
# Given a dataframe and the Frictionless Schema object (see load_schema), recast each of the fields in the
|
|
53
|
+
# dataframe to the data type specified in the schema.
|
|
54
|
+
def cast_field_types(df, schema, forceInteger=False, handleMissingFields="error", verbose=True):
|
|
55
|
+
import frictionless
|
|
56
|
+
import pandas as pd
|
|
57
|
+
import shapely
|
|
58
|
+
import json
|
|
59
|
+
outDF = df.copy()
|
|
60
|
+
|
|
61
|
+
for field in schema.fields:
|
|
62
|
+
fieldName = field.name
|
|
63
|
+
fieldType = field.type
|
|
64
|
+
if(not fieldName in df.columns):
|
|
65
|
+
if(handleMissingFields == "ignore"):
|
|
66
|
+
print("cast_field_types | WARNING | Skipping field {} which is not present in dataframe".format(fieldName))
|
|
67
|
+
continue
|
|
68
|
+
elif(handleMissingFields == "add"):
|
|
69
|
+
print("cast_field_types | WARNING | Adding field {} which is not present in dataframe".format(fieldName))
|
|
70
|
+
add_missing_fields(df, schema, fieldNames=fieldName, verbose=verbose)
|
|
71
|
+
continue
|
|
72
|
+
else:
|
|
73
|
+
print("cast_field_types | ERROR | Field {} is not present in dataframe. To handle missing fields, see argument handleMissingFields.".format(fieldName))
|
|
74
|
+
raise RuntimeError
|
|
75
|
+
|
|
76
|
+
if(verbose):
|
|
77
|
+
print("cast_field_types | INFO | Casting field {} as type {}.".format(fieldName, fieldType))
|
|
78
|
+
# The following section is necessary because the pandas "int" type does not support null values. If null values are present,
|
|
79
|
+
# the field must be cast as "Int64" instead.
|
|
80
|
+
if((fieldType == "int") or (fieldType == "integer")):
|
|
81
|
+
try:
|
|
82
|
+
# Try to cast the field as an "int". This will fail if nulls are present.
|
|
83
|
+
outDF[fieldName] = outDF[fieldName].astype("int")
|
|
84
|
+
except:
|
|
85
|
+
try:
|
|
86
|
+
# Try to cast as "Int64", which supports nulls. This will fail if the fractional part is non-zero.
|
|
87
|
+
if(verbose):
|
|
88
|
+
print("cast_field_types | WARNING | Failed conversion of fieldname {} to type 'int'. Trying type 'Int64' instead.".format(fieldName))
|
|
89
|
+
outDF[fieldName] = outDF[fieldName].astype("Int64")
|
|
90
|
+
except:
|
|
91
|
+
if(forceInteger == True):
|
|
92
|
+
# If the user has allowed coercion of the values to integers, then round the values to the ones place prior to
|
|
93
|
+
# converting to "Int64"
|
|
94
|
+
print("cast_field_types | WARNING | Failed conversion of fieldname {} to type 'Int64'. Trying to round first.".format(fieldName))
|
|
95
|
+
outDF[fieldName] = outDF[fieldName].astype("float").round(0).astype("Int64")
|
|
96
|
+
else:
|
|
97
|
+
# If the user has not allow coercion of the values to integers, then throw an error.
|
|
98
|
+
print("cast_field_types | ERROR | Unable to coerce value to Int64 type. Ensure that fractional part of values is zero, or set forceInteger=True")
|
|
99
|
+
raise RuntimeError
|
|
100
|
+
elif(fieldType == "number"):
|
|
101
|
+
outDF[fieldName] = outDF[fieldName].astype("float")
|
|
102
|
+
elif(fieldType == "date"):
|
|
103
|
+
outDF[fieldName] = pd.to_datetime(outDF[fieldName])
|
|
104
|
+
elif(fieldType == "geojson"):
|
|
105
|
+
try:
|
|
106
|
+
print(f"cast_field_types | INFO | Fieldname {fieldName} as geojson. Attempting to convert to geometry.")
|
|
107
|
+
outDF[fieldName] = [shapely.geometry.shape(json.loads(x)) for x in outDF[fieldName]]
|
|
108
|
+
except RuntimeError as r:
|
|
109
|
+
print(f"cast_field_types | ERROR | Unable to convert to geometry.")
|
|
110
|
+
finally:
|
|
111
|
+
print(f"cast_field_types | INFO | Fieldname {fieldName} cast as geometry.")
|
|
112
|
+
elif(fieldType == "boolean"):
|
|
113
|
+
if(outDF[fieldName].dtype == "bool"):
|
|
114
|
+
print("cast_field_types | WARNING | Fieldname {} already cast as boolean type. Skipping casting for this field.".format(fieldName))
|
|
115
|
+
continue
|
|
116
|
+
elif(outDF[fieldName].dtype != "string"):
|
|
117
|
+
print("cast_field_types | WARNING | Standardizing fieldname {} as a string prior to conversion to boolean.".format(fieldName))
|
|
118
|
+
outDF[fieldName] = outDF[fieldName].astype("string")
|
|
119
|
+
|
|
120
|
+
# The field definition in the schema may contain properties trueValues and/or falseValues which specify what values
|
|
121
|
+
# represent True and False, respectively. If trueVales or falseValues are unspecified, Frictionless recognizes the
|
|
122
|
+
# following values by default:
|
|
123
|
+
# trueValues: ['true', 'True', 'TRUE', '1']
|
|
124
|
+
# falseValues: ['false', 'False', 'FALSE', '0']
|
|
125
|
+
trueValues = field.true_values
|
|
126
|
+
falseValues = field.false_values
|
|
127
|
+
|
|
128
|
+
# Map each of the true and false values to the appropriate Python boolean values
|
|
129
|
+
truthMap = {}
|
|
130
|
+
for value in trueValues:
|
|
131
|
+
truthMap[value] = True
|
|
132
|
+
for value in falseValues:
|
|
133
|
+
truthMap[value] = False
|
|
134
|
+
|
|
135
|
+
# Compare the values found in the field to the set of valid true and false values. If there are values in the
|
|
136
|
+
# data that are among the valid values, throw an error.
|
|
137
|
+
validValuesSet = set(list(truthMap.keys()))
|
|
138
|
+
foundValuesSet = set(outDF[fieldName].unique())
|
|
139
|
+
if(foundValuesSet > validValuesSet):
|
|
140
|
+
print("cast_field_types | ERROR | Fieldname {0} contains values that are not recognized as true or false: {1}".format(fieldName, ", ".join(list(foundValuesSet-validValuesSet))))
|
|
141
|
+
raise RuntimeError
|
|
142
|
+
|
|
143
|
+
# Now that we are confident that all of the values are valid in string form, map them to actual boolean values
|
|
144
|
+
outDF[fieldName] = outDF[fieldName].map(truthMap)
|
|
145
|
+
|
|
146
|
+
# Finally, make the change official by changing the pandas field type to "bool".
|
|
147
|
+
outDF[fieldName] = outDF[fieldName].astype("bool")
|
|
148
|
+
|
|
149
|
+
else:
|
|
150
|
+
outDF[fieldName] = outDF[fieldName].astype(fieldType)
|
|
151
|
+
|
|
152
|
+
return outDF
|
|
153
|
+
|
|
154
|
+
# Given a dataframe and the Frictionless Schema object (see load_schema), add any fields in the schema that
|
|
155
|
+
# are missing in the dataframe. If fieldNames == None, any fields missing from the schema will be added to the dataframe
|
|
156
|
+
# with the correct type and null values. If fieldNames is a string or list of strings, only those fields will be added.
|
|
157
|
+
def add_missing_fields(df, schema, fieldNames=None, verbose=True):
|
|
158
|
+
import frictionless
|
|
159
|
+
outDF = df.copy()
|
|
160
|
+
|
|
161
|
+
if(fieldNames == None):
|
|
162
|
+
myFieldNames = schema.field_names
|
|
163
|
+
elif(type(fieldNames) == str):
|
|
164
|
+
myFieldNames = [fieldNames]
|
|
165
|
+
elif(type(fieldNames) == list):
|
|
166
|
+
myFieldNames = fieldNames
|
|
167
|
+
else:
|
|
168
|
+
print("add_missing_fields | ERROR | If provided, argument fieldNames must be a string containing a single field name or a list of strings")
|
|
169
|
+
raise RuntimeError
|
|
170
|
+
|
|
171
|
+
# Iterate through all of the fields defined in the schema
|
|
172
|
+
for field in schema.fields:
|
|
173
|
+
fieldName = field.name
|
|
174
|
+
fieldType = field.type
|
|
175
|
+
|
|
176
|
+
# If this field is not in the list of fields to add, skip it and move on to the next
|
|
177
|
+
if(not fieldName in myFieldNames):
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
# If the requested field is actually missing then add it. Otherwise notify the user that it is already present and skip it.
|
|
181
|
+
if(not fieldName in df.columns):
|
|
182
|
+
# If the field is missing, add it.
|
|
183
|
+
if(verbose == True):
|
|
184
|
+
print("add_missing_fields | INFO | Adding missing field {0}, type {1}, filled with null values.".format(fieldName, fieldType))
|
|
185
|
+
outDF[fieldName] = None
|
|
186
|
+
|
|
187
|
+
if((fieldType == "int") or (fieldType == "integer")):
|
|
188
|
+
if(verbose == True):
|
|
189
|
+
print("add_missing_fields | WARNING | Field {0} specified as type {1} (pandas type 'int'), which does not support null values in pandas. Casting field as pandas type 'Int64' instead.".format(fieldName, fieldType))
|
|
190
|
+
df[fieldName] = df[fieldName].astype("Int64")
|
|
191
|
+
elif(fieldType == "number"):
|
|
192
|
+
outDF[fieldName] = outDF[fieldName].astype("float")
|
|
193
|
+
else:
|
|
194
|
+
outDF[fieldName] = outDF[fieldName].astype(fieldType)
|
|
195
|
+
else:
|
|
196
|
+
# If the field is not missing, skip it
|
|
197
|
+
print("add_missing_fields | WARNING | User-specified field {0} is already present in the dataframe. Skipping it.".format(fieldName))
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
return outDF
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def create_resource(dataPath, title=None, name=None, description=None, resourcePath=None, schemaPath=None, resFormat=None,
|
|
205
|
+
resProfile=None, resMediaType=None, computeHash=True, computeBytes=True, ignoreSchema=False,
|
|
206
|
+
writeResource=False, validate=False):
|
|
207
|
+
"""Create a Frictionless resource object using sane default values for some attributes. Optionally, write the
|
|
208
|
+
resource file to disk and validate the resource file, schema, and data.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
dataPath : str
|
|
213
|
+
The path to the data file that the resource file will describe, as you want it to appear in the resource file.
|
|
214
|
+
Typically the data lives in the same directory as the resource file, in which case dataPath is simply the data file name.
|
|
215
|
+
Could instead be a relative path (RELATIVE TO THE LOCATION OF THE RESOURCE FILE) or a URL. It may NOT be an absolute path.
|
|
216
|
+
title : str
|
|
217
|
+
Optional. The value for the title attribute in the resource file. A human-readable title that describes the data. If
|
|
218
|
+
unspecified, defaults to a title derived from the data file name.
|
|
219
|
+
name : str
|
|
220
|
+
Optional. The value for the name attribute in the resource file. A unique, machine-readable string to refer to the resource.
|
|
221
|
+
Must be lowercase and must not contain spaces. If unspecified, defaults to a name derived from the data file name.
|
|
222
|
+
description : str
|
|
223
|
+
Optional. The value for the description attribute in the resource file. A human-readable detailed description of the data and
|
|
224
|
+
any interpretation or usage guidelines as required. If unspecified, defaults to a generic description attributing
|
|
225
|
+
the data to MORPC.
|
|
226
|
+
resourcePath : str
|
|
227
|
+
Optional. If you wish to write the resource object to disk as a resource file (see writeResource), you may specify the target
|
|
228
|
+
path here. Can be an absolute path or a path RELATIVE TO THE CURRENT WORKING DIRECTORY of the script. The values for dataPath
|
|
229
|
+
and schemaPath typically should be specified relative to this location. If unspecified, the resource will be created in the
|
|
230
|
+
directory specified or implied by dataPath. In that case it will have the same basename as the data file but with
|
|
231
|
+
the extension replaced by ".resource.yaml"
|
|
232
|
+
schemaPath : str
|
|
233
|
+
Optional. The path to the schema file that describes the data. Typically the schema lives in the same directory as the
|
|
234
|
+
resource file, in which case this is just the schema file name. Could instead by a relative path (RELATIVE TO THE LOCATION OF THE
|
|
235
|
+
RESOURCE file) or a URL. It may NOT be an absolute path. If unspecified, it will be assumed that the schema is in the same
|
|
236
|
+
directory as the data and that it hase same basename as the data file but with the extension replaced by ".schema.yaml". If
|
|
237
|
+
ignoreSchema is True, the schema will be omitted from the resource, regardless of whether a path is specified.
|
|
238
|
+
resFormat : str
|
|
239
|
+
Optional. The value for the format attribute in the resource file. The file type in which the data is formatted (e.g. csv, xlsx,
|
|
240
|
+
json). If unspecified, will attempt to infer this from the extension of the data file. See Frictionless documentation for supported formats and EXTENSION_MAP in the function code for the subset of formats that can be inferred.
|
|
241
|
+
resProfile : str
|
|
242
|
+
Optional. The value for the profile attribute in the resource file. If unspecified, defaults to "data-resource". Typically you will
|
|
243
|
+
not have to change this. See Frictionless documentation for other supported profiles.
|
|
244
|
+
resMediaType : str
|
|
245
|
+
Optional. The value for the mediatype attribute in the resource file. The MIME type that best describes the data file. If
|
|
246
|
+
unspecified, will attempt to infer this from the extension of the data file. If you need to specify it manually, search the internet for the appropriate MIME type. See EXTENSION_MAP in the function code for the subset of mediatypes that can be inferred.
|
|
247
|
+
computeHash : bool
|
|
248
|
+
Optional. If True, compute the MD5 hash for the data file and include it in the hash attribute in the resource. Defaults to True. If resourcePath is not specified, assume the data path is relative to the current working directory.
|
|
249
|
+
computeBytes : bool
|
|
250
|
+
Optional. If True, compute the file size for the data file and include it in the bytes attribute in the resource. Defaults to True. If resourcePath is not specified, assume the data path is relative to the current working directory.
|
|
251
|
+
ignoreSchema : bool
|
|
252
|
+
Optional. If True, no schema information will be included in the resource even if a path is provided.
|
|
253
|
+
writeResource : bool
|
|
254
|
+
Optional. If True, write the resource file to disk. Defaults to false. If resourcePath is provided, use that path. If resourcePath is not provided, write the resource to the current working directory.
|
|
255
|
+
validate : bool
|
|
256
|
+
Optional. If True, the resource file, schema file, and data file will be validated. Note that writeResource must be True to
|
|
257
|
+
use this option.
|
|
258
|
+
|
|
259
|
+
Returns
|
|
260
|
+
-------
|
|
261
|
+
resource : frictionless.resources.table.TableResource
|
|
262
|
+
A Frictionless TableResource object which describes the data
|
|
263
|
+
"""
|
|
264
|
+
import os
|
|
265
|
+
import re
|
|
266
|
+
import frictionless
|
|
267
|
+
|
|
268
|
+
EXTENSION_MAP = {
|
|
269
|
+
".gpkg": {
|
|
270
|
+
"format":"gpkg",
|
|
271
|
+
"mediatype":"geopackage+sqlite3"
|
|
272
|
+
},
|
|
273
|
+
".csv": {
|
|
274
|
+
"format":"csv",
|
|
275
|
+
"mediatype":"text/csv"
|
|
276
|
+
},
|
|
277
|
+
".xls": {
|
|
278
|
+
"format":"xls",
|
|
279
|
+
"mediatype":"application/vnd.ms-excel"
|
|
280
|
+
},
|
|
281
|
+
".xlsx": {
|
|
282
|
+
"format":"xlsx",
|
|
283
|
+
"mediatype":"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
284
|
+
},
|
|
285
|
+
".dbf": {
|
|
286
|
+
"format":"dbf",
|
|
287
|
+
"mediatype":"application/dbf"
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
dataFilePath = os.path.normpath(dataPath)
|
|
292
|
+
dataFileName = os.path.splitext(os.path.basename(dataFilePath))[0]
|
|
293
|
+
dataFileExtension = os.path.splitext(os.path.basename(dataFilePath))[1]
|
|
294
|
+
|
|
295
|
+
if(os.path.basename(dataFilePath) != os.path.normpath(dataFilePath)):
|
|
296
|
+
# If dataFilePath is not simply a filename
|
|
297
|
+
print("morpc.create_resource | WARNING | You seem to have specified a data path that is not simply a file name. This implies that the data is located in a different directory than the resource file. Typically the data is located in the same directory as the resource file and the path is simply the filename.")
|
|
298
|
+
|
|
299
|
+
resourceFilePath = None
|
|
300
|
+
if(resourcePath != None):
|
|
301
|
+
if(not writeResource):
|
|
302
|
+
# Warn the user if they specified a resource file location but did not enable writeResource
|
|
303
|
+
print("morpc.create_resource | WARNING | You specified a path for the resource file, however writeResource is not set to True. Resource file will not be written to disk.")
|
|
304
|
+
|
|
305
|
+
# If the user has specified a path to the resource file, we'll use it without modification. Warn the user if the choice is unusual.
|
|
306
|
+
if(os.path.basename(dataFilePath) != os.path.normpath(dataFilePath)):
|
|
307
|
+
# If dataFilePath is not simply a filename
|
|
308
|
+
if(os.path.dirname(os.path.abspath(resourcePath)) != os.path.dirname(os.path.abspath(dataFilePath))):
|
|
309
|
+
# If the absolute path to the resource file and the absolute path to the data put them in different directories
|
|
310
|
+
print("morpc.create_resource | WARNING | You seem to have specified a path for the resource file that is in a different directory than the data. Typically the data is located in the same directory as the resource file and the path is simply the filename.")
|
|
311
|
+
resourceFilePath = os.path.normpath(resourcePath)
|
|
312
|
+
|
|
313
|
+
if resFormat != None:
|
|
314
|
+
resourceFormat = resFormat
|
|
315
|
+
else:
|
|
316
|
+
if dataFileExtension.lower() in EXTENSION_MAP:
|
|
317
|
+
resourceFormat = EXTENSION_MAP[dataFileExtension.lower()]["format"]
|
|
318
|
+
print("morpc.create_resource | INFO | Format not specified. Using format derived from data file extension: {}".format(resourceFormat))
|
|
319
|
+
else:
|
|
320
|
+
print("morpc.create_resource | ERROR | Format not specified and could not be determined from data file extension.")
|
|
321
|
+
raise RuntimeError
|
|
322
|
+
|
|
323
|
+
if(not ignoreSchema):
|
|
324
|
+
# If ignoreSchema is False, determine the schema file path
|
|
325
|
+
if(schemaPath != None):
|
|
326
|
+
# If the user has specified a path to the resource file, we'll use it without modification. Warn the user if the choice is unusual.
|
|
327
|
+
if(os.path.basename(dataFilePath) != os.path.normpath(dataFilePath)):
|
|
328
|
+
# If dataFilePath is not simply a filename
|
|
329
|
+
if(os.path.dirname(os.path.abspath(schemaPath)) != os.path.dirname(os.path.abspath(dataFilePath))):
|
|
330
|
+
# If the absolute path to the schema file and the absolute path to the data put them in different directories
|
|
331
|
+
print("morpc.create_resource | WARNING | You seem to have specified a path for the schema file that is in a different directory than the data. Typically the schema is located in the same directory as the data.")
|
|
332
|
+
schemaFilePath = os.path.normpath(schemaPath)
|
|
333
|
+
else:
|
|
334
|
+
# If the user has not specified a path to the schema file, we'll assume that it should go in the same directory as the data. In that
|
|
335
|
+
# case, derive the path from the data path.
|
|
336
|
+
schemaFilePath = dataFilePath.replace(dataFileExtension, ".schema.yaml")
|
|
337
|
+
print("morpc.create_resource | INFO | Schema path not specified. Using path derived from data file path: {}".format(schemaFilePath))
|
|
338
|
+
|
|
339
|
+
if title != None:
|
|
340
|
+
resourceTitle = title
|
|
341
|
+
else:
|
|
342
|
+
resourceTitle = dataFileName
|
|
343
|
+
print("morpc.create_resource | INFO | Title not specified. Using placeholder value derived from data filename: {}".format(resourceTitle))
|
|
344
|
+
|
|
345
|
+
if name != None:
|
|
346
|
+
resourceName = name
|
|
347
|
+
else:
|
|
348
|
+
resourceName = re.sub(r"\W+", "-", dataFileName).lower()
|
|
349
|
+
print("morpc.create_resource | INFO | Name not specified. Using placeholder value derived from data filename: {}".format(resourceName))
|
|
350
|
+
|
|
351
|
+
if description != None:
|
|
352
|
+
resourceDescription = description
|
|
353
|
+
else:
|
|
354
|
+
resourceDescription = "This dataset was produced by MORPC. For more information, please contact dataandmaps@morpc.org."
|
|
355
|
+
print("morpc.create_resource | INFO | Description not specified. Using boilerplate placeholder value: {}".format(resourceDescription))
|
|
356
|
+
|
|
357
|
+
if resMediaType != None:
|
|
358
|
+
resourceMediaType = resMediaType
|
|
359
|
+
else:
|
|
360
|
+
if dataFileExtension.lower() in EXTENSION_MAP:
|
|
361
|
+
resourceMediaType = EXTENSION_MAP[dataFileExtension.lower()]["mediatype"]
|
|
362
|
+
else:
|
|
363
|
+
print("morpc.create_resource | ERROR | Media type not specified and could not be determined from data file extension.")
|
|
364
|
+
raise RuntimeError
|
|
365
|
+
|
|
366
|
+
if resProfile != None:
|
|
367
|
+
resourceProfile = resProfile
|
|
368
|
+
else:
|
|
369
|
+
resourceProfile = "data-resource"
|
|
370
|
+
|
|
371
|
+
resource = frictionless.Resource.from_descriptor({
|
|
372
|
+
"name": resourceName,
|
|
373
|
+
"title": resourceTitle,
|
|
374
|
+
"description": resourceDescription,
|
|
375
|
+
"profile": resourceProfile,
|
|
376
|
+
"path": dataFilePath,
|
|
377
|
+
"format": resourceFormat,
|
|
378
|
+
"mediatype": resourceMediaType,
|
|
379
|
+
})
|
|
380
|
+
|
|
381
|
+
if(not ignoreSchema):
|
|
382
|
+
resource.schema = schemaFilePath
|
|
383
|
+
|
|
384
|
+
unlocatedDataWarningIssued = False
|
|
385
|
+
if(computeHash):
|
|
386
|
+
if(resourceFilePath != None):
|
|
387
|
+
resource.hash = md5(os.path.join(os.path.dirname(resourceFilePath), dataFilePath))
|
|
388
|
+
else:
|
|
389
|
+
try:
|
|
390
|
+
print("morpc.create_resource | WARNING | Data path is specified relative to resource file, however no resource file path was specified. Assuming data path is relative to current working directory.")
|
|
391
|
+
unlocatedDataWarningIssued = True
|
|
392
|
+
resource.hash = md5(dataFilePath)
|
|
393
|
+
except:
|
|
394
|
+
print("morpc.create_resource | ERROR | Unable to compute MD5 hash. Data file could not be located.")
|
|
395
|
+
raise RuntimeError
|
|
396
|
+
|
|
397
|
+
if(computeBytes):
|
|
398
|
+
# If the data path is relative, we need to know the resource file path
|
|
399
|
+
if(resourceFilePath != None):
|
|
400
|
+
resource.bytes = os.path.getsize(os.path.join(os.path.dirname(resourceFilePath), dataFilePath))
|
|
401
|
+
else:
|
|
402
|
+
try:
|
|
403
|
+
if(not unlocatedDataWarningIssued):
|
|
404
|
+
print("morpc.create_resource | WARNING | Data path is specified relative to resource file, however no resource file path was specified. Assuming data path is relative to current working directory.")
|
|
405
|
+
resource.hash = md5(dataFilePath)
|
|
406
|
+
except:
|
|
407
|
+
print("morpc.create_resource | ERROR | Unable to compute file size (bytes). Data file could not be located.")
|
|
408
|
+
raise RuntimeError
|
|
409
|
+
|
|
410
|
+
if(writeResource):
|
|
411
|
+
if(resourceFilePath != None):
|
|
412
|
+
print("morpc.create_resource | INFO | Writing Frictionless Resource file to {}".format(resourceFilePath))
|
|
413
|
+
write_resource(resource, resourceFilePath)
|
|
414
|
+
else:
|
|
415
|
+
print("morpc.create_resource | ERROR | Unable to validate resource. No resource file path specified.")
|
|
416
|
+
raise RuntimeError
|
|
417
|
+
|
|
418
|
+
if(validate == True):
|
|
419
|
+
if(resourceFilePath != None):
|
|
420
|
+
print("morpc.create_resource | INFO | Validating resource on disk.")
|
|
421
|
+
validate_resource(resourceFilePath)
|
|
422
|
+
else:
|
|
423
|
+
print("morpc.create_resource | ERROR | Unable to validate resource. No resource file path specified.")
|
|
424
|
+
raise RuntimeError
|
|
425
|
+
|
|
426
|
+
return resource
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def write_resource(resource, resourcePath):
|
|
431
|
+
"""Given a Frictionless resource object and a path to a target file, this function writes the resource to disk in YAML
|
|
432
|
+
format. It is a wrapper for frictionless.Resource.to_yaml() that is necessary when the paths to the data and/or schema
|
|
433
|
+
files are specified as relative paths.
|
|
434
|
+
|
|
435
|
+
Parameters
|
|
436
|
+
----------
|
|
437
|
+
resource : frictionless.resources.table.TableResource
|
|
438
|
+
A Frictionless TableResource object which describes the data
|
|
439
|
+
resourcePath : str
|
|
440
|
+
The path to the Frictionless Resource file that describes the data.
|
|
441
|
+
"""
|
|
442
|
+
|
|
443
|
+
import os
|
|
444
|
+
import frictionless
|
|
445
|
+
|
|
446
|
+
cwd = os.getcwd()
|
|
447
|
+
|
|
448
|
+
try:
|
|
449
|
+
os.chdir(os.path.dirname(resourcePath))
|
|
450
|
+
resource.to_yaml(os.path.basename(resourcePath))
|
|
451
|
+
except Exception as e:
|
|
452
|
+
os.chdir(cwd)
|
|
453
|
+
print("ERROR: An unhandled error occurred while trying to write the Frictionless resource: {}".format(e))
|
|
454
|
+
raise RuntimeError
|
|
455
|
+
|
|
456
|
+
os.chdir(cwd)
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def validate_resource(resourcePath, verbose=True):
|
|
460
|
+
import os
|
|
461
|
+
import frictionless
|
|
462
|
+
cwd = os.getcwd()
|
|
463
|
+
|
|
464
|
+
try:
|
|
465
|
+
os.chdir(os.path.dirname(resourcePath))
|
|
466
|
+
|
|
467
|
+
if(verbose):
|
|
468
|
+
print("morpc.validate_resource | INFO | Validating resource on disk (including data and schema). This may take some time.")
|
|
469
|
+
resourceOnDisk = frictionless.Resource(os.path.basename(resourcePath))
|
|
470
|
+
results = resourceOnDisk.validate()
|
|
471
|
+
|
|
472
|
+
except Exception as e:
|
|
473
|
+
os.chdir(cwd)
|
|
474
|
+
print("morpc.validate_resource | ERROR | An unhandled error occurred while trying to validate the Frictionless resource: {}".format(e))
|
|
475
|
+
raise RuntimeError
|
|
476
|
+
|
|
477
|
+
os.chdir(cwd)
|
|
478
|
+
|
|
479
|
+
if(results.valid == True):
|
|
480
|
+
if(verbose):
|
|
481
|
+
print("morpc.validate_resource | INFO | Resource is valid")
|
|
482
|
+
return True
|
|
483
|
+
else:
|
|
484
|
+
if(verbose):
|
|
485
|
+
print("morpc.validate_resource | ERROR | Resource is NOT valid. Errors follow.")
|
|
486
|
+
print(results)
|
|
487
|
+
return False
|
|
488
|
+
|
|
489
|
+
def load_data(resourcePath, archiveDir=None, validate=False, verbose=True):
|
|
490
|
+
"""Often we want to make a copy of some input data and work with the copy, for example to protect
|
|
491
|
+
the original data or to create an archival copy of it so that we can replicate the process later.
|
|
492
|
+
The `load_data()` function simplifies the process of reading the data and
|
|
493
|
+
(optionally) validating the data and/or making an archival copy.
|
|
494
|
+
|
|
495
|
+
Parameters
|
|
496
|
+
----------
|
|
497
|
+
resourcePath : str
|
|
498
|
+
The path to the Frictionless Resource file that describes the data.
|
|
499
|
+
archiveDir : str
|
|
500
|
+
Optional. The path to the directory where a copy of a data should be archived. If this is specified,
|
|
501
|
+
the Resource file, schema file, and data file will be archived in this location.
|
|
502
|
+
validate : bool
|
|
503
|
+
Optional. If True, the resource file, schema file, and data file will be validated. If archiveDir is
|
|
504
|
+
specified, the copies of the files will be validated. If not, the original files will be validated.
|
|
505
|
+
|
|
506
|
+
Returns
|
|
507
|
+
-------
|
|
508
|
+
df : pandas.core.frame.DataFrame
|
|
509
|
+
A GeoPandas GeoDataframe constructed from the data at the location specified by sourcePath and layerName
|
|
510
|
+
resource : frictionless.resources.table.TableResource
|
|
511
|
+
A Frictionless TableResource object which describes the data
|
|
512
|
+
schema : frictionless.schema.schema.Schema
|
|
513
|
+
A Frictionless Schema object which describes the data
|
|
514
|
+
"""
|
|
515
|
+
|
|
516
|
+
import frictionless
|
|
517
|
+
import pandas as pd
|
|
518
|
+
import os
|
|
519
|
+
import json
|
|
520
|
+
import shutil
|
|
521
|
+
|
|
522
|
+
myResourcePath = os.path.normpath(resourcePath)
|
|
523
|
+
|
|
524
|
+
print("morpc.load_data | INFO | Loading Frictionless Resource file at location {}".format(myResourcePath))
|
|
525
|
+
|
|
526
|
+
resource = load_resource(myResourcePath)
|
|
527
|
+
|
|
528
|
+
sourceDir = os.path.dirname(myResourcePath)
|
|
529
|
+
resourceFilename = os.path.basename(myResourcePath)
|
|
530
|
+
dataFileExtension = os.path.splitext(resource.path)[1]
|
|
531
|
+
|
|
532
|
+
if(archiveDir != None):
|
|
533
|
+
|
|
534
|
+
targetResource = os.path.join(archiveDir, resourceFilename)
|
|
535
|
+
targetData = os.path.join(archiveDir, resource.path)
|
|
536
|
+
# Surely there is a more convenient way to get the schema path from the Resource object?
|
|
537
|
+
targetSchema = os.path.join(archiveDir, json.loads(resource.to_json())["schema"])
|
|
538
|
+
|
|
539
|
+
try:
|
|
540
|
+
print("morpc.load_data | INFO | Copying data, resource file, and schema to directory {}".format(archiveDir))
|
|
541
|
+
|
|
542
|
+
shutil.copyfile(os.path.join(sourceDir, resourceFilename), targetResource)
|
|
543
|
+
shutil.copyfile(os.path.join(sourceDir, resource.path), targetData)
|
|
544
|
+
shutil.copyfile(os.path.join(sourceDir, json.loads(resource.to_json())["schema"]), targetSchema)
|
|
545
|
+
except Exception as e:
|
|
546
|
+
print("morpc.load_data | ERROR | Unhandled exception when trying to copy data and associated Frictionless files: {}".format(e))
|
|
547
|
+
raise RuntimeError
|
|
548
|
+
|
|
549
|
+
else:
|
|
550
|
+
targetResource = os.path.join(sourceDir, resourceFilename)
|
|
551
|
+
targetData = os.path.join(sourceDir, resource.path)
|
|
552
|
+
targetSchema = os.path.join(sourceDir, json.loads(resource.to_json())["schema"])
|
|
553
|
+
|
|
554
|
+
print("morpc.load_data | INFO | Loading data, resource file, and schema from their source locations")
|
|
555
|
+
|
|
556
|
+
print("morpc.load_data | INFO | --> Data file: {}".format(targetData))
|
|
557
|
+
print("morpc.load_data | INFO | --> Resource file: {}".format(targetResource))
|
|
558
|
+
print("morpc.load_data | INFO | --> Schema file: {}".format(targetSchema))
|
|
559
|
+
|
|
560
|
+
if(validate):
|
|
561
|
+
print("morpc.load_data | INFO | Validating resource including data and schema.")
|
|
562
|
+
resourceValid = validate_resource(targetResource)
|
|
563
|
+
if(not resourceValid):
|
|
564
|
+
print("morpc.load_data | ERROR | Validation failed. Errors should be described above.")
|
|
565
|
+
raise RuntimeError
|
|
566
|
+
|
|
567
|
+
print("morpc.load_data | INFO | Loading data.")
|
|
568
|
+
if(dataFileExtension == ".csv"):
|
|
569
|
+
df = pd.read_csv(targetData, dtype="string")
|
|
570
|
+
elif(dataFileExtension == ".xlsx"):
|
|
571
|
+
df = pd.read_excel(targetData)
|
|
572
|
+
else:
|
|
573
|
+
print("morpc.load_data | ERROR | Unknown data file extension: {}".format(dataFileExtension))
|
|
574
|
+
raise RuntimeError
|
|
575
|
+
|
|
576
|
+
df = cast_field_types(df, resource.schema, verbose=verbose)
|
|
577
|
+
|
|
578
|
+
return df, resource, resource.schema
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
# Given the path to a schema document in Avro format, load the Avro schema and reformat it as a
|
|
582
|
+
# Frictionless Schema object in memory
|
|
583
|
+
# WARNING: This function has not been extensively tested. Be sure to validate the resulting
|
|
584
|
+
# Frictionless schema
|
|
585
|
+
def schema_from_avro(path):
|
|
586
|
+
import frictionless
|
|
587
|
+
import os
|
|
588
|
+
|
|
589
|
+
fieldList = []
|
|
590
|
+
avroSchema = load_avro_schema(os.path.normpath(path))
|
|
591
|
+
for field in avroSchema["fields"]:
|
|
592
|
+
thisField = {}
|
|
593
|
+
for key in field:
|
|
594
|
+
if key == "name":
|
|
595
|
+
thisField["name"] = field[key]
|
|
596
|
+
elif key == "type":
|
|
597
|
+
if field[key] == "int":
|
|
598
|
+
thisField["type"] = "integer"
|
|
599
|
+
elif field[key] == "float":
|
|
600
|
+
thisField["type"] = "number"
|
|
601
|
+
else:
|
|
602
|
+
thisField["type"] = field[key]
|
|
603
|
+
elif key == "doc":
|
|
604
|
+
thisField["description"] = field[key]
|
|
605
|
+
fieldList.append(thisField)
|
|
606
|
+
|
|
607
|
+
frictionlessSchemaDescriptor = {
|
|
608
|
+
"fields": fieldList
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
results = frictionless.Schema.validate_descriptor(frictionlessSchemaDescriptor)
|
|
612
|
+
if(results.valid == True):
|
|
613
|
+
print("Schema is valid")
|
|
614
|
+
else:
|
|
615
|
+
print("ERROR: Schema is NOT valid. Errors follow.")
|
|
616
|
+
print(results)
|
|
617
|
+
raise RuntimeError
|
|
618
|
+
|
|
619
|
+
frictionlessSchema = frictionless.Schema.from_descriptor(frictionlessSchemaDescriptor)
|
|
620
|
+
|
|
621
|
+
return frictionlessSchema
|