morpc 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,621 @@
1
+
2
+ # md5() computes the MD5 checksum for a file. When the original checksum is known, the current checksum can be compared
3
+ # to it to determine whether the file has changed.
4
+ #
5
+ # Input parameters:
6
+ # - fname is a string representing the path to the file for which the checksum is to be computed
7
+ #
8
+ # Returns:
9
+ # - MD5 checksum for the file
10
+ def md5(fname):
11
+ import hashlib
12
+ hash_md5 = hashlib.md5()
13
+ with open(fname, "rb") as f:
14
+ for chunk in iter(lambda: f.read(4096), b""):
15
+ hash_md5.update(chunk)
16
+ return hash_md5.hexdigest()
17
+
18
+ # Functions for manipulating schemas in Frictionless TableSchema format
19
+ # Reference: https://specs.frictionlessdata.io/table-schema/
20
+
21
+ # Given the path to a Frictionless schema file in JSON or YAML format, load the file into memory as a Frictionless
22
+ # Schema object.
23
+ def load_schema(path):
24
+ import frictionless
25
+ return frictionless.Schema(path)
26
+
27
+ # Given the path to a Frictionless Resource file in JSON or YAML format, load the file into memory as a Frictionless
28
+ # Resource object.
29
+ def load_resource(path):
30
+ import frictionless
31
+ return frictionless.Resource(path)
32
+
33
+ # Given a Frictionless TableSchema object, return a list containing the names of the fields defined in the schema.
34
+ # NOTE: This is implemented natively using the TableSchema.field_names() method. Functional implementation is just to provide
35
+ # consistency with morpc.avro_get_field_names()
36
+ def get_field_names(schema):
37
+ import frictionless
38
+ return schema.field_names
39
+
40
+ # Given a Frictionless TableSchema object, return a dictionary mapping each field name to the corresponding data type
41
+ # specified in the schema. The resulting dictionary is suitable for use by the pandas.DataFrame.astype() method (for example)
42
+ def name_to_dtype_map(schema):
43
+ import frictionless
44
+ return {schema.fields[i].name:schema.fields[i].type for i in range(len(schema.fields))}
45
+
46
+ # Given a Frictionless TableSchema object, return a dictionary mapping each field name to the corresponding description
47
+ # specified in the schema.
48
+ def name_to_desc_map(schema):
49
+ import frictionless
50
+ return {schema.fields[i].name:schema.fields[i].description for i in range(len(schema.fields))}
51
+
52
+ # Given a dataframe and the Frictionless Schema object (see load_schema), recast each of the fields in the
53
+ # dataframe to the data type specified in the schema.
54
+ def cast_field_types(df, schema, forceInteger=False, handleMissingFields="error", verbose=True):
55
+ import frictionless
56
+ import pandas as pd
57
+ import shapely
58
+ import json
59
+ outDF = df.copy()
60
+
61
+ for field in schema.fields:
62
+ fieldName = field.name
63
+ fieldType = field.type
64
+ if(not fieldName in df.columns):
65
+ if(handleMissingFields == "ignore"):
66
+ print("cast_field_types | WARNING | Skipping field {} which is not present in dataframe".format(fieldName))
67
+ continue
68
+ elif(handleMissingFields == "add"):
69
+ print("cast_field_types | WARNING | Adding field {} which is not present in dataframe".format(fieldName))
70
+ add_missing_fields(df, schema, fieldNames=fieldName, verbose=verbose)
71
+ continue
72
+ else:
73
+ print("cast_field_types | ERROR | Field {} is not present in dataframe. To handle missing fields, see argument handleMissingFields.".format(fieldName))
74
+ raise RuntimeError
75
+
76
+ if(verbose):
77
+ print("cast_field_types | INFO | Casting field {} as type {}.".format(fieldName, fieldType))
78
+ # The following section is necessary because the pandas "int" type does not support null values. If null values are present,
79
+ # the field must be cast as "Int64" instead.
80
+ if((fieldType == "int") or (fieldType == "integer")):
81
+ try:
82
+ # Try to cast the field as an "int". This will fail if nulls are present.
83
+ outDF[fieldName] = outDF[fieldName].astype("int")
84
+ except:
85
+ try:
86
+ # Try to cast as "Int64", which supports nulls. This will fail if the fractional part is non-zero.
87
+ if(verbose):
88
+ print("cast_field_types | WARNING | Failed conversion of fieldname {} to type 'int'. Trying type 'Int64' instead.".format(fieldName))
89
+ outDF[fieldName] = outDF[fieldName].astype("Int64")
90
+ except:
91
+ if(forceInteger == True):
92
+ # If the user has allowed coercion of the values to integers, then round the values to the ones place prior to
93
+ # converting to "Int64"
94
+ print("cast_field_types | WARNING | Failed conversion of fieldname {} to type 'Int64'. Trying to round first.".format(fieldName))
95
+ outDF[fieldName] = outDF[fieldName].astype("float").round(0).astype("Int64")
96
+ else:
97
+ # If the user has not allow coercion of the values to integers, then throw an error.
98
+ print("cast_field_types | ERROR | Unable to coerce value to Int64 type. Ensure that fractional part of values is zero, or set forceInteger=True")
99
+ raise RuntimeError
100
+ elif(fieldType == "number"):
101
+ outDF[fieldName] = outDF[fieldName].astype("float")
102
+ elif(fieldType == "date"):
103
+ outDF[fieldName] = pd.to_datetime(outDF[fieldName])
104
+ elif(fieldType == "geojson"):
105
+ try:
106
+ print(f"cast_field_types | INFO | Fieldname {fieldName} as geojson. Attempting to convert to geometry.")
107
+ outDF[fieldName] = [shapely.geometry.shape(json.loads(x)) for x in outDF[fieldName]]
108
+ except RuntimeError as r:
109
+ print(f"cast_field_types | ERROR | Unable to convert to geometry.")
110
+ finally:
111
+ print(f"cast_field_types | INFO | Fieldname {fieldName} cast as geometry.")
112
+ elif(fieldType == "boolean"):
113
+ if(outDF[fieldName].dtype == "bool"):
114
+ print("cast_field_types | WARNING | Fieldname {} already cast as boolean type. Skipping casting for this field.".format(fieldName))
115
+ continue
116
+ elif(outDF[fieldName].dtype != "string"):
117
+ print("cast_field_types | WARNING | Standardizing fieldname {} as a string prior to conversion to boolean.".format(fieldName))
118
+ outDF[fieldName] = outDF[fieldName].astype("string")
119
+
120
+ # The field definition in the schema may contain properties trueValues and/or falseValues which specify what values
121
+ # represent True and False, respectively. If trueVales or falseValues are unspecified, Frictionless recognizes the
122
+ # following values by default:
123
+ # trueValues: ['true', 'True', 'TRUE', '1']
124
+ # falseValues: ['false', 'False', 'FALSE', '0']
125
+ trueValues = field.true_values
126
+ falseValues = field.false_values
127
+
128
+ # Map each of the true and false values to the appropriate Python boolean values
129
+ truthMap = {}
130
+ for value in trueValues:
131
+ truthMap[value] = True
132
+ for value in falseValues:
133
+ truthMap[value] = False
134
+
135
+ # Compare the values found in the field to the set of valid true and false values. If there are values in the
136
+ # data that are among the valid values, throw an error.
137
+ validValuesSet = set(list(truthMap.keys()))
138
+ foundValuesSet = set(outDF[fieldName].unique())
139
+ if(foundValuesSet > validValuesSet):
140
+ print("cast_field_types | ERROR | Fieldname {0} contains values that are not recognized as true or false: {1}".format(fieldName, ", ".join(list(foundValuesSet-validValuesSet))))
141
+ raise RuntimeError
142
+
143
+ # Now that we are confident that all of the values are valid in string form, map them to actual boolean values
144
+ outDF[fieldName] = outDF[fieldName].map(truthMap)
145
+
146
+ # Finally, make the change official by changing the pandas field type to "bool".
147
+ outDF[fieldName] = outDF[fieldName].astype("bool")
148
+
149
+ else:
150
+ outDF[fieldName] = outDF[fieldName].astype(fieldType)
151
+
152
+ return outDF
153
+
154
+ # Given a dataframe and the Frictionless Schema object (see load_schema), add any fields in the schema that
155
+ # are missing in the dataframe. If fieldNames == None, any fields missing from the schema will be added to the dataframe
156
+ # with the correct type and null values. If fieldNames is a string or list of strings, only those fields will be added.
157
+ def add_missing_fields(df, schema, fieldNames=None, verbose=True):
158
+ import frictionless
159
+ outDF = df.copy()
160
+
161
+ if(fieldNames == None):
162
+ myFieldNames = schema.field_names
163
+ elif(type(fieldNames) == str):
164
+ myFieldNames = [fieldNames]
165
+ elif(type(fieldNames) == list):
166
+ myFieldNames = fieldNames
167
+ else:
168
+ print("add_missing_fields | ERROR | If provided, argument fieldNames must be a string containing a single field name or a list of strings")
169
+ raise RuntimeError
170
+
171
+ # Iterate through all of the fields defined in the schema
172
+ for field in schema.fields:
173
+ fieldName = field.name
174
+ fieldType = field.type
175
+
176
+ # If this field is not in the list of fields to add, skip it and move on to the next
177
+ if(not fieldName in myFieldNames):
178
+ continue
179
+
180
+ # If the requested field is actually missing then add it. Otherwise notify the user that it is already present and skip it.
181
+ if(not fieldName in df.columns):
182
+ # If the field is missing, add it.
183
+ if(verbose == True):
184
+ print("add_missing_fields | INFO | Adding missing field {0}, type {1}, filled with null values.".format(fieldName, fieldType))
185
+ outDF[fieldName] = None
186
+
187
+ if((fieldType == "int") or (fieldType == "integer")):
188
+ if(verbose == True):
189
+ print("add_missing_fields | WARNING | Field {0} specified as type {1} (pandas type 'int'), which does not support null values in pandas. Casting field as pandas type 'Int64' instead.".format(fieldName, fieldType))
190
+ df[fieldName] = df[fieldName].astype("Int64")
191
+ elif(fieldType == "number"):
192
+ outDF[fieldName] = outDF[fieldName].astype("float")
193
+ else:
194
+ outDF[fieldName] = outDF[fieldName].astype(fieldType)
195
+ else:
196
+ # If the field is not missing, skip it
197
+ print("add_missing_fields | WARNING | User-specified field {0} is already present in the dataframe. Skipping it.".format(fieldName))
198
+ continue
199
+
200
+ return outDF
201
+
202
+
203
+
204
+ def create_resource(dataPath, title=None, name=None, description=None, resourcePath=None, schemaPath=None, resFormat=None,
205
+ resProfile=None, resMediaType=None, computeHash=True, computeBytes=True, ignoreSchema=False,
206
+ writeResource=False, validate=False):
207
+ """Create a Frictionless resource object using sane default values for some attributes. Optionally, write the
208
+ resource file to disk and validate the resource file, schema, and data.
209
+
210
+ Parameters
211
+ ----------
212
+ dataPath : str
213
+ The path to the data file that the resource file will describe, as you want it to appear in the resource file.
214
+ Typically the data lives in the same directory as the resource file, in which case dataPath is simply the data file name.
215
+ Could instead be a relative path (RELATIVE TO THE LOCATION OF THE RESOURCE FILE) or a URL. It may NOT be an absolute path.
216
+ title : str
217
+ Optional. The value for the title attribute in the resource file. A human-readable title that describes the data. If
218
+ unspecified, defaults to a title derived from the data file name.
219
+ name : str
220
+ Optional. The value for the name attribute in the resource file. A unique, machine-readable string to refer to the resource.
221
+ Must be lowercase and must not contain spaces. If unspecified, defaults to a name derived from the data file name.
222
+ description : str
223
+ Optional. The value for the description attribute in the resource file. A human-readable detailed description of the data and
224
+ any interpretation or usage guidelines as required. If unspecified, defaults to a generic description attributing
225
+ the data to MORPC.
226
+ resourcePath : str
227
+ Optional. If you wish to write the resource object to disk as a resource file (see writeResource), you may specify the target
228
+ path here. Can be an absolute path or a path RELATIVE TO THE CURRENT WORKING DIRECTORY of the script. The values for dataPath
229
+ and schemaPath typically should be specified relative to this location. If unspecified, the resource will be created in the
230
+ directory specified or implied by dataPath. In that case it will have the same basename as the data file but with
231
+ the extension replaced by ".resource.yaml"
232
+ schemaPath : str
233
+ Optional. The path to the schema file that describes the data. Typically the schema lives in the same directory as the
234
+ resource file, in which case this is just the schema file name. Could instead by a relative path (RELATIVE TO THE LOCATION OF THE
235
+ RESOURCE file) or a URL. It may NOT be an absolute path. If unspecified, it will be assumed that the schema is in the same
236
+ directory as the data and that it hase same basename as the data file but with the extension replaced by ".schema.yaml". If
237
+ ignoreSchema is True, the schema will be omitted from the resource, regardless of whether a path is specified.
238
+ resFormat : str
239
+ Optional. The value for the format attribute in the resource file. The file type in which the data is formatted (e.g. csv, xlsx,
240
+ json). If unspecified, will attempt to infer this from the extension of the data file. See Frictionless documentation for supported formats and EXTENSION_MAP in the function code for the subset of formats that can be inferred.
241
+ resProfile : str
242
+ Optional. The value for the profile attribute in the resource file. If unspecified, defaults to "data-resource". Typically you will
243
+ not have to change this. See Frictionless documentation for other supported profiles.
244
+ resMediaType : str
245
+ Optional. The value for the mediatype attribute in the resource file. The MIME type that best describes the data file. If
246
+ unspecified, will attempt to infer this from the extension of the data file. If you need to specify it manually, search the internet for the appropriate MIME type. See EXTENSION_MAP in the function code for the subset of mediatypes that can be inferred.
247
+ computeHash : bool
248
+ Optional. If True, compute the MD5 hash for the data file and include it in the hash attribute in the resource. Defaults to True. If resourcePath is not specified, assume the data path is relative to the current working directory.
249
+ computeBytes : bool
250
+ Optional. If True, compute the file size for the data file and include it in the bytes attribute in the resource. Defaults to True. If resourcePath is not specified, assume the data path is relative to the current working directory.
251
+ ignoreSchema : bool
252
+ Optional. If True, no schema information will be included in the resource even if a path is provided.
253
+ writeResource : bool
254
+ Optional. If True, write the resource file to disk. Defaults to false. If resourcePath is provided, use that path. If resourcePath is not provided, write the resource to the current working directory.
255
+ validate : bool
256
+ Optional. If True, the resource file, schema file, and data file will be validated. Note that writeResource must be True to
257
+ use this option.
258
+
259
+ Returns
260
+ -------
261
+ resource : frictionless.resources.table.TableResource
262
+ A Frictionless TableResource object which describes the data
263
+ """
264
+ import os
265
+ import re
266
+ import frictionless
267
+
268
+ EXTENSION_MAP = {
269
+ ".gpkg": {
270
+ "format":"gpkg",
271
+ "mediatype":"geopackage+sqlite3"
272
+ },
273
+ ".csv": {
274
+ "format":"csv",
275
+ "mediatype":"text/csv"
276
+ },
277
+ ".xls": {
278
+ "format":"xls",
279
+ "mediatype":"application/vnd.ms-excel"
280
+ },
281
+ ".xlsx": {
282
+ "format":"xlsx",
283
+ "mediatype":"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
284
+ },
285
+ ".dbf": {
286
+ "format":"dbf",
287
+ "mediatype":"application/dbf"
288
+ }
289
+ }
290
+
291
+ dataFilePath = os.path.normpath(dataPath)
292
+ dataFileName = os.path.splitext(os.path.basename(dataFilePath))[0]
293
+ dataFileExtension = os.path.splitext(os.path.basename(dataFilePath))[1]
294
+
295
+ if(os.path.basename(dataFilePath) != os.path.normpath(dataFilePath)):
296
+ # If dataFilePath is not simply a filename
297
+ print("morpc.create_resource | WARNING | You seem to have specified a data path that is not simply a file name. This implies that the data is located in a different directory than the resource file. Typically the data is located in the same directory as the resource file and the path is simply the filename.")
298
+
299
+ resourceFilePath = None
300
+ if(resourcePath != None):
301
+ if(not writeResource):
302
+ # Warn the user if they specified a resource file location but did not enable writeResource
303
+ print("morpc.create_resource | WARNING | You specified a path for the resource file, however writeResource is not set to True. Resource file will not be written to disk.")
304
+
305
+ # If the user has specified a path to the resource file, we'll use it without modification. Warn the user if the choice is unusual.
306
+ if(os.path.basename(dataFilePath) != os.path.normpath(dataFilePath)):
307
+ # If dataFilePath is not simply a filename
308
+ if(os.path.dirname(os.path.abspath(resourcePath)) != os.path.dirname(os.path.abspath(dataFilePath))):
309
+ # If the absolute path to the resource file and the absolute path to the data put them in different directories
310
+ print("morpc.create_resource | WARNING | You seem to have specified a path for the resource file that is in a different directory than the data. Typically the data is located in the same directory as the resource file and the path is simply the filename.")
311
+ resourceFilePath = os.path.normpath(resourcePath)
312
+
313
+ if resFormat != None:
314
+ resourceFormat = resFormat
315
+ else:
316
+ if dataFileExtension.lower() in EXTENSION_MAP:
317
+ resourceFormat = EXTENSION_MAP[dataFileExtension.lower()]["format"]
318
+ print("morpc.create_resource | INFO | Format not specified. Using format derived from data file extension: {}".format(resourceFormat))
319
+ else:
320
+ print("morpc.create_resource | ERROR | Format not specified and could not be determined from data file extension.")
321
+ raise RuntimeError
322
+
323
+ if(not ignoreSchema):
324
+ # If ignoreSchema is False, determine the schema file path
325
+ if(schemaPath != None):
326
+ # If the user has specified a path to the resource file, we'll use it without modification. Warn the user if the choice is unusual.
327
+ if(os.path.basename(dataFilePath) != os.path.normpath(dataFilePath)):
328
+ # If dataFilePath is not simply a filename
329
+ if(os.path.dirname(os.path.abspath(schemaPath)) != os.path.dirname(os.path.abspath(dataFilePath))):
330
+ # If the absolute path to the schema file and the absolute path to the data put them in different directories
331
+ print("morpc.create_resource | WARNING | You seem to have specified a path for the schema file that is in a different directory than the data. Typically the schema is located in the same directory as the data.")
332
+ schemaFilePath = os.path.normpath(schemaPath)
333
+ else:
334
+ # If the user has not specified a path to the schema file, we'll assume that it should go in the same directory as the data. In that
335
+ # case, derive the path from the data path.
336
+ schemaFilePath = dataFilePath.replace(dataFileExtension, ".schema.yaml")
337
+ print("morpc.create_resource | INFO | Schema path not specified. Using path derived from data file path: {}".format(schemaFilePath))
338
+
339
+ if title != None:
340
+ resourceTitle = title
341
+ else:
342
+ resourceTitle = dataFileName
343
+ print("morpc.create_resource | INFO | Title not specified. Using placeholder value derived from data filename: {}".format(resourceTitle))
344
+
345
+ if name != None:
346
+ resourceName = name
347
+ else:
348
+ resourceName = re.sub(r"\W+", "-", dataFileName).lower()
349
+ print("morpc.create_resource | INFO | Name not specified. Using placeholder value derived from data filename: {}".format(resourceName))
350
+
351
+ if description != None:
352
+ resourceDescription = description
353
+ else:
354
+ resourceDescription = "This dataset was produced by MORPC. For more information, please contact dataandmaps@morpc.org."
355
+ print("morpc.create_resource | INFO | Description not specified. Using boilerplate placeholder value: {}".format(resourceDescription))
356
+
357
+ if resMediaType != None:
358
+ resourceMediaType = resMediaType
359
+ else:
360
+ if dataFileExtension.lower() in EXTENSION_MAP:
361
+ resourceMediaType = EXTENSION_MAP[dataFileExtension.lower()]["mediatype"]
362
+ else:
363
+ print("morpc.create_resource | ERROR | Media type not specified and could not be determined from data file extension.")
364
+ raise RuntimeError
365
+
366
+ if resProfile != None:
367
+ resourceProfile = resProfile
368
+ else:
369
+ resourceProfile = "data-resource"
370
+
371
+ resource = frictionless.Resource.from_descriptor({
372
+ "name": resourceName,
373
+ "title": resourceTitle,
374
+ "description": resourceDescription,
375
+ "profile": resourceProfile,
376
+ "path": dataFilePath,
377
+ "format": resourceFormat,
378
+ "mediatype": resourceMediaType,
379
+ })
380
+
381
+ if(not ignoreSchema):
382
+ resource.schema = schemaFilePath
383
+
384
+ unlocatedDataWarningIssued = False
385
+ if(computeHash):
386
+ if(resourceFilePath != None):
387
+ resource.hash = md5(os.path.join(os.path.dirname(resourceFilePath), dataFilePath))
388
+ else:
389
+ try:
390
+ print("morpc.create_resource | WARNING | Data path is specified relative to resource file, however no resource file path was specified. Assuming data path is relative to current working directory.")
391
+ unlocatedDataWarningIssued = True
392
+ resource.hash = md5(dataFilePath)
393
+ except:
394
+ print("morpc.create_resource | ERROR | Unable to compute MD5 hash. Data file could not be located.")
395
+ raise RuntimeError
396
+
397
+ if(computeBytes):
398
+ # If the data path is relative, we need to know the resource file path
399
+ if(resourceFilePath != None):
400
+ resource.bytes = os.path.getsize(os.path.join(os.path.dirname(resourceFilePath), dataFilePath))
401
+ else:
402
+ try:
403
+ if(not unlocatedDataWarningIssued):
404
+ print("morpc.create_resource | WARNING | Data path is specified relative to resource file, however no resource file path was specified. Assuming data path is relative to current working directory.")
405
+ resource.hash = md5(dataFilePath)
406
+ except:
407
+ print("morpc.create_resource | ERROR | Unable to compute file size (bytes). Data file could not be located.")
408
+ raise RuntimeError
409
+
410
+ if(writeResource):
411
+ if(resourceFilePath != None):
412
+ print("morpc.create_resource | INFO | Writing Frictionless Resource file to {}".format(resourceFilePath))
413
+ write_resource(resource, resourceFilePath)
414
+ else:
415
+ print("morpc.create_resource | ERROR | Unable to validate resource. No resource file path specified.")
416
+ raise RuntimeError
417
+
418
+ if(validate == True):
419
+ if(resourceFilePath != None):
420
+ print("morpc.create_resource | INFO | Validating resource on disk.")
421
+ validate_resource(resourceFilePath)
422
+ else:
423
+ print("morpc.create_resource | ERROR | Unable to validate resource. No resource file path specified.")
424
+ raise RuntimeError
425
+
426
+ return resource
427
+
428
+
429
+
430
+ def write_resource(resource, resourcePath):
431
+ """Given a Frictionless resource object and a path to a target file, this function writes the resource to disk in YAML
432
+ format. It is a wrapper for frictionless.Resource.to_yaml() that is necessary when the paths to the data and/or schema
433
+ files are specified as relative paths.
434
+
435
+ Parameters
436
+ ----------
437
+ resource : frictionless.resources.table.TableResource
438
+ A Frictionless TableResource object which describes the data
439
+ resourcePath : str
440
+ The path to the Frictionless Resource file that describes the data.
441
+ """
442
+
443
+ import os
444
+ import frictionless
445
+
446
+ cwd = os.getcwd()
447
+
448
+ try:
449
+ os.chdir(os.path.dirname(resourcePath))
450
+ resource.to_yaml(os.path.basename(resourcePath))
451
+ except Exception as e:
452
+ os.chdir(cwd)
453
+ print("ERROR: An unhandled error occurred while trying to write the Frictionless resource: {}".format(e))
454
+ raise RuntimeError
455
+
456
+ os.chdir(cwd)
457
+
458
+
459
+ def validate_resource(resourcePath, verbose=True):
460
+ import os
461
+ import frictionless
462
+ cwd = os.getcwd()
463
+
464
+ try:
465
+ os.chdir(os.path.dirname(resourcePath))
466
+
467
+ if(verbose):
468
+ print("morpc.validate_resource | INFO | Validating resource on disk (including data and schema). This may take some time.")
469
+ resourceOnDisk = frictionless.Resource(os.path.basename(resourcePath))
470
+ results = resourceOnDisk.validate()
471
+
472
+ except Exception as e:
473
+ os.chdir(cwd)
474
+ print("morpc.validate_resource | ERROR | An unhandled error occurred while trying to validate the Frictionless resource: {}".format(e))
475
+ raise RuntimeError
476
+
477
+ os.chdir(cwd)
478
+
479
+ if(results.valid == True):
480
+ if(verbose):
481
+ print("morpc.validate_resource | INFO | Resource is valid")
482
+ return True
483
+ else:
484
+ if(verbose):
485
+ print("morpc.validate_resource | ERROR | Resource is NOT valid. Errors follow.")
486
+ print(results)
487
+ return False
488
+
489
+ def load_data(resourcePath, archiveDir=None, validate=False, verbose=True):
490
+ """Often we want to make a copy of some input data and work with the copy, for example to protect
491
+ the original data or to create an archival copy of it so that we can replicate the process later.
492
+ The `load_data()` function simplifies the process of reading the data and
493
+ (optionally) validating the data and/or making an archival copy.
494
+
495
+ Parameters
496
+ ----------
497
+ resourcePath : str
498
+ The path to the Frictionless Resource file that describes the data.
499
+ archiveDir : str
500
+ Optional. The path to the directory where a copy of a data should be archived. If this is specified,
501
+ the Resource file, schema file, and data file will be archived in this location.
502
+ validate : bool
503
+ Optional. If True, the resource file, schema file, and data file will be validated. If archiveDir is
504
+ specified, the copies of the files will be validated. If not, the original files will be validated.
505
+
506
+ Returns
507
+ -------
508
+ df : pandas.core.frame.DataFrame
509
+ A GeoPandas GeoDataframe constructed from the data at the location specified by sourcePath and layerName
510
+ resource : frictionless.resources.table.TableResource
511
+ A Frictionless TableResource object which describes the data
512
+ schema : frictionless.schema.schema.Schema
513
+ A Frictionless Schema object which describes the data
514
+ """
515
+
516
+ import frictionless
517
+ import pandas as pd
518
+ import os
519
+ import json
520
+ import shutil
521
+
522
+ myResourcePath = os.path.normpath(resourcePath)
523
+
524
+ print("morpc.load_data | INFO | Loading Frictionless Resource file at location {}".format(myResourcePath))
525
+
526
+ resource = load_resource(myResourcePath)
527
+
528
+ sourceDir = os.path.dirname(myResourcePath)
529
+ resourceFilename = os.path.basename(myResourcePath)
530
+ dataFileExtension = os.path.splitext(resource.path)[1]
531
+
532
+ if(archiveDir != None):
533
+
534
+ targetResource = os.path.join(archiveDir, resourceFilename)
535
+ targetData = os.path.join(archiveDir, resource.path)
536
+ # Surely there is a more convenient way to get the schema path from the Resource object?
537
+ targetSchema = os.path.join(archiveDir, json.loads(resource.to_json())["schema"])
538
+
539
+ try:
540
+ print("morpc.load_data | INFO | Copying data, resource file, and schema to directory {}".format(archiveDir))
541
+
542
+ shutil.copyfile(os.path.join(sourceDir, resourceFilename), targetResource)
543
+ shutil.copyfile(os.path.join(sourceDir, resource.path), targetData)
544
+ shutil.copyfile(os.path.join(sourceDir, json.loads(resource.to_json())["schema"]), targetSchema)
545
+ except Exception as e:
546
+ print("morpc.load_data | ERROR | Unhandled exception when trying to copy data and associated Frictionless files: {}".format(e))
547
+ raise RuntimeError
548
+
549
+ else:
550
+ targetResource = os.path.join(sourceDir, resourceFilename)
551
+ targetData = os.path.join(sourceDir, resource.path)
552
+ targetSchema = os.path.join(sourceDir, json.loads(resource.to_json())["schema"])
553
+
554
+ print("morpc.load_data | INFO | Loading data, resource file, and schema from their source locations")
555
+
556
+ print("morpc.load_data | INFO | --> Data file: {}".format(targetData))
557
+ print("morpc.load_data | INFO | --> Resource file: {}".format(targetResource))
558
+ print("morpc.load_data | INFO | --> Schema file: {}".format(targetSchema))
559
+
560
+ if(validate):
561
+ print("morpc.load_data | INFO | Validating resource including data and schema.")
562
+ resourceValid = validate_resource(targetResource)
563
+ if(not resourceValid):
564
+ print("morpc.load_data | ERROR | Validation failed. Errors should be described above.")
565
+ raise RuntimeError
566
+
567
+ print("morpc.load_data | INFO | Loading data.")
568
+ if(dataFileExtension == ".csv"):
569
+ df = pd.read_csv(targetData, dtype="string")
570
+ elif(dataFileExtension == ".xlsx"):
571
+ df = pd.read_excel(targetData)
572
+ else:
573
+ print("morpc.load_data | ERROR | Unknown data file extension: {}".format(dataFileExtension))
574
+ raise RuntimeError
575
+
576
+ df = cast_field_types(df, resource.schema, verbose=verbose)
577
+
578
+ return df, resource, resource.schema
579
+
580
+
581
+ # Given the path to a schema document in Avro format, load the Avro schema and reformat it as a
582
+ # Frictionless Schema object in memory
583
+ # WARNING: This function has not been extensively tested. Be sure to validate the resulting
584
+ # Frictionless schema
585
+ def schema_from_avro(path):
586
+ import frictionless
587
+ import os
588
+
589
+ fieldList = []
590
+ avroSchema = load_avro_schema(os.path.normpath(path))
591
+ for field in avroSchema["fields"]:
592
+ thisField = {}
593
+ for key in field:
594
+ if key == "name":
595
+ thisField["name"] = field[key]
596
+ elif key == "type":
597
+ if field[key] == "int":
598
+ thisField["type"] = "integer"
599
+ elif field[key] == "float":
600
+ thisField["type"] = "number"
601
+ else:
602
+ thisField["type"] = field[key]
603
+ elif key == "doc":
604
+ thisField["description"] = field[key]
605
+ fieldList.append(thisField)
606
+
607
+ frictionlessSchemaDescriptor = {
608
+ "fields": fieldList
609
+ }
610
+
611
+ results = frictionless.Schema.validate_descriptor(frictionlessSchemaDescriptor)
612
+ if(results.valid == True):
613
+ print("Schema is valid")
614
+ else:
615
+ print("ERROR: Schema is NOT valid. Errors follow.")
616
+ print(results)
617
+ raise RuntimeError
618
+
619
+ frictionlessSchema = frictionless.Schema.from_descriptor(frictionlessSchemaDescriptor)
620
+
621
+ return frictionlessSchema