nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +22 -2
- nmdc_runtime/api/core/idgen.py +36 -6
- nmdc_runtime/api/db/mongo.py +0 -12
- nmdc_runtime/api/endpoints/find.py +65 -225
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
- nmdc_runtime/api/endpoints/objects.py +4 -11
- nmdc_runtime/api/endpoints/operations.py +0 -27
- nmdc_runtime/api/endpoints/queries.py +22 -0
- nmdc_runtime/api/endpoints/sites.py +0 -24
- nmdc_runtime/api/endpoints/util.py +57 -35
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +84 -60
- nmdc_runtime/api/models/util.py +12 -5
- nmdc_runtime/api/openapi.py +116 -180
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/minter/adapters/repository.py +21 -0
- nmdc_runtime/minter/domain/model.py +20 -0
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +632 -11
- nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
- nmdc_runtime/site/graphs.py +7 -0
- nmdc_runtime/site/ops.py +92 -34
- nmdc_runtime/site/repository.py +2 -0
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +87 -1
- nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
- nmdc_runtime/api/endpoints/ids.py +0 -192
- nmdc_runtime/client/__init__.py +0 -0
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/__init__.py +0 -0
- nmdc_runtime/core/db/Database.py +0 -13
- nmdc_runtime/core/db/__init__.py +0 -0
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/__init__.py +0 -0
- nmdc_runtime/domain/users/__init__.py +0 -0
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/models/user.py +0 -1
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -33
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -825
- nmdc_runtime/lib/nmdc_etl_class.py +0 -396
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/__init__.py +0 -0
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
- nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,1117 +0,0 @@
|
|
|
1
|
-
## author: Bill Duncan
|
|
2
|
-
## summary: Contains methods for transforming data in NMDC ETL pipeline.
|
|
3
|
-
|
|
4
|
-
import io
|
|
5
|
-
import json
|
|
6
|
-
import pkgutil
|
|
7
|
-
from collections import namedtuple
|
|
8
|
-
from datetime import datetime
|
|
9
|
-
|
|
10
|
-
import jsonasobj
|
|
11
|
-
|
|
12
|
-
## system level modules
|
|
13
|
-
import pandas as pds
|
|
14
|
-
|
|
15
|
-
## add all classes for local nmdc.py
|
|
16
|
-
## this is the file of python classes generated by linkml
|
|
17
|
-
from nmdc_schema import nmdc
|
|
18
|
-
|
|
19
|
-
import nmdc_runtime.lib.nmdc_dataframes as nmdc_dataframes
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def has_raw_value(obj, attribute: str) -> bool:
|
|
23
|
-
"""
|
|
24
|
-
Helper function that returns True/False if a an object attribute
|
|
25
|
-
has a has_raw_value property.
|
|
26
|
-
E.g.: "lat_lon": {"has_raw_value": "-33.460524 150.168149"}
|
|
27
|
-
|
|
28
|
-
Args:
|
|
29
|
-
obj (dict or object):
|
|
30
|
-
attribute (string): the name of the attribute in obj to check.
|
|
31
|
-
|
|
32
|
-
Returns:
|
|
33
|
-
boolean: True if haw_raw_value property is present.
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
val = getattr(obj, attribute) # get value of object
|
|
37
|
-
|
|
38
|
-
if val is None: # check that value exists
|
|
39
|
-
return False
|
|
40
|
-
|
|
41
|
-
## if val is a dict, check that it has a has_raw_value key
|
|
42
|
-
## and that the value is not null
|
|
43
|
-
if type(val) == type({}):
|
|
44
|
-
if "has_raw_value" in val.keys():
|
|
45
|
-
return pds.notnull(val["has_raw_value"])
|
|
46
|
-
else:
|
|
47
|
-
return False
|
|
48
|
-
|
|
49
|
-
## if val is not a dict, assume it is a class
|
|
50
|
-
## and check has_raw_value
|
|
51
|
-
obj_vars = vars(val)
|
|
52
|
-
if "has_raw_value" in obj_vars.keys():
|
|
53
|
-
return pds.notnull(obj_vars["has_raw_value"])
|
|
54
|
-
else:
|
|
55
|
-
return False
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def record_has_field(nmdc_record: namedtuple, attribute_field: str) -> bool:
|
|
59
|
-
"""
|
|
60
|
-
Returns True/False if a field is in nmdc_record (a namedtuple).
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
nmdc_record (namedtuple): the nmdc record
|
|
64
|
-
attribute_field (string): the name of the attribute
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
bool: True if the record has the field.
|
|
68
|
-
"""
|
|
69
|
-
if pds.isnull(nmdc_record):
|
|
70
|
-
return None
|
|
71
|
-
|
|
72
|
-
if "," in attribute_field: # e.g., "file_size_bytes, int"
|
|
73
|
-
field = attribute_field.split(",")[0].strip()
|
|
74
|
-
else: # default to string datatype
|
|
75
|
-
field = attribute_field.strip()
|
|
76
|
-
|
|
77
|
-
return field in nmdc_record._fields
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def coerce_value(value, dtype: str):
|
|
81
|
-
"""
|
|
82
|
-
Coerces value into the type specified by dtype and returns the coerced value.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
value: the value to coerece
|
|
86
|
-
dtype (str): the data type to coerce/cast value into
|
|
87
|
-
|
|
88
|
-
Returns:
|
|
89
|
-
the value cast into the data type specified by dtype
|
|
90
|
-
"""
|
|
91
|
-
if value is None:
|
|
92
|
-
return None
|
|
93
|
-
|
|
94
|
-
if dtype != "str": # only do the eval when it is not a string
|
|
95
|
-
return eval(f"""{dtype}({value})""") # convert value to specified datatype
|
|
96
|
-
else:
|
|
97
|
-
return f"""{value}"""
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def get_dtype_from_attribute_field(attribute_field) -> str:
|
|
101
|
-
"""
|
|
102
|
-
Return data type part of attribute_field (e.g. 'file_size, int').
|
|
103
|
-
If no dtype is given, "str" is returned.
|
|
104
|
-
|
|
105
|
-
Args:
|
|
106
|
-
attribute_field: the attribute field to get the data type from
|
|
107
|
-
|
|
108
|
-
Returns:
|
|
109
|
-
str: the string representation of the attribute field's data type
|
|
110
|
-
"""
|
|
111
|
-
if type(attribute_field) == type({}):
|
|
112
|
-
if "$const" in attribute_field.keys():
|
|
113
|
-
## NB: RECURSIVE CALL
|
|
114
|
-
dtype = get_dtype_from_attribute_field(attribute_field["$const"])
|
|
115
|
-
else:
|
|
116
|
-
dtype = "str"
|
|
117
|
-
elif "," in attribute_field: # e.g., "file_size_bytes, int"
|
|
118
|
-
dtype = attribute_field.split(",")[1].strip()
|
|
119
|
-
else: # default to string datatype
|
|
120
|
-
dtype = "str"
|
|
121
|
-
|
|
122
|
-
return dtype
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def get_field_and_dtype_from_attribute_field(attribute_field) -> tuple:
|
|
126
|
-
"""
|
|
127
|
-
Returns both the field and data type parts of attribute_field (e.g. 'file_size, int').
|
|
128
|
-
If no dtype is given, a dtype of "str" is returned.
|
|
129
|
-
|
|
130
|
-
Args:
|
|
131
|
-
attribute_field: the name of the attribute field
|
|
132
|
-
|
|
133
|
-
Returns:
|
|
134
|
-
tuple: contains the (field, data type)
|
|
135
|
-
"""
|
|
136
|
-
if type(attribute_field) == type({}):
|
|
137
|
-
if "$const" in attribute_field.keys():
|
|
138
|
-
## NB: RECURSIVE CALL
|
|
139
|
-
field, dtype = get_field_and_dtype_from_attribute_field(
|
|
140
|
-
attribute_field["$const"]
|
|
141
|
-
)
|
|
142
|
-
elif "$field" in attribute_field.keys():
|
|
143
|
-
## NB: RECURSIVE CALL
|
|
144
|
-
field, dtype = get_field_and_dtype_from_attribute_field(
|
|
145
|
-
attribute_field["$field"]
|
|
146
|
-
)
|
|
147
|
-
else:
|
|
148
|
-
field, dtype = attribute_field, "str"
|
|
149
|
-
elif "," in attribute_field: # e.g., "file_size_bytes, int"
|
|
150
|
-
field, dtype = attribute_field.split(",")
|
|
151
|
-
field, dtype = field.strip(), dtype.strip()
|
|
152
|
-
else: # default to string datatype
|
|
153
|
-
field, dtype = attribute_field.strip(), "str"
|
|
154
|
-
|
|
155
|
-
return field, dtype
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
def get_record_attr(record: namedtuple, attribute_field, return_field_if_none=True):
|
|
159
|
-
"""
|
|
160
|
-
Returns the value specified by attribute_field in the record.
|
|
161
|
-
E.g., get_record_attr(Record(id='gold:001', name='foo'), 'id') would return 'gold:001'.
|
|
162
|
-
|
|
163
|
-
In some cases, the attribure_field may used for constant value (e.g., unit: meter).
|
|
164
|
-
In these case the return_field_if_none (default True), specifies whether to return the
|
|
165
|
-
constant value (e.g., return 'meter' instead of None)
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
record (namedtuple): the record containing the data
|
|
169
|
-
attribute_field: the name of the field that contains the data
|
|
170
|
-
return_field_if_none (bool, optional): Defaults to True.
|
|
171
|
-
|
|
172
|
-
Returns:
|
|
173
|
-
the value of record's field
|
|
174
|
-
"""
|
|
175
|
-
## check for constant
|
|
176
|
-
if type({}) == type(attribute_field) and "$const" in attribute_field.keys():
|
|
177
|
-
field, dtype = get_field_and_dtype_from_attribute_field(
|
|
178
|
-
attribute_field["$const"]
|
|
179
|
-
)
|
|
180
|
-
return coerce_value(field, dtype)
|
|
181
|
-
|
|
182
|
-
## get field name and data type
|
|
183
|
-
field, dtype = get_field_and_dtype_from_attribute_field(attribute_field)
|
|
184
|
-
|
|
185
|
-
## get value from record
|
|
186
|
-
if record_has_field(record, field): # check field
|
|
187
|
-
val = getattr(record, field)
|
|
188
|
-
else: #### ********** Return value of field or None ******************* #######
|
|
189
|
-
val = field if return_field_if_none else None
|
|
190
|
-
|
|
191
|
-
if pds.notnull(val):
|
|
192
|
-
return coerce_value(val, dtype)
|
|
193
|
-
else:
|
|
194
|
-
return None
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def make_constructor_args_from_record(
|
|
198
|
-
constructor_map: dict, nmdc_record: namedtuple
|
|
199
|
-
) -> dict:
|
|
200
|
-
"""
|
|
201
|
-
Returns the constructor arguments as a dict that are needed to build an object.
|
|
202
|
-
E.g., If the constructor map specifies that a Study object requires an id and name in
|
|
203
|
-
the constructor, this function would return {id: gold:001, name: foo}.
|
|
204
|
-
|
|
205
|
-
Args:
|
|
206
|
-
constructor_map (dict): the arguments specified to build an object
|
|
207
|
-
nmdc_record (namedtuple): holds the data that is used to build an object
|
|
208
|
-
|
|
209
|
-
Returns:
|
|
210
|
-
dict: the constructor arguments needed to build the object
|
|
211
|
-
"""
|
|
212
|
-
## for every mapping between a key and data field create a dict
|
|
213
|
-
## of the parameters needed to instantiate the class
|
|
214
|
-
constructor_dict = {}
|
|
215
|
-
for key, field in constructor_map.items():
|
|
216
|
-
## if the fields is a dict, constructor param takes an object
|
|
217
|
-
## e.g., {'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon', '$class_type': 'GeolocationValue'}
|
|
218
|
-
if type({}) == type(field) and len(field) > 0:
|
|
219
|
-
## get values from the nmdc record for each field name
|
|
220
|
-
record_dict = make_record_dict(nmdc_record, field)
|
|
221
|
-
## find constructors defined by the initialization key
|
|
222
|
-
if "$class_type" in field.keys():
|
|
223
|
-
class_type = make_nmdc_class(field["$class_type"]) # get class type
|
|
224
|
-
|
|
225
|
-
## update constructor dict
|
|
226
|
-
constructor_dict[key] = class_type(**record_dict)
|
|
227
|
-
else:
|
|
228
|
-
constructor_dict[key] = record_dict
|
|
229
|
-
elif type([]) == type(field) and len(field) > 0:
|
|
230
|
-
constructor_dict[key] = [get_record_attr(nmdc_record, f) for f in field]
|
|
231
|
-
else:
|
|
232
|
-
constructor_dict[key] = get_record_attr(nmdc_record, field)
|
|
233
|
-
|
|
234
|
-
return constructor_dict
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def make_dict_from_nmdc_obj(nmdc_obj) -> dict:
|
|
238
|
-
"""
|
|
239
|
-
Returns a dict based on the nmdc_obj.
|
|
240
|
-
|
|
241
|
-
Args:
|
|
242
|
-
nmdc_obj: an object containing nmdc data
|
|
243
|
-
|
|
244
|
-
Returns:
|
|
245
|
-
dict: representation of the object
|
|
246
|
-
"""
|
|
247
|
-
|
|
248
|
-
def is_value(variable):
|
|
249
|
-
"""
|
|
250
|
-
Checks if variable has a value. Returns True if:
|
|
251
|
-
- variable is not None and
|
|
252
|
-
- has length > 0 if variable is a list and dict and
|
|
253
|
-
- has an id and/or has raw value key if variable is a dict
|
|
254
|
-
"""
|
|
255
|
-
## check if variable is None
|
|
256
|
-
if variable is None:
|
|
257
|
-
return False
|
|
258
|
-
|
|
259
|
-
## check for zero len variable
|
|
260
|
-
if (
|
|
261
|
-
type([]) == type(variable)
|
|
262
|
-
or type({}) == type(variable)
|
|
263
|
-
or type("") == type(variable)
|
|
264
|
-
):
|
|
265
|
-
if len(variable) == 0:
|
|
266
|
-
return False
|
|
267
|
-
else:
|
|
268
|
-
if pds.isnull(variable):
|
|
269
|
-
return False ## check for null
|
|
270
|
-
|
|
271
|
-
## if variable is a dict, make sure it has an id or raw value
|
|
272
|
-
if type({}) == type(variable):
|
|
273
|
-
if "id" in variable.keys():
|
|
274
|
-
return is_value(variable["id"]) # check if id has a value
|
|
275
|
-
elif "has_raw_value" in variable.keys():
|
|
276
|
-
return is_value(
|
|
277
|
-
variable["has_raw_value"]
|
|
278
|
-
) # check if has_raw_value has a value
|
|
279
|
-
else:
|
|
280
|
-
return False # if it makes it here, there wasn't an id or has_raw_value
|
|
281
|
-
|
|
282
|
-
return True # if it makes it here, all good
|
|
283
|
-
|
|
284
|
-
def make_dict(obj):
|
|
285
|
-
"""
|
|
286
|
-
Transforms an nmdc object into a dict
|
|
287
|
-
"""
|
|
288
|
-
if obj == None:
|
|
289
|
-
return # make sure the object has a value
|
|
290
|
-
|
|
291
|
-
## check if obj can convert to dict
|
|
292
|
-
if not hasattr(obj, "_as_dict"):
|
|
293
|
-
return obj
|
|
294
|
-
|
|
295
|
-
# temp_dict = jsonasobj.as_dict(obj) # convert obj dict
|
|
296
|
-
temp_dict = {}
|
|
297
|
-
obj_dict = {}
|
|
298
|
-
|
|
299
|
-
## include only valid values in lists and dicts
|
|
300
|
-
for key, val in jsonasobj.as_dict(obj).items():
|
|
301
|
-
# print('key:', key, '\n', ' val:', val, '\n')
|
|
302
|
-
if type({}) == type(val): # check values in dict
|
|
303
|
-
temp_dict[key] = {k: v for k, v in val.items() if is_value(v)}
|
|
304
|
-
elif type([]) == type(val): # check values in list
|
|
305
|
-
temp_dict[key] = [element for element in val if is_value(element)]
|
|
306
|
-
else:
|
|
307
|
-
temp_dict[key] = val
|
|
308
|
-
|
|
309
|
-
## check for {} or [] that may resulted from prevous loop
|
|
310
|
-
for key, val in temp_dict.items():
|
|
311
|
-
if is_value(val):
|
|
312
|
-
obj_dict[key] = val
|
|
313
|
-
|
|
314
|
-
return obj_dict
|
|
315
|
-
|
|
316
|
-
if type([]) == type(nmdc_obj):
|
|
317
|
-
# print('nndc_obj:', nmdc_obj)
|
|
318
|
-
nmdc_dict = [make_dict(o) for o in nmdc_obj if is_value(o)]
|
|
319
|
-
# print('nmdc_dict:', nmdc_dict)
|
|
320
|
-
else:
|
|
321
|
-
nmdc_dict = make_dict(nmdc_obj)
|
|
322
|
-
|
|
323
|
-
return nmdc_dict
|
|
324
|
-
|
|
325
|
-
def make_dict(obj):
|
|
326
|
-
"""
|
|
327
|
-
transforms an nmdc object into a dict
|
|
328
|
-
"""
|
|
329
|
-
if obj == None:
|
|
330
|
-
return # make sure the object has a value
|
|
331
|
-
|
|
332
|
-
## check if obj can convert to dict
|
|
333
|
-
if not hasattr(obj, "_as_dict"):
|
|
334
|
-
return obj
|
|
335
|
-
|
|
336
|
-
# temp_dict = jsonasobj.as_dict(obj) # convert obj dict
|
|
337
|
-
temp_dict = {}
|
|
338
|
-
obj_dict = {}
|
|
339
|
-
|
|
340
|
-
## include only valid values in lists and dicts
|
|
341
|
-
for key, val in jsonasobj.as_dict(obj).items():
|
|
342
|
-
# print('key:', key, '\n', ' val:', val, '\n')
|
|
343
|
-
if type({}) == type(val): # check values in dict
|
|
344
|
-
temp_dict[key] = {k: v for k, v in val.items() if is_value(v)}
|
|
345
|
-
elif type([]) == type(val): # check values in list
|
|
346
|
-
temp_dict[key] = [element for element in val if is_value(element)]
|
|
347
|
-
else:
|
|
348
|
-
temp_dict[key] = val
|
|
349
|
-
|
|
350
|
-
## check for {} or [] that may resulted from prevous loop
|
|
351
|
-
for key, val in temp_dict.items():
|
|
352
|
-
if is_value(val):
|
|
353
|
-
obj_dict[key] = val
|
|
354
|
-
|
|
355
|
-
return obj_dict
|
|
356
|
-
|
|
357
|
-
if type([]) == type(nmdc_obj):
|
|
358
|
-
# print('nndc_obj:', nmdc_obj)
|
|
359
|
-
nmdc_dict = [make_dict(o) for o in nmdc_obj if is_value(o)]
|
|
360
|
-
# print('nmdc_dict:', nmdc_dict)
|
|
361
|
-
else:
|
|
362
|
-
nmdc_dict = make_dict(nmdc_obj)
|
|
363
|
-
|
|
364
|
-
return nmdc_dict
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
def set_nmdc_object(
|
|
368
|
-
nmdc_obj, nmdc_record: namedtuple, attribute_map: dict, attribute_field
|
|
369
|
-
):
|
|
370
|
-
"""
|
|
371
|
-
Sets the properties of nmdc_obj using the values stored in the nmdc_record.
|
|
372
|
-
The update nmdc_obj is returned.
|
|
373
|
-
|
|
374
|
-
Args:
|
|
375
|
-
nmdc_obj: the nmdc object that will modified
|
|
376
|
-
nmdc_record (namedtuple): the record who's data will be used to set the values of the nmdc_obj
|
|
377
|
-
attribute_map (dict): a dict/map based on the sssom file used to update the object's field
|
|
378
|
-
attribute_field: the nmdc_obj's field to be set
|
|
379
|
-
|
|
380
|
-
Returns:
|
|
381
|
-
updated nmdc_obj
|
|
382
|
-
"""
|
|
383
|
-
## by default property values are represented as dicts
|
|
384
|
-
## the exception is when an value is created using '$class_type'
|
|
385
|
-
## e.g. {latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon', '$class_type': 'GeolocationValue'}
|
|
386
|
-
## when '$class_type' is used the represent as dict flag is changed
|
|
387
|
-
represent_as_dict = True
|
|
388
|
-
|
|
389
|
-
## check if attribute is a dict; e.g. part_of: gold_study_id
|
|
390
|
-
if type({}) == type(attribute_field):
|
|
391
|
-
## get the field and value parts from dict
|
|
392
|
-
field, val = list(attribute_field.items())[0]
|
|
393
|
-
if type([]) == type(val):
|
|
394
|
-
## e.g. has_output: ["data_object_id, str"]
|
|
395
|
-
av = make_object_from_list(nmdc_record, val)
|
|
396
|
-
elif type({}) == type(val):
|
|
397
|
-
## # e.g. has_output: {id: gold:0001, name: 'foo', $class_type: Study}
|
|
398
|
-
## check if the av needs to be represented as an object
|
|
399
|
-
if "$class_type" in val.keys():
|
|
400
|
-
represent_as_dict = False
|
|
401
|
-
av = make_object_from_dict(nmdc_record, val) # val is a dict
|
|
402
|
-
elif type("") == type(val):
|
|
403
|
-
# e.g. has_output: "data_object_id, str" (not a list)
|
|
404
|
-
av = get_record_attr(nmdc_record, val)
|
|
405
|
-
else:
|
|
406
|
-
## val names the field in the record
|
|
407
|
-
av = make_attribute_value_from_record(nmdc_record, val)
|
|
408
|
-
elif type("") == type(attribute_field):
|
|
409
|
-
if "," in attribute_field:
|
|
410
|
-
## e.g., "file_size_bytes, int"
|
|
411
|
-
field = attribute_field.split(",")[0].strip()
|
|
412
|
-
else:
|
|
413
|
-
field = attribute_field.strip()
|
|
414
|
-
|
|
415
|
-
av = get_record_attr(nmdc_record, attribute_field)
|
|
416
|
-
else:
|
|
417
|
-
field = attribute_field
|
|
418
|
-
av = make_attribute_value_from_record(nmdc_record, field)
|
|
419
|
-
|
|
420
|
-
## convert attribute value into a dict
|
|
421
|
-
if represent_as_dict == True:
|
|
422
|
-
av = make_dict_from_nmdc_obj(av)
|
|
423
|
-
|
|
424
|
-
## check if attribute has been mapped in the sssom file
|
|
425
|
-
if (len(attribute_map) > 0) and (field in attribute_map.keys()):
|
|
426
|
-
setattr(nmdc_obj, attribute_map[field], av)
|
|
427
|
-
else:
|
|
428
|
-
setattr(nmdc_obj, field, av)
|
|
429
|
-
|
|
430
|
-
return nmdc_obj
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
def make_attribute_value_from_record(nmdc_record: namedtuple, field, object_type=""):
|
|
434
|
-
"""
|
|
435
|
-
Creates an attribute value object linked the value in the nmdc record's field.
|
|
436
|
-
|
|
437
|
-
Args:
|
|
438
|
-
nmdc_record (namedtuple): holds the data
|
|
439
|
-
field: the field to get the data from
|
|
440
|
-
object_type (str, optional): used to specify the type of object retured; defaults to ""
|
|
441
|
-
|
|
442
|
-
Returns:
|
|
443
|
-
an attribute value object (by default) with the has_raw_value property set to value in field
|
|
444
|
-
"""
|
|
445
|
-
# val = getattr(nmdc_record, field)
|
|
446
|
-
val = get_record_attr(nmdc_record, field)
|
|
447
|
-
av = make_attribute_value(val, object_type)
|
|
448
|
-
|
|
449
|
-
return av
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
def make_attribute_map(sssom_map_file: str = "") -> dict:
|
|
453
|
-
"""
|
|
454
|
-
Retuns a dict based on the SSSOM mapping.
|
|
455
|
-
By default the SSSOM mappping comes from the nmdc-schema package,
|
|
456
|
-
but an optional path to an SSSOM formed tsv may be used.
|
|
457
|
-
|
|
458
|
-
Args:
|
|
459
|
-
sssom_map_file (str): an optional path to the sssom file
|
|
460
|
-
|
|
461
|
-
Returns:
|
|
462
|
-
dict: map relating the subject to the object where there is a skos:exactMatch
|
|
463
|
-
"""
|
|
464
|
-
attr_map = {}
|
|
465
|
-
if len(sssom_map_file) > 0:
|
|
466
|
-
## load sssom mapping file and subset to skos:exactMatch
|
|
467
|
-
mapping_df = nmdc_dataframes.make_dataframe(
|
|
468
|
-
sssom_map_file, comment_str="#"
|
|
469
|
-
).query("predicate_id == 'skos:exactMatch'")
|
|
470
|
-
else:
|
|
471
|
-
sssom = io.BytesIO(pkgutil.get_data("nmdc_schema", "gold-to-mixs.sssom.tsv"))
|
|
472
|
-
mapping_df = pds.read_csv(sssom, sep="\t", comment="#", encoding="utf-8")
|
|
473
|
-
|
|
474
|
-
attr_map = {
|
|
475
|
-
subj: obj
|
|
476
|
-
for idx, subj, obj in mapping_df[["subject_label", "object_label"]].itertuples()
|
|
477
|
-
} # build attribute dict
|
|
478
|
-
|
|
479
|
-
return attr_map
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
def make_attribute_value(val):
|
|
483
|
-
"""
|
|
484
|
-
Creates an attribute value object that has_raw_value val.
|
|
485
|
-
|
|
486
|
-
Args:
|
|
487
|
-
val: the value that is set as the value of has_raw_value
|
|
488
|
-
|
|
489
|
-
Returns:
|
|
490
|
-
attribute value object that has_raw_value val
|
|
491
|
-
"""
|
|
492
|
-
av = nmdc.AttributeValue()
|
|
493
|
-
if pds.notnull(val):
|
|
494
|
-
av.has_raw_value = val
|
|
495
|
-
|
|
496
|
-
return av
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
def make_nmdc_class(class_type):
|
|
500
|
-
"""
|
|
501
|
-
Returns the NMDC class from the NMDC module as specified by class_type.
|
|
502
|
-
|
|
503
|
-
Args:
|
|
504
|
-
class_type: they type of class to return
|
|
505
|
-
|
|
506
|
-
Returns:
|
|
507
|
-
the specfied class reference (not string) that can be used to build an object
|
|
508
|
-
"""
|
|
509
|
-
## check if the class type is being passed as a string e.g., '$class_type': 'GeolocationValue'
|
|
510
|
-
if type("") == type(class_type):
|
|
511
|
-
class_type = getattr(nmdc, class_type)
|
|
512
|
-
return class_type
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
def make_record_dict(
|
|
516
|
-
nmdc_record: namedtuple, object_dict: dict, return_field_if_none=True
|
|
517
|
-
) -> dict:
|
|
518
|
-
"""
|
|
519
|
-
Transforms nmdc_record into a dict in which the record field/properties are the keys.
|
|
520
|
-
|
|
521
|
-
Args:
|
|
522
|
-
nmdc_record (namedtuple): the record/tuple that holds the data
|
|
523
|
-
object_dict (dict): holds the specificaion of fields to get data from
|
|
524
|
-
return_field_if_none (bool, optional): defaults to True;
|
|
525
|
-
speficies return type if field doesn't have any data
|
|
526
|
-
this is useful returning constants; e.g: depth {has_unit: meter} will return
|
|
527
|
-
'meter' for the has_unit property even though 'has_unit' is not a field in the record
|
|
528
|
-
|
|
529
|
-
Returns:
|
|
530
|
-
dict: a dict representation of the nmdc record
|
|
531
|
-
"""
|
|
532
|
-
## build record from the field names in the object dict
|
|
533
|
-
## note: $class_type is a special key that is ignored
|
|
534
|
-
record_dict = {}
|
|
535
|
-
for field_key, field in object_dict.items():
|
|
536
|
-
if field_key != "$class_type":
|
|
537
|
-
if type({}) == type(field):
|
|
538
|
-
## if the object value is a dict (e.g., {has_unit: {const: 'meter'}})
|
|
539
|
-
## then set the value to the dict's value
|
|
540
|
-
## needed if a field name conflicts with constant (e.g, if there was field named 'meter')
|
|
541
|
-
if list(field.keys())[0] == "$const":
|
|
542
|
-
record_dict[field_key] = list(field.values())[0]
|
|
543
|
-
else:
|
|
544
|
-
## get records value from nmdc record
|
|
545
|
-
## note: if the field is not in the nmdc record and return_field_if_none=True, the field is returned
|
|
546
|
-
## e.g., adding a constant or type: {has_raw_value: '10', type: QuantityValue}
|
|
547
|
-
record_dict[field_key] = get_record_attr(
|
|
548
|
-
nmdc_record, field, return_field_if_none
|
|
549
|
-
)
|
|
550
|
-
|
|
551
|
-
return record_dict
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
def make_object_from_dict(nmdc_record: namedtuple, object_dict: dict):
|
|
555
|
-
"""
|
|
556
|
-
Creates and returns an "object" based on nmdc_record.
|
|
557
|
-
If the object_dict has a $class_type key, an instantiated object is returned.
|
|
558
|
-
Otherwise, a dict is returned.
|
|
559
|
-
|
|
560
|
-
Args:
|
|
561
|
-
nmdc_record (namedtuple): the record that holds the data
|
|
562
|
-
object_dict (dict): the dict that specifies the field/data (key/value) pairings
|
|
563
|
-
|
|
564
|
-
Returns:
|
|
565
|
-
an object built from the record and object_dict information
|
|
566
|
-
"""
|
|
567
|
-
record_dict = make_record_dict(nmdc_record, object_dict)
|
|
568
|
-
|
|
569
|
-
if "$class_type" in object_dict.keys():
|
|
570
|
-
class_type = make_nmdc_class(object_dict["$class_type"])
|
|
571
|
-
obj = class_type(**record_dict) # build object
|
|
572
|
-
else:
|
|
573
|
-
obj = record_dict
|
|
574
|
-
|
|
575
|
-
return obj
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
def make_object_from_list_item_dict(nmdc_record: namedtuple, item: dict) -> list:
|
|
579
|
-
"""
|
|
580
|
-
When the item in the list is a dict; e.g.;
|
|
581
|
-
[{id: 'gold_id, int', name: project_name, $class_type: Study}]
|
|
582
|
-
A list of objects is returned that were created from the keys
|
|
583
|
-
in the dict.
|
|
584
|
-
|
|
585
|
-
This function is called from make_object_from_list.
|
|
586
|
-
|
|
587
|
-
Args:
|
|
588
|
-
nmdc_record (namedtuple): the record that holds the data values
|
|
589
|
-
item (dict): holds the information needed to build the object
|
|
590
|
-
|
|
591
|
-
Returns:
|
|
592
|
-
list: holds objects built from data in the record
|
|
593
|
-
"""
|
|
594
|
-
## set split value for values in dict (globally)
|
|
595
|
-
if "$spit_val" in item.keys():
|
|
596
|
-
split_val = item.pop("$split_val")
|
|
597
|
-
else:
|
|
598
|
-
split_val = ","
|
|
599
|
-
|
|
600
|
-
## get class type if prestent
|
|
601
|
-
if "$class_type" in item.keys():
|
|
602
|
-
class_type = item.pop("$class_type")
|
|
603
|
-
class_type = make_nmdc_class(class_type) # convert to a type
|
|
604
|
-
else:
|
|
605
|
-
class_type = None
|
|
606
|
-
|
|
607
|
-
## get list of record values from nmdc record and split
|
|
608
|
-
## e.g., [{id: 'gold_id, int', name: project_name, $class_type: Study}]
|
|
609
|
-
## -> [['gold:001', 'gold:0002'], ['name 1', 'name 2']]
|
|
610
|
-
record_values = []
|
|
611
|
-
for field_name in item.values():
|
|
612
|
-
## get value in nmdc record
|
|
613
|
-
val = get_record_attr(nmdc_record, field_name, return_field_if_none=False)
|
|
614
|
-
|
|
615
|
-
if val is not None:
|
|
616
|
-
dtype = get_dtype_from_attribute_field(field_name) # determine data type
|
|
617
|
-
|
|
618
|
-
## check for local spit val; e.g., [{id: {$field: 'gold_id, int', $split_val:'|'}}
|
|
619
|
-
mysplit = (
|
|
620
|
-
field_name["$split_val"]
|
|
621
|
-
if type({}) == type(field_name) and "$split_val" in field_name.keys()
|
|
622
|
-
else split_val
|
|
623
|
-
)
|
|
624
|
-
|
|
625
|
-
rv = [coerce_value(v.strip(), dtype) for v in str(val).split(mysplit)]
|
|
626
|
-
record_values.append(rv)
|
|
627
|
-
else:
|
|
628
|
-
record_values.append([None])
|
|
629
|
-
|
|
630
|
-
## get list of keys from item
|
|
631
|
-
keys = [key for key in item.keys() if key != "$class_type"]
|
|
632
|
-
|
|
633
|
-
## build list of objects
|
|
634
|
-
## this works by using zip build dictionary using the keys and record values
|
|
635
|
-
## first the values are zipped/paired/collated; e.g.:
|
|
636
|
-
## zip(*[['gold:001', 'gold:0002'], ['name 1', 'name 2']])
|
|
637
|
-
## -> [['gold:001', 'name 1'], ['gold:002', 'name 2']]
|
|
638
|
-
## then the keys are zipped as a dict to the values; e.g.:
|
|
639
|
-
## dict(zip(['id', 'name'], [['gold:001', 'name 1'], ['gold:002', 'name 2']]))
|
|
640
|
-
## -> [{id: gold:001, name: 'name 1'}, {id: gold:002, name: 'name 2'}]
|
|
641
|
-
obj_list = []
|
|
642
|
-
# for rv in zip_longest(*record_values):
|
|
643
|
-
# obj_dict = dict(zip(keys, rv))
|
|
644
|
-
#
|
|
645
|
-
# if class_type is not None:
|
|
646
|
-
# ## add the instantiated object to the list; e.g. obj_list.append(Study(id='gold:001'))
|
|
647
|
-
# obj_list.append(class_type(**obj_dict))
|
|
648
|
-
# else:
|
|
649
|
-
# ## simply add the object; e.g., obj_list.append({id: gold:001, name: name1})
|
|
650
|
-
# obj_list.append(obj_dict)
|
|
651
|
-
|
|
652
|
-
return obj_list
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
def make_value_from_list_item_dict(nmdc_record: namedtuple, item: dict) -> list:
|
|
656
|
-
"""
|
|
657
|
-
When the item in the list is a dict; e.g.;
|
|
658
|
-
[{$field: 'data_object_id, int'}]
|
|
659
|
-
[{$field: 'data_object_id, int', $split=','}]
|
|
660
|
-
A list of values is returned that were created from the keys
|
|
661
|
-
in the dict.
|
|
662
|
-
|
|
663
|
-
This function is called from make_object_from_list.
|
|
664
|
-
|
|
665
|
-
Args:
|
|
666
|
-
nmdc_record (namedtuple): the record that holds the data values
|
|
667
|
-
item (dict): holds the information needed to build the object
|
|
668
|
-
|
|
669
|
-
Returns:
|
|
670
|
-
list: values retrieved from data in the record
|
|
671
|
-
"""
|
|
672
|
-
# ****** add info to documentation ********
|
|
673
|
-
dtype = get_dtype_from_attribute_field(item["$field"])
|
|
674
|
-
|
|
675
|
-
## set value to split on
|
|
676
|
-
if "$split_val" in item.keys():
|
|
677
|
-
split_val = item["$split_val"]
|
|
678
|
-
else:
|
|
679
|
-
split_val = ","
|
|
680
|
-
|
|
681
|
-
## e.g., [{$field: data_object_id, $split=','}]
|
|
682
|
-
## get record value for the field
|
|
683
|
-
## returns None if the field is not in record
|
|
684
|
-
if "$const" in item.keys():
|
|
685
|
-
return [coerce_value(item["$const"], dtype)]
|
|
686
|
-
elif "$field" in item.keys():
|
|
687
|
-
record_val = get_record_attr(
|
|
688
|
-
nmdc_record, item["$field"], return_field_if_none=False
|
|
689
|
-
)
|
|
690
|
-
else:
|
|
691
|
-
record_val = None
|
|
692
|
-
|
|
693
|
-
## check the record value is not None
|
|
694
|
-
if record_val is not None:
|
|
695
|
-
## check if record needs to be split
|
|
696
|
-
if split_val is not None:
|
|
697
|
-
# make sure record_val is a string, needed for splitting
|
|
698
|
-
if type(record_val) != type(""):
|
|
699
|
-
record_val = str(record_val)
|
|
700
|
-
|
|
701
|
-
return [
|
|
702
|
-
coerce_value(rv.strip(), dtype) for rv in record_val.split(split_val)
|
|
703
|
-
]
|
|
704
|
-
else:
|
|
705
|
-
return [coerce_value(record_val.strip(), dtype)]
|
|
706
|
-
else:
|
|
707
|
-
return [None] # note: a list is returned
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
def make_object_from_list(nmdc_record: namedtuple, nmdc_list: list) -> list:
|
|
711
|
-
"""
|
|
712
|
-
When a list is specified as the value of a field; e.g.:
|
|
713
|
-
['gold_id, str']
|
|
714
|
-
{$field: data_object_id, $split=','}]
|
|
715
|
-
[{id: gold_id, name: project_name, $class_type: Study}]
|
|
716
|
-
A list of items (either values objects) is returned.
|
|
717
|
-
|
|
718
|
-
Args:
|
|
719
|
-
nmdc_record (namedtuple): [description]
|
|
720
|
-
nmdc_list (list): [description]
|
|
721
|
-
|
|
722
|
-
Returns:
|
|
723
|
-
list: [description]
|
|
724
|
-
"""
|
|
725
|
-
obj_list = []
|
|
726
|
-
for val in nmdc_list:
|
|
727
|
-
if type({}) == type(val):
|
|
728
|
-
if "$field" in val.keys():
|
|
729
|
-
## e.g., [{$field: data_object_id, $split=','}]
|
|
730
|
-
obj_list.extend(make_value_from_list_item_dict(nmdc_record, val))
|
|
731
|
-
else:
|
|
732
|
-
## e.g., [{id: gold_id, name: project_name, $class_type: Study}]
|
|
733
|
-
obj_list.extend(make_object_from_list_item_dict(nmdc_record, val))
|
|
734
|
-
else:
|
|
735
|
-
## e.g., ['gold_id, str']
|
|
736
|
-
dtype = get_dtype_from_attribute_field(val) # determine the data type
|
|
737
|
-
record_val = get_record_attr(nmdc_record, val)
|
|
738
|
-
if record_val is not None:
|
|
739
|
-
obj_list.extend(
|
|
740
|
-
[
|
|
741
|
-
coerce_value(rv.strip(), dtype)
|
|
742
|
-
for rv in str(record_val).split(",")
|
|
743
|
-
]
|
|
744
|
-
)
|
|
745
|
-
else:
|
|
746
|
-
obj_list.append(None)
|
|
747
|
-
|
|
748
|
-
return obj_list
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
def dataframe_to_dict(
|
|
752
|
-
nmdc_df: pds.DataFrame,
|
|
753
|
-
nmdc_class,
|
|
754
|
-
constructor_map={},
|
|
755
|
-
attribute_fields=[],
|
|
756
|
-
attribute_map={},
|
|
757
|
-
transform_map={},
|
|
758
|
-
) -> list:
|
|
759
|
-
"""
|
|
760
|
-
This is the main interface for the module.
|
|
761
|
-
The nmdc dataframe (nmdc_df) is transformed and returned as a list of dicts.
|
|
762
|
-
|
|
763
|
-
Args:
|
|
764
|
-
nmdc_df (pds.DataFrame): the Pandas dataframe to be transformed
|
|
765
|
-
nmdc_class: the NMDC class used to build objects
|
|
766
|
-
constructor_map (dict, optional): specifies constructor arguments need to build the object; defaults to {}
|
|
767
|
-
attribute_fields (list, optional): specifies which data fields to use as properties/keys; defaults to []
|
|
768
|
-
attribute_map (dict, optional): maps data fields to MIxS (or other standard) fields; defaults to {}
|
|
769
|
-
transform_map (dict, optional): specfies pre/post transformations to preform on the data; defaults to {}
|
|
770
|
-
|
|
771
|
-
Returns:
|
|
772
|
-
list: list of dicts that represent hte dataframe
|
|
773
|
-
"""
|
|
774
|
-
|
|
775
|
-
def make_nmdc_object(nmdc_record: namedtuple, nmdc_class):
|
|
776
|
-
"""
|
|
777
|
-
Creates an object from the nmdc records of the type nmdc_class.
|
|
778
|
-
|
|
779
|
-
Args:
|
|
780
|
-
nmdc_record (namedtuple): the records that holds the data
|
|
781
|
-
nmdc_class ([type]): the class that the object will instantiate
|
|
782
|
-
|
|
783
|
-
Returns:
|
|
784
|
-
an object of the type specified by class_type
|
|
785
|
-
"""
|
|
786
|
-
## check for constructor_map containing the paramaters necessary to instantiate the class
|
|
787
|
-
if len(constructor_map) > 0:
|
|
788
|
-
constructor_args = make_constructor_args_from_record(
|
|
789
|
-
constructor_map, nmdc_record
|
|
790
|
-
)
|
|
791
|
-
nmdc_obj = nmdc_class(**constructor_args)
|
|
792
|
-
else:
|
|
793
|
-
nmdc_obj = nmdc_class()
|
|
794
|
-
|
|
795
|
-
# print("****\n", nmdc_obj)
|
|
796
|
-
|
|
797
|
-
nmdc_obj.type = (
|
|
798
|
-
nmdc_class.class_class_curie
|
|
799
|
-
) ## add info about the type of entity it is
|
|
800
|
-
|
|
801
|
-
## get mappings for attribute fields
|
|
802
|
-
for af in attribute_fields:
|
|
803
|
-
nmdc_obj = set_nmdc_object(nmdc_obj, nmdc_record, attribute_map, af)
|
|
804
|
-
|
|
805
|
-
return nmdc_obj
|
|
806
|
-
|
|
807
|
-
## create transform kwargs and pre and post transform lists
|
|
808
|
-
tx_kwargs = {
|
|
809
|
-
"nmdc_class": nmdc_class,
|
|
810
|
-
"constructor_map": constructor_map,
|
|
811
|
-
"attribute_fields": attribute_fields,
|
|
812
|
-
"attribute_map": attribute_map,
|
|
813
|
-
}
|
|
814
|
-
pre_transforms = transform_map["pre"] if "pre" in transform_map.keys() else []
|
|
815
|
-
post_transforms = transform_map["post"] if "post" in transform_map.keys() else []
|
|
816
|
-
|
|
817
|
-
## execute specified pre transformations; note: this transforms the dataframe
|
|
818
|
-
for transform in pre_transforms:
|
|
819
|
-
tx_function = eval(transform["function"]) # dynamically load function
|
|
820
|
-
tx_attributes = transform["attributes"] # get list of attibutes
|
|
821
|
-
|
|
822
|
-
## apply transform funciton
|
|
823
|
-
nmdc_df = tx_function(nmdc_df, tx_attributes)
|
|
824
|
-
|
|
825
|
-
## transform each record into an nmdc object and store in list
|
|
826
|
-
## NB: SSSOM mapping is performed during this step
|
|
827
|
-
nmdc_objs = [
|
|
828
|
-
make_nmdc_object(record, nmdc_class)
|
|
829
|
-
for record in nmdc_df.itertuples(index=False)
|
|
830
|
-
]
|
|
831
|
-
|
|
832
|
-
## set value to None for fields that have dicts as values
|
|
833
|
-
## but not an id or has_raw_value key
|
|
834
|
-
## this needed in case conversions resulted in junk values
|
|
835
|
-
for obj in nmdc_objs:
|
|
836
|
-
for key, val in obj.__dict__.items():
|
|
837
|
-
if type(val) == type({}):
|
|
838
|
-
if (not "id" in val.keys()) and (not "has_raw_value" in val.keys()):
|
|
839
|
-
obj.__dict__[key] = None
|
|
840
|
-
|
|
841
|
-
## execute specified post transformations; note: this transforms the nmdc objects
|
|
842
|
-
for transform in post_transforms:
|
|
843
|
-
tx_function = eval(transform["function"]) # dynamically load function
|
|
844
|
-
tx_attributes = transform["attributes"] # get list of attibutes
|
|
845
|
-
|
|
846
|
-
## apply transform funciton
|
|
847
|
-
nmdc_objs = tx_function(nmdc_objs, tx_attributes, **tx_kwargs)
|
|
848
|
-
|
|
849
|
-
## transform each nmdc object in a dict and store in list
|
|
850
|
-
nmdc_dicts = [make_dict_from_nmdc_obj(obj) for obj in nmdc_objs]
|
|
851
|
-
|
|
852
|
-
## return list of dicts
|
|
853
|
-
return nmdc_dicts
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
def test_pre_transform(
|
|
857
|
-
nmdc_df: pds.DataFrame, tx_attributes: list, **kwargs
|
|
858
|
-
) -> pds.DataFrame:
|
|
859
|
-
"""
|
|
860
|
-
Dummy function to test pre-transform declarations.
|
|
861
|
-
"""
|
|
862
|
-
print("*** test pre-transform ****")
|
|
863
|
-
return nmdc_df
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
def merge_value_range_fields(nmdc_objs: list, tx_attributes: list, **kwargs) -> list:
|
|
867
|
-
"""
|
|
868
|
-
Takes each nmdc object (either a dict or class instance) and merges two
|
|
869
|
-
attributues into a single attribute separated by a "-".
|
|
870
|
-
Additionaly, the has_minuimum_numeric_value and has_maximum_numeric_value
|
|
871
|
-
attrubutes are given values.
|
|
872
|
-
The unit of the first attribute is preserved.
|
|
873
|
-
The second attribute is removed.
|
|
874
|
-
|
|
875
|
-
For example, if an object has the attributes "depth: 5.0, has_unit: meter"
|
|
876
|
-
and "depth2: 10.0", the two attribute are merged into a single attribute
|
|
877
|
-
with the form:
|
|
878
|
-
|
|
879
|
-
depth: 5.0-10.0
|
|
880
|
-
has_unit: meter
|
|
881
|
-
has_minimum_numeric_value: 5.0
|
|
882
|
-
has_maximum_numeric_value: 10.0
|
|
883
|
-
|
|
884
|
-
Args:
|
|
885
|
-
nmdc_objs (list): list of objects to be updated with has_numeric_value and/or has_unit values
|
|
886
|
-
tx_attributes (list): list of two attributes whose values need to be merged
|
|
887
|
-
|
|
888
|
-
Returns:
|
|
889
|
-
list: updated nmdc_objs with has_minimum_numeric_value and has_maximum_numeric_value values
|
|
890
|
-
in the first attribute; the second attriubte is removed
|
|
891
|
-
|
|
892
|
-
"""
|
|
893
|
-
|
|
894
|
-
def has_range_fields(obj, field1, field2):
|
|
895
|
-
# check that keys exist
|
|
896
|
-
if isinstance(obj, dict):
|
|
897
|
-
# check that keys have values
|
|
898
|
-
if field1 in obj.keys() and field2 in obj.keys():
|
|
899
|
-
# check if vals are None
|
|
900
|
-
if obj[field1] is not None and obj[field2] is not None:
|
|
901
|
-
field_obj1 = obj[field1]
|
|
902
|
-
field_obj2 = obj[field2]
|
|
903
|
-
return (
|
|
904
|
-
field_obj1["has_raw_value"] is not None
|
|
905
|
-
and field_obj2["has_raw_value"]
|
|
906
|
-
)
|
|
907
|
-
else:
|
|
908
|
-
return False
|
|
909
|
-
else:
|
|
910
|
-
# check that properties exist
|
|
911
|
-
if hasattr(obj, field1) and hasattr(obj, field2):
|
|
912
|
-
# get objects from fields and check if vals are None
|
|
913
|
-
field_obj1 = getattr(obj, field1)
|
|
914
|
-
field_obj2 = getattr(obj, field2)
|
|
915
|
-
return (
|
|
916
|
-
getattr(field_obj1, "has_raw_value") is not None
|
|
917
|
-
and getattr(field_obj2, "has_raw_value") is not None
|
|
918
|
-
)
|
|
919
|
-
else:
|
|
920
|
-
return False
|
|
921
|
-
|
|
922
|
-
def get_obj_field_values(obj, field1, field2):
|
|
923
|
-
if isinstance(obj, dict):
|
|
924
|
-
return obj[field1], obj[field2]
|
|
925
|
-
else:
|
|
926
|
-
field_obj1 = getattr(obj, field1)
|
|
927
|
-
field_obj2 = getattr(obj, field2)
|
|
928
|
-
return getattr(field_obj1, "has_raw_value"), getattr(
|
|
929
|
-
field_obj2, "has_raw_value"
|
|
930
|
-
)
|
|
931
|
-
|
|
932
|
-
def format_val(val: str):
|
|
933
|
-
# if val is negative, put it in parens
|
|
934
|
-
return f"({val})" if val[0] == "-" else val
|
|
935
|
-
|
|
936
|
-
def add_min_max(obj, field, val1, val2):
|
|
937
|
-
# merge vals
|
|
938
|
-
merge_val = f"{format_val(val1)}-{format_val(val2)}"
|
|
939
|
-
|
|
940
|
-
if isinstance(obj, dict):
|
|
941
|
-
pass
|
|
942
|
-
else:
|
|
943
|
-
# set value range and min/max numeric values
|
|
944
|
-
field_obj = getattr(obj, field)
|
|
945
|
-
setattr(field_obj, "has_raw_value", merge_val) # e.g., {va1}-{val2}
|
|
946
|
-
setattr(field_obj, "has_minimum_numeric_value", float(val1))
|
|
947
|
-
setattr(field_obj, "has_maximum_numeric_value", float(val2))
|
|
948
|
-
|
|
949
|
-
# remove simple number value
|
|
950
|
-
if hasattr(field_obj, "has_numeric_value"):
|
|
951
|
-
delattr(field_obj, "has_numeric_value")
|
|
952
|
-
|
|
953
|
-
return obj
|
|
954
|
-
|
|
955
|
-
print(f"*** executing merge_value_range_fields for attributes {tx_attributes}")
|
|
956
|
-
|
|
957
|
-
if len(tx_attributes) != 2:
|
|
958
|
-
raise Exception("This function only accepts two arguments.")
|
|
959
|
-
|
|
960
|
-
# get fields to be merged
|
|
961
|
-
field1 = tx_attributes[0]
|
|
962
|
-
field2 = tx_attributes[1]
|
|
963
|
-
|
|
964
|
-
for obj in nmdc_objs:
|
|
965
|
-
# test if fields exist
|
|
966
|
-
if has_range_fields(obj, field1, field2):
|
|
967
|
-
# get values from fields and merge
|
|
968
|
-
val1, val2 = get_obj_field_values(obj, field1, field2)
|
|
969
|
-
|
|
970
|
-
# modify obj's field1 to hold min/max ranges
|
|
971
|
-
obj = add_min_max(obj, field1, val1, val2)
|
|
972
|
-
|
|
973
|
-
# remove field2, no long needed
|
|
974
|
-
if isinstance(obj, dict):
|
|
975
|
-
obj.pop(field2, None)
|
|
976
|
-
else:
|
|
977
|
-
delattr(obj, field2)
|
|
978
|
-
|
|
979
|
-
return nmdc_objs
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
def make_quantity_value(nmdc_objs: list, tx_attributes: list, **kwargs) -> list:
|
|
983
|
-
"""
|
|
984
|
-
Takes each nmdc object (either a dict or class instance) and adds has_numeric_value and has_unit information.
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
Args:
|
|
988
|
-
nmdc_objs (list): list of objects to be updated with has_numeric_value and/or has_unit values
|
|
989
|
-
tx_attributes (list): list of attributes whose values need to updated
|
|
990
|
-
|
|
991
|
-
Returns:
|
|
992
|
-
list: updated nmdc_objs with has_numeric_value and/or has_unit values
|
|
993
|
-
"""
|
|
994
|
-
print(f"*** executing make_quantity_value for attributes {tx_attributes}")
|
|
995
|
-
for attribute in tx_attributes:
|
|
996
|
-
for obj in nmdc_objs:
|
|
997
|
-
if has_raw_value(obj, attribute):
|
|
998
|
-
val = getattr(obj, attribute)
|
|
999
|
-
|
|
1000
|
-
## split raw value after first space
|
|
1001
|
-
if type(val) == type({}):
|
|
1002
|
-
value_list = str(val["has_raw_value"]).split(" ", 1)
|
|
1003
|
-
else:
|
|
1004
|
-
value_list = str(getattr(val, "has_raw_value")).split(" ", 1)
|
|
1005
|
-
|
|
1006
|
-
## assign numeric quantity value
|
|
1007
|
-
if type(val) == type({}):
|
|
1008
|
-
try:
|
|
1009
|
-
val["has_numeric_value"] = float(value_list[0].strip())
|
|
1010
|
-
except Exception as ex:
|
|
1011
|
-
pass
|
|
1012
|
-
else:
|
|
1013
|
-
try:
|
|
1014
|
-
val.has_numeric_value = float(value_list[0].strip())
|
|
1015
|
-
except Exception as ex:
|
|
1016
|
-
pass
|
|
1017
|
-
|
|
1018
|
-
## assign unit if present
|
|
1019
|
-
if len(value_list) > 1:
|
|
1020
|
-
if type(val) == type({}):
|
|
1021
|
-
val["has_unit"] = value_list[1].strip()
|
|
1022
|
-
else:
|
|
1023
|
-
val.has_unit = value_list[1].strip()
|
|
1024
|
-
|
|
1025
|
-
return nmdc_objs
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
def make_iso_8601_date_value(nmdc_objs: list, tx_attributes: list, **kwargs) -> list:
|
|
1029
|
-
"""
|
|
1030
|
-
Converts date values in ISO-8601 format.
|
|
1031
|
-
E.g., "30-OCT-14 12.00.00.000000000 AM" -> "30-OCT-14" is converted to "2014-10-14".
|
|
1032
|
-
|
|
1033
|
-
Parameters
|
|
1034
|
-
----------
|
|
1035
|
-
nmdc_objs : list
|
|
1036
|
-
List of objects to whose attributes will converted to ISO-8601 format.
|
|
1037
|
-
tx_attributes : list
|
|
1038
|
-
List of attributes whose values need to updated to ISO-8601 format.
|
|
1039
|
-
|
|
1040
|
-
Returns
|
|
1041
|
-
-------
|
|
1042
|
-
list
|
|
1043
|
-
List of updated nmdc_objs with ISO-8601 formated strings as values.
|
|
1044
|
-
"""
|
|
1045
|
-
print(f"*** executing make_iso_8601_date for attributes {tx_attributes}")
|
|
1046
|
-
for attribute in tx_attributes:
|
|
1047
|
-
for obj in nmdc_objs:
|
|
1048
|
-
# check if object has a date field (attribute)
|
|
1049
|
-
if hasattr(obj, attribute):
|
|
1050
|
-
# get the current date string value and return just the date part
|
|
1051
|
-
# e.g.: "30-OCT-14 12.00.00.000000000 AM" -> "30-OCT-14"
|
|
1052
|
-
date_str = str(getattr(obj, attribute)).split(" ", 1)[0]
|
|
1053
|
-
|
|
1054
|
-
# convert date string in ISO-8601
|
|
1055
|
-
# e.g.: "30-OCT-14" -> "2014-10-14"
|
|
1056
|
-
if not (date_str is None) and date_str != "None":
|
|
1057
|
-
try:
|
|
1058
|
-
date_val = datetime.strptime(date_str, "%d-%b-%y").strftime(
|
|
1059
|
-
"%Y-%m-%d"
|
|
1060
|
-
)
|
|
1061
|
-
setattr(obj, attribute, date_val)
|
|
1062
|
-
except Exception as ex:
|
|
1063
|
-
print(getattr(obj, "id"), f"property {attribute}", "error:", ex)
|
|
1064
|
-
|
|
1065
|
-
return nmdc_objs
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
def get_json(file_path: str, replace_single_quote=False):
|
|
1069
|
-
"""
|
|
1070
|
-
Returns a json object from the file specied by file_path.
|
|
1071
|
-
|
|
1072
|
-
Args:
|
|
1073
|
-
file_path (sting): path file holding json
|
|
1074
|
-
replace_single_quote (bool, optional): specifies if "'" is replaced with '"'; defaults to False
|
|
1075
|
-
|
|
1076
|
-
Returns:
|
|
1077
|
-
json object
|
|
1078
|
-
"""
|
|
1079
|
-
## load json
|
|
1080
|
-
with open(file_path, "r") as in_file:
|
|
1081
|
-
if replace_single_quote: # json
|
|
1082
|
-
text = in_file.read()
|
|
1083
|
-
json_data = json.loads(text.replace("'", '"'))
|
|
1084
|
-
else:
|
|
1085
|
-
json_data = json.load(in_file)
|
|
1086
|
-
return json_data
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
def save_json(json_data: str, file_path):
|
|
1090
|
-
"""
|
|
1091
|
-
Saves json_data to file specified by file_path.
|
|
1092
|
-
|
|
1093
|
-
Args:
|
|
1094
|
-
json_data: json data
|
|
1095
|
-
file_path (sting): path to where json is saved
|
|
1096
|
-
|
|
1097
|
-
Returns:
|
|
1098
|
-
[type]: [description]
|
|
1099
|
-
"""
|
|
1100
|
-
## if json data is a string, it will need to be
|
|
1101
|
-
## loaded into a variable to for "\" escape characters
|
|
1102
|
-
if type(json_data) == type(""):
|
|
1103
|
-
json_data = json.loads(json_data)
|
|
1104
|
-
|
|
1105
|
-
## save json with changed data types
|
|
1106
|
-
with open(file_path, "w") as out_file:
|
|
1107
|
-
json.dump(json_data, out_file, indent=2)
|
|
1108
|
-
return json_data
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
if __name__ == "__main__":
|
|
1112
|
-
## code for testing
|
|
1113
|
-
file_path = "../output/nmdc_etl/test.json"
|
|
1114
|
-
# test_json = collapse_json_file(file_path, 'part_of')
|
|
1115
|
-
# test_json = collapse_json_file(file_path, 'has_input')
|
|
1116
|
-
# test_json = collapse_json_file(file_path, "has_output")
|
|
1117
|
-
# print(test_json)
|