nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +22 -2
  3. nmdc_runtime/api/core/idgen.py +36 -6
  4. nmdc_runtime/api/db/mongo.py +0 -12
  5. nmdc_runtime/api/endpoints/find.py +65 -225
  6. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  7. nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
  8. nmdc_runtime/api/endpoints/objects.py +4 -11
  9. nmdc_runtime/api/endpoints/operations.py +0 -27
  10. nmdc_runtime/api/endpoints/queries.py +22 -0
  11. nmdc_runtime/api/endpoints/sites.py +0 -24
  12. nmdc_runtime/api/endpoints/util.py +57 -35
  13. nmdc_runtime/api/entrypoint.sh +7 -0
  14. nmdc_runtime/api/main.py +84 -60
  15. nmdc_runtime/api/models/util.py +12 -5
  16. nmdc_runtime/api/openapi.py +116 -180
  17. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  18. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  19. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  20. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  21. nmdc_runtime/minter/adapters/repository.py +21 -0
  22. nmdc_runtime/minter/domain/model.py +20 -0
  23. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  24. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  25. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  26. nmdc_runtime/site/dagster.yaml +53 -0
  27. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  28. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  29. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  30. nmdc_runtime/site/export/ncbi_xml.py +632 -11
  31. nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
  32. nmdc_runtime/site/graphs.py +7 -0
  33. nmdc_runtime/site/ops.py +92 -34
  34. nmdc_runtime/site/repository.py +2 -0
  35. nmdc_runtime/site/resources.py +16 -3
  36. nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
  37. nmdc_runtime/site/workspace.yaml +13 -0
  38. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  39. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  40. nmdc_runtime/static/README.md +5 -0
  41. nmdc_runtime/static/favicon.ico +0 -0
  42. nmdc_runtime/util.py +87 -1
  43. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  44. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/RECORD +47 -57
  45. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  46. nmdc_runtime/api/endpoints/ids.py +0 -192
  47. nmdc_runtime/client/__init__.py +0 -0
  48. nmdc_runtime/containers.py +0 -14
  49. nmdc_runtime/core/__init__.py +0 -0
  50. nmdc_runtime/core/db/Database.py +0 -13
  51. nmdc_runtime/core/db/__init__.py +0 -0
  52. nmdc_runtime/core/exceptions/__init__.py +0 -23
  53. nmdc_runtime/core/exceptions/base.py +0 -47
  54. nmdc_runtime/core/exceptions/token.py +0 -13
  55. nmdc_runtime/domain/__init__.py +0 -0
  56. nmdc_runtime/domain/users/__init__.py +0 -0
  57. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  58. nmdc_runtime/domain/users/userSchema.py +0 -37
  59. nmdc_runtime/domain/users/userService.py +0 -14
  60. nmdc_runtime/infrastructure/__init__.py +0 -0
  61. nmdc_runtime/infrastructure/database/__init__.py +0 -0
  62. nmdc_runtime/infrastructure/database/db.py +0 -3
  63. nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
  64. nmdc_runtime/infrastructure/database/models/user.py +0 -1
  65. nmdc_runtime/lib/__init__.py +0 -1
  66. nmdc_runtime/lib/extract_nmdc_data.py +0 -33
  67. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  68. nmdc_runtime/lib/nmdc_dataframes.py +0 -825
  69. nmdc_runtime/lib/nmdc_etl_class.py +0 -396
  70. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  71. nmdc_runtime/site/drsobjects/__init__.py +0 -0
  72. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  73. nmdc_runtime/site/drsobjects/registration.py +0 -131
  74. nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
  75. nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
  76. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  77. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,1117 +0,0 @@
1
- ## author: Bill Duncan
2
- ## summary: Contains methods for transforming data in NMDC ETL pipeline.
3
-
4
- import io
5
- import json
6
- import pkgutil
7
- from collections import namedtuple
8
- from datetime import datetime
9
-
10
- import jsonasobj
11
-
12
- ## system level modules
13
- import pandas as pds
14
-
15
- ## add all classes for local nmdc.py
16
- ## this is the file of python classes generated by linkml
17
- from nmdc_schema import nmdc
18
-
19
- import nmdc_runtime.lib.nmdc_dataframes as nmdc_dataframes
20
-
21
-
22
- def has_raw_value(obj, attribute: str) -> bool:
23
- """
24
- Helper function that returns True/False if a an object attribute
25
- has a has_raw_value property.
26
- E.g.: "lat_lon": {"has_raw_value": "-33.460524 150.168149"}
27
-
28
- Args:
29
- obj (dict or object):
30
- attribute (string): the name of the attribute in obj to check.
31
-
32
- Returns:
33
- boolean: True if haw_raw_value property is present.
34
- """
35
-
36
- val = getattr(obj, attribute) # get value of object
37
-
38
- if val is None: # check that value exists
39
- return False
40
-
41
- ## if val is a dict, check that it has a has_raw_value key
42
- ## and that the value is not null
43
- if type(val) == type({}):
44
- if "has_raw_value" in val.keys():
45
- return pds.notnull(val["has_raw_value"])
46
- else:
47
- return False
48
-
49
- ## if val is not a dict, assume it is a class
50
- ## and check has_raw_value
51
- obj_vars = vars(val)
52
- if "has_raw_value" in obj_vars.keys():
53
- return pds.notnull(obj_vars["has_raw_value"])
54
- else:
55
- return False
56
-
57
-
58
- def record_has_field(nmdc_record: namedtuple, attribute_field: str) -> bool:
59
- """
60
- Returns True/False if a field is in nmdc_record (a namedtuple).
61
-
62
- Args:
63
- nmdc_record (namedtuple): the nmdc record
64
- attribute_field (string): the name of the attribute
65
-
66
- Returns:
67
- bool: True if the record has the field.
68
- """
69
- if pds.isnull(nmdc_record):
70
- return None
71
-
72
- if "," in attribute_field: # e.g., "file_size_bytes, int"
73
- field = attribute_field.split(",")[0].strip()
74
- else: # default to string datatype
75
- field = attribute_field.strip()
76
-
77
- return field in nmdc_record._fields
78
-
79
-
80
- def coerce_value(value, dtype: str):
81
- """
82
- Coerces value into the type specified by dtype and returns the coerced value.
83
-
84
- Args:
85
- value: the value to coerece
86
- dtype (str): the data type to coerce/cast value into
87
-
88
- Returns:
89
- the value cast into the data type specified by dtype
90
- """
91
- if value is None:
92
- return None
93
-
94
- if dtype != "str": # only do the eval when it is not a string
95
- return eval(f"""{dtype}({value})""") # convert value to specified datatype
96
- else:
97
- return f"""{value}"""
98
-
99
-
100
- def get_dtype_from_attribute_field(attribute_field) -> str:
101
- """
102
- Return data type part of attribute_field (e.g. 'file_size, int').
103
- If no dtype is given, "str" is returned.
104
-
105
- Args:
106
- attribute_field: the attribute field to get the data type from
107
-
108
- Returns:
109
- str: the string representation of the attribute field's data type
110
- """
111
- if type(attribute_field) == type({}):
112
- if "$const" in attribute_field.keys():
113
- ## NB: RECURSIVE CALL
114
- dtype = get_dtype_from_attribute_field(attribute_field["$const"])
115
- else:
116
- dtype = "str"
117
- elif "," in attribute_field: # e.g., "file_size_bytes, int"
118
- dtype = attribute_field.split(",")[1].strip()
119
- else: # default to string datatype
120
- dtype = "str"
121
-
122
- return dtype
123
-
124
-
125
- def get_field_and_dtype_from_attribute_field(attribute_field) -> tuple:
126
- """
127
- Returns both the field and data type parts of attribute_field (e.g. 'file_size, int').
128
- If no dtype is given, a dtype of "str" is returned.
129
-
130
- Args:
131
- attribute_field: the name of the attribute field
132
-
133
- Returns:
134
- tuple: contains the (field, data type)
135
- """
136
- if type(attribute_field) == type({}):
137
- if "$const" in attribute_field.keys():
138
- ## NB: RECURSIVE CALL
139
- field, dtype = get_field_and_dtype_from_attribute_field(
140
- attribute_field["$const"]
141
- )
142
- elif "$field" in attribute_field.keys():
143
- ## NB: RECURSIVE CALL
144
- field, dtype = get_field_and_dtype_from_attribute_field(
145
- attribute_field["$field"]
146
- )
147
- else:
148
- field, dtype = attribute_field, "str"
149
- elif "," in attribute_field: # e.g., "file_size_bytes, int"
150
- field, dtype = attribute_field.split(",")
151
- field, dtype = field.strip(), dtype.strip()
152
- else: # default to string datatype
153
- field, dtype = attribute_field.strip(), "str"
154
-
155
- return field, dtype
156
-
157
-
158
- def get_record_attr(record: namedtuple, attribute_field, return_field_if_none=True):
159
- """
160
- Returns the value specified by attribute_field in the record.
161
- E.g., get_record_attr(Record(id='gold:001', name='foo'), 'id') would return 'gold:001'.
162
-
163
- In some cases, the attribure_field may used for constant value (e.g., unit: meter).
164
- In these case the return_field_if_none (default True), specifies whether to return the
165
- constant value (e.g., return 'meter' instead of None)
166
-
167
- Args:
168
- record (namedtuple): the record containing the data
169
- attribute_field: the name of the field that contains the data
170
- return_field_if_none (bool, optional): Defaults to True.
171
-
172
- Returns:
173
- the value of record's field
174
- """
175
- ## check for constant
176
- if type({}) == type(attribute_field) and "$const" in attribute_field.keys():
177
- field, dtype = get_field_and_dtype_from_attribute_field(
178
- attribute_field["$const"]
179
- )
180
- return coerce_value(field, dtype)
181
-
182
- ## get field name and data type
183
- field, dtype = get_field_and_dtype_from_attribute_field(attribute_field)
184
-
185
- ## get value from record
186
- if record_has_field(record, field): # check field
187
- val = getattr(record, field)
188
- else: #### ********** Return value of field or None ******************* #######
189
- val = field if return_field_if_none else None
190
-
191
- if pds.notnull(val):
192
- return coerce_value(val, dtype)
193
- else:
194
- return None
195
-
196
-
197
- def make_constructor_args_from_record(
198
- constructor_map: dict, nmdc_record: namedtuple
199
- ) -> dict:
200
- """
201
- Returns the constructor arguments as a dict that are needed to build an object.
202
- E.g., If the constructor map specifies that a Study object requires an id and name in
203
- the constructor, this function would return {id: gold:001, name: foo}.
204
-
205
- Args:
206
- constructor_map (dict): the arguments specified to build an object
207
- nmdc_record (namedtuple): holds the data that is used to build an object
208
-
209
- Returns:
210
- dict: the constructor arguments needed to build the object
211
- """
212
- ## for every mapping between a key and data field create a dict
213
- ## of the parameters needed to instantiate the class
214
- constructor_dict = {}
215
- for key, field in constructor_map.items():
216
- ## if the fields is a dict, constructor param takes an object
217
- ## e.g., {'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon', '$class_type': 'GeolocationValue'}
218
- if type({}) == type(field) and len(field) > 0:
219
- ## get values from the nmdc record for each field name
220
- record_dict = make_record_dict(nmdc_record, field)
221
- ## find constructors defined by the initialization key
222
- if "$class_type" in field.keys():
223
- class_type = make_nmdc_class(field["$class_type"]) # get class type
224
-
225
- ## update constructor dict
226
- constructor_dict[key] = class_type(**record_dict)
227
- else:
228
- constructor_dict[key] = record_dict
229
- elif type([]) == type(field) and len(field) > 0:
230
- constructor_dict[key] = [get_record_attr(nmdc_record, f) for f in field]
231
- else:
232
- constructor_dict[key] = get_record_attr(nmdc_record, field)
233
-
234
- return constructor_dict
235
-
236
-
237
- def make_dict_from_nmdc_obj(nmdc_obj) -> dict:
238
- """
239
- Returns a dict based on the nmdc_obj.
240
-
241
- Args:
242
- nmdc_obj: an object containing nmdc data
243
-
244
- Returns:
245
- dict: representation of the object
246
- """
247
-
248
- def is_value(variable):
249
- """
250
- Checks if variable has a value. Returns True if:
251
- - variable is not None and
252
- - has length > 0 if variable is a list and dict and
253
- - has an id and/or has raw value key if variable is a dict
254
- """
255
- ## check if variable is None
256
- if variable is None:
257
- return False
258
-
259
- ## check for zero len variable
260
- if (
261
- type([]) == type(variable)
262
- or type({}) == type(variable)
263
- or type("") == type(variable)
264
- ):
265
- if len(variable) == 0:
266
- return False
267
- else:
268
- if pds.isnull(variable):
269
- return False ## check for null
270
-
271
- ## if variable is a dict, make sure it has an id or raw value
272
- if type({}) == type(variable):
273
- if "id" in variable.keys():
274
- return is_value(variable["id"]) # check if id has a value
275
- elif "has_raw_value" in variable.keys():
276
- return is_value(
277
- variable["has_raw_value"]
278
- ) # check if has_raw_value has a value
279
- else:
280
- return False # if it makes it here, there wasn't an id or has_raw_value
281
-
282
- return True # if it makes it here, all good
283
-
284
- def make_dict(obj):
285
- """
286
- Transforms an nmdc object into a dict
287
- """
288
- if obj == None:
289
- return # make sure the object has a value
290
-
291
- ## check if obj can convert to dict
292
- if not hasattr(obj, "_as_dict"):
293
- return obj
294
-
295
- # temp_dict = jsonasobj.as_dict(obj) # convert obj dict
296
- temp_dict = {}
297
- obj_dict = {}
298
-
299
- ## include only valid values in lists and dicts
300
- for key, val in jsonasobj.as_dict(obj).items():
301
- # print('key:', key, '\n', ' val:', val, '\n')
302
- if type({}) == type(val): # check values in dict
303
- temp_dict[key] = {k: v for k, v in val.items() if is_value(v)}
304
- elif type([]) == type(val): # check values in list
305
- temp_dict[key] = [element for element in val if is_value(element)]
306
- else:
307
- temp_dict[key] = val
308
-
309
- ## check for {} or [] that may resulted from prevous loop
310
- for key, val in temp_dict.items():
311
- if is_value(val):
312
- obj_dict[key] = val
313
-
314
- return obj_dict
315
-
316
- if type([]) == type(nmdc_obj):
317
- # print('nndc_obj:', nmdc_obj)
318
- nmdc_dict = [make_dict(o) for o in nmdc_obj if is_value(o)]
319
- # print('nmdc_dict:', nmdc_dict)
320
- else:
321
- nmdc_dict = make_dict(nmdc_obj)
322
-
323
- return nmdc_dict
324
-
325
- def make_dict(obj):
326
- """
327
- transforms an nmdc object into a dict
328
- """
329
- if obj == None:
330
- return # make sure the object has a value
331
-
332
- ## check if obj can convert to dict
333
- if not hasattr(obj, "_as_dict"):
334
- return obj
335
-
336
- # temp_dict = jsonasobj.as_dict(obj) # convert obj dict
337
- temp_dict = {}
338
- obj_dict = {}
339
-
340
- ## include only valid values in lists and dicts
341
- for key, val in jsonasobj.as_dict(obj).items():
342
- # print('key:', key, '\n', ' val:', val, '\n')
343
- if type({}) == type(val): # check values in dict
344
- temp_dict[key] = {k: v for k, v in val.items() if is_value(v)}
345
- elif type([]) == type(val): # check values in list
346
- temp_dict[key] = [element for element in val if is_value(element)]
347
- else:
348
- temp_dict[key] = val
349
-
350
- ## check for {} or [] that may resulted from prevous loop
351
- for key, val in temp_dict.items():
352
- if is_value(val):
353
- obj_dict[key] = val
354
-
355
- return obj_dict
356
-
357
- if type([]) == type(nmdc_obj):
358
- # print('nndc_obj:', nmdc_obj)
359
- nmdc_dict = [make_dict(o) for o in nmdc_obj if is_value(o)]
360
- # print('nmdc_dict:', nmdc_dict)
361
- else:
362
- nmdc_dict = make_dict(nmdc_obj)
363
-
364
- return nmdc_dict
365
-
366
-
367
- def set_nmdc_object(
368
- nmdc_obj, nmdc_record: namedtuple, attribute_map: dict, attribute_field
369
- ):
370
- """
371
- Sets the properties of nmdc_obj using the values stored in the nmdc_record.
372
- The update nmdc_obj is returned.
373
-
374
- Args:
375
- nmdc_obj: the nmdc object that will modified
376
- nmdc_record (namedtuple): the record who's data will be used to set the values of the nmdc_obj
377
- attribute_map (dict): a dict/map based on the sssom file used to update the object's field
378
- attribute_field: the nmdc_obj's field to be set
379
-
380
- Returns:
381
- updated nmdc_obj
382
- """
383
- ## by default property values are represented as dicts
384
- ## the exception is when an value is created using '$class_type'
385
- ## e.g. {latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon', '$class_type': 'GeolocationValue'}
386
- ## when '$class_type' is used the represent as dict flag is changed
387
- represent_as_dict = True
388
-
389
- ## check if attribute is a dict; e.g. part_of: gold_study_id
390
- if type({}) == type(attribute_field):
391
- ## get the field and value parts from dict
392
- field, val = list(attribute_field.items())[0]
393
- if type([]) == type(val):
394
- ## e.g. has_output: ["data_object_id, str"]
395
- av = make_object_from_list(nmdc_record, val)
396
- elif type({}) == type(val):
397
- ## # e.g. has_output: {id: gold:0001, name: 'foo', $class_type: Study}
398
- ## check if the av needs to be represented as an object
399
- if "$class_type" in val.keys():
400
- represent_as_dict = False
401
- av = make_object_from_dict(nmdc_record, val) # val is a dict
402
- elif type("") == type(val):
403
- # e.g. has_output: "data_object_id, str" (not a list)
404
- av = get_record_attr(nmdc_record, val)
405
- else:
406
- ## val names the field in the record
407
- av = make_attribute_value_from_record(nmdc_record, val)
408
- elif type("") == type(attribute_field):
409
- if "," in attribute_field:
410
- ## e.g., "file_size_bytes, int"
411
- field = attribute_field.split(",")[0].strip()
412
- else:
413
- field = attribute_field.strip()
414
-
415
- av = get_record_attr(nmdc_record, attribute_field)
416
- else:
417
- field = attribute_field
418
- av = make_attribute_value_from_record(nmdc_record, field)
419
-
420
- ## convert attribute value into a dict
421
- if represent_as_dict == True:
422
- av = make_dict_from_nmdc_obj(av)
423
-
424
- ## check if attribute has been mapped in the sssom file
425
- if (len(attribute_map) > 0) and (field in attribute_map.keys()):
426
- setattr(nmdc_obj, attribute_map[field], av)
427
- else:
428
- setattr(nmdc_obj, field, av)
429
-
430
- return nmdc_obj
431
-
432
-
433
- def make_attribute_value_from_record(nmdc_record: namedtuple, field, object_type=""):
434
- """
435
- Creates an attribute value object linked the value in the nmdc record's field.
436
-
437
- Args:
438
- nmdc_record (namedtuple): holds the data
439
- field: the field to get the data from
440
- object_type (str, optional): used to specify the type of object retured; defaults to ""
441
-
442
- Returns:
443
- an attribute value object (by default) with the has_raw_value property set to value in field
444
- """
445
- # val = getattr(nmdc_record, field)
446
- val = get_record_attr(nmdc_record, field)
447
- av = make_attribute_value(val, object_type)
448
-
449
- return av
450
-
451
-
452
- def make_attribute_map(sssom_map_file: str = "") -> dict:
453
- """
454
- Retuns a dict based on the SSSOM mapping.
455
- By default the SSSOM mappping comes from the nmdc-schema package,
456
- but an optional path to an SSSOM formed tsv may be used.
457
-
458
- Args:
459
- sssom_map_file (str): an optional path to the sssom file
460
-
461
- Returns:
462
- dict: map relating the subject to the object where there is a skos:exactMatch
463
- """
464
- attr_map = {}
465
- if len(sssom_map_file) > 0:
466
- ## load sssom mapping file and subset to skos:exactMatch
467
- mapping_df = nmdc_dataframes.make_dataframe(
468
- sssom_map_file, comment_str="#"
469
- ).query("predicate_id == 'skos:exactMatch'")
470
- else:
471
- sssom = io.BytesIO(pkgutil.get_data("nmdc_schema", "gold-to-mixs.sssom.tsv"))
472
- mapping_df = pds.read_csv(sssom, sep="\t", comment="#", encoding="utf-8")
473
-
474
- attr_map = {
475
- subj: obj
476
- for idx, subj, obj in mapping_df[["subject_label", "object_label"]].itertuples()
477
- } # build attribute dict
478
-
479
- return attr_map
480
-
481
-
482
- def make_attribute_value(val):
483
- """
484
- Creates an attribute value object that has_raw_value val.
485
-
486
- Args:
487
- val: the value that is set as the value of has_raw_value
488
-
489
- Returns:
490
- attribute value object that has_raw_value val
491
- """
492
- av = nmdc.AttributeValue()
493
- if pds.notnull(val):
494
- av.has_raw_value = val
495
-
496
- return av
497
-
498
-
499
- def make_nmdc_class(class_type):
500
- """
501
- Returns the NMDC class from the NMDC module as specified by class_type.
502
-
503
- Args:
504
- class_type: they type of class to return
505
-
506
- Returns:
507
- the specfied class reference (not string) that can be used to build an object
508
- """
509
- ## check if the class type is being passed as a string e.g., '$class_type': 'GeolocationValue'
510
- if type("") == type(class_type):
511
- class_type = getattr(nmdc, class_type)
512
- return class_type
513
-
514
-
515
- def make_record_dict(
516
- nmdc_record: namedtuple, object_dict: dict, return_field_if_none=True
517
- ) -> dict:
518
- """
519
- Transforms nmdc_record into a dict in which the record field/properties are the keys.
520
-
521
- Args:
522
- nmdc_record (namedtuple): the record/tuple that holds the data
523
- object_dict (dict): holds the specificaion of fields to get data from
524
- return_field_if_none (bool, optional): defaults to True;
525
- speficies return type if field doesn't have any data
526
- this is useful returning constants; e.g: depth {has_unit: meter} will return
527
- 'meter' for the has_unit property even though 'has_unit' is not a field in the record
528
-
529
- Returns:
530
- dict: a dict representation of the nmdc record
531
- """
532
- ## build record from the field names in the object dict
533
- ## note: $class_type is a special key that is ignored
534
- record_dict = {}
535
- for field_key, field in object_dict.items():
536
- if field_key != "$class_type":
537
- if type({}) == type(field):
538
- ## if the object value is a dict (e.g., {has_unit: {const: 'meter'}})
539
- ## then set the value to the dict's value
540
- ## needed if a field name conflicts with constant (e.g, if there was field named 'meter')
541
- if list(field.keys())[0] == "$const":
542
- record_dict[field_key] = list(field.values())[0]
543
- else:
544
- ## get records value from nmdc record
545
- ## note: if the field is not in the nmdc record and return_field_if_none=True, the field is returned
546
- ## e.g., adding a constant or type: {has_raw_value: '10', type: QuantityValue}
547
- record_dict[field_key] = get_record_attr(
548
- nmdc_record, field, return_field_if_none
549
- )
550
-
551
- return record_dict
552
-
553
-
554
- def make_object_from_dict(nmdc_record: namedtuple, object_dict: dict):
555
- """
556
- Creates and returns an "object" based on nmdc_record.
557
- If the object_dict has a $class_type key, an instantiated object is returned.
558
- Otherwise, a dict is returned.
559
-
560
- Args:
561
- nmdc_record (namedtuple): the record that holds the data
562
- object_dict (dict): the dict that specifies the field/data (key/value) pairings
563
-
564
- Returns:
565
- an object built from the record and object_dict information
566
- """
567
- record_dict = make_record_dict(nmdc_record, object_dict)
568
-
569
- if "$class_type" in object_dict.keys():
570
- class_type = make_nmdc_class(object_dict["$class_type"])
571
- obj = class_type(**record_dict) # build object
572
- else:
573
- obj = record_dict
574
-
575
- return obj
576
-
577
-
578
- def make_object_from_list_item_dict(nmdc_record: namedtuple, item: dict) -> list:
579
- """
580
- When the item in the list is a dict; e.g.;
581
- [{id: 'gold_id, int', name: project_name, $class_type: Study}]
582
- A list of objects is returned that were created from the keys
583
- in the dict.
584
-
585
- This function is called from make_object_from_list.
586
-
587
- Args:
588
- nmdc_record (namedtuple): the record that holds the data values
589
- item (dict): holds the information needed to build the object
590
-
591
- Returns:
592
- list: holds objects built from data in the record
593
- """
594
- ## set split value for values in dict (globally)
595
- if "$spit_val" in item.keys():
596
- split_val = item.pop("$split_val")
597
- else:
598
- split_val = ","
599
-
600
- ## get class type if prestent
601
- if "$class_type" in item.keys():
602
- class_type = item.pop("$class_type")
603
- class_type = make_nmdc_class(class_type) # convert to a type
604
- else:
605
- class_type = None
606
-
607
- ## get list of record values from nmdc record and split
608
- ## e.g., [{id: 'gold_id, int', name: project_name, $class_type: Study}]
609
- ## -> [['gold:001', 'gold:0002'], ['name 1', 'name 2']]
610
- record_values = []
611
- for field_name in item.values():
612
- ## get value in nmdc record
613
- val = get_record_attr(nmdc_record, field_name, return_field_if_none=False)
614
-
615
- if val is not None:
616
- dtype = get_dtype_from_attribute_field(field_name) # determine data type
617
-
618
- ## check for local spit val; e.g., [{id: {$field: 'gold_id, int', $split_val:'|'}}
619
- mysplit = (
620
- field_name["$split_val"]
621
- if type({}) == type(field_name) and "$split_val" in field_name.keys()
622
- else split_val
623
- )
624
-
625
- rv = [coerce_value(v.strip(), dtype) for v in str(val).split(mysplit)]
626
- record_values.append(rv)
627
- else:
628
- record_values.append([None])
629
-
630
- ## get list of keys from item
631
- keys = [key for key in item.keys() if key != "$class_type"]
632
-
633
- ## build list of objects
634
- ## this works by using zip build dictionary using the keys and record values
635
- ## first the values are zipped/paired/collated; e.g.:
636
- ## zip(*[['gold:001', 'gold:0002'], ['name 1', 'name 2']])
637
- ## -> [['gold:001', 'name 1'], ['gold:002', 'name 2']]
638
- ## then the keys are zipped as a dict to the values; e.g.:
639
- ## dict(zip(['id', 'name'], [['gold:001', 'name 1'], ['gold:002', 'name 2']]))
640
- ## -> [{id: gold:001, name: 'name 1'}, {id: gold:002, name: 'name 2'}]
641
- obj_list = []
642
- # for rv in zip_longest(*record_values):
643
- # obj_dict = dict(zip(keys, rv))
644
- #
645
- # if class_type is not None:
646
- # ## add the instantiated object to the list; e.g. obj_list.append(Study(id='gold:001'))
647
- # obj_list.append(class_type(**obj_dict))
648
- # else:
649
- # ## simply add the object; e.g., obj_list.append({id: gold:001, name: name1})
650
- # obj_list.append(obj_dict)
651
-
652
- return obj_list
653
-
654
-
655
- def make_value_from_list_item_dict(nmdc_record: namedtuple, item: dict) -> list:
656
- """
657
- When the item in the list is a dict; e.g.;
658
- [{$field: 'data_object_id, int'}]
659
- [{$field: 'data_object_id, int', $split=','}]
660
- A list of values is returned that were created from the keys
661
- in the dict.
662
-
663
- This function is called from make_object_from_list.
664
-
665
- Args:
666
- nmdc_record (namedtuple): the record that holds the data values
667
- item (dict): holds the information needed to build the object
668
-
669
- Returns:
670
- list: values retrieved from data in the record
671
- """
672
- # ****** add info to documentation ********
673
- dtype = get_dtype_from_attribute_field(item["$field"])
674
-
675
- ## set value to split on
676
- if "$split_val" in item.keys():
677
- split_val = item["$split_val"]
678
- else:
679
- split_val = ","
680
-
681
- ## e.g., [{$field: data_object_id, $split=','}]
682
- ## get record value for the field
683
- ## returns None if the field is not in record
684
- if "$const" in item.keys():
685
- return [coerce_value(item["$const"], dtype)]
686
- elif "$field" in item.keys():
687
- record_val = get_record_attr(
688
- nmdc_record, item["$field"], return_field_if_none=False
689
- )
690
- else:
691
- record_val = None
692
-
693
- ## check the record value is not None
694
- if record_val is not None:
695
- ## check if record needs to be split
696
- if split_val is not None:
697
- # make sure record_val is a string, needed for splitting
698
- if type(record_val) != type(""):
699
- record_val = str(record_val)
700
-
701
- return [
702
- coerce_value(rv.strip(), dtype) for rv in record_val.split(split_val)
703
- ]
704
- else:
705
- return [coerce_value(record_val.strip(), dtype)]
706
- else:
707
- return [None] # note: a list is returned
708
-
709
-
710
- def make_object_from_list(nmdc_record: namedtuple, nmdc_list: list) -> list:
711
- """
712
- When a list is specified as the value of a field; e.g.:
713
- ['gold_id, str']
714
- {$field: data_object_id, $split=','}]
715
- [{id: gold_id, name: project_name, $class_type: Study}]
716
- A list of items (either values objects) is returned.
717
-
718
- Args:
719
- nmdc_record (namedtuple): [description]
720
- nmdc_list (list): [description]
721
-
722
- Returns:
723
- list: [description]
724
- """
725
- obj_list = []
726
- for val in nmdc_list:
727
- if type({}) == type(val):
728
- if "$field" in val.keys():
729
- ## e.g., [{$field: data_object_id, $split=','}]
730
- obj_list.extend(make_value_from_list_item_dict(nmdc_record, val))
731
- else:
732
- ## e.g., [{id: gold_id, name: project_name, $class_type: Study}]
733
- obj_list.extend(make_object_from_list_item_dict(nmdc_record, val))
734
- else:
735
- ## e.g., ['gold_id, str']
736
- dtype = get_dtype_from_attribute_field(val) # determine the data type
737
- record_val = get_record_attr(nmdc_record, val)
738
- if record_val is not None:
739
- obj_list.extend(
740
- [
741
- coerce_value(rv.strip(), dtype)
742
- for rv in str(record_val).split(",")
743
- ]
744
- )
745
- else:
746
- obj_list.append(None)
747
-
748
- return obj_list
749
-
750
-
751
- def dataframe_to_dict(
752
- nmdc_df: pds.DataFrame,
753
- nmdc_class,
754
- constructor_map={},
755
- attribute_fields=[],
756
- attribute_map={},
757
- transform_map={},
758
- ) -> list:
759
- """
760
- This is the main interface for the module.
761
- The nmdc dataframe (nmdc_df) is transformed and returned as a list of dicts.
762
-
763
- Args:
764
- nmdc_df (pds.DataFrame): the Pandas dataframe to be transformed
765
- nmdc_class: the NMDC class used to build objects
766
- constructor_map (dict, optional): specifies constructor arguments need to build the object; defaults to {}
767
- attribute_fields (list, optional): specifies which data fields to use as properties/keys; defaults to []
768
- attribute_map (dict, optional): maps data fields to MIxS (or other standard) fields; defaults to {}
769
- transform_map (dict, optional): specfies pre/post transformations to preform on the data; defaults to {}
770
-
771
- Returns:
772
- list: list of dicts that represent hte dataframe
773
- """
774
-
775
- def make_nmdc_object(nmdc_record: namedtuple, nmdc_class):
776
- """
777
- Creates an object from the nmdc records of the type nmdc_class.
778
-
779
- Args:
780
- nmdc_record (namedtuple): the records that holds the data
781
- nmdc_class ([type]): the class that the object will instantiate
782
-
783
- Returns:
784
- an object of the type specified by class_type
785
- """
786
- ## check for constructor_map containing the paramaters necessary to instantiate the class
787
- if len(constructor_map) > 0:
788
- constructor_args = make_constructor_args_from_record(
789
- constructor_map, nmdc_record
790
- )
791
- nmdc_obj = nmdc_class(**constructor_args)
792
- else:
793
- nmdc_obj = nmdc_class()
794
-
795
- # print("****\n", nmdc_obj)
796
-
797
- nmdc_obj.type = (
798
- nmdc_class.class_class_curie
799
- ) ## add info about the type of entity it is
800
-
801
- ## get mappings for attribute fields
802
- for af in attribute_fields:
803
- nmdc_obj = set_nmdc_object(nmdc_obj, nmdc_record, attribute_map, af)
804
-
805
- return nmdc_obj
806
-
807
- ## create transform kwargs and pre and post transform lists
808
- tx_kwargs = {
809
- "nmdc_class": nmdc_class,
810
- "constructor_map": constructor_map,
811
- "attribute_fields": attribute_fields,
812
- "attribute_map": attribute_map,
813
- }
814
- pre_transforms = transform_map["pre"] if "pre" in transform_map.keys() else []
815
- post_transforms = transform_map["post"] if "post" in transform_map.keys() else []
816
-
817
- ## execute specified pre transformations; note: this transforms the dataframe
818
- for transform in pre_transforms:
819
- tx_function = eval(transform["function"]) # dynamically load function
820
- tx_attributes = transform["attributes"] # get list of attibutes
821
-
822
- ## apply transform funciton
823
- nmdc_df = tx_function(nmdc_df, tx_attributes)
824
-
825
- ## transform each record into an nmdc object and store in list
826
- ## NB: SSSOM mapping is performed during this step
827
- nmdc_objs = [
828
- make_nmdc_object(record, nmdc_class)
829
- for record in nmdc_df.itertuples(index=False)
830
- ]
831
-
832
- ## set value to None for fields that have dicts as values
833
- ## but not an id or has_raw_value key
834
- ## this needed in case conversions resulted in junk values
835
- for obj in nmdc_objs:
836
- for key, val in obj.__dict__.items():
837
- if type(val) == type({}):
838
- if (not "id" in val.keys()) and (not "has_raw_value" in val.keys()):
839
- obj.__dict__[key] = None
840
-
841
- ## execute specified post transformations; note: this transforms the nmdc objects
842
- for transform in post_transforms:
843
- tx_function = eval(transform["function"]) # dynamically load function
844
- tx_attributes = transform["attributes"] # get list of attibutes
845
-
846
- ## apply transform funciton
847
- nmdc_objs = tx_function(nmdc_objs, tx_attributes, **tx_kwargs)
848
-
849
- ## transform each nmdc object in a dict and store in list
850
- nmdc_dicts = [make_dict_from_nmdc_obj(obj) for obj in nmdc_objs]
851
-
852
- ## return list of dicts
853
- return nmdc_dicts
854
-
855
-
856
- def test_pre_transform(
857
- nmdc_df: pds.DataFrame, tx_attributes: list, **kwargs
858
- ) -> pds.DataFrame:
859
- """
860
- Dummy function to test pre-transform declarations.
861
- """
862
- print("*** test pre-transform ****")
863
- return nmdc_df
864
-
865
-
866
- def merge_value_range_fields(nmdc_objs: list, tx_attributes: list, **kwargs) -> list:
867
- """
868
- Takes each nmdc object (either a dict or class instance) and merges two
869
- attributues into a single attribute separated by a "-".
870
- Additionaly, the has_minuimum_numeric_value and has_maximum_numeric_value
871
- attrubutes are given values.
872
- The unit of the first attribute is preserved.
873
- The second attribute is removed.
874
-
875
- For example, if an object has the attributes "depth: 5.0, has_unit: meter"
876
- and "depth2: 10.0", the two attribute are merged into a single attribute
877
- with the form:
878
-
879
- depth: 5.0-10.0
880
- has_unit: meter
881
- has_minimum_numeric_value: 5.0
882
- has_maximum_numeric_value: 10.0
883
-
884
- Args:
885
- nmdc_objs (list): list of objects to be updated with has_numeric_value and/or has_unit values
886
- tx_attributes (list): list of two attributes whose values need to be merged
887
-
888
- Returns:
889
- list: updated nmdc_objs with has_minimum_numeric_value and has_maximum_numeric_value values
890
- in the first attribute; the second attriubte is removed
891
-
892
- """
893
-
894
- def has_range_fields(obj, field1, field2):
895
- # check that keys exist
896
- if isinstance(obj, dict):
897
- # check that keys have values
898
- if field1 in obj.keys() and field2 in obj.keys():
899
- # check if vals are None
900
- if obj[field1] is not None and obj[field2] is not None:
901
- field_obj1 = obj[field1]
902
- field_obj2 = obj[field2]
903
- return (
904
- field_obj1["has_raw_value"] is not None
905
- and field_obj2["has_raw_value"]
906
- )
907
- else:
908
- return False
909
- else:
910
- # check that properties exist
911
- if hasattr(obj, field1) and hasattr(obj, field2):
912
- # get objects from fields and check if vals are None
913
- field_obj1 = getattr(obj, field1)
914
- field_obj2 = getattr(obj, field2)
915
- return (
916
- getattr(field_obj1, "has_raw_value") is not None
917
- and getattr(field_obj2, "has_raw_value") is not None
918
- )
919
- else:
920
- return False
921
-
922
- def get_obj_field_values(obj, field1, field2):
923
- if isinstance(obj, dict):
924
- return obj[field1], obj[field2]
925
- else:
926
- field_obj1 = getattr(obj, field1)
927
- field_obj2 = getattr(obj, field2)
928
- return getattr(field_obj1, "has_raw_value"), getattr(
929
- field_obj2, "has_raw_value"
930
- )
931
-
932
- def format_val(val: str):
933
- # if val is negative, put it in parens
934
- return f"({val})" if val[0] == "-" else val
935
-
936
- def add_min_max(obj, field, val1, val2):
937
- # merge vals
938
- merge_val = f"{format_val(val1)}-{format_val(val2)}"
939
-
940
- if isinstance(obj, dict):
941
- pass
942
- else:
943
- # set value range and min/max numeric values
944
- field_obj = getattr(obj, field)
945
- setattr(field_obj, "has_raw_value", merge_val) # e.g., {va1}-{val2}
946
- setattr(field_obj, "has_minimum_numeric_value", float(val1))
947
- setattr(field_obj, "has_maximum_numeric_value", float(val2))
948
-
949
- # remove simple number value
950
- if hasattr(field_obj, "has_numeric_value"):
951
- delattr(field_obj, "has_numeric_value")
952
-
953
- return obj
954
-
955
- print(f"*** executing merge_value_range_fields for attributes {tx_attributes}")
956
-
957
- if len(tx_attributes) != 2:
958
- raise Exception("This function only accepts two arguments.")
959
-
960
- # get fields to be merged
961
- field1 = tx_attributes[0]
962
- field2 = tx_attributes[1]
963
-
964
- for obj in nmdc_objs:
965
- # test if fields exist
966
- if has_range_fields(obj, field1, field2):
967
- # get values from fields and merge
968
- val1, val2 = get_obj_field_values(obj, field1, field2)
969
-
970
- # modify obj's field1 to hold min/max ranges
971
- obj = add_min_max(obj, field1, val1, val2)
972
-
973
- # remove field2, no long needed
974
- if isinstance(obj, dict):
975
- obj.pop(field2, None)
976
- else:
977
- delattr(obj, field2)
978
-
979
- return nmdc_objs
980
-
981
-
982
- def make_quantity_value(nmdc_objs: list, tx_attributes: list, **kwargs) -> list:
983
- """
984
- Takes each nmdc object (either a dict or class instance) and adds has_numeric_value and has_unit information.
985
-
986
-
987
- Args:
988
- nmdc_objs (list): list of objects to be updated with has_numeric_value and/or has_unit values
989
- tx_attributes (list): list of attributes whose values need to updated
990
-
991
- Returns:
992
- list: updated nmdc_objs with has_numeric_value and/or has_unit values
993
- """
994
- print(f"*** executing make_quantity_value for attributes {tx_attributes}")
995
- for attribute in tx_attributes:
996
- for obj in nmdc_objs:
997
- if has_raw_value(obj, attribute):
998
- val = getattr(obj, attribute)
999
-
1000
- ## split raw value after first space
1001
- if type(val) == type({}):
1002
- value_list = str(val["has_raw_value"]).split(" ", 1)
1003
- else:
1004
- value_list = str(getattr(val, "has_raw_value")).split(" ", 1)
1005
-
1006
- ## assign numeric quantity value
1007
- if type(val) == type({}):
1008
- try:
1009
- val["has_numeric_value"] = float(value_list[0].strip())
1010
- except Exception as ex:
1011
- pass
1012
- else:
1013
- try:
1014
- val.has_numeric_value = float(value_list[0].strip())
1015
- except Exception as ex:
1016
- pass
1017
-
1018
- ## assign unit if present
1019
- if len(value_list) > 1:
1020
- if type(val) == type({}):
1021
- val["has_unit"] = value_list[1].strip()
1022
- else:
1023
- val.has_unit = value_list[1].strip()
1024
-
1025
- return nmdc_objs
1026
-
1027
-
1028
- def make_iso_8601_date_value(nmdc_objs: list, tx_attributes: list, **kwargs) -> list:
1029
- """
1030
- Converts date values in ISO-8601 format.
1031
- E.g., "30-OCT-14 12.00.00.000000000 AM" -> "30-OCT-14" is converted to "2014-10-14".
1032
-
1033
- Parameters
1034
- ----------
1035
- nmdc_objs : list
1036
- List of objects to whose attributes will converted to ISO-8601 format.
1037
- tx_attributes : list
1038
- List of attributes whose values need to updated to ISO-8601 format.
1039
-
1040
- Returns
1041
- -------
1042
- list
1043
- List of updated nmdc_objs with ISO-8601 formated strings as values.
1044
- """
1045
- print(f"*** executing make_iso_8601_date for attributes {tx_attributes}")
1046
- for attribute in tx_attributes:
1047
- for obj in nmdc_objs:
1048
- # check if object has a date field (attribute)
1049
- if hasattr(obj, attribute):
1050
- # get the current date string value and return just the date part
1051
- # e.g.: "30-OCT-14 12.00.00.000000000 AM" -> "30-OCT-14"
1052
- date_str = str(getattr(obj, attribute)).split(" ", 1)[0]
1053
-
1054
- # convert date string in ISO-8601
1055
- # e.g.: "30-OCT-14" -> "2014-10-14"
1056
- if not (date_str is None) and date_str != "None":
1057
- try:
1058
- date_val = datetime.strptime(date_str, "%d-%b-%y").strftime(
1059
- "%Y-%m-%d"
1060
- )
1061
- setattr(obj, attribute, date_val)
1062
- except Exception as ex:
1063
- print(getattr(obj, "id"), f"property {attribute}", "error:", ex)
1064
-
1065
- return nmdc_objs
1066
-
1067
-
1068
- def get_json(file_path: str, replace_single_quote=False):
1069
- """
1070
- Returns a json object from the file specied by file_path.
1071
-
1072
- Args:
1073
- file_path (sting): path file holding json
1074
- replace_single_quote (bool, optional): specifies if "'" is replaced with '"'; defaults to False
1075
-
1076
- Returns:
1077
- json object
1078
- """
1079
- ## load json
1080
- with open(file_path, "r") as in_file:
1081
- if replace_single_quote: # json
1082
- text = in_file.read()
1083
- json_data = json.loads(text.replace("'", '"'))
1084
- else:
1085
- json_data = json.load(in_file)
1086
- return json_data
1087
-
1088
-
1089
- def save_json(json_data: str, file_path):
1090
- """
1091
- Saves json_data to file specified by file_path.
1092
-
1093
- Args:
1094
- json_data: json data
1095
- file_path (sting): path to where json is saved
1096
-
1097
- Returns:
1098
- [type]: [description]
1099
- """
1100
- ## if json data is a string, it will need to be
1101
- ## loaded into a variable to for "\" escape characters
1102
- if type(json_data) == type(""):
1103
- json_data = json.loads(json_data)
1104
-
1105
- ## save json with changed data types
1106
- with open(file_path, "w") as out_file:
1107
- json.dump(json_data, out_file, indent=2)
1108
- return json_data
1109
-
1110
-
1111
- if __name__ == "__main__":
1112
- ## code for testing
1113
- file_path = "../output/nmdc_etl/test.json"
1114
- # test_json = collapse_json_file(file_path, 'part_of')
1115
- # test_json = collapse_json_file(file_path, 'has_input')
1116
- # test_json = collapse_json_file(file_path, "has_output")
1117
- # print(test_json)