nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (100) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +55 -4
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +33 -28
  76. nmdc_runtime/site/ops.py +97 -237
  77. nmdc_runtime/site/repair/database_updater.py +8 -0
  78. nmdc_runtime/site/repository.py +7 -117
  79. nmdc_runtime/site/resources.py +4 -4
  80. nmdc_runtime/site/translation/gold_translator.py +22 -21
  81. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  82. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  83. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  84. nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
  85. nmdc_runtime/site/translation/translator.py +63 -1
  86. nmdc_runtime/site/util.py +8 -3
  87. nmdc_runtime/site/validation/util.py +10 -5
  88. nmdc_runtime/util.py +9 -321
  89. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  90. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  91. nmdc_runtime/site/translation/emsl.py +0 -43
  92. nmdc_runtime/site/translation/gold.py +0 -53
  93. nmdc_runtime/site/translation/jgi.py +0 -32
  94. nmdc_runtime/site/translation/util.py +0 -132
  95. nmdc_runtime/site/validation/jgi.py +0 -43
  96. nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
  97. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  98. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  99. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  100. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,788 @@
1
+ import builtins
2
+ import inspect
3
+ import json
4
+ from collections import defaultdict, namedtuple
5
+ from functools import lru_cache
6
+ from io import StringIO
7
+ from pathlib import Path
8
+ from types import ModuleType
9
+ from typing import Optional, Dict, List, Tuple, Any, Union
10
+
11
+ from bson.json_util import dumps as bson_dumps
12
+ import pandas as pd
13
+ import pandas as pds
14
+ from fastapi import HTTPException
15
+ from jsonschema import Draft7Validator
16
+ from linkml_runtime.utils.schemaview import SchemaView
17
+ from nmdc_schema import nmdc
18
+ from nmdc_schema.nmdc_data import get_nmdc_schema_definition
19
+ from pymongo.database import Database as MongoDatabase
20
+ from starlette import status
21
+ from toolz.dicttoolz import dissoc, assoc_in, get_in
22
+
23
+ from nmdc_runtime.api.models.metadata import ChangesheetIn
24
+ from nmdc_runtime.util import get_nmdc_jsonschema_dict, collection_name_to_class_names
25
+
26
+ # custom named tuple to hold path property information
27
+ SchemaPathProperties = namedtuple(
28
+ "SchemaPathProperties", ["slots", "ranges", "multivalues"]
29
+ )
30
+
31
+ FilePathOrBuffer = Union[Path, StringIO]
32
+
33
+
34
+ def load_changesheet(
35
+ filename: FilePathOrBuffer, mongodb: MongoDatabase, sep="\t"
36
+ ) -> pds.DataFrame:
37
+ """
38
+ Creates a datafame from the input file that includes extra columns used for
39
+ determining the path for updating a Mongo document and the data type of the updated data.
40
+
41
+ Returns
42
+ -------
43
+ Pandas DataFrame
44
+
45
+ Parameters
46
+ ----------
47
+ filename : FilePathOrBuffer
48
+ Name of the file containing the change sheet.
49
+ mongodb : MongoDatabase
50
+ The Mongo database that the change sheet will update.
51
+ sep : str
52
+ Column separator in file.
53
+
54
+ Raises
55
+ ------
56
+ ValueError
57
+ If input file lacks an id column.
58
+ ValueError
59
+ If input file lacks an attribute column.
60
+ ValueError
61
+ If input file lacks an action column.
62
+ Exception
63
+ If a document id is not found in the Mongo database.
64
+ Exception
65
+ If a class name cannot be determined.
66
+ """
67
+ # load dataframe replacing NaN with ''
68
+ df = pds.read_csv(filename, sep=sep, dtype="string").fillna("")
69
+ # df = pds.read_csv(filename, sep=sep, dtype="string")
70
+
71
+ # add a group id column, but copy only IRIs (has ":" in it)
72
+ try:
73
+ df["group_id"] = df["id"].map(lambda x: x if ":" in x else "")
74
+ except KeyError:
75
+ raise ValueError("change sheet lacks 'id' column.")
76
+
77
+ # fill in blank group ids
78
+ for i in range(len(df)):
79
+ if len(str(df.loc[i, "group_id"]).strip()) < 1:
80
+ df.loc[i, "group_id"] = df.loc[i - 1, "group_id"]
81
+
82
+ # fill in blank action columns
83
+ try:
84
+ for i in range(len(df)):
85
+ if len(str(df.loc[i, "action"]).strip()) < 1:
86
+ df.loc[i, "action"] = df.loc[i - 1, "action"]
87
+ except KeyError:
88
+ raise ValueError("change sheet lacks 'action' column.")
89
+
90
+ # build dict to hold variables that have been defined
91
+ # in the id column of the change sheet
92
+ try:
93
+ # collect vars in the id column
94
+ var_dict = {
95
+ id_val: None
96
+ for id_val, attr in df[["id", "attribute"]].values
97
+ if len(id_val) > 0 and ":" not in id_val
98
+ }
99
+ except KeyError:
100
+ # note the presence of the id column is checked above
101
+ raise ValueError("change sheet lacks 'attribute' column.")
102
+
103
+ # add group_var column to hold values from the id column
104
+ # that are being used varialbe/blank nodes
105
+ # df["group_var"] = df["id"].map(lambda x: x if not (":" in x) else "")
106
+ df["group_var"] = ""
107
+ for ix, id_val, attr, value in df[["id", "attribute", "value"]].itertuples():
108
+ if id_val in var_dict.keys() and value in var_dict.keys():
109
+ var_dict[value] = f"{var_dict[id_val]}.{attr}"
110
+ var_dict[f"{id_val}.{value}"] = f"{var_dict[id_val]}.{attr}"
111
+ df.loc[ix, "group_var"] = f"{id_val}.{value}"
112
+ elif value in var_dict.keys():
113
+ var_dict[value] = attr
114
+ df.loc[ix, "group_var"] = value
115
+ elif id_val in var_dict.keys():
116
+ df.loc[ix, "group_var"] = id_val
117
+
118
+ # add path column used to hold the path in the data to the data that will be changed
119
+ # e.g. principal_investigator.name
120
+ df["path"] = ""
121
+ # split into id groups, this allow each id group to have its own local variables
122
+ # i.e., same var name can be used with different ids
123
+ group_ids = df.groupby("group_id")
124
+ for group_id in group_ids:
125
+ df_id = group_id[1] # dataframe for value group_id
126
+
127
+ # split into var groups
128
+ var_groups = df_id.groupby("group_var")
129
+ for var_group in var_groups:
130
+ # var = var_group[0] # value of group_var
131
+ df_var = var_group[1] # dataframe for value group_var
132
+
133
+ for ix, attr, value, group_var in df_var[
134
+ ["attribute", "value", "group_var"]
135
+ ].itertuples():
136
+ # if group_var is empty, it is a simple property
137
+ if "" == group_var:
138
+ df.loc[ix, "path"] = attr
139
+
140
+ # otherwise, it is a nested property
141
+ # if the value is not a var, then we are at bottom level
142
+ elif value not in var_dict.keys():
143
+ df.loc[ix, "path"] = f"{var_dict[group_var]}.{attr}"
144
+
145
+ # create map between id and collection
146
+ id_dict = map_id_to_collection(mongodb)
147
+ # add collection for each id
148
+ df["collection_name"] = ""
149
+ prev_id = ""
150
+ for ix, group_id in df[["group_id"]].itertuples():
151
+ # check if there is a new id
152
+ if group_id != prev_id:
153
+ prev_id = group_id # update prev id
154
+ collection_name = get_collection_for_id(group_id, id_dict)
155
+
156
+ if collection_name is None:
157
+ raise Exception("Cannot find ID", group_id, "in any collection")
158
+
159
+ df.loc[ix, "collection_name"] = collection_name
160
+
161
+ # add linkml class name for each id
162
+ df["linkml_class"] = ""
163
+ class_name_dict = map_schema_class_names(nmdc)
164
+ for ix, id_, collection_name in df[["group_id", "collection_name"]].itertuples():
165
+ data = mongodb[collection_name].find_one({"id": id_})
166
+
167
+ # find the type of class the data instantiates
168
+ if "type" in list(data.keys()):
169
+ # get part after the ":"
170
+ class_name = data["type"].split(":")[-1]
171
+ class_name = class_name_dict[class_name]
172
+ else:
173
+ class_names = collection_name_to_class_names[collection_name]
174
+ if len(class_names) > 1:
175
+ raise ValueError(
176
+ "cannot unambiguously infer class of document"
177
+ f" with `id` {id_} in collection {collection_name}."
178
+ " Please ensure explicit `type` is present in document."
179
+ )
180
+ class_name = class_name_dict[class_names[0]]
181
+
182
+ # set class name for id
183
+ df["linkml_class"] = class_name
184
+
185
+ # info about properties of slots in the property path
186
+ df["linkml_slots"] = ""
187
+ df["ranges"] = ""
188
+ df["multivalues"] = ""
189
+ sd = get_nmdc_schema_definition()
190
+ view = SchemaView(sd)
191
+ for ix, attribute, path, class_name in df[
192
+ ["attribute", "path", "linkml_class"]
193
+ ].itertuples():
194
+ # fetch the properites for the path
195
+ if len(path) > 0:
196
+ spp = fetch_schema_path_properties(view, path, class_name)
197
+ else:
198
+ spp = fetch_schema_path_properties(view, attribute, class_name)
199
+
200
+ df.loc[ix, "linkml_slots"] = str.join("|", spp.slots)
201
+ df.loc[ix, "ranges"] = str.join("|", spp.ranges)
202
+ df.loc[ix, "multivalues"] = str.join("|", spp.multivalues)
203
+ df = df.astype({"value": object})
204
+ for ix, value, ranges in list(df[["value", "ranges"]].itertuples()):
205
+ # Infer python builtin type for coercion via <https://w3id.org/linkml/base>.
206
+ # If base is member of builtins module, e.g. `int` or `float`, coercion will succeed.
207
+ # Otherwise, keep value as is (as a `str`).
208
+ # Note: Mongo BSON has a decimal type,
209
+ # but e.g. <https://w3id.org/nmdc/DecimalDegree> has a specified `base` of `float`
210
+ # and I think it's best to not "re-interpret" what LinkML specifies. Can revisit this decision
211
+ # by e.g. overriding `base` when `uri` is a "known" type (`xsd:decimal` in the case of DecimalDegree).
212
+ try:
213
+ base_type = view.induced_type(ranges.rsplit("|", maxsplit=1)[-1]).base
214
+ if base_type == "Decimal":
215
+ # Note: Use of bson.decimal128.Decimal128 here would require changing JSON encoding/decoding.
216
+ # Choosing to use `float` to preserve existing (expected) behavior.
217
+ df.at[ix, "value"] = float(value)
218
+ df.at[ix, "value"] = getattr(builtins, base_type)(value)
219
+ except:
220
+ continue
221
+ return df
222
+
223
+
224
+ def map_schema_class_names(nmdc_mod: ModuleType) -> Dict[str, str]:
225
+ """Returns dict that maps the classes in the nmdc.py module (within the NMDC Schema PyPI library)
226
+ to the class names used in the linkml schema.
227
+
228
+ Parameters
229
+ ----------
230
+ nmdc_mod : ModuleType
231
+ The nmdc.py module in the NMDC Schema library.
232
+
233
+ Returns
234
+ -------
235
+ Dict[str, str]
236
+ Maps the class as named in the module to the class name in the linkml schema.
237
+ E.g., BiosampleProcessing -> biosample processing
238
+ """
239
+ class_dict = {}
240
+ for name, member in inspect.getmembers(nmdc_mod):
241
+ if inspect.isclass(member) and hasattr(member, "class_name"):
242
+ class_dict[name] = member.class_name
243
+ return class_dict
244
+
245
+
246
+ @lru_cache
247
+ def fetch_schema_path_properties(
248
+ view: SchemaView, schema_path: str, class_name: str
249
+ ) -> SchemaPathProperties:
250
+ """Returns properies for a slot in the linkml schema.
251
+
252
+ Parameters
253
+ ----------
254
+ view : SchemaView
255
+ The SchemaView object holding the linkml schema
256
+ schema_path : str
257
+ The path in Mongo database to the value
258
+ class_name : str
259
+ The name of the class with the slot(s)
260
+
261
+ Returns
262
+ -------
263
+ SchemaPathProperties
264
+ A namedtuple of form "SchemaPathProperties", ["slots", "ranges", "multivalues"]
265
+ that holds the property informaton about the slot.
266
+ slots: a list of the linkml slots, this may differ from the path names
267
+ ranges: a list of the range for slot in the slots list
268
+ multivalues: a list of True/False strings specifying if the slot is multivaued
269
+
270
+ Raises
271
+ ------
272
+ AttributeError
273
+ If the slot is not found in the linkml schema, an AttributeError is raised.
274
+ """
275
+ # lists to hold properties for a value in the path
276
+ slots = []
277
+ ranges = []
278
+ multivalues = []
279
+ paths = schema_path.split(".")
280
+ for path in paths:
281
+ schema_class = view.get_class(class_name) # get class from schema
282
+
283
+ # first check if it is an induced slot
284
+ # i.e., if slot properties have been overridden
285
+ if path in schema_class.slot_usage.keys():
286
+ schema_slot = view.induced_slot(path, class_name)
287
+ elif path.replace("_", " ") in schema_class.slot_usage.keys():
288
+ schema_slot = view.induced_slot(path.replace("_", " "), class_name)
289
+
290
+ # if slot has not been overridden, check class attributes
291
+ if path in schema_class.attributes.keys():
292
+ schema_slot = view.induced_slot(path, class_name)
293
+ elif path.replace("_", " ") in schema_class.attributes.keys():
294
+ schema_slot = view.induced_slot(path.replace("_", " "), class_name)
295
+
296
+ # if slot has not been overridden or is an attribute, get slot properties from view
297
+ elif path in view.all_slots().keys():
298
+ schema_slot = view.get_slot(path)
299
+ elif path.replace("_", " ") in view.all_slots().keys():
300
+ schema_slot = view.get_slot(path.replace("_", " "))
301
+
302
+ # raise error if the slot is not found
303
+ else:
304
+ raise AttributeError(f"slot '{path}' not found for '{schema_class.name}'")
305
+
306
+ # properties to lists as strings (strings are needed for dataframe)
307
+ slots.append(str(schema_slot.name))
308
+
309
+ if schema_slot.range is None:
310
+ ranges.append("string")
311
+ else:
312
+ ranges.append(str(schema_slot.range))
313
+
314
+ if schema_slot.multivalued is None:
315
+ multivalues.append("False")
316
+ else:
317
+ multivalues.append(str(schema_slot.multivalued))
318
+
319
+ # update the class name to range of slot
320
+ class_name = schema_slot.range
321
+
322
+ return SchemaPathProperties(slots, ranges, multivalues)
323
+
324
+
325
+ def make_vargroup_updates(df: pds.DataFrame) -> List:
326
+ """Returns a list of update commands to execute on the Mongo database
327
+ when updates are grouped with a grouping variable.
328
+
329
+ Parameters
330
+ ----------
331
+ df : pds.DataFrame
332
+ The dataframe that contains the values associated with the grouping variable.
333
+
334
+ Returns
335
+ -------
336
+ List
337
+ A list of Mongo update commands for that grouping variable.
338
+ """
339
+ id_ = df["group_id"].values[0]
340
+ path_multivalued_dict = {}
341
+ update_key = ""
342
+ path_lists = []
343
+ obj_dict = {}
344
+ for (
345
+ action,
346
+ attribute,
347
+ value,
348
+ path,
349
+ multivalues,
350
+ ) in df[
351
+ [
352
+ "action",
353
+ "attribute",
354
+ "value",
355
+ "path",
356
+ "multivalues",
357
+ ]
358
+ ].itertuples(index=False):
359
+ if len(path) < 1:
360
+ update_key = attribute
361
+ else:
362
+ # gather path lists
363
+ path_list = path.split(".")
364
+ path_lists.append(path_list)
365
+
366
+ # determine if value is a list
367
+ multivalues_list = multivalues.split("|")
368
+ value = make_mongo_update_value(action, value, multivalues_list)
369
+
370
+ # build dictionary that merges all keys and
371
+ # values into a single object, e.g:
372
+ # {'has_credit_associations': {
373
+ # 'applied_role': 'Conceptualization',
374
+ # 'applies_to_person': {
375
+ # 'name': 'CREDIT NAME 1',
376
+ # 'email': 'CREDIT_NAME_1@foo.edu',
377
+ # 'orcid': 'orcid:0000-0000-0000-0001'}}}
378
+ obj_dict = assoc_in(obj_dict, path_list, value)
379
+
380
+ # for each potential path in the path list
381
+ # deterimine if the value is multivalued
382
+ for i in range(len(path_list)):
383
+ key, value = ".".join(path_list[0 : i + 1]), multivalues_list[i]
384
+ path_multivalued_dict[key] = value
385
+
386
+ # sort path lists by length and reverse
387
+ path_lists = list(reversed(sorted(path_lists, key=len)))
388
+ longest = len(path_lists[0])
389
+
390
+ # modify the values to have correct arity
391
+ # start at the end of each path list and determine
392
+ # if that path's value is multivalued
393
+ for i in range(longest, 0, -1):
394
+ for path_list in path_lists:
395
+ # deermine if path is multivalued
396
+ # note the use of the 0 to i portion of path list
397
+ path_portion = path_list[0:i]
398
+ is_multivalued = path_multivalued_dict[".".join(path_portion)]
399
+
400
+ # modify object so that the key has correct multivalue
401
+ temp = get_in(path_portion, obj_dict)
402
+ if "True" == is_multivalued and (not isinstance(temp, list)):
403
+ obj_dict = assoc_in(obj_dict, path_portion, [temp])
404
+
405
+ update_dict = make_mongo_update_command_dict(
406
+ action, id_, update_key, obj_dict[update_key]
407
+ )
408
+
409
+ return [update_dict]
410
+
411
+
412
+ def make_updates(var_group: Tuple) -> List:
413
+ """
414
+ Creates a list of update commands to execute on the Mongo database.
415
+
416
+ Parameters
417
+ ----------
418
+ var_group : Tuple
419
+ Group of change sheet record based on the id column (generated by pandas.groupby()).
420
+ var_group[0] -> the value (if any) in the group_var column
421
+ var_group[1] -> the dataframe with group_var variables
422
+
423
+ Returns
424
+ -------
425
+ List
426
+ A list of Mongo update commands.
427
+ """
428
+ # group_var = var_group[0] # the value (if any) in the group_var column
429
+ df = var_group[1] # dataframe with group_var variables
430
+ id_ = df["group_id"].values[0] # get id for group
431
+
432
+ updates = [] # collected properties/values to updated
433
+ for (
434
+ action,
435
+ value,
436
+ path,
437
+ multivalues,
438
+ ) in df[
439
+ [
440
+ "action",
441
+ "value",
442
+ "path",
443
+ "multivalues",
444
+ ]
445
+ ].itertuples(index=False):
446
+ # note: if a path is present, there is a value to be updated
447
+ if len(path) > 0:
448
+ update_dict = {} # holds the values for the update query
449
+ action = action.strip() # remove extra white space
450
+
451
+ # determine if value is a list
452
+ value = make_mongo_update_value(action, value, multivalues.split("|"))
453
+
454
+ # if a grouping variable (group_var) is present then a
455
+ # complex object is used to update db
456
+ # if len(group_var) > 0:
457
+ # obj = {}
458
+ update_dict = make_mongo_update_command_dict(action, id_, path, value)
459
+ updates.append(update_dict) # add update commands to list
460
+
461
+ return updates
462
+
463
+
464
+ def make_mongo_update_value(action: str, value: Any, multivalues_list: List) -> Any:
465
+ """Based on the params, determines of the value for a Mongo update operation needs to be a list.
466
+
467
+ Parameters
468
+ ----------
469
+ action : str
470
+ The type of update that will be performed (e.g., insert items, replace)
471
+ value : Any
472
+ The value used for the update operation.
473
+ multivalues_list : List
474
+ List of 'True'/'False' values indicating if the value is to be multivalued (i.e., an array).
475
+
476
+ Returns
477
+ -------
478
+ Any
479
+ The value which may or may not be encapsulated in a list.
480
+ """
481
+ # if an array field is being updated, split based on pipe
482
+ if multivalues_list[-1] == "True" or (isinstance(value, str) and "|" in value):
483
+ # value = value.strip() # ****
484
+ value = [v.strip() for v in value.split("|") if len(v.strip()) > 0]
485
+ elif isinstance(value, str):
486
+ value = value.strip() # remove extra white space
487
+
488
+ return value
489
+
490
+
491
+ def make_mongo_update_command_dict(
492
+ action: str, doc_id: str, update_key: str, update_value: Any
493
+ ) -> Dict:
494
+ """Returns a dict of the command need to execute a Mongo update opertation.
495
+
496
+ Parameters
497
+ ----------
498
+ action : str
499
+ The kind of update being performed (e.g., insert item, replace).
500
+ doc_id : str
501
+ The id of Mongo document being updated.
502
+ update_key : str
503
+ The property of document whose values are being updated.
504
+ update_value : Any
505
+ The new value used for updating.
506
+
507
+ Returns
508
+ -------
509
+ Dict
510
+ The Mongo command that when executed will update the document.
511
+ """
512
+ # build dict of update commands for Mongo
513
+ if action in ["insert", "insert items", "insert item"]:
514
+ update_dict = {
515
+ "q": {"id": f"{doc_id}"},
516
+ "u": {"$addToSet": {update_key: {"$each": update_value}}},
517
+ }
518
+ elif action in ["remove items", "remove item"]:
519
+ update_dict = {
520
+ "q": {"id": f"{doc_id}"},
521
+ "u": {"$pull": {update_key: {"$in": update_value}}},
522
+ }
523
+ elif action in ["update", "set", "replace", "replace items"]:
524
+ update_dict = {
525
+ "q": {"id": f"{doc_id}"},
526
+ "u": {"$set": {update_key: update_value}},
527
+ }
528
+ elif action in ["remove", "delete"]: # remove the property from the object
529
+ # note: the update_value in an $unset opertation doesn't matter
530
+ # it is included so that we see it during debugging
531
+ update_dict = {
532
+ "q": {"id": f"{doc_id}"},
533
+ "u": {"$unset": {update_key: update_value}},
534
+ }
535
+ else:
536
+ raise ValueError(f"cannot execute action '{action}'")
537
+
538
+ return update_dict
539
+
540
+
541
+ def map_id_to_collection(mongodb: MongoDatabase) -> Dict:
542
+ """Returns dict using the collection name as a key and the ids of documents as values.
543
+
544
+ Parameters
545
+ ----------
546
+ mongodb : MongoDatabase
547
+ The Mongo database on which to build the dict.
548
+
549
+ Returns
550
+ -------
551
+ Dict
552
+ Dict mapping collection names to the set of document ids in the collection.
553
+ key: collection name
554
+ value: set(id of document)
555
+ """
556
+ collection_names = [
557
+ name for name in mongodb.list_collection_names() if name.endswith("_set")
558
+ ]
559
+ id_dict = {
560
+ name: set(mongodb[name].distinct("id"))
561
+ for name in collection_names
562
+ if "id_1" in mongodb[name].index_information()
563
+ }
564
+ return id_dict
565
+
566
+
567
+ def get_collection_for_id(
568
+ id_: str, id_map: Dict, replace_underscore: bool = False
569
+ ) -> Optional[str]:
570
+ """
571
+ Returns the name of the collect that contains the document idenfied by the id.
572
+
573
+ Parameters
574
+ ----------
575
+ id_ : str
576
+ The identifier of the document.
577
+ id_map : Dict
578
+ A dict mapping collection names to document ids.
579
+ key: collection name
580
+ value: set of document ids
581
+ replace_underscore : bool
582
+ If true, underscores in the collection name are replaced with spaces.
583
+
584
+ Returns
585
+ -------
586
+ Optional[str]
587
+ Collection name containing the document.
588
+ None if the id was not found.
589
+ """
590
+ for collection_name in id_map:
591
+ if id_ in id_map[collection_name]:
592
+ if replace_underscore is True:
593
+ return collection_name.replace("_", " ")
594
+ else:
595
+ return collection_name
596
+ return None
597
+
598
+
599
+ def mongo_update_command_for(df_change: pds.DataFrame) -> Dict[str, list]:
600
+ """
601
+ Creates a dictionary of update commands to be executed against the Mongo database.
602
+
603
+ Parameters
604
+ ----------
605
+ df_change : pds.DataFrame
606
+ A dataframe containing change sheet information
607
+
608
+ Returns
609
+ -------
610
+ Dict
611
+ A dict of the update commands to be executed.
612
+ key: collection name
613
+ value: list of update commands
614
+ """
615
+ update_cmd = {} # list of dicts to hold mongo update queries
616
+
617
+ # split data into groups by values in the group_id column (e.g., gold:Gs0103573)
618
+ id_group = df_change.groupby("group_id")
619
+ for ig in id_group:
620
+ # ig[0] -> id_: group_id for data
621
+ # ig[1] -> df_id: dataframe with rows having the group_id
622
+ id_, df_id = ig
623
+
624
+ # split data into groups by values in the group_var column (e.g, v1, v2)
625
+ var_group = df_id.groupby("group_var")
626
+ ig_updates = [] # update commands for the id group
627
+ for vg in var_group:
628
+ # vg[0] -> group_var for data
629
+ # vg[1] -> dataframe with rows having the group_var
630
+ if len(vg[0].strip()) > 0:
631
+ ig_updates.extend(make_vargroup_updates(vg[1]))
632
+ else:
633
+ ig_updates.extend(make_updates(vg))
634
+
635
+ # add update commands for the group id to dict
636
+ update_cmd[id_] = {
637
+ "update": df_id["collection_name"].values[0],
638
+ "updates": ig_updates,
639
+ }
640
+ return update_cmd
641
+
642
+
643
+ def copy_docs_in_update_cmd(
644
+ update_cmd, mdb_from: MongoDatabase, mdb_to: MongoDatabase, drop_mdb_to: bool = True
645
+ ) -> Dict[str, str]:
646
+ """
647
+ Copies data between Mongo databases.
648
+ Useful to apply and inspect updates on a test database.
649
+
650
+ Parameters
651
+ ----------
652
+ mdb_from : MongoDatbase
653
+ Database from which data being copied (i.e., source).
654
+ mdb_to: MongoDatabase
655
+ Datbase which data is being copied into (i.e., destination).
656
+
657
+ Returns
658
+ -------
659
+ results : Dict
660
+ Dict with collection name as the key, and a message of number of docs inserted as value.
661
+ """
662
+ doc_specs = defaultdict(list)
663
+ for id_, update_cmd_doc in update_cmd.items():
664
+ collection_name = update_cmd_doc["update"]
665
+ doc_specs[collection_name].append(id_)
666
+
667
+ if drop_mdb_to:
668
+ mdb_to.client.drop_database(mdb_to.name)
669
+ results = {}
670
+ for collection_name, ids in doc_specs.items():
671
+ docs = [
672
+ dissoc(d, "_id")
673
+ for d in mdb_from[collection_name].find({"id": {"$in": ids}})
674
+ ]
675
+ results[collection_name] = (
676
+ f"{len(mdb_to[collection_name].insert_many(docs).inserted_ids)} docs inserted"
677
+ )
678
+ return results
679
+
680
+
681
+ def update_mongo_db(mdb: MongoDatabase, update_cmd: Dict):
682
+ """
683
+ Updates the Mongo database using commands in the update_cmd dict.
684
+
685
+ Parameters
686
+ ----------
687
+ mdb : MongoDatabase
688
+ Mongo database to be updated.
689
+ update_cmd : Dict
690
+ Contians update commands to be executed.
691
+
692
+ Returns
693
+ -------
694
+ results: Dict
695
+ Information about what was updated in the Mongo database.
696
+ """
697
+ results = []
698
+ validator_strict = Draft7Validator(get_nmdc_jsonschema_dict())
699
+ validator_noidpatterns = Draft7Validator(
700
+ get_nmdc_jsonschema_dict(enforce_id_patterns=False)
701
+ )
702
+
703
+ for id_, update_cmd_doc in update_cmd.items():
704
+ collection_name = update_cmd_doc["update"]
705
+ doc_before = dissoc(mdb[collection_name].find_one({"id": id_}), "_id")
706
+ update_result = json.loads(bson_dumps(mdb.command(update_cmd_doc)))
707
+ doc_after = dissoc(mdb[collection_name].find_one({"id": id_}), "_id")
708
+ if collection_name in {
709
+ "study_set",
710
+ "biosample_set",
711
+ "omics_processing_set",
712
+ } and id_.split(":")[0] in {"gold", "emsl", "igsn"}:
713
+ validator = validator_noidpatterns
714
+ else:
715
+ validator = validator_strict
716
+ errors = list(validator.iter_errors({collection_name: [doc_after]}))
717
+ results.append(
718
+ {
719
+ "id": id_,
720
+ "doc_before": doc_before,
721
+ "update_info": update_result,
722
+ "doc_after": doc_after,
723
+ "validation_errors": [e.message for e in errors],
724
+ }
725
+ )
726
+
727
+ return results
728
+
729
+
730
+ def _validate_changesheet(df_change: pd.DataFrame, mdb: MongoDatabase):
731
+ update_cmd = mongo_update_command_for(df_change)
732
+ mdb_to_inspect = mdb.client["nmdc_changesheet_submission_results"]
733
+ results_of_copy = copy_docs_in_update_cmd(
734
+ update_cmd,
735
+ mdb_from=mdb,
736
+ mdb_to=mdb_to_inspect,
737
+ )
738
+ results_of_updates = update_mongo_db(mdb_to_inspect, update_cmd)
739
+ rv = {
740
+ "update_cmd": update_cmd,
741
+ "inspection_info": {
742
+ "mdb_name": mdb_to_inspect.name,
743
+ "results_of_copy": results_of_copy,
744
+ },
745
+ "results_of_updates": results_of_updates,
746
+ }
747
+ validation_errors = []
748
+ for result in results_of_updates:
749
+ if len(result.get("validation_errors", [])) > 0:
750
+ validation_errors.append(result["validation_errors"])
751
+ if (
752
+ len(write_errors := result.get("update_info", {}).get("writeErrors", {}))
753
+ > 0
754
+ ):
755
+ validation_errors.append(write_errors)
756
+
757
+ if validation_errors:
758
+ raise HTTPException(
759
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
760
+ detail={
761
+ "update_cmd": rv["update_cmd"],
762
+ "validation_errors": validation_errors,
763
+ },
764
+ )
765
+ return rv
766
+
767
+
768
+ def df_from_sheet_in(sheet_in: ChangesheetIn, mdb: MongoDatabase) -> pd.DataFrame:
769
+ content_types = {
770
+ "text/csv": ",",
771
+ "text/tab-separated-values": "\t",
772
+ }
773
+ content_type = sheet_in.content_type
774
+ sep = content_types.get(content_type)
775
+ filename = sheet_in.name
776
+ if content_type not in content_types:
777
+ raise HTTPException(
778
+ status_code=status.HTTP_400_BAD_REQUEST,
779
+ detail=(
780
+ f"file {filename} has content type '{content_type}'. "
781
+ f"Only {list(content_types)} files are permitted."
782
+ ),
783
+ )
784
+ try:
785
+ df = load_changesheet(StringIO(sheet_in.text), mdb, sep=sep)
786
+ except Exception as e:
787
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
788
+ return df