nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,777 @@
1
+ import builtins
2
+ import inspect
3
+ import json
4
+ from collections import defaultdict, namedtuple
5
+ from functools import lru_cache
6
+ from io import StringIO
7
+ from pathlib import Path
8
+ from types import ModuleType
9
+ from typing import Optional, Dict, List, Tuple, Any, Union
10
+
11
+ from bson.json_util import dumps as bson_dumps
12
+ import pandas as pd
13
+ import pandas as pds
14
+ from fastapi import HTTPException
15
+ from linkml_runtime.utils.schemaview import SchemaView
16
+ from nmdc_schema import nmdc
17
+ from nmdc_schema.nmdc_data import get_nmdc_schema_definition
18
+ from pymongo.database import Database as MongoDatabase
19
+ from starlette import status
20
+ from toolz.dicttoolz import dissoc, assoc_in, get_in
21
+
22
+ from nmdc_runtime.api.models.metadata import ChangesheetIn
23
+ from nmdc_runtime.util import collection_name_to_class_names, get_nmdc_schema_validator
24
+
25
+ # custom named tuple to hold path property information
26
+ SchemaPathProperties = namedtuple(
27
+ "SchemaPathProperties", ["slots", "ranges", "multivalues"]
28
+ )
29
+
30
+ FilePathOrBuffer = Union[Path, StringIO]
31
+
32
+
33
+ def load_changesheet(
34
+ filename: FilePathOrBuffer, mongodb: MongoDatabase, sep="\t"
35
+ ) -> pds.DataFrame:
36
+ """
37
+ Creates a datafame from the input file that includes extra columns used for
38
+ determining the path for updating a Mongo document and the data type of the updated data.
39
+
40
+ Returns
41
+ -------
42
+ Pandas DataFrame
43
+
44
+ Parameters
45
+ ----------
46
+ filename : FilePathOrBuffer
47
+ Name of the file containing the change sheet.
48
+ mongodb : MongoDatabase
49
+ The Mongo database that the change sheet will update.
50
+ sep : str
51
+ Column separator in file.
52
+
53
+ Raises
54
+ ------
55
+ ValueError
56
+ If input file lacks an id column.
57
+ ValueError
58
+ If input file lacks an attribute column.
59
+ ValueError
60
+ If input file lacks an action column.
61
+ Exception
62
+ If a document id is not found in the Mongo database.
63
+ Exception
64
+ If a class name cannot be determined.
65
+ """
66
+ # load dataframe replacing NaN with ''
67
+ df = pds.read_csv(filename, sep=sep, dtype="string").fillna("")
68
+ # df = pds.read_csv(filename, sep=sep, dtype="string")
69
+
70
+ # add a group id column, but copy only IRIs (has ":" in it)
71
+ try:
72
+ df["group_id"] = df["id"].map(lambda x: x if ":" in x else "")
73
+ except KeyError:
74
+ raise ValueError("change sheet lacks 'id' column.")
75
+
76
+ # fill in blank group ids
77
+ for i in range(len(df)):
78
+ if len(str(df.loc[i, "group_id"]).strip()) < 1:
79
+ df.loc[i, "group_id"] = df.loc[i - 1, "group_id"]
80
+
81
+ # fill in blank action columns
82
+ try:
83
+ for i in range(len(df)):
84
+ if len(str(df.loc[i, "action"]).strip()) < 1:
85
+ df.loc[i, "action"] = df.loc[i - 1, "action"]
86
+ except KeyError:
87
+ raise ValueError("change sheet lacks 'action' column.")
88
+
89
+ # build dict to hold variables that have been defined
90
+ # in the id column of the change sheet
91
+ try:
92
+ # collect vars in the id column
93
+ var_dict = {
94
+ id_val: None
95
+ for id_val, attr in df[["id", "attribute"]].values
96
+ if len(id_val) > 0 and ":" not in id_val
97
+ }
98
+ except KeyError:
99
+ # note the presence of the id column is checked above
100
+ raise ValueError("change sheet lacks 'attribute' column.")
101
+
102
+ # add group_var column to hold values from the id column
103
+ # that are being used varialbe/blank nodes
104
+ # df["group_var"] = df["id"].map(lambda x: x if not (":" in x) else "")
105
+ df["group_var"] = ""
106
+ for ix, id_val, attr, value in df[["id", "attribute", "value"]].itertuples():
107
+ if id_val in var_dict.keys() and value in var_dict.keys():
108
+ var_dict[value] = f"{var_dict[id_val]}.{attr}"
109
+ var_dict[f"{id_val}.{value}"] = f"{var_dict[id_val]}.{attr}"
110
+ df.loc[ix, "group_var"] = f"{id_val}.{value}"
111
+ elif value in var_dict.keys():
112
+ var_dict[value] = attr
113
+ df.loc[ix, "group_var"] = value
114
+ elif id_val in var_dict.keys():
115
+ df.loc[ix, "group_var"] = id_val
116
+
117
+ # add path column used to hold the path in the data to the data that will be changed
118
+ # e.g. principal_investigator.name
119
+ df["path"] = ""
120
+ # split into id groups, this allow each id group to have its own local variables
121
+ # i.e., same var name can be used with different ids
122
+ group_ids = df.groupby("group_id")
123
+ for group_id in group_ids:
124
+ df_id = group_id[1] # dataframe for value group_id
125
+
126
+ # split into var groups
127
+ var_groups = df_id.groupby("group_var")
128
+ for var_group in var_groups:
129
+ # var = var_group[0] # value of group_var
130
+ df_var = var_group[1] # dataframe for value group_var
131
+
132
+ for ix, attr, value, group_var in df_var[
133
+ ["attribute", "value", "group_var"]
134
+ ].itertuples():
135
+ # if group_var is empty, it is a simple property
136
+ if "" == group_var:
137
+ df.loc[ix, "path"] = attr
138
+
139
+ # otherwise, it is a nested property
140
+ # if the value is not a var, then we are at bottom level
141
+ elif value not in var_dict.keys():
142
+ df.loc[ix, "path"] = f"{var_dict[group_var]}.{attr}"
143
+
144
+ # create map between id and collection
145
+ id_dict = map_id_to_collection(mongodb)
146
+ # add collection for each id
147
+ df["collection_name"] = ""
148
+ prev_id = ""
149
+ for ix, group_id in df[["group_id"]].itertuples():
150
+ # check if there is a new id
151
+ if group_id != prev_id:
152
+ prev_id = group_id # update prev id
153
+ collection_name = get_collection_for_id(group_id, id_dict)
154
+
155
+ if collection_name is None:
156
+ raise Exception("Cannot find ID", group_id, "in any collection")
157
+
158
+ df.loc[ix, "collection_name"] = collection_name
159
+
160
+ # add linkml class name for each id
161
+ df["linkml_class"] = ""
162
+ class_name_dict = map_schema_class_names(nmdc)
163
+ for ix, id_, collection_name in df[["group_id", "collection_name"]].itertuples():
164
+ data = mongodb[collection_name].find_one({"id": id_})
165
+
166
+ # find the type of class the data instantiates
167
+ if "type" in list(data.keys()):
168
+ # get part after the ":"
169
+ class_name = data["type"].split(":")[-1]
170
+ class_name = class_name_dict[class_name]
171
+ else:
172
+ class_names = collection_name_to_class_names[collection_name]
173
+ if len(class_names) > 1:
174
+ raise ValueError(
175
+ "cannot unambiguously infer class of document"
176
+ f" with `id` {id_} in collection {collection_name}."
177
+ " Please ensure explicit `type` is present in document."
178
+ )
179
+ class_name = class_name_dict[class_names[0]]
180
+
181
+ # set class name for id
182
+ df["linkml_class"] = class_name
183
+
184
+ # info about properties of slots in the property path
185
+ df["linkml_slots"] = ""
186
+ df["ranges"] = ""
187
+ df["multivalues"] = ""
188
+ sd = get_nmdc_schema_definition()
189
+ view = SchemaView(sd)
190
+ for ix, attribute, path, class_name in df[
191
+ ["attribute", "path", "linkml_class"]
192
+ ].itertuples():
193
+ # fetch the properites for the path
194
+ if len(path) > 0:
195
+ spp = fetch_schema_path_properties(view, path, class_name)
196
+ else:
197
+ spp = fetch_schema_path_properties(view, attribute, class_name)
198
+
199
+ df.loc[ix, "linkml_slots"] = str.join("|", spp.slots)
200
+ df.loc[ix, "ranges"] = str.join("|", spp.ranges)
201
+ df.loc[ix, "multivalues"] = str.join("|", spp.multivalues)
202
+ df = df.astype({"value": object})
203
+ for ix, value, ranges in list(df[["value", "ranges"]].itertuples()):
204
+ # Infer python builtin type for coercion via <https://w3id.org/linkml/base>.
205
+ # If base is member of builtins module, e.g. `int` or `float`, coercion will succeed.
206
+ # Otherwise, keep value as is (as a `str`).
207
+ # Note: Mongo BSON has a decimal type,
208
+ # but e.g. <https://w3id.org/nmdc/DecimalDegree> has a specified `base` of `float`
209
+ # and I think it's best to not "re-interpret" what LinkML specifies. Can revisit this decision
210
+ # by e.g. overriding `base` when `uri` is a "known" type (`xsd:decimal` in the case of DecimalDegree).
211
+ try:
212
+ base_type = view.induced_type(ranges.rsplit("|", maxsplit=1)[-1]).base
213
+ if base_type == "Decimal":
214
+ # Note: Use of bson.decimal128.Decimal128 here would require changing JSON encoding/decoding.
215
+ # Choosing to use `float` to preserve existing (expected) behavior.
216
+ df.at[ix, "value"] = float(value)
217
+ df.at[ix, "value"] = getattr(builtins, base_type)(value)
218
+ except:
219
+ continue
220
+ return df
221
+
222
+
223
+ def map_schema_class_names(nmdc_mod: ModuleType) -> Dict[str, str]:
224
+ """Returns dict that maps the classes in the nmdc.py module (within the NMDC Schema PyPI library)
225
+ to the class names used in the linkml schema.
226
+
227
+ Parameters
228
+ ----------
229
+ nmdc_mod : ModuleType
230
+ The nmdc.py module in the NMDC Schema library.
231
+
232
+ Returns
233
+ -------
234
+ Dict[str, str]
235
+ Maps the class as named in the module to the class name in the linkml schema.
236
+ E.g., BiosampleProcessing -> biosample processing
237
+ """
238
+ class_dict = {}
239
+ for name, member in inspect.getmembers(nmdc_mod):
240
+ if inspect.isclass(member) and hasattr(member, "class_name"):
241
+ class_dict[name] = member.class_name
242
+ return class_dict
243
+
244
+
245
+ @lru_cache
246
+ def fetch_schema_path_properties(
247
+ view: SchemaView, schema_path: str, class_name: str
248
+ ) -> SchemaPathProperties:
249
+ """Returns properies for a slot in the linkml schema.
250
+
251
+ Parameters
252
+ ----------
253
+ view : SchemaView
254
+ The SchemaView object holding the linkml schema
255
+ schema_path : str
256
+ The path in Mongo database to the value
257
+ class_name : str
258
+ The name of the class with the slot(s)
259
+
260
+ Returns
261
+ -------
262
+ SchemaPathProperties
263
+ A namedtuple of form "SchemaPathProperties", ["slots", "ranges", "multivalues"]
264
+ that holds the property informaton about the slot.
265
+ slots: a list of the linkml slots, this may differ from the path names
266
+ ranges: a list of the range for slot in the slots list
267
+ multivalues: a list of True/False strings specifying if the slot is multivaued
268
+
269
+ Raises
270
+ ------
271
+ AttributeError
272
+ If the slot is not found in the linkml schema, an AttributeError is raised.
273
+ """
274
+ # lists to hold properties for a value in the path
275
+ slots = []
276
+ ranges = []
277
+ multivalues = []
278
+ paths = schema_path.split(".")
279
+ for path in paths:
280
+ schema_class = view.get_class(class_name) # get class from schema
281
+
282
+ # first check if it is an induced slot
283
+ # i.e., if slot properties have been overridden
284
+ if path in schema_class.slot_usage.keys():
285
+ schema_slot = view.induced_slot(path, class_name)
286
+ elif path.replace("_", " ") in schema_class.slot_usage.keys():
287
+ schema_slot = view.induced_slot(path.replace("_", " "), class_name)
288
+
289
+ # if slot has not been overridden, check class attributes
290
+ if path in schema_class.attributes.keys():
291
+ schema_slot = view.induced_slot(path, class_name)
292
+ elif path.replace("_", " ") in schema_class.attributes.keys():
293
+ schema_slot = view.induced_slot(path.replace("_", " "), class_name)
294
+
295
+ # if slot has not been overridden or is an attribute, get slot properties from view
296
+ elif path in view.all_slots().keys():
297
+ schema_slot = view.get_slot(path)
298
+ elif path.replace("_", " ") in view.all_slots().keys():
299
+ schema_slot = view.get_slot(path.replace("_", " "))
300
+
301
+ # raise error if the slot is not found
302
+ else:
303
+ raise AttributeError(f"slot '{path}' not found for '{schema_class.name}'")
304
+
305
+ # properties to lists as strings (strings are needed for dataframe)
306
+ slots.append(str(schema_slot.name))
307
+
308
+ if schema_slot.range is None:
309
+ ranges.append("string")
310
+ else:
311
+ ranges.append(str(schema_slot.range))
312
+
313
+ if schema_slot.multivalued is None:
314
+ multivalues.append("False")
315
+ else:
316
+ multivalues.append(str(schema_slot.multivalued))
317
+
318
+ # update the class name to range of slot
319
+ class_name = schema_slot.range
320
+
321
+ return SchemaPathProperties(slots, ranges, multivalues)
322
+
323
+
324
+ def make_vargroup_updates(df: pds.DataFrame) -> List:
325
+ """Returns a list of update commands to execute on the Mongo database
326
+ when updates are grouped with a grouping variable.
327
+
328
+ Parameters
329
+ ----------
330
+ df : pds.DataFrame
331
+ The dataframe that contains the values associated with the grouping variable.
332
+
333
+ Returns
334
+ -------
335
+ List
336
+ A list of Mongo update commands for that grouping variable.
337
+ """
338
+ id_ = df["group_id"].values[0]
339
+ path_multivalued_dict = {}
340
+ update_key = ""
341
+ path_lists = []
342
+ obj_dict = {}
343
+ for (
344
+ action,
345
+ attribute,
346
+ value,
347
+ path,
348
+ multivalues,
349
+ ) in df[
350
+ [
351
+ "action",
352
+ "attribute",
353
+ "value",
354
+ "path",
355
+ "multivalues",
356
+ ]
357
+ ].itertuples(index=False):
358
+ if len(path) < 1:
359
+ update_key = attribute
360
+ else:
361
+ # gather path lists
362
+ path_list = path.split(".")
363
+ path_lists.append(path_list)
364
+
365
+ # determine if value is a list
366
+ multivalues_list = multivalues.split("|")
367
+ value = make_mongo_update_value(action, value, multivalues_list)
368
+
369
+ # build dictionary that merges all keys and
370
+ # values into a single object, e.g:
371
+ # {'has_credit_associations': {
372
+ # 'applied_role': 'Conceptualization',
373
+ # 'applies_to_person': {
374
+ # 'name': 'CREDIT NAME 1',
375
+ # 'email': 'CREDIT_NAME_1@foo.edu',
376
+ # 'orcid': 'orcid:0000-0000-0000-0001'}}}
377
+ obj_dict = assoc_in(obj_dict, path_list, value)
378
+
379
+ # for each potential path in the path list
380
+ # deterimine if the value is multivalued
381
+ for i in range(len(path_list)):
382
+ key, value = ".".join(path_list[0 : i + 1]), multivalues_list[i]
383
+ path_multivalued_dict[key] = value
384
+
385
+ # sort path lists by length and reverse
386
+ path_lists = list(reversed(sorted(path_lists, key=len)))
387
+ longest = len(path_lists[0])
388
+
389
+ # modify the values to have correct arity
390
+ # start at the end of each path list and determine
391
+ # if that path's value is multivalued
392
+ for i in range(longest, 0, -1):
393
+ for path_list in path_lists:
394
+ # deermine if path is multivalued
395
+ # note the use of the 0 to i portion of path list
396
+ path_portion = path_list[0:i]
397
+ is_multivalued = path_multivalued_dict[".".join(path_portion)]
398
+
399
+ # modify object so that the key has correct multivalue
400
+ temp = get_in(path_portion, obj_dict)
401
+ if "True" == is_multivalued and (not isinstance(temp, list)):
402
+ obj_dict = assoc_in(obj_dict, path_portion, [temp])
403
+
404
+ update_dict = make_mongo_update_command_dict(
405
+ action, id_, update_key, obj_dict[update_key]
406
+ )
407
+
408
+ return [update_dict]
409
+
410
+
411
+ def make_updates(var_group: Tuple) -> List:
412
+ """
413
+ Creates a list of update commands to execute on the Mongo database.
414
+
415
+ Parameters
416
+ ----------
417
+ var_group : Tuple
418
+ Group of change sheet record based on the id column (generated by pandas.groupby()).
419
+ var_group[0] -> the value (if any) in the group_var column
420
+ var_group[1] -> the dataframe with group_var variables
421
+
422
+ Returns
423
+ -------
424
+ List
425
+ A list of Mongo update commands.
426
+ """
427
+ # group_var = var_group[0] # the value (if any) in the group_var column
428
+ df = var_group[1] # dataframe with group_var variables
429
+ id_ = df["group_id"].values[0] # get id for group
430
+
431
+ updates = [] # collected properties/values to updated
432
+ for (
433
+ action,
434
+ value,
435
+ path,
436
+ multivalues,
437
+ ) in df[
438
+ [
439
+ "action",
440
+ "value",
441
+ "path",
442
+ "multivalues",
443
+ ]
444
+ ].itertuples(index=False):
445
+ # note: if a path is present, there is a value to be updated
446
+ if len(path) > 0:
447
+ update_dict = {} # holds the values for the update query
448
+ action = action.strip() # remove extra white space
449
+
450
+ # determine if value is a list
451
+ value = make_mongo_update_value(action, value, multivalues.split("|"))
452
+
453
+ # if a grouping variable (group_var) is present then a
454
+ # complex object is used to update db
455
+ # if len(group_var) > 0:
456
+ # obj = {}
457
+ update_dict = make_mongo_update_command_dict(action, id_, path, value)
458
+ updates.append(update_dict) # add update commands to list
459
+
460
+ return updates
461
+
462
+
463
+ def make_mongo_update_value(action: str, value: Any, multivalues_list: List) -> Any:
464
+ """Based on the params, determines of the value for a Mongo update operation needs to be a list.
465
+
466
+ Parameters
467
+ ----------
468
+ action : str
469
+ The type of update that will be performed (e.g., insert items, replace)
470
+ value : Any
471
+ The value used for the update operation.
472
+ multivalues_list : List
473
+ List of 'True'/'False' values indicating if the value is to be multivalued (i.e., an array).
474
+
475
+ Returns
476
+ -------
477
+ Any
478
+ The value which may or may not be encapsulated in a list.
479
+ """
480
+ # if an array field is being updated, split based on pipe
481
+ if multivalues_list[-1] == "True" or (isinstance(value, str) and "|" in value):
482
+ # value = value.strip() # ****
483
+ value = [v.strip() for v in value.split("|") if len(v.strip()) > 0]
484
+ elif isinstance(value, str):
485
+ value = value.strip() # remove extra white space
486
+
487
+ return value
488
+
489
+
490
+ def make_mongo_update_command_dict(
491
+ action: str, doc_id: str, update_key: str, update_value: Any
492
+ ) -> Dict:
493
+ """Returns a dict of the command need to execute a Mongo update opertation.
494
+
495
+ Parameters
496
+ ----------
497
+ action : str
498
+ The kind of update being performed (e.g., insert item, replace).
499
+ doc_id : str
500
+ The id of Mongo document being updated.
501
+ update_key : str
502
+ The property of document whose values are being updated.
503
+ update_value : Any
504
+ The new value used for updating.
505
+
506
+ Returns
507
+ -------
508
+ Dict
509
+ The Mongo command that when executed will update the document.
510
+ """
511
+ # build dict of update commands for Mongo
512
+ if action in ["insert", "insert items", "insert item"]:
513
+ update_dict = {
514
+ "q": {"id": f"{doc_id}"},
515
+ "u": {"$addToSet": {update_key: {"$each": update_value}}},
516
+ }
517
+ elif action in ["remove items", "remove item"]:
518
+ update_dict = {
519
+ "q": {"id": f"{doc_id}"},
520
+ "u": {"$pull": {update_key: {"$in": update_value}}},
521
+ }
522
+ elif action in ["update", "set", "replace", "replace items"]:
523
+ update_dict = {
524
+ "q": {"id": f"{doc_id}"},
525
+ "u": {"$set": {update_key: update_value}},
526
+ }
527
+ elif action in ["remove", "delete"]: # remove the property from the object
528
+ # note: the update_value in an $unset opertation doesn't matter
529
+ # it is included so that we see it during debugging
530
+ update_dict = {
531
+ "q": {"id": f"{doc_id}"},
532
+ "u": {"$unset": {update_key: update_value}},
533
+ }
534
+ else:
535
+ raise ValueError(f"cannot execute action '{action}'")
536
+
537
+ return update_dict
538
+
539
+
540
+ def map_id_to_collection(mongodb: MongoDatabase) -> Dict:
541
+ """Returns dict using the collection name as a key and the ids of documents as values.
542
+
543
+ Parameters
544
+ ----------
545
+ mongodb : MongoDatabase
546
+ The Mongo database on which to build the dict.
547
+
548
+ Returns
549
+ -------
550
+ Dict
551
+ Dict mapping collection names to the set of document ids in the collection.
552
+ key: collection name
553
+ value: set(id of document)
554
+ """
555
+ collection_names = [
556
+ name for name in mongodb.list_collection_names() if name.endswith("_set")
557
+ ]
558
+ id_dict = {
559
+ name: set(mongodb[name].distinct("id"))
560
+ for name in collection_names
561
+ if "id_1" in mongodb[name].index_information()
562
+ }
563
+ return id_dict
564
+
565
+
566
+ def get_collection_for_id(
567
+ id_: str, id_map: Dict, replace_underscore: bool = False
568
+ ) -> Optional[str]:
569
+ """
570
+ Returns the name of the collect that contains the document idenfied by the id.
571
+
572
+ Parameters
573
+ ----------
574
+ id_ : str
575
+ The identifier of the document.
576
+ id_map : Dict
577
+ A dict mapping collection names to document ids.
578
+ key: collection name
579
+ value: set of document ids
580
+ replace_underscore : bool
581
+ If true, underscores in the collection name are replaced with spaces.
582
+
583
+ Returns
584
+ -------
585
+ Optional[str]
586
+ Collection name containing the document.
587
+ None if the id was not found.
588
+ """
589
+ for collection_name in id_map:
590
+ if id_ in id_map[collection_name]:
591
+ if replace_underscore is True:
592
+ return collection_name.replace("_", " ")
593
+ else:
594
+ return collection_name
595
+ return None
596
+
597
+
598
+ def mongo_update_command_for(df_change: pds.DataFrame) -> Dict[str, list]:
599
+ """
600
+ Creates a dictionary of update commands to be executed against the Mongo database.
601
+
602
+ Parameters
603
+ ----------
604
+ df_change : pds.DataFrame
605
+ A dataframe containing change sheet information
606
+
607
+ Returns
608
+ -------
609
+ Dict
610
+ A dict of the update commands to be executed.
611
+ key: collection name
612
+ value: list of update commands
613
+ """
614
+ update_cmd = {} # list of dicts to hold mongo update queries
615
+
616
+ # split data into groups by values in the group_id column (e.g., gold:Gs0103573)
617
+ id_group = df_change.groupby("group_id")
618
+ for ig in id_group:
619
+ # ig[0] -> id_: group_id for data
620
+ # ig[1] -> df_id: dataframe with rows having the group_id
621
+ id_, df_id = ig
622
+
623
+ # split data into groups by values in the group_var column (e.g, v1, v2)
624
+ var_group = df_id.groupby("group_var")
625
+ ig_updates = [] # update commands for the id group
626
+ for vg in var_group:
627
+ # vg[0] -> group_var for data
628
+ # vg[1] -> dataframe with rows having the group_var
629
+ if len(vg[0].strip()) > 0:
630
+ ig_updates.extend(make_vargroup_updates(vg[1]))
631
+ else:
632
+ ig_updates.extend(make_updates(vg))
633
+
634
+ # add update commands for the group id to dict
635
+ update_cmd[id_] = {
636
+ "update": df_id["collection_name"].values[0],
637
+ "updates": ig_updates,
638
+ }
639
+ return update_cmd
640
+
641
+
642
+ def copy_docs_in_update_cmd(
643
+ update_cmd, mdb_from: MongoDatabase, mdb_to: MongoDatabase, drop_mdb_to: bool = True
644
+ ) -> Dict[str, str]:
645
+ """
646
+ Copies data between Mongo databases.
647
+ Useful to apply and inspect updates on a test database.
648
+
649
+ Parameters
650
+ ----------
651
+ mdb_from : MongoDatbase
652
+ Database from which data being copied (i.e., source).
653
+ mdb_to: MongoDatabase
654
+ Datbase which data is being copied into (i.e., destination).
655
+
656
+ Returns
657
+ -------
658
+ results : Dict
659
+ Dict with collection name as the key, and a message of number of docs inserted as value.
660
+ """
661
+ doc_specs = defaultdict(list)
662
+ for id_, update_cmd_doc in update_cmd.items():
663
+ collection_name = update_cmd_doc["update"]
664
+ doc_specs[collection_name].append(id_)
665
+
666
+ if drop_mdb_to:
667
+ mdb_to.client.drop_database(mdb_to.name)
668
+ results = {}
669
+ for collection_name, ids in doc_specs.items():
670
+ docs = [
671
+ dissoc(d, "_id")
672
+ for d in mdb_from[collection_name].find({"id": {"$in": ids}})
673
+ ]
674
+ results[collection_name] = (
675
+ f"{len(mdb_to[collection_name].insert_many(docs).inserted_ids)} docs inserted"
676
+ )
677
+ return results
678
+
679
+
680
+ def update_mongo_db(mdb: MongoDatabase, update_cmd: Dict):
681
+ """
682
+ Updates the Mongo database using commands in the update_cmd dict.
683
+
684
+ Parameters
685
+ ----------
686
+ mdb : MongoDatabase
687
+ Mongo database to be updated.
688
+ update_cmd : Dict
689
+ Contains update commands to be executed.
690
+
691
+ Returns
692
+ -------
693
+ results: Dict
694
+ Information about what was updated in the Mongo database.
695
+ """
696
+ results = []
697
+ validator = get_nmdc_schema_validator()
698
+ for id_, update_cmd_doc in update_cmd.items():
699
+ collection_name = update_cmd_doc["update"]
700
+ doc_before = dissoc(mdb[collection_name].find_one({"id": id_}), "_id")
701
+ update_result = json.loads(bson_dumps(mdb.command(update_cmd_doc)))
702
+ doc_after = dissoc(mdb[collection_name].find_one({"id": id_}), "_id")
703
+ report = validator.validate(
704
+ {collection_name: [doc_after]}, target_class="Database"
705
+ )
706
+ results.append(
707
+ {
708
+ "id": id_,
709
+ "doc_before": doc_before,
710
+ "update_info": update_result,
711
+ "doc_after": doc_after,
712
+ "validation_errors": [e.message for e in report.results],
713
+ }
714
+ )
715
+
716
+ return results
717
+
718
+
719
+ def _validate_changesheet(df_change: pd.DataFrame, mdb: MongoDatabase):
720
+ update_cmd = mongo_update_command_for(df_change)
721
+ mdb_to_inspect = mdb.client["nmdc_changesheet_submission_results"]
722
+ results_of_copy = copy_docs_in_update_cmd(
723
+ update_cmd,
724
+ mdb_from=mdb,
725
+ mdb_to=mdb_to_inspect,
726
+ )
727
+ results_of_updates = update_mongo_db(mdb_to_inspect, update_cmd)
728
+ rv = {
729
+ "update_cmd": update_cmd,
730
+ "inspection_info": {
731
+ "mdb_name": mdb_to_inspect.name,
732
+ "results_of_copy": results_of_copy,
733
+ },
734
+ "results_of_updates": results_of_updates,
735
+ }
736
+ validation_errors = []
737
+ for result in results_of_updates:
738
+ if len(result.get("validation_errors", [])) > 0:
739
+ validation_errors.append(result["validation_errors"])
740
+ if (
741
+ len(write_errors := result.get("update_info", {}).get("writeErrors", {}))
742
+ > 0
743
+ ):
744
+ validation_errors.append(write_errors)
745
+
746
+ if validation_errors:
747
+ raise HTTPException(
748
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
749
+ detail={
750
+ "update_cmd": rv["update_cmd"],
751
+ "validation_errors": validation_errors,
752
+ },
753
+ )
754
+ return rv
755
+
756
+
757
+ def df_from_sheet_in(sheet_in: ChangesheetIn, mdb: MongoDatabase) -> pd.DataFrame:
758
+ content_types = {
759
+ "text/csv": ",",
760
+ "text/tab-separated-values": "\t",
761
+ }
762
+ content_type = sheet_in.content_type
763
+ sep = content_types.get(content_type)
764
+ filename = sheet_in.name
765
+ if content_type not in content_types:
766
+ raise HTTPException(
767
+ status_code=status.HTTP_400_BAD_REQUEST,
768
+ detail=(
769
+ f"file {filename} has content type '{content_type}'. "
770
+ f"Only {list(content_types)} files are permitted."
771
+ ),
772
+ )
773
+ try:
774
+ df = load_changesheet(StringIO(sheet_in.text), mdb, sep=sep)
775
+ except Exception as e:
776
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
777
+ return df