nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/api/__init__.py +0 -0
- nmdc_runtime/api/analytics.py +70 -0
- nmdc_runtime/api/boot/__init__.py +0 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/__init__.py +0 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +170 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/__init__.py +0 -0
- nmdc_runtime/api/db/mongo.py +447 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/__init__.py +0 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +794 -0
- nmdc_runtime/api/endpoints/ids.py +192 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +105 -0
- nmdc_runtime/api/endpoints/queries.py +679 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +229 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +774 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/main.py +401 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/__init__.py +0 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/__init__.py +0 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/minter.py +0 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +253 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +242 -0
- nmdc_runtime/config.py +55 -4
- nmdc_runtime/core/db/Database.py +1 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -9
- nmdc_runtime/lib/extract_nmdc_data.py +0 -8
- nmdc_runtime/lib/nmdc_dataframes.py +3 -7
- nmdc_runtime/lib/nmdc_etl_class.py +1 -7
- nmdc_runtime/minter/adapters/repository.py +1 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +35 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/export/ncbi_xml.py +1 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
- nmdc_runtime/site/graphs.py +33 -28
- nmdc_runtime/site/ops.py +97 -237
- nmdc_runtime/site/repair/database_updater.py +8 -0
- nmdc_runtime/site/repository.py +7 -117
- nmdc_runtime/site/resources.py +4 -4
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/util.py +9 -321
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
- nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,788 @@
|
|
|
1
|
+
import builtins
|
|
2
|
+
import inspect
|
|
3
|
+
import json
|
|
4
|
+
from collections import defaultdict, namedtuple
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from io import StringIO
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from types import ModuleType
|
|
9
|
+
from typing import Optional, Dict, List, Tuple, Any, Union
|
|
10
|
+
|
|
11
|
+
from bson.json_util import dumps as bson_dumps
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import pandas as pds
|
|
14
|
+
from fastapi import HTTPException
|
|
15
|
+
from jsonschema import Draft7Validator
|
|
16
|
+
from linkml_runtime.utils.schemaview import SchemaView
|
|
17
|
+
from nmdc_schema import nmdc
|
|
18
|
+
from nmdc_schema.nmdc_data import get_nmdc_schema_definition
|
|
19
|
+
from pymongo.database import Database as MongoDatabase
|
|
20
|
+
from starlette import status
|
|
21
|
+
from toolz.dicttoolz import dissoc, assoc_in, get_in
|
|
22
|
+
|
|
23
|
+
from nmdc_runtime.api.models.metadata import ChangesheetIn
|
|
24
|
+
from nmdc_runtime.util import get_nmdc_jsonschema_dict, collection_name_to_class_names
|
|
25
|
+
|
|
26
|
+
# custom named tuple to hold path property information
|
|
27
|
+
SchemaPathProperties = namedtuple(
|
|
28
|
+
"SchemaPathProperties", ["slots", "ranges", "multivalues"]
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
FilePathOrBuffer = Union[Path, StringIO]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def load_changesheet(
|
|
35
|
+
filename: FilePathOrBuffer, mongodb: MongoDatabase, sep="\t"
|
|
36
|
+
) -> pds.DataFrame:
|
|
37
|
+
"""
|
|
38
|
+
Creates a datafame from the input file that includes extra columns used for
|
|
39
|
+
determining the path for updating a Mongo document and the data type of the updated data.
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
Pandas DataFrame
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
filename : FilePathOrBuffer
|
|
48
|
+
Name of the file containing the change sheet.
|
|
49
|
+
mongodb : MongoDatabase
|
|
50
|
+
The Mongo database that the change sheet will update.
|
|
51
|
+
sep : str
|
|
52
|
+
Column separator in file.
|
|
53
|
+
|
|
54
|
+
Raises
|
|
55
|
+
------
|
|
56
|
+
ValueError
|
|
57
|
+
If input file lacks an id column.
|
|
58
|
+
ValueError
|
|
59
|
+
If input file lacks an attribute column.
|
|
60
|
+
ValueError
|
|
61
|
+
If input file lacks an action column.
|
|
62
|
+
Exception
|
|
63
|
+
If a document id is not found in the Mongo database.
|
|
64
|
+
Exception
|
|
65
|
+
If a class name cannot be determined.
|
|
66
|
+
"""
|
|
67
|
+
# load dataframe replacing NaN with ''
|
|
68
|
+
df = pds.read_csv(filename, sep=sep, dtype="string").fillna("")
|
|
69
|
+
# df = pds.read_csv(filename, sep=sep, dtype="string")
|
|
70
|
+
|
|
71
|
+
# add a group id column, but copy only IRIs (has ":" in it)
|
|
72
|
+
try:
|
|
73
|
+
df["group_id"] = df["id"].map(lambda x: x if ":" in x else "")
|
|
74
|
+
except KeyError:
|
|
75
|
+
raise ValueError("change sheet lacks 'id' column.")
|
|
76
|
+
|
|
77
|
+
# fill in blank group ids
|
|
78
|
+
for i in range(len(df)):
|
|
79
|
+
if len(str(df.loc[i, "group_id"]).strip()) < 1:
|
|
80
|
+
df.loc[i, "group_id"] = df.loc[i - 1, "group_id"]
|
|
81
|
+
|
|
82
|
+
# fill in blank action columns
|
|
83
|
+
try:
|
|
84
|
+
for i in range(len(df)):
|
|
85
|
+
if len(str(df.loc[i, "action"]).strip()) < 1:
|
|
86
|
+
df.loc[i, "action"] = df.loc[i - 1, "action"]
|
|
87
|
+
except KeyError:
|
|
88
|
+
raise ValueError("change sheet lacks 'action' column.")
|
|
89
|
+
|
|
90
|
+
# build dict to hold variables that have been defined
|
|
91
|
+
# in the id column of the change sheet
|
|
92
|
+
try:
|
|
93
|
+
# collect vars in the id column
|
|
94
|
+
var_dict = {
|
|
95
|
+
id_val: None
|
|
96
|
+
for id_val, attr in df[["id", "attribute"]].values
|
|
97
|
+
if len(id_val) > 0 and ":" not in id_val
|
|
98
|
+
}
|
|
99
|
+
except KeyError:
|
|
100
|
+
# note the presence of the id column is checked above
|
|
101
|
+
raise ValueError("change sheet lacks 'attribute' column.")
|
|
102
|
+
|
|
103
|
+
# add group_var column to hold values from the id column
|
|
104
|
+
# that are being used varialbe/blank nodes
|
|
105
|
+
# df["group_var"] = df["id"].map(lambda x: x if not (":" in x) else "")
|
|
106
|
+
df["group_var"] = ""
|
|
107
|
+
for ix, id_val, attr, value in df[["id", "attribute", "value"]].itertuples():
|
|
108
|
+
if id_val in var_dict.keys() and value in var_dict.keys():
|
|
109
|
+
var_dict[value] = f"{var_dict[id_val]}.{attr}"
|
|
110
|
+
var_dict[f"{id_val}.{value}"] = f"{var_dict[id_val]}.{attr}"
|
|
111
|
+
df.loc[ix, "group_var"] = f"{id_val}.{value}"
|
|
112
|
+
elif value in var_dict.keys():
|
|
113
|
+
var_dict[value] = attr
|
|
114
|
+
df.loc[ix, "group_var"] = value
|
|
115
|
+
elif id_val in var_dict.keys():
|
|
116
|
+
df.loc[ix, "group_var"] = id_val
|
|
117
|
+
|
|
118
|
+
# add path column used to hold the path in the data to the data that will be changed
|
|
119
|
+
# e.g. principal_investigator.name
|
|
120
|
+
df["path"] = ""
|
|
121
|
+
# split into id groups, this allow each id group to have its own local variables
|
|
122
|
+
# i.e., same var name can be used with different ids
|
|
123
|
+
group_ids = df.groupby("group_id")
|
|
124
|
+
for group_id in group_ids:
|
|
125
|
+
df_id = group_id[1] # dataframe for value group_id
|
|
126
|
+
|
|
127
|
+
# split into var groups
|
|
128
|
+
var_groups = df_id.groupby("group_var")
|
|
129
|
+
for var_group in var_groups:
|
|
130
|
+
# var = var_group[0] # value of group_var
|
|
131
|
+
df_var = var_group[1] # dataframe for value group_var
|
|
132
|
+
|
|
133
|
+
for ix, attr, value, group_var in df_var[
|
|
134
|
+
["attribute", "value", "group_var"]
|
|
135
|
+
].itertuples():
|
|
136
|
+
# if group_var is empty, it is a simple property
|
|
137
|
+
if "" == group_var:
|
|
138
|
+
df.loc[ix, "path"] = attr
|
|
139
|
+
|
|
140
|
+
# otherwise, it is a nested property
|
|
141
|
+
# if the value is not a var, then we are at bottom level
|
|
142
|
+
elif value not in var_dict.keys():
|
|
143
|
+
df.loc[ix, "path"] = f"{var_dict[group_var]}.{attr}"
|
|
144
|
+
|
|
145
|
+
# create map between id and collection
|
|
146
|
+
id_dict = map_id_to_collection(mongodb)
|
|
147
|
+
# add collection for each id
|
|
148
|
+
df["collection_name"] = ""
|
|
149
|
+
prev_id = ""
|
|
150
|
+
for ix, group_id in df[["group_id"]].itertuples():
|
|
151
|
+
# check if there is a new id
|
|
152
|
+
if group_id != prev_id:
|
|
153
|
+
prev_id = group_id # update prev id
|
|
154
|
+
collection_name = get_collection_for_id(group_id, id_dict)
|
|
155
|
+
|
|
156
|
+
if collection_name is None:
|
|
157
|
+
raise Exception("Cannot find ID", group_id, "in any collection")
|
|
158
|
+
|
|
159
|
+
df.loc[ix, "collection_name"] = collection_name
|
|
160
|
+
|
|
161
|
+
# add linkml class name for each id
|
|
162
|
+
df["linkml_class"] = ""
|
|
163
|
+
class_name_dict = map_schema_class_names(nmdc)
|
|
164
|
+
for ix, id_, collection_name in df[["group_id", "collection_name"]].itertuples():
|
|
165
|
+
data = mongodb[collection_name].find_one({"id": id_})
|
|
166
|
+
|
|
167
|
+
# find the type of class the data instantiates
|
|
168
|
+
if "type" in list(data.keys()):
|
|
169
|
+
# get part after the ":"
|
|
170
|
+
class_name = data["type"].split(":")[-1]
|
|
171
|
+
class_name = class_name_dict[class_name]
|
|
172
|
+
else:
|
|
173
|
+
class_names = collection_name_to_class_names[collection_name]
|
|
174
|
+
if len(class_names) > 1:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
"cannot unambiguously infer class of document"
|
|
177
|
+
f" with `id` {id_} in collection {collection_name}."
|
|
178
|
+
" Please ensure explicit `type` is present in document."
|
|
179
|
+
)
|
|
180
|
+
class_name = class_name_dict[class_names[0]]
|
|
181
|
+
|
|
182
|
+
# set class name for id
|
|
183
|
+
df["linkml_class"] = class_name
|
|
184
|
+
|
|
185
|
+
# info about properties of slots in the property path
|
|
186
|
+
df["linkml_slots"] = ""
|
|
187
|
+
df["ranges"] = ""
|
|
188
|
+
df["multivalues"] = ""
|
|
189
|
+
sd = get_nmdc_schema_definition()
|
|
190
|
+
view = SchemaView(sd)
|
|
191
|
+
for ix, attribute, path, class_name in df[
|
|
192
|
+
["attribute", "path", "linkml_class"]
|
|
193
|
+
].itertuples():
|
|
194
|
+
# fetch the properites for the path
|
|
195
|
+
if len(path) > 0:
|
|
196
|
+
spp = fetch_schema_path_properties(view, path, class_name)
|
|
197
|
+
else:
|
|
198
|
+
spp = fetch_schema_path_properties(view, attribute, class_name)
|
|
199
|
+
|
|
200
|
+
df.loc[ix, "linkml_slots"] = str.join("|", spp.slots)
|
|
201
|
+
df.loc[ix, "ranges"] = str.join("|", spp.ranges)
|
|
202
|
+
df.loc[ix, "multivalues"] = str.join("|", spp.multivalues)
|
|
203
|
+
df = df.astype({"value": object})
|
|
204
|
+
for ix, value, ranges in list(df[["value", "ranges"]].itertuples()):
|
|
205
|
+
# Infer python builtin type for coercion via <https://w3id.org/linkml/base>.
|
|
206
|
+
# If base is member of builtins module, e.g. `int` or `float`, coercion will succeed.
|
|
207
|
+
# Otherwise, keep value as is (as a `str`).
|
|
208
|
+
# Note: Mongo BSON has a decimal type,
|
|
209
|
+
# but e.g. <https://w3id.org/nmdc/DecimalDegree> has a specified `base` of `float`
|
|
210
|
+
# and I think it's best to not "re-interpret" what LinkML specifies. Can revisit this decision
|
|
211
|
+
# by e.g. overriding `base` when `uri` is a "known" type (`xsd:decimal` in the case of DecimalDegree).
|
|
212
|
+
try:
|
|
213
|
+
base_type = view.induced_type(ranges.rsplit("|", maxsplit=1)[-1]).base
|
|
214
|
+
if base_type == "Decimal":
|
|
215
|
+
# Note: Use of bson.decimal128.Decimal128 here would require changing JSON encoding/decoding.
|
|
216
|
+
# Choosing to use `float` to preserve existing (expected) behavior.
|
|
217
|
+
df.at[ix, "value"] = float(value)
|
|
218
|
+
df.at[ix, "value"] = getattr(builtins, base_type)(value)
|
|
219
|
+
except:
|
|
220
|
+
continue
|
|
221
|
+
return df
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def map_schema_class_names(nmdc_mod: ModuleType) -> Dict[str, str]:
|
|
225
|
+
"""Returns dict that maps the classes in the nmdc.py module (within the NMDC Schema PyPI library)
|
|
226
|
+
to the class names used in the linkml schema.
|
|
227
|
+
|
|
228
|
+
Parameters
|
|
229
|
+
----------
|
|
230
|
+
nmdc_mod : ModuleType
|
|
231
|
+
The nmdc.py module in the NMDC Schema library.
|
|
232
|
+
|
|
233
|
+
Returns
|
|
234
|
+
-------
|
|
235
|
+
Dict[str, str]
|
|
236
|
+
Maps the class as named in the module to the class name in the linkml schema.
|
|
237
|
+
E.g., BiosampleProcessing -> biosample processing
|
|
238
|
+
"""
|
|
239
|
+
class_dict = {}
|
|
240
|
+
for name, member in inspect.getmembers(nmdc_mod):
|
|
241
|
+
if inspect.isclass(member) and hasattr(member, "class_name"):
|
|
242
|
+
class_dict[name] = member.class_name
|
|
243
|
+
return class_dict
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
@lru_cache
|
|
247
|
+
def fetch_schema_path_properties(
|
|
248
|
+
view: SchemaView, schema_path: str, class_name: str
|
|
249
|
+
) -> SchemaPathProperties:
|
|
250
|
+
"""Returns properies for a slot in the linkml schema.
|
|
251
|
+
|
|
252
|
+
Parameters
|
|
253
|
+
----------
|
|
254
|
+
view : SchemaView
|
|
255
|
+
The SchemaView object holding the linkml schema
|
|
256
|
+
schema_path : str
|
|
257
|
+
The path in Mongo database to the value
|
|
258
|
+
class_name : str
|
|
259
|
+
The name of the class with the slot(s)
|
|
260
|
+
|
|
261
|
+
Returns
|
|
262
|
+
-------
|
|
263
|
+
SchemaPathProperties
|
|
264
|
+
A namedtuple of form "SchemaPathProperties", ["slots", "ranges", "multivalues"]
|
|
265
|
+
that holds the property informaton about the slot.
|
|
266
|
+
slots: a list of the linkml slots, this may differ from the path names
|
|
267
|
+
ranges: a list of the range for slot in the slots list
|
|
268
|
+
multivalues: a list of True/False strings specifying if the slot is multivaued
|
|
269
|
+
|
|
270
|
+
Raises
|
|
271
|
+
------
|
|
272
|
+
AttributeError
|
|
273
|
+
If the slot is not found in the linkml schema, an AttributeError is raised.
|
|
274
|
+
"""
|
|
275
|
+
# lists to hold properties for a value in the path
|
|
276
|
+
slots = []
|
|
277
|
+
ranges = []
|
|
278
|
+
multivalues = []
|
|
279
|
+
paths = schema_path.split(".")
|
|
280
|
+
for path in paths:
|
|
281
|
+
schema_class = view.get_class(class_name) # get class from schema
|
|
282
|
+
|
|
283
|
+
# first check if it is an induced slot
|
|
284
|
+
# i.e., if slot properties have been overridden
|
|
285
|
+
if path in schema_class.slot_usage.keys():
|
|
286
|
+
schema_slot = view.induced_slot(path, class_name)
|
|
287
|
+
elif path.replace("_", " ") in schema_class.slot_usage.keys():
|
|
288
|
+
schema_slot = view.induced_slot(path.replace("_", " "), class_name)
|
|
289
|
+
|
|
290
|
+
# if slot has not been overridden, check class attributes
|
|
291
|
+
if path in schema_class.attributes.keys():
|
|
292
|
+
schema_slot = view.induced_slot(path, class_name)
|
|
293
|
+
elif path.replace("_", " ") in schema_class.attributes.keys():
|
|
294
|
+
schema_slot = view.induced_slot(path.replace("_", " "), class_name)
|
|
295
|
+
|
|
296
|
+
# if slot has not been overridden or is an attribute, get slot properties from view
|
|
297
|
+
elif path in view.all_slots().keys():
|
|
298
|
+
schema_slot = view.get_slot(path)
|
|
299
|
+
elif path.replace("_", " ") in view.all_slots().keys():
|
|
300
|
+
schema_slot = view.get_slot(path.replace("_", " "))
|
|
301
|
+
|
|
302
|
+
# raise error if the slot is not found
|
|
303
|
+
else:
|
|
304
|
+
raise AttributeError(f"slot '{path}' not found for '{schema_class.name}'")
|
|
305
|
+
|
|
306
|
+
# properties to lists as strings (strings are needed for dataframe)
|
|
307
|
+
slots.append(str(schema_slot.name))
|
|
308
|
+
|
|
309
|
+
if schema_slot.range is None:
|
|
310
|
+
ranges.append("string")
|
|
311
|
+
else:
|
|
312
|
+
ranges.append(str(schema_slot.range))
|
|
313
|
+
|
|
314
|
+
if schema_slot.multivalued is None:
|
|
315
|
+
multivalues.append("False")
|
|
316
|
+
else:
|
|
317
|
+
multivalues.append(str(schema_slot.multivalued))
|
|
318
|
+
|
|
319
|
+
# update the class name to range of slot
|
|
320
|
+
class_name = schema_slot.range
|
|
321
|
+
|
|
322
|
+
return SchemaPathProperties(slots, ranges, multivalues)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def make_vargroup_updates(df: pds.DataFrame) -> List:
|
|
326
|
+
"""Returns a list of update commands to execute on the Mongo database
|
|
327
|
+
when updates are grouped with a grouping variable.
|
|
328
|
+
|
|
329
|
+
Parameters
|
|
330
|
+
----------
|
|
331
|
+
df : pds.DataFrame
|
|
332
|
+
The dataframe that contains the values associated with the grouping variable.
|
|
333
|
+
|
|
334
|
+
Returns
|
|
335
|
+
-------
|
|
336
|
+
List
|
|
337
|
+
A list of Mongo update commands for that grouping variable.
|
|
338
|
+
"""
|
|
339
|
+
id_ = df["group_id"].values[0]
|
|
340
|
+
path_multivalued_dict = {}
|
|
341
|
+
update_key = ""
|
|
342
|
+
path_lists = []
|
|
343
|
+
obj_dict = {}
|
|
344
|
+
for (
|
|
345
|
+
action,
|
|
346
|
+
attribute,
|
|
347
|
+
value,
|
|
348
|
+
path,
|
|
349
|
+
multivalues,
|
|
350
|
+
) in df[
|
|
351
|
+
[
|
|
352
|
+
"action",
|
|
353
|
+
"attribute",
|
|
354
|
+
"value",
|
|
355
|
+
"path",
|
|
356
|
+
"multivalues",
|
|
357
|
+
]
|
|
358
|
+
].itertuples(index=False):
|
|
359
|
+
if len(path) < 1:
|
|
360
|
+
update_key = attribute
|
|
361
|
+
else:
|
|
362
|
+
# gather path lists
|
|
363
|
+
path_list = path.split(".")
|
|
364
|
+
path_lists.append(path_list)
|
|
365
|
+
|
|
366
|
+
# determine if value is a list
|
|
367
|
+
multivalues_list = multivalues.split("|")
|
|
368
|
+
value = make_mongo_update_value(action, value, multivalues_list)
|
|
369
|
+
|
|
370
|
+
# build dictionary that merges all keys and
|
|
371
|
+
# values into a single object, e.g:
|
|
372
|
+
# {'has_credit_associations': {
|
|
373
|
+
# 'applied_role': 'Conceptualization',
|
|
374
|
+
# 'applies_to_person': {
|
|
375
|
+
# 'name': 'CREDIT NAME 1',
|
|
376
|
+
# 'email': 'CREDIT_NAME_1@foo.edu',
|
|
377
|
+
# 'orcid': 'orcid:0000-0000-0000-0001'}}}
|
|
378
|
+
obj_dict = assoc_in(obj_dict, path_list, value)
|
|
379
|
+
|
|
380
|
+
# for each potential path in the path list
|
|
381
|
+
# deterimine if the value is multivalued
|
|
382
|
+
for i in range(len(path_list)):
|
|
383
|
+
key, value = ".".join(path_list[0 : i + 1]), multivalues_list[i]
|
|
384
|
+
path_multivalued_dict[key] = value
|
|
385
|
+
|
|
386
|
+
# sort path lists by length and reverse
|
|
387
|
+
path_lists = list(reversed(sorted(path_lists, key=len)))
|
|
388
|
+
longest = len(path_lists[0])
|
|
389
|
+
|
|
390
|
+
# modify the values to have correct arity
|
|
391
|
+
# start at the end of each path list and determine
|
|
392
|
+
# if that path's value is multivalued
|
|
393
|
+
for i in range(longest, 0, -1):
|
|
394
|
+
for path_list in path_lists:
|
|
395
|
+
# deermine if path is multivalued
|
|
396
|
+
# note the use of the 0 to i portion of path list
|
|
397
|
+
path_portion = path_list[0:i]
|
|
398
|
+
is_multivalued = path_multivalued_dict[".".join(path_portion)]
|
|
399
|
+
|
|
400
|
+
# modify object so that the key has correct multivalue
|
|
401
|
+
temp = get_in(path_portion, obj_dict)
|
|
402
|
+
if "True" == is_multivalued and (not isinstance(temp, list)):
|
|
403
|
+
obj_dict = assoc_in(obj_dict, path_portion, [temp])
|
|
404
|
+
|
|
405
|
+
update_dict = make_mongo_update_command_dict(
|
|
406
|
+
action, id_, update_key, obj_dict[update_key]
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
return [update_dict]
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def make_updates(var_group: Tuple) -> List:
|
|
413
|
+
"""
|
|
414
|
+
Creates a list of update commands to execute on the Mongo database.
|
|
415
|
+
|
|
416
|
+
Parameters
|
|
417
|
+
----------
|
|
418
|
+
var_group : Tuple
|
|
419
|
+
Group of change sheet record based on the id column (generated by pandas.groupby()).
|
|
420
|
+
var_group[0] -> the value (if any) in the group_var column
|
|
421
|
+
var_group[1] -> the dataframe with group_var variables
|
|
422
|
+
|
|
423
|
+
Returns
|
|
424
|
+
-------
|
|
425
|
+
List
|
|
426
|
+
A list of Mongo update commands.
|
|
427
|
+
"""
|
|
428
|
+
# group_var = var_group[0] # the value (if any) in the group_var column
|
|
429
|
+
df = var_group[1] # dataframe with group_var variables
|
|
430
|
+
id_ = df["group_id"].values[0] # get id for group
|
|
431
|
+
|
|
432
|
+
updates = [] # collected properties/values to updated
|
|
433
|
+
for (
|
|
434
|
+
action,
|
|
435
|
+
value,
|
|
436
|
+
path,
|
|
437
|
+
multivalues,
|
|
438
|
+
) in df[
|
|
439
|
+
[
|
|
440
|
+
"action",
|
|
441
|
+
"value",
|
|
442
|
+
"path",
|
|
443
|
+
"multivalues",
|
|
444
|
+
]
|
|
445
|
+
].itertuples(index=False):
|
|
446
|
+
# note: if a path is present, there is a value to be updated
|
|
447
|
+
if len(path) > 0:
|
|
448
|
+
update_dict = {} # holds the values for the update query
|
|
449
|
+
action = action.strip() # remove extra white space
|
|
450
|
+
|
|
451
|
+
# determine if value is a list
|
|
452
|
+
value = make_mongo_update_value(action, value, multivalues.split("|"))
|
|
453
|
+
|
|
454
|
+
# if a grouping variable (group_var) is present then a
|
|
455
|
+
# complex object is used to update db
|
|
456
|
+
# if len(group_var) > 0:
|
|
457
|
+
# obj = {}
|
|
458
|
+
update_dict = make_mongo_update_command_dict(action, id_, path, value)
|
|
459
|
+
updates.append(update_dict) # add update commands to list
|
|
460
|
+
|
|
461
|
+
return updates
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def make_mongo_update_value(action: str, value: Any, multivalues_list: List) -> Any:
|
|
465
|
+
"""Based on the params, determines of the value for a Mongo update operation needs to be a list.
|
|
466
|
+
|
|
467
|
+
Parameters
|
|
468
|
+
----------
|
|
469
|
+
action : str
|
|
470
|
+
The type of update that will be performed (e.g., insert items, replace)
|
|
471
|
+
value : Any
|
|
472
|
+
The value used for the update operation.
|
|
473
|
+
multivalues_list : List
|
|
474
|
+
List of 'True'/'False' values indicating if the value is to be multivalued (i.e., an array).
|
|
475
|
+
|
|
476
|
+
Returns
|
|
477
|
+
-------
|
|
478
|
+
Any
|
|
479
|
+
The value which may or may not be encapsulated in a list.
|
|
480
|
+
"""
|
|
481
|
+
# if an array field is being updated, split based on pipe
|
|
482
|
+
if multivalues_list[-1] == "True" or (isinstance(value, str) and "|" in value):
|
|
483
|
+
# value = value.strip() # ****
|
|
484
|
+
value = [v.strip() for v in value.split("|") if len(v.strip()) > 0]
|
|
485
|
+
elif isinstance(value, str):
|
|
486
|
+
value = value.strip() # remove extra white space
|
|
487
|
+
|
|
488
|
+
return value
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def make_mongo_update_command_dict(
|
|
492
|
+
action: str, doc_id: str, update_key: str, update_value: Any
|
|
493
|
+
) -> Dict:
|
|
494
|
+
"""Returns a dict of the command need to execute a Mongo update opertation.
|
|
495
|
+
|
|
496
|
+
Parameters
|
|
497
|
+
----------
|
|
498
|
+
action : str
|
|
499
|
+
The kind of update being performed (e.g., insert item, replace).
|
|
500
|
+
doc_id : str
|
|
501
|
+
The id of Mongo document being updated.
|
|
502
|
+
update_key : str
|
|
503
|
+
The property of document whose values are being updated.
|
|
504
|
+
update_value : Any
|
|
505
|
+
The new value used for updating.
|
|
506
|
+
|
|
507
|
+
Returns
|
|
508
|
+
-------
|
|
509
|
+
Dict
|
|
510
|
+
The Mongo command that when executed will update the document.
|
|
511
|
+
"""
|
|
512
|
+
# build dict of update commands for Mongo
|
|
513
|
+
if action in ["insert", "insert items", "insert item"]:
|
|
514
|
+
update_dict = {
|
|
515
|
+
"q": {"id": f"{doc_id}"},
|
|
516
|
+
"u": {"$addToSet": {update_key: {"$each": update_value}}},
|
|
517
|
+
}
|
|
518
|
+
elif action in ["remove items", "remove item"]:
|
|
519
|
+
update_dict = {
|
|
520
|
+
"q": {"id": f"{doc_id}"},
|
|
521
|
+
"u": {"$pull": {update_key: {"$in": update_value}}},
|
|
522
|
+
}
|
|
523
|
+
elif action in ["update", "set", "replace", "replace items"]:
|
|
524
|
+
update_dict = {
|
|
525
|
+
"q": {"id": f"{doc_id}"},
|
|
526
|
+
"u": {"$set": {update_key: update_value}},
|
|
527
|
+
}
|
|
528
|
+
elif action in ["remove", "delete"]: # remove the property from the object
|
|
529
|
+
# note: the update_value in an $unset opertation doesn't matter
|
|
530
|
+
# it is included so that we see it during debugging
|
|
531
|
+
update_dict = {
|
|
532
|
+
"q": {"id": f"{doc_id}"},
|
|
533
|
+
"u": {"$unset": {update_key: update_value}},
|
|
534
|
+
}
|
|
535
|
+
else:
|
|
536
|
+
raise ValueError(f"cannot execute action '{action}'")
|
|
537
|
+
|
|
538
|
+
return update_dict
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def map_id_to_collection(mongodb: MongoDatabase) -> Dict:
|
|
542
|
+
"""Returns dict using the collection name as a key and the ids of documents as values.
|
|
543
|
+
|
|
544
|
+
Parameters
|
|
545
|
+
----------
|
|
546
|
+
mongodb : MongoDatabase
|
|
547
|
+
The Mongo database on which to build the dict.
|
|
548
|
+
|
|
549
|
+
Returns
|
|
550
|
+
-------
|
|
551
|
+
Dict
|
|
552
|
+
Dict mapping collection names to the set of document ids in the collection.
|
|
553
|
+
key: collection name
|
|
554
|
+
value: set(id of document)
|
|
555
|
+
"""
|
|
556
|
+
collection_names = [
|
|
557
|
+
name for name in mongodb.list_collection_names() if name.endswith("_set")
|
|
558
|
+
]
|
|
559
|
+
id_dict = {
|
|
560
|
+
name: set(mongodb[name].distinct("id"))
|
|
561
|
+
for name in collection_names
|
|
562
|
+
if "id_1" in mongodb[name].index_information()
|
|
563
|
+
}
|
|
564
|
+
return id_dict
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def get_collection_for_id(
|
|
568
|
+
id_: str, id_map: Dict, replace_underscore: bool = False
|
|
569
|
+
) -> Optional[str]:
|
|
570
|
+
"""
|
|
571
|
+
Returns the name of the collect that contains the document idenfied by the id.
|
|
572
|
+
|
|
573
|
+
Parameters
|
|
574
|
+
----------
|
|
575
|
+
id_ : str
|
|
576
|
+
The identifier of the document.
|
|
577
|
+
id_map : Dict
|
|
578
|
+
A dict mapping collection names to document ids.
|
|
579
|
+
key: collection name
|
|
580
|
+
value: set of document ids
|
|
581
|
+
replace_underscore : bool
|
|
582
|
+
If true, underscores in the collection name are replaced with spaces.
|
|
583
|
+
|
|
584
|
+
Returns
|
|
585
|
+
-------
|
|
586
|
+
Optional[str]
|
|
587
|
+
Collection name containing the document.
|
|
588
|
+
None if the id was not found.
|
|
589
|
+
"""
|
|
590
|
+
for collection_name in id_map:
|
|
591
|
+
if id_ in id_map[collection_name]:
|
|
592
|
+
if replace_underscore is True:
|
|
593
|
+
return collection_name.replace("_", " ")
|
|
594
|
+
else:
|
|
595
|
+
return collection_name
|
|
596
|
+
return None
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def mongo_update_command_for(df_change: pds.DataFrame) -> Dict[str, list]:
|
|
600
|
+
"""
|
|
601
|
+
Creates a dictionary of update commands to be executed against the Mongo database.
|
|
602
|
+
|
|
603
|
+
Parameters
|
|
604
|
+
----------
|
|
605
|
+
df_change : pds.DataFrame
|
|
606
|
+
A dataframe containing change sheet information
|
|
607
|
+
|
|
608
|
+
Returns
|
|
609
|
+
-------
|
|
610
|
+
Dict
|
|
611
|
+
A dict of the update commands to be executed.
|
|
612
|
+
key: collection name
|
|
613
|
+
value: list of update commands
|
|
614
|
+
"""
|
|
615
|
+
update_cmd = {} # list of dicts to hold mongo update queries
|
|
616
|
+
|
|
617
|
+
# split data into groups by values in the group_id column (e.g., gold:Gs0103573)
|
|
618
|
+
id_group = df_change.groupby("group_id")
|
|
619
|
+
for ig in id_group:
|
|
620
|
+
# ig[0] -> id_: group_id for data
|
|
621
|
+
# ig[1] -> df_id: dataframe with rows having the group_id
|
|
622
|
+
id_, df_id = ig
|
|
623
|
+
|
|
624
|
+
# split data into groups by values in the group_var column (e.g, v1, v2)
|
|
625
|
+
var_group = df_id.groupby("group_var")
|
|
626
|
+
ig_updates = [] # update commands for the id group
|
|
627
|
+
for vg in var_group:
|
|
628
|
+
# vg[0] -> group_var for data
|
|
629
|
+
# vg[1] -> dataframe with rows having the group_var
|
|
630
|
+
if len(vg[0].strip()) > 0:
|
|
631
|
+
ig_updates.extend(make_vargroup_updates(vg[1]))
|
|
632
|
+
else:
|
|
633
|
+
ig_updates.extend(make_updates(vg))
|
|
634
|
+
|
|
635
|
+
# add update commands for the group id to dict
|
|
636
|
+
update_cmd[id_] = {
|
|
637
|
+
"update": df_id["collection_name"].values[0],
|
|
638
|
+
"updates": ig_updates,
|
|
639
|
+
}
|
|
640
|
+
return update_cmd
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def copy_docs_in_update_cmd(
|
|
644
|
+
update_cmd, mdb_from: MongoDatabase, mdb_to: MongoDatabase, drop_mdb_to: bool = True
|
|
645
|
+
) -> Dict[str, str]:
|
|
646
|
+
"""
|
|
647
|
+
Copies data between Mongo databases.
|
|
648
|
+
Useful to apply and inspect updates on a test database.
|
|
649
|
+
|
|
650
|
+
Parameters
|
|
651
|
+
----------
|
|
652
|
+
mdb_from : MongoDatbase
|
|
653
|
+
Database from which data being copied (i.e., source).
|
|
654
|
+
mdb_to: MongoDatabase
|
|
655
|
+
Datbase which data is being copied into (i.e., destination).
|
|
656
|
+
|
|
657
|
+
Returns
|
|
658
|
+
-------
|
|
659
|
+
results : Dict
|
|
660
|
+
Dict with collection name as the key, and a message of number of docs inserted as value.
|
|
661
|
+
"""
|
|
662
|
+
doc_specs = defaultdict(list)
|
|
663
|
+
for id_, update_cmd_doc in update_cmd.items():
|
|
664
|
+
collection_name = update_cmd_doc["update"]
|
|
665
|
+
doc_specs[collection_name].append(id_)
|
|
666
|
+
|
|
667
|
+
if drop_mdb_to:
|
|
668
|
+
mdb_to.client.drop_database(mdb_to.name)
|
|
669
|
+
results = {}
|
|
670
|
+
for collection_name, ids in doc_specs.items():
|
|
671
|
+
docs = [
|
|
672
|
+
dissoc(d, "_id")
|
|
673
|
+
for d in mdb_from[collection_name].find({"id": {"$in": ids}})
|
|
674
|
+
]
|
|
675
|
+
results[collection_name] = (
|
|
676
|
+
f"{len(mdb_to[collection_name].insert_many(docs).inserted_ids)} docs inserted"
|
|
677
|
+
)
|
|
678
|
+
return results
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def update_mongo_db(mdb: MongoDatabase, update_cmd: Dict):
|
|
682
|
+
"""
|
|
683
|
+
Updates the Mongo database using commands in the update_cmd dict.
|
|
684
|
+
|
|
685
|
+
Parameters
|
|
686
|
+
----------
|
|
687
|
+
mdb : MongoDatabase
|
|
688
|
+
Mongo database to be updated.
|
|
689
|
+
update_cmd : Dict
|
|
690
|
+
Contians update commands to be executed.
|
|
691
|
+
|
|
692
|
+
Returns
|
|
693
|
+
-------
|
|
694
|
+
results: Dict
|
|
695
|
+
Information about what was updated in the Mongo database.
|
|
696
|
+
"""
|
|
697
|
+
results = []
|
|
698
|
+
validator_strict = Draft7Validator(get_nmdc_jsonschema_dict())
|
|
699
|
+
validator_noidpatterns = Draft7Validator(
|
|
700
|
+
get_nmdc_jsonschema_dict(enforce_id_patterns=False)
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
for id_, update_cmd_doc in update_cmd.items():
|
|
704
|
+
collection_name = update_cmd_doc["update"]
|
|
705
|
+
doc_before = dissoc(mdb[collection_name].find_one({"id": id_}), "_id")
|
|
706
|
+
update_result = json.loads(bson_dumps(mdb.command(update_cmd_doc)))
|
|
707
|
+
doc_after = dissoc(mdb[collection_name].find_one({"id": id_}), "_id")
|
|
708
|
+
if collection_name in {
|
|
709
|
+
"study_set",
|
|
710
|
+
"biosample_set",
|
|
711
|
+
"omics_processing_set",
|
|
712
|
+
} and id_.split(":")[0] in {"gold", "emsl", "igsn"}:
|
|
713
|
+
validator = validator_noidpatterns
|
|
714
|
+
else:
|
|
715
|
+
validator = validator_strict
|
|
716
|
+
errors = list(validator.iter_errors({collection_name: [doc_after]}))
|
|
717
|
+
results.append(
|
|
718
|
+
{
|
|
719
|
+
"id": id_,
|
|
720
|
+
"doc_before": doc_before,
|
|
721
|
+
"update_info": update_result,
|
|
722
|
+
"doc_after": doc_after,
|
|
723
|
+
"validation_errors": [e.message for e in errors],
|
|
724
|
+
}
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
return results
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def _validate_changesheet(df_change: pd.DataFrame, mdb: MongoDatabase):
|
|
731
|
+
update_cmd = mongo_update_command_for(df_change)
|
|
732
|
+
mdb_to_inspect = mdb.client["nmdc_changesheet_submission_results"]
|
|
733
|
+
results_of_copy = copy_docs_in_update_cmd(
|
|
734
|
+
update_cmd,
|
|
735
|
+
mdb_from=mdb,
|
|
736
|
+
mdb_to=mdb_to_inspect,
|
|
737
|
+
)
|
|
738
|
+
results_of_updates = update_mongo_db(mdb_to_inspect, update_cmd)
|
|
739
|
+
rv = {
|
|
740
|
+
"update_cmd": update_cmd,
|
|
741
|
+
"inspection_info": {
|
|
742
|
+
"mdb_name": mdb_to_inspect.name,
|
|
743
|
+
"results_of_copy": results_of_copy,
|
|
744
|
+
},
|
|
745
|
+
"results_of_updates": results_of_updates,
|
|
746
|
+
}
|
|
747
|
+
validation_errors = []
|
|
748
|
+
for result in results_of_updates:
|
|
749
|
+
if len(result.get("validation_errors", [])) > 0:
|
|
750
|
+
validation_errors.append(result["validation_errors"])
|
|
751
|
+
if (
|
|
752
|
+
len(write_errors := result.get("update_info", {}).get("writeErrors", {}))
|
|
753
|
+
> 0
|
|
754
|
+
):
|
|
755
|
+
validation_errors.append(write_errors)
|
|
756
|
+
|
|
757
|
+
if validation_errors:
|
|
758
|
+
raise HTTPException(
|
|
759
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
760
|
+
detail={
|
|
761
|
+
"update_cmd": rv["update_cmd"],
|
|
762
|
+
"validation_errors": validation_errors,
|
|
763
|
+
},
|
|
764
|
+
)
|
|
765
|
+
return rv
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def df_from_sheet_in(sheet_in: ChangesheetIn, mdb: MongoDatabase) -> pd.DataFrame:
|
|
769
|
+
content_types = {
|
|
770
|
+
"text/csv": ",",
|
|
771
|
+
"text/tab-separated-values": "\t",
|
|
772
|
+
}
|
|
773
|
+
content_type = sheet_in.content_type
|
|
774
|
+
sep = content_types.get(content_type)
|
|
775
|
+
filename = sheet_in.name
|
|
776
|
+
if content_type not in content_types:
|
|
777
|
+
raise HTTPException(
|
|
778
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
779
|
+
detail=(
|
|
780
|
+
f"file {filename} has content type '{content_type}'. "
|
|
781
|
+
f"Only {list(content_types)} files are permitted."
|
|
782
|
+
),
|
|
783
|
+
)
|
|
784
|
+
try:
|
|
785
|
+
df = load_changesheet(StringIO(sheet_in.text), mdb, sep=sep)
|
|
786
|
+
except Exception as e:
|
|
787
|
+
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
|
|
788
|
+
return df
|