nmdc-runtime 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/api/__init__.py +0 -0
- nmdc_runtime/api/analytics.py +70 -0
- nmdc_runtime/api/boot/__init__.py +0 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/__init__.py +0 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +170 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/__init__.py +0 -0
- nmdc_runtime/api/db/mongo.py +447 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/__init__.py +0 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +794 -0
- nmdc_runtime/api/endpoints/ids.py +192 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +105 -0
- nmdc_runtime/api/endpoints/queries.py +679 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +229 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +774 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/main.py +401 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/__init__.py +0 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/__init__.py +0 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/minter.py +0 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +253 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +242 -0
- nmdc_runtime/config.py +7 -8
- nmdc_runtime/core/db/Database.py +1 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -9
- nmdc_runtime/lib/extract_nmdc_data.py +0 -8
- nmdc_runtime/lib/nmdc_dataframes.py +3 -7
- nmdc_runtime/lib/nmdc_etl_class.py +1 -7
- nmdc_runtime/minter/adapters/repository.py +1 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +35 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/export/ncbi_xml.py +1 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
- nmdc_runtime/site/graphs.py +1 -22
- nmdc_runtime/site/ops.py +60 -152
- nmdc_runtime/site/repository.py +0 -112
- nmdc_runtime/site/translation/gold_translator.py +4 -12
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +2 -54
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/util.py +3 -47
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
- nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,774 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from json import JSONDecodeError
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from time import time_ns
|
|
9
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
10
|
+
from zoneinfo import ZoneInfo
|
|
11
|
+
|
|
12
|
+
from bson import json_util
|
|
13
|
+
from dagster import DagsterRunStatus
|
|
14
|
+
from dagster_graphql import DagsterGraphQLClientError
|
|
15
|
+
from fastapi import HTTPException
|
|
16
|
+
from gridfs import GridFS
|
|
17
|
+
from nmdc_runtime.api.core.idgen import generate_one_id, local_part
|
|
18
|
+
from nmdc_runtime.api.core.util import (
|
|
19
|
+
dotted_path_for,
|
|
20
|
+
expiry_dt_from_now,
|
|
21
|
+
raise404_if_none,
|
|
22
|
+
)
|
|
23
|
+
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
24
|
+
from nmdc_runtime.api.models.job import Job, JobClaim, JobOperationMetadata
|
|
25
|
+
from nmdc_runtime.api.models.object import (
|
|
26
|
+
DrsId,
|
|
27
|
+
DrsObject,
|
|
28
|
+
DrsObjectIn,
|
|
29
|
+
PortableFilename,
|
|
30
|
+
)
|
|
31
|
+
from nmdc_runtime.api.models.operation import Operation
|
|
32
|
+
from nmdc_runtime.api.models.run import (
|
|
33
|
+
RunUserSpec,
|
|
34
|
+
_add_run_fail_event,
|
|
35
|
+
_add_run_requested_event,
|
|
36
|
+
_add_run_started_event,
|
|
37
|
+
get_dagster_graphql_client,
|
|
38
|
+
)
|
|
39
|
+
from nmdc_runtime.api.models.site import Site
|
|
40
|
+
from nmdc_runtime.api.models.user import User
|
|
41
|
+
from nmdc_runtime.api.models.util import (
|
|
42
|
+
FindRequest,
|
|
43
|
+
ListRequest,
|
|
44
|
+
ResultT,
|
|
45
|
+
)
|
|
46
|
+
from nmdc_runtime.util import drs_metadata_for
|
|
47
|
+
from pymongo.collection import Collection as MongoCollection
|
|
48
|
+
from pymongo.database import Database as MongoDatabase
|
|
49
|
+
from pymongo.errors import DuplicateKeyError
|
|
50
|
+
from starlette import status
|
|
51
|
+
from toolz import assoc_in, concat, dissoc, get_in, merge
|
|
52
|
+
|
|
53
|
+
BASE_URL_INTERNAL = os.getenv("API_HOST")
|
|
54
|
+
BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL")
|
|
55
|
+
HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def does_num_matching_docs_exceed_threshold(
|
|
59
|
+
collection: MongoCollection, filter_: dict, threshold: int
|
|
60
|
+
) -> bool:
|
|
61
|
+
"""Check whether a MongoDB collection contains more than `threshold` documents matching the filter."""
|
|
62
|
+
if threshold < 0:
|
|
63
|
+
raise ValueError("Threshold must be at least 0.")
|
|
64
|
+
|
|
65
|
+
limited_num_matching_docs = collection.count_documents(
|
|
66
|
+
filter=filter_,
|
|
67
|
+
limit=threshold + 1,
|
|
68
|
+
)
|
|
69
|
+
return limited_num_matching_docs > threshold
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def check_filter(filter_: str):
|
|
73
|
+
"""A pass-through function that checks if `filter_` is parsable as a JSON object. Raises otherwise."""
|
|
74
|
+
filter_ = filter_.strip()
|
|
75
|
+
if not filter_.startswith("{") or not filter_.endswith("}"):
|
|
76
|
+
raise HTTPException(
|
|
77
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
78
|
+
detail=f"The given `filter` is not a valid JSON object, which must start with '{{' and end with '}}'.",
|
|
79
|
+
)
|
|
80
|
+
try:
|
|
81
|
+
json_util.loads(filter_)
|
|
82
|
+
except JSONDecodeError as e:
|
|
83
|
+
raise HTTPException(
|
|
84
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
85
|
+
detail=f"Given `filter` is not valid JSON: {e}",
|
|
86
|
+
)
|
|
87
|
+
return filter_
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
|
|
91
|
+
r"""
|
|
92
|
+
Returns a dictionary containing the requested MongoDB documents, maybe alongside pagination information.
|
|
93
|
+
|
|
94
|
+
Note: If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter
|
|
95
|
+
criteria than can fit on a page of that size, this function will paginate the resources.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
id_field = "id"
|
|
99
|
+
if "id_1" not in mdb[collection_name].index_information():
|
|
100
|
+
logging.warning(
|
|
101
|
+
f"list_resources: no index set on 'id' for collection {collection_name}"
|
|
102
|
+
)
|
|
103
|
+
id_field = (
|
|
104
|
+
"_id" # currently expected for `functional_annotation_agg` collection
|
|
105
|
+
)
|
|
106
|
+
max_page_size = req.max_page_size
|
|
107
|
+
filter_ = json_util.loads(check_filter(req.filter)) if req.filter else {}
|
|
108
|
+
projection = (
|
|
109
|
+
list(set(comma_separated_values(req.projection)) | {id_field})
|
|
110
|
+
if req.projection
|
|
111
|
+
else None
|
|
112
|
+
)
|
|
113
|
+
if req.page_token:
|
|
114
|
+
doc = mdb.page_tokens.find_one({"_id": req.page_token, "ns": collection_name})
|
|
115
|
+
if doc is None:
|
|
116
|
+
raise HTTPException(
|
|
117
|
+
status_code=status.HTTP_400_BAD_REQUEST, detail="Bad page_token"
|
|
118
|
+
)
|
|
119
|
+
last_id = doc["last_id"]
|
|
120
|
+
mdb.page_tokens.delete_one({"_id": req.page_token})
|
|
121
|
+
else:
|
|
122
|
+
last_id = None
|
|
123
|
+
if last_id is not None:
|
|
124
|
+
if id_field in filter_:
|
|
125
|
+
filter_[id_field] = merge(filter_[id_field], {"$gt": last_id})
|
|
126
|
+
else:
|
|
127
|
+
filter_ = merge(filter_, {id_field: {"$gt": last_id}})
|
|
128
|
+
|
|
129
|
+
# Determine whether we will paginate the results.
|
|
130
|
+
#
|
|
131
|
+
# Note: We will paginate them unless either:
|
|
132
|
+
# - the `max_page_size` is not a positive integer
|
|
133
|
+
# - the number of documents matching the filter does not exceed `max_page_size`
|
|
134
|
+
#
|
|
135
|
+
will_paginate = True
|
|
136
|
+
if not isinstance(max_page_size, int):
|
|
137
|
+
will_paginate = False
|
|
138
|
+
elif max_page_size < 1:
|
|
139
|
+
will_paginate = False
|
|
140
|
+
elif not does_num_matching_docs_exceed_threshold(
|
|
141
|
+
collection=mdb[collection_name], filter_=filter_, threshold=max_page_size
|
|
142
|
+
):
|
|
143
|
+
will_paginate = False
|
|
144
|
+
|
|
145
|
+
if not will_paginate:
|
|
146
|
+
rv = {
|
|
147
|
+
"resources": list(
|
|
148
|
+
mdb[collection_name].find(filter=filter_, projection=projection)
|
|
149
|
+
)
|
|
150
|
+
}
|
|
151
|
+
return rv
|
|
152
|
+
else:
|
|
153
|
+
resources = list(
|
|
154
|
+
mdb[collection_name].find(
|
|
155
|
+
filter=filter_,
|
|
156
|
+
projection=projection,
|
|
157
|
+
limit=max_page_size,
|
|
158
|
+
sort=[(id_field, 1)],
|
|
159
|
+
allow_disk_use=True,
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
last_id = resources[-1][id_field]
|
|
163
|
+
token = generate_one_id(mdb, "page_tokens")
|
|
164
|
+
# TODO unify with `/queries:run` query continuation model
|
|
165
|
+
# => {_id: cursor/token, query: <full query>, last_id: <>, last_modified: <>}
|
|
166
|
+
mdb.page_tokens.insert_one(
|
|
167
|
+
{"_id": token, "ns": collection_name, "last_id": last_id}
|
|
168
|
+
)
|
|
169
|
+
return {"resources": resources, "next_page_token": token}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def coerce_to_float_if_possible(val):
|
|
173
|
+
r"""
|
|
174
|
+
Converts the specified value into a floating-point number if possible;
|
|
175
|
+
raising a `ValueError` if not possible.
|
|
176
|
+
"""
|
|
177
|
+
try:
|
|
178
|
+
return float(val)
|
|
179
|
+
except ValueError:
|
|
180
|
+
return val
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def comma_separated_values(s: str):
|
|
184
|
+
r"""
|
|
185
|
+
Returns a list of the comma-delimited substrings of the specified string. Discards any whitespace
|
|
186
|
+
surrounding each substring.
|
|
187
|
+
|
|
188
|
+
Reference: https://docs.python.org/3/library/re.html#re.split
|
|
189
|
+
|
|
190
|
+
>>> comma_separated_values("apple, banana, cherry")
|
|
191
|
+
['apple', 'banana', 'cherry']
|
|
192
|
+
"""
|
|
193
|
+
return [v.strip() for v in s.split(",")]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_mongo_filter(filter_str):
|
|
197
|
+
r"""
|
|
198
|
+
Convert a str in the domain-specific language (DSL) solicited by `nmdc_runtime.api.models.util.FindRequest.filter`
|
|
199
|
+
-- i.e., a comma-separated list of `attribute:value` pairs, where the `value` can include a comparison operator
|
|
200
|
+
(e.g. `>=`) and where if the attribute is of type _string_ and has the suffix `.search` appended to its name
|
|
201
|
+
then the server should perform a full-text search
|
|
202
|
+
-- to a corresponding MongoDB filter representation for e.g. passing to a collection `find` call.
|
|
203
|
+
"""
|
|
204
|
+
filter_ = {}
|
|
205
|
+
if not filter_str:
|
|
206
|
+
return filter_
|
|
207
|
+
|
|
208
|
+
pairs = comma_separated_values(filter_str)
|
|
209
|
+
if not all(len(split) == 2 for split in (p.split(":", maxsplit=1) for p in pairs)):
|
|
210
|
+
raise HTTPException(
|
|
211
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
212
|
+
detail="Filter must be of form: attribute:spec[,attribute:spec]*",
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
for attr, spec in (p.split(":", maxsplit=1) for p in pairs):
|
|
216
|
+
if attr.endswith(".search"):
|
|
217
|
+
actual_attr = attr[: -len(".search")]
|
|
218
|
+
filter_[actual_attr] = {"$regex": spec}
|
|
219
|
+
else:
|
|
220
|
+
for op, key in {("<", "$lt"), ("<=", "$lte"), (">", "$gt"), (">=", "$gte")}:
|
|
221
|
+
if spec.startswith(op):
|
|
222
|
+
filter_[attr] = {key: coerce_to_float_if_possible(spec[len(op) :])}
|
|
223
|
+
break
|
|
224
|
+
else:
|
|
225
|
+
filter_[attr] = spec
|
|
226
|
+
return filter_
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def get_mongo_sort(sort_str) -> Optional[List[Tuple[str, int]]]:
|
|
230
|
+
"""
|
|
231
|
+
Parse `sort_str` and a str of the form "attribute:spec[,attribute:spec]*",
|
|
232
|
+
where spec is `asc` (ascending -- the default if no spec) or `desc` (descending),
|
|
233
|
+
and return a value suitable to pass as a `sort` kwarg to a mongo collection `find` call.
|
|
234
|
+
"""
|
|
235
|
+
sort_ = []
|
|
236
|
+
if not sort_str:
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
pairs = comma_separated_values(sort_str)
|
|
240
|
+
for p in pairs:
|
|
241
|
+
components = p.split(":", maxsplit=1)
|
|
242
|
+
if len(components) == 1:
|
|
243
|
+
attr, spec = components[0], ""
|
|
244
|
+
else:
|
|
245
|
+
attr, spec = components
|
|
246
|
+
for op, key in {("", 1), ("asc", 1), ("desc", -1)}:
|
|
247
|
+
if spec == op:
|
|
248
|
+
sort_.append((attr, key))
|
|
249
|
+
break
|
|
250
|
+
else:
|
|
251
|
+
raise HTTPException(
|
|
252
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
253
|
+
detail=(
|
|
254
|
+
"Sort must be of form: attribute:spec[,attribute:spec]* "
|
|
255
|
+
"where spec is `asc` (ascending -- the default if no spec) "
|
|
256
|
+
"or `desc` (descending).",
|
|
257
|
+
),
|
|
258
|
+
)
|
|
259
|
+
return sort_
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def strip_oid(doc: dict) -> dict:
|
|
263
|
+
r"""
|
|
264
|
+
Returns a copy of the specified dictionary, that has no `_id` key.
|
|
265
|
+
"""
|
|
266
|
+
return dissoc(doc, "_id")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def timeit(cursor):
|
|
270
|
+
"""Collect from cursor and return time taken in milliseconds."""
|
|
271
|
+
tic = time_ns()
|
|
272
|
+
results = list(cursor)
|
|
273
|
+
toc = time_ns()
|
|
274
|
+
return results, int(round((toc - tic) / 1e6))
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str):
|
|
278
|
+
"""Find nmdc schema collection entities that match the FindRequest.
|
|
279
|
+
|
|
280
|
+
"resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
|
|
281
|
+
"""
|
|
282
|
+
if req.group_by:
|
|
283
|
+
raise HTTPException(
|
|
284
|
+
status_code=status.HTTP_418_IM_A_TEAPOT,
|
|
285
|
+
detail="I don't yet know how to ?group_by=",
|
|
286
|
+
)
|
|
287
|
+
if req.search:
|
|
288
|
+
raise HTTPException(
|
|
289
|
+
status_code=status.HTTP_418_IM_A_TEAPOT,
|
|
290
|
+
detail=(
|
|
291
|
+
"I don't yet know how to ?search=. "
|
|
292
|
+
"Use ?filter=<attribute>.search:<spec> instead."
|
|
293
|
+
),
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
filter_ = get_mongo_filter(req.filter)
|
|
297
|
+
projection = (
|
|
298
|
+
list(set(comma_separated_values(req.fields)) | {"id"}) if req.fields else None
|
|
299
|
+
)
|
|
300
|
+
sort_ = get_mongo_sort(req.sort)
|
|
301
|
+
|
|
302
|
+
total_count = mdb[collection_name].count_documents(filter=filter_)
|
|
303
|
+
|
|
304
|
+
if req.page:
|
|
305
|
+
skip = (req.page - 1) * req.per_page
|
|
306
|
+
if skip > 10_000:
|
|
307
|
+
raise HTTPException(
|
|
308
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
309
|
+
detail="Use cursor-based pagination for paging beyond 10,000 items",
|
|
310
|
+
)
|
|
311
|
+
limit = req.per_page
|
|
312
|
+
results, db_response_time_ms = timeit(
|
|
313
|
+
mdb[collection_name].find(
|
|
314
|
+
filter=filter_,
|
|
315
|
+
skip=skip,
|
|
316
|
+
limit=limit,
|
|
317
|
+
sort=sort_,
|
|
318
|
+
projection=projection,
|
|
319
|
+
)
|
|
320
|
+
)
|
|
321
|
+
rv = {
|
|
322
|
+
"meta": {
|
|
323
|
+
"mongo_filter_dict": filter_,
|
|
324
|
+
"mongo_sort_list": [[a, s] for a, s in sort_] if sort_ else None,
|
|
325
|
+
"count": total_count,
|
|
326
|
+
"db_response_time_ms": db_response_time_ms,
|
|
327
|
+
"page": req.page,
|
|
328
|
+
"per_page": req.per_page,
|
|
329
|
+
},
|
|
330
|
+
"results": [strip_oid(d) for d in results],
|
|
331
|
+
"group_by": [],
|
|
332
|
+
}
|
|
333
|
+
if req.fields:
|
|
334
|
+
rv["meta"]["fields"] = req.fields
|
|
335
|
+
|
|
336
|
+
else: # req.cursor is not None
|
|
337
|
+
if req.cursor != "*":
|
|
338
|
+
doc = mdb.page_tokens.find_one({"_id": req.cursor, "ns": collection_name})
|
|
339
|
+
if doc is None:
|
|
340
|
+
raise HTTPException(
|
|
341
|
+
status_code=status.HTTP_400_BAD_REQUEST, detail="Bad cursor value"
|
|
342
|
+
)
|
|
343
|
+
last_id = doc["last_id"]
|
|
344
|
+
mdb.page_tokens.delete_one({"_id": req.cursor})
|
|
345
|
+
else:
|
|
346
|
+
last_id = None
|
|
347
|
+
|
|
348
|
+
if last_id is not None:
|
|
349
|
+
if "id" in filter_:
|
|
350
|
+
filter_["id"] = merge(filter_["id"], {"$gt": last_id})
|
|
351
|
+
else:
|
|
352
|
+
filter_ = merge(filter_, {"id": {"$gt": last_id}})
|
|
353
|
+
|
|
354
|
+
if "id_1" not in mdb[collection_name].index_information():
|
|
355
|
+
raise HTTPException(
|
|
356
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
357
|
+
detail=f"Cursor-based pagination is not enabled for this resource.",
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
limit = req.per_page
|
|
361
|
+
sort_for_cursor = (sort_ or []) + [("id", 1)]
|
|
362
|
+
results, db_response_time_ms = timeit(
|
|
363
|
+
mdb[collection_name].find(
|
|
364
|
+
filter=filter_, limit=limit, sort=sort_for_cursor, projection=projection
|
|
365
|
+
)
|
|
366
|
+
)
|
|
367
|
+
last_id = results[-1]["id"]
|
|
368
|
+
|
|
369
|
+
# Is this the last id overall? Then next_cursor should be None.
|
|
370
|
+
filter_eager = filter_
|
|
371
|
+
if "id" in filter_:
|
|
372
|
+
filter_eager["id"] = merge(filter_["id"], {"$gt": last_id})
|
|
373
|
+
else:
|
|
374
|
+
filter_eager = merge(filter_, {"id": {"$gt": last_id}})
|
|
375
|
+
more_results = (
|
|
376
|
+
mdb[collection_name].count_documents(filter=filter_eager, limit=limit) > 0
|
|
377
|
+
)
|
|
378
|
+
if more_results:
|
|
379
|
+
token = generate_one_id(mdb, "page_tokens")
|
|
380
|
+
mdb.page_tokens.insert_one(
|
|
381
|
+
{"_id": token, "ns": collection_name, "last_id": last_id}
|
|
382
|
+
)
|
|
383
|
+
else:
|
|
384
|
+
token = None
|
|
385
|
+
|
|
386
|
+
rv = {
|
|
387
|
+
"meta": {
|
|
388
|
+
"mongo_filter_dict": filter_,
|
|
389
|
+
"mongo_sort_list": sort_for_cursor,
|
|
390
|
+
"count": total_count,
|
|
391
|
+
"db_response_time_ms": db_response_time_ms,
|
|
392
|
+
"page": None,
|
|
393
|
+
"per_page": req.per_page,
|
|
394
|
+
"next_cursor": token,
|
|
395
|
+
},
|
|
396
|
+
"results": [strip_oid(d) for d in results],
|
|
397
|
+
"group_by": [],
|
|
398
|
+
}
|
|
399
|
+
if req.fields:
|
|
400
|
+
rv["meta"]["fields"] = req.fields
|
|
401
|
+
return rv
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def find_resources_spanning(
|
|
405
|
+
req: FindRequest, mdb: MongoDatabase, collection_names: Set[str]
|
|
406
|
+
):
|
|
407
|
+
"""Find nmdc schema collection entities -- here, across multiple collections -- that match the FindRequest.
|
|
408
|
+
|
|
409
|
+
This is useful for collections that house documents that are subclasses of a common ancestor class.
|
|
410
|
+
|
|
411
|
+
"resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
|
|
412
|
+
"""
|
|
413
|
+
if req.cursor or not req.page:
|
|
414
|
+
raise HTTPException(
|
|
415
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
416
|
+
detail="This resource only supports page-based pagination",
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
if len(collection_names) == 0:
|
|
420
|
+
return {
|
|
421
|
+
"meta": {
|
|
422
|
+
"mongo_filter_dict": get_mongo_filter(req.filter),
|
|
423
|
+
"count": 0,
|
|
424
|
+
"db_response_time_ms": 0,
|
|
425
|
+
"page": req.page,
|
|
426
|
+
"per_page": req.per_page,
|
|
427
|
+
},
|
|
428
|
+
"results": [],
|
|
429
|
+
"group_by": [],
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
responses = {name: find_resources(req, mdb, name) for name in collection_names}
|
|
433
|
+
rv = {
|
|
434
|
+
"meta": {
|
|
435
|
+
"mongo_filter_dict": next(
|
|
436
|
+
r["meta"]["mongo_filter_dict"] for r in responses.values()
|
|
437
|
+
),
|
|
438
|
+
"count": sum(r["meta"]["count"] for r in responses.values()),
|
|
439
|
+
"db_response_time_ms": sum(
|
|
440
|
+
r["meta"]["db_response_time_ms"] for r in responses.values()
|
|
441
|
+
),
|
|
442
|
+
"page": req.page,
|
|
443
|
+
"per_page": req.per_page,
|
|
444
|
+
},
|
|
445
|
+
"results": list(concat(r["results"] for r in responses.values())),
|
|
446
|
+
"group_by": [],
|
|
447
|
+
}
|
|
448
|
+
return rv
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def exists(collection: MongoCollection, filter_: dict):
|
|
452
|
+
r"""
|
|
453
|
+
Returns True if there are any documents in the collection that meet the filter requirements.
|
|
454
|
+
"""
|
|
455
|
+
return collection.count_documents(filter_) > 0
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def persist_content_and_get_drs_object(
|
|
459
|
+
content: str,
|
|
460
|
+
description: str,
|
|
461
|
+
username="(anonymous)",
|
|
462
|
+
filename=None,
|
|
463
|
+
content_type="application/json",
|
|
464
|
+
id_ns="json-metadata-in",
|
|
465
|
+
exists_ok=False,
|
|
466
|
+
):
|
|
467
|
+
"""Persist a Data Repository Service (DRS) object.
|
|
468
|
+
|
|
469
|
+
An object may be a blob, analogous to a file, or a bundle, analogous to a folder. Sites register objects,
|
|
470
|
+
and sites must ensure that these objects are accessible to the NMDC data broker.
|
|
471
|
+
An object may be associated with one or more object types, useful for triggering workflows.
|
|
472
|
+
|
|
473
|
+
Reference: https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.1.0/docs/#_drs_datatypes
|
|
474
|
+
"""
|
|
475
|
+
mdb = get_mongo_db()
|
|
476
|
+
drs_id = local_part(generate_one_id(mdb, ns=id_ns, shoulder="gfs0"))
|
|
477
|
+
filename = filename or drs_id
|
|
478
|
+
PortableFilename(filename) # validates
|
|
479
|
+
DrsId(drs_id) # validates
|
|
480
|
+
|
|
481
|
+
mdb_fs = GridFS(mdb)
|
|
482
|
+
mdb_fs.put(
|
|
483
|
+
content,
|
|
484
|
+
_id=drs_id,
|
|
485
|
+
filename=filename,
|
|
486
|
+
content_type=content_type,
|
|
487
|
+
encoding="utf-8",
|
|
488
|
+
)
|
|
489
|
+
with tempfile.TemporaryDirectory() as save_dir:
|
|
490
|
+
filepath = str(Path(save_dir).joinpath(filename))
|
|
491
|
+
with open(filepath, "w") as f:
|
|
492
|
+
f.write(content)
|
|
493
|
+
now_to_the_minute = datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat(
|
|
494
|
+
timespec="minutes"
|
|
495
|
+
)
|
|
496
|
+
object_in = DrsObjectIn(
|
|
497
|
+
**drs_metadata_for(
|
|
498
|
+
filepath,
|
|
499
|
+
base={
|
|
500
|
+
"description": (
|
|
501
|
+
description
|
|
502
|
+
+ f" (created by/for {username}"
|
|
503
|
+
+ f" at {now_to_the_minute})"
|
|
504
|
+
),
|
|
505
|
+
"access_methods": [{"access_id": drs_id}],
|
|
506
|
+
},
|
|
507
|
+
timestamp=now_to_the_minute,
|
|
508
|
+
)
|
|
509
|
+
)
|
|
510
|
+
self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}"
|
|
511
|
+
return _create_object(
|
|
512
|
+
mdb,
|
|
513
|
+
object_in,
|
|
514
|
+
mgr_site="nmdc-runtime",
|
|
515
|
+
drs_id=drs_id,
|
|
516
|
+
self_uri=self_uri,
|
|
517
|
+
exists_ok=exists_ok,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def _create_object(
|
|
522
|
+
mdb: MongoDatabase,
|
|
523
|
+
object_in: DrsObjectIn,
|
|
524
|
+
mgr_site,
|
|
525
|
+
drs_id,
|
|
526
|
+
self_uri,
|
|
527
|
+
exists_ok=False,
|
|
528
|
+
):
|
|
529
|
+
"""Helper function for creating a Data Repository Service (DRS) object."""
|
|
530
|
+
drs_obj = DrsObject(
|
|
531
|
+
**object_in.model_dump(exclude_unset=True),
|
|
532
|
+
id=drs_id,
|
|
533
|
+
self_uri=self_uri,
|
|
534
|
+
)
|
|
535
|
+
doc = drs_obj.model_dump(exclude_unset=True)
|
|
536
|
+
doc["_mgr_site"] = mgr_site # manager site
|
|
537
|
+
try:
|
|
538
|
+
mdb.objects.insert_one(doc)
|
|
539
|
+
except DuplicateKeyError as e:
|
|
540
|
+
if e.details["keyPattern"] == {"checksums.type": 1, "checksums.checksum": 1}:
|
|
541
|
+
if exists_ok:
|
|
542
|
+
return mdb.objects.find_one(
|
|
543
|
+
{
|
|
544
|
+
"checksums": {
|
|
545
|
+
"$elemMatch": {
|
|
546
|
+
"type": e.details["keyValue"]["checksums.type"],
|
|
547
|
+
"checksum": e.details["keyValue"]["checksums.checksum"],
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
)
|
|
552
|
+
else:
|
|
553
|
+
raise HTTPException(
|
|
554
|
+
status_code=status.HTTP_409_CONFLICT,
|
|
555
|
+
detail=f"provided checksum matches existing object: {e.details['keyValue']}",
|
|
556
|
+
)
|
|
557
|
+
else:
|
|
558
|
+
raise HTTPException(
|
|
559
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
560
|
+
detail="duplicate key error",
|
|
561
|
+
)
|
|
562
|
+
return doc
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def _claim_job(job_id: str, mdb: MongoDatabase, site: Site):
|
|
566
|
+
r"""
|
|
567
|
+
TODO: Document this function.
|
|
568
|
+
"""
|
|
569
|
+
job_doc = raise404_if_none(mdb.jobs.find_one({"id": job_id}))
|
|
570
|
+
job = Job(**job_doc)
|
|
571
|
+
# check that site satisfies the job's workflow's required capabilities.
|
|
572
|
+
capabilities_required = job.workflow.capability_ids or []
|
|
573
|
+
for cid in capabilities_required:
|
|
574
|
+
if cid not in site.capability_ids:
|
|
575
|
+
raise HTTPException(
|
|
576
|
+
status_code=status.HTTP_403_FORBIDDEN,
|
|
577
|
+
detail=f"client site does not have capability {cid} required to claim job",
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
# For now, allow site to claim same job multiple times,
|
|
581
|
+
# to re-submit results given same job input config.
|
|
582
|
+
job_op_for_site = mdb.operations.find_one(
|
|
583
|
+
{"metadata.job.id": job.id, "metadata.site_id": site.id}
|
|
584
|
+
)
|
|
585
|
+
if job_op_for_site is not None:
|
|
586
|
+
# raise HTTPException(
|
|
587
|
+
# status_code=status.HTTP_409_CONFLICT,
|
|
588
|
+
# detail={
|
|
589
|
+
# "msg": (
|
|
590
|
+
# f"client site already claimed job -- "
|
|
591
|
+
# f"see operation {job_op_for_site['id']}"
|
|
592
|
+
# ),
|
|
593
|
+
# "id": job_op_for_site["id"],
|
|
594
|
+
# },
|
|
595
|
+
# )
|
|
596
|
+
pass
|
|
597
|
+
|
|
598
|
+
op_id = generate_one_id(mdb, "op")
|
|
599
|
+
job.claims = (job.claims or []) + [JobClaim(op_id=op_id, site_id=site.id)]
|
|
600
|
+
op = Operation[ResultT, JobOperationMetadata](
|
|
601
|
+
**{
|
|
602
|
+
"id": op_id,
|
|
603
|
+
"expire_time": expiry_dt_from_now(days=30),
|
|
604
|
+
"metadata": {
|
|
605
|
+
"job": Job(
|
|
606
|
+
**{
|
|
607
|
+
"id": job.id,
|
|
608
|
+
"workflow": job.workflow,
|
|
609
|
+
"config": job.config,
|
|
610
|
+
}
|
|
611
|
+
).model_dump(exclude_unset=True),
|
|
612
|
+
"site_id": site.id,
|
|
613
|
+
"model": dotted_path_for(JobOperationMetadata),
|
|
614
|
+
},
|
|
615
|
+
}
|
|
616
|
+
)
|
|
617
|
+
mdb.operations.insert_one(op.model_dump())
|
|
618
|
+
mdb.jobs.replace_one({"id": job.id}, job.model_dump(exclude_unset=True))
|
|
619
|
+
|
|
620
|
+
return op.model_dump(exclude_unset=True)
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
@lru_cache
|
|
624
|
+
def map_nmdc_workflow_id_to_dagster_job_name():
|
|
625
|
+
"""Returns a dictionary mapping nmdc_workflow_id to dagster_job_name."""
|
|
626
|
+
return {
|
|
627
|
+
"metadata-in-1.0.0": "apply_metadata_in",
|
|
628
|
+
"export-study-biosamples-as-csv-1.0.0": "export_study_biosamples_metadata",
|
|
629
|
+
"gold_study_to_database": "gold_study_to_database",
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def ensure_run_config_data(
|
|
634
|
+
nmdc_workflow_id: str,
|
|
635
|
+
nmdc_workflow_inputs: List[str],
|
|
636
|
+
run_config_data: dict,
|
|
637
|
+
mdb: MongoDatabase,
|
|
638
|
+
user: User,
|
|
639
|
+
):
|
|
640
|
+
r"""
|
|
641
|
+
Ensures that run_config_data has entries for certain nmdc workflow ids.
|
|
642
|
+
Returns return_config_data.
|
|
643
|
+
"""
|
|
644
|
+
if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
|
|
645
|
+
run_config_data = assoc_in(
|
|
646
|
+
run_config_data,
|
|
647
|
+
["ops", "get_study_biosamples_metadata", "config", "study_id"],
|
|
648
|
+
nmdc_workflow_inputs[0],
|
|
649
|
+
)
|
|
650
|
+
run_config_data = assoc_in(
|
|
651
|
+
run_config_data,
|
|
652
|
+
["ops", "get_study_biosamples_metadata", "config", "username"],
|
|
653
|
+
user.username,
|
|
654
|
+
)
|
|
655
|
+
return run_config_data
|
|
656
|
+
if nmdc_workflow_id == "gold_study_to_database":
|
|
657
|
+
run_config_data = assoc_in(
|
|
658
|
+
run_config_data,
|
|
659
|
+
["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
|
|
660
|
+
nmdc_workflow_inputs[0],
|
|
661
|
+
)
|
|
662
|
+
run_config_data = assoc_in(
|
|
663
|
+
run_config_data,
|
|
664
|
+
["ops", "export_json_to_drs", "config", "username"],
|
|
665
|
+
user.username,
|
|
666
|
+
)
|
|
667
|
+
return run_config_data
|
|
668
|
+
else:
|
|
669
|
+
return run_config_data
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def inputs_for(nmdc_workflow_id, run_config_data):
|
|
673
|
+
"""Returns a URI path for given nmdc_workflow_id, constructed from run_config_data."""
|
|
674
|
+
if nmdc_workflow_id == "metadata-in-1.0.0":
|
|
675
|
+
return [
|
|
676
|
+
"/objects/"
|
|
677
|
+
+ get_in(["ops", "get_json_in", "config", "object_id"], run_config_data)
|
|
678
|
+
]
|
|
679
|
+
if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
|
|
680
|
+
return [
|
|
681
|
+
"/studies/"
|
|
682
|
+
+ get_in(
|
|
683
|
+
["ops", "get_study_biosamples_metadata", "config", "study_id"],
|
|
684
|
+
run_config_data,
|
|
685
|
+
)
|
|
686
|
+
]
|
|
687
|
+
if nmdc_workflow_id == "gold_study_to_database":
|
|
688
|
+
return [
|
|
689
|
+
"/studies/"
|
|
690
|
+
+ get_in(
|
|
691
|
+
["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
|
|
692
|
+
run_config_data,
|
|
693
|
+
)
|
|
694
|
+
]
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def _request_dagster_run(
|
|
698
|
+
nmdc_workflow_id: str,
|
|
699
|
+
nmdc_workflow_inputs: List[str],
|
|
700
|
+
extra_run_config_data: dict,
|
|
701
|
+
mdb: MongoDatabase,
|
|
702
|
+
user: User,
|
|
703
|
+
repository_location_name=None,
|
|
704
|
+
repository_name=None,
|
|
705
|
+
):
|
|
706
|
+
r"""
|
|
707
|
+
Requests a Dagster run using the specified parameters.
|
|
708
|
+
Returns a json dictionary indicating the job's success or failure.
|
|
709
|
+
This is a generic wrapper.
|
|
710
|
+
"""
|
|
711
|
+
dagster_job_name = map_nmdc_workflow_id_to_dagster_job_name()[nmdc_workflow_id]
|
|
712
|
+
|
|
713
|
+
extra_run_config_data = ensure_run_config_data(
|
|
714
|
+
nmdc_workflow_id, nmdc_workflow_inputs, extra_run_config_data, mdb, user
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
# add REQUESTED RunEvent
|
|
718
|
+
nmdc_run_id = _add_run_requested_event(
|
|
719
|
+
run_spec=RunUserSpec(
|
|
720
|
+
job_id=nmdc_workflow_id,
|
|
721
|
+
run_config=extra_run_config_data,
|
|
722
|
+
inputs=inputs_for(nmdc_workflow_id, extra_run_config_data),
|
|
723
|
+
),
|
|
724
|
+
mdb=mdb,
|
|
725
|
+
user=user,
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
dagster_client = get_dagster_graphql_client()
|
|
729
|
+
try:
|
|
730
|
+
dagster_run_id: str = dagster_client.submit_job_execution(
|
|
731
|
+
dagster_job_name,
|
|
732
|
+
repository_location_name=repository_location_name,
|
|
733
|
+
repository_name=repository_name,
|
|
734
|
+
run_config=extra_run_config_data,
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
# add STARTED RunEvent
|
|
738
|
+
_add_run_started_event(run_id=nmdc_run_id, mdb=mdb)
|
|
739
|
+
mdb.run_events.find_one_and_update(
|
|
740
|
+
filter={"run.id": nmdc_run_id, "type": "STARTED"},
|
|
741
|
+
update={"$set": {"run.facets.nmdcRuntime_dagsterRunId": dagster_run_id}},
|
|
742
|
+
sort=[("time", -1)],
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
return {"type": "success", "detail": {"run_id": nmdc_run_id}}
|
|
746
|
+
except DagsterGraphQLClientError as exc:
|
|
747
|
+
# add FAIL RunEvent
|
|
748
|
+
_add_run_fail_event(run_id=nmdc_run_id, mdb=mdb)
|
|
749
|
+
|
|
750
|
+
return {
|
|
751
|
+
"type": "error",
|
|
752
|
+
"detail": {"run_id": nmdc_run_id, "error_detail": str(exc)},
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def _get_dagster_run_status(run_id: str):
|
|
757
|
+
r"""
|
|
758
|
+
Returns the status (either "success" or "error") of a requested Dagster run.
|
|
759
|
+
"""
|
|
760
|
+
dagster_client = get_dagster_graphql_client()
|
|
761
|
+
try:
|
|
762
|
+
run_status: DagsterRunStatus = dagster_client.get_run_status(run_id)
|
|
763
|
+
return {"type": "success", "detail": str(run_status.value)}
|
|
764
|
+
except DagsterGraphQLClientError as exc:
|
|
765
|
+
return {"type": "error", "detail": str(exc)}
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def check_action_permitted(username: str, action: str):
|
|
769
|
+
"""Returns True if a Mongo database action is "allowed" and "not denied"."""
|
|
770
|
+
db: MongoDatabase = get_mongo_db()
|
|
771
|
+
filter_ = {"username": username, "action": action}
|
|
772
|
+
denied = db["_runtime.api.deny"].find_one(filter_) is not None
|
|
773
|
+
allowed = db["_runtime.api.allow"].find_one(filter_) is not None
|
|
774
|
+
return (not denied) and allowed
|