nmdc-runtime 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (98) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +7 -8
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +1 -22
  76. nmdc_runtime/site/ops.py +60 -152
  77. nmdc_runtime/site/repository.py +0 -112
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +2 -54
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/util.py +3 -47
  87. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  88. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  89. nmdc_runtime/site/translation/emsl.py +0 -43
  90. nmdc_runtime/site/translation/gold.py +0 -53
  91. nmdc_runtime/site/translation/jgi.py +0 -32
  92. nmdc_runtime/site/translation/util.py +0 -132
  93. nmdc_runtime/site/validation/jgi.py +0 -43
  94. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  95. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  96. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  97. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  98. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,774 @@
1
+ import logging
2
+ import os
3
+ import tempfile
4
+ from datetime import datetime
5
+ from functools import lru_cache
6
+ from json import JSONDecodeError
7
+ from pathlib import Path
8
+ from time import time_ns
9
+ from typing import Dict, List, Optional, Set, Tuple
10
+ from zoneinfo import ZoneInfo
11
+
12
+ from bson import json_util
13
+ from dagster import DagsterRunStatus
14
+ from dagster_graphql import DagsterGraphQLClientError
15
+ from fastapi import HTTPException
16
+ from gridfs import GridFS
17
+ from nmdc_runtime.api.core.idgen import generate_one_id, local_part
18
+ from nmdc_runtime.api.core.util import (
19
+ dotted_path_for,
20
+ expiry_dt_from_now,
21
+ raise404_if_none,
22
+ )
23
+ from nmdc_runtime.api.db.mongo import get_mongo_db
24
+ from nmdc_runtime.api.models.job import Job, JobClaim, JobOperationMetadata
25
+ from nmdc_runtime.api.models.object import (
26
+ DrsId,
27
+ DrsObject,
28
+ DrsObjectIn,
29
+ PortableFilename,
30
+ )
31
+ from nmdc_runtime.api.models.operation import Operation
32
+ from nmdc_runtime.api.models.run import (
33
+ RunUserSpec,
34
+ _add_run_fail_event,
35
+ _add_run_requested_event,
36
+ _add_run_started_event,
37
+ get_dagster_graphql_client,
38
+ )
39
+ from nmdc_runtime.api.models.site import Site
40
+ from nmdc_runtime.api.models.user import User
41
+ from nmdc_runtime.api.models.util import (
42
+ FindRequest,
43
+ ListRequest,
44
+ ResultT,
45
+ )
46
+ from nmdc_runtime.util import drs_metadata_for
47
+ from pymongo.collection import Collection as MongoCollection
48
+ from pymongo.database import Database as MongoDatabase
49
+ from pymongo.errors import DuplicateKeyError
50
+ from starlette import status
51
+ from toolz import assoc_in, concat, dissoc, get_in, merge
52
+
53
+ BASE_URL_INTERNAL = os.getenv("API_HOST")
54
+ BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL")
55
+ HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1]
56
+
57
+
58
+ def does_num_matching_docs_exceed_threshold(
59
+ collection: MongoCollection, filter_: dict, threshold: int
60
+ ) -> bool:
61
+ """Check whether a MongoDB collection contains more than `threshold` documents matching the filter."""
62
+ if threshold < 0:
63
+ raise ValueError("Threshold must be at least 0.")
64
+
65
+ limited_num_matching_docs = collection.count_documents(
66
+ filter=filter_,
67
+ limit=threshold + 1,
68
+ )
69
+ return limited_num_matching_docs > threshold
70
+
71
+
72
+ def check_filter(filter_: str):
73
+ """A pass-through function that checks if `filter_` is parsable as a JSON object. Raises otherwise."""
74
+ filter_ = filter_.strip()
75
+ if not filter_.startswith("{") or not filter_.endswith("}"):
76
+ raise HTTPException(
77
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
78
+ detail=f"The given `filter` is not a valid JSON object, which must start with '{{' and end with '}}'.",
79
+ )
80
+ try:
81
+ json_util.loads(filter_)
82
+ except JSONDecodeError as e:
83
+ raise HTTPException(
84
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
85
+ detail=f"Given `filter` is not valid JSON: {e}",
86
+ )
87
+ return filter_
88
+
89
+
90
+ def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
91
+ r"""
92
+ Returns a dictionary containing the requested MongoDB documents, maybe alongside pagination information.
93
+
94
+ Note: If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter
95
+ criteria than can fit on a page of that size, this function will paginate the resources.
96
+ """
97
+
98
+ id_field = "id"
99
+ if "id_1" not in mdb[collection_name].index_information():
100
+ logging.warning(
101
+ f"list_resources: no index set on 'id' for collection {collection_name}"
102
+ )
103
+ id_field = (
104
+ "_id" # currently expected for `functional_annotation_agg` collection
105
+ )
106
+ max_page_size = req.max_page_size
107
+ filter_ = json_util.loads(check_filter(req.filter)) if req.filter else {}
108
+ projection = (
109
+ list(set(comma_separated_values(req.projection)) | {id_field})
110
+ if req.projection
111
+ else None
112
+ )
113
+ if req.page_token:
114
+ doc = mdb.page_tokens.find_one({"_id": req.page_token, "ns": collection_name})
115
+ if doc is None:
116
+ raise HTTPException(
117
+ status_code=status.HTTP_400_BAD_REQUEST, detail="Bad page_token"
118
+ )
119
+ last_id = doc["last_id"]
120
+ mdb.page_tokens.delete_one({"_id": req.page_token})
121
+ else:
122
+ last_id = None
123
+ if last_id is not None:
124
+ if id_field in filter_:
125
+ filter_[id_field] = merge(filter_[id_field], {"$gt": last_id})
126
+ else:
127
+ filter_ = merge(filter_, {id_field: {"$gt": last_id}})
128
+
129
+ # Determine whether we will paginate the results.
130
+ #
131
+ # Note: We will paginate them unless either:
132
+ # - the `max_page_size` is not a positive integer
133
+ # - the number of documents matching the filter does not exceed `max_page_size`
134
+ #
135
+ will_paginate = True
136
+ if not isinstance(max_page_size, int):
137
+ will_paginate = False
138
+ elif max_page_size < 1:
139
+ will_paginate = False
140
+ elif not does_num_matching_docs_exceed_threshold(
141
+ collection=mdb[collection_name], filter_=filter_, threshold=max_page_size
142
+ ):
143
+ will_paginate = False
144
+
145
+ if not will_paginate:
146
+ rv = {
147
+ "resources": list(
148
+ mdb[collection_name].find(filter=filter_, projection=projection)
149
+ )
150
+ }
151
+ return rv
152
+ else:
153
+ resources = list(
154
+ mdb[collection_name].find(
155
+ filter=filter_,
156
+ projection=projection,
157
+ limit=max_page_size,
158
+ sort=[(id_field, 1)],
159
+ allow_disk_use=True,
160
+ )
161
+ )
162
+ last_id = resources[-1][id_field]
163
+ token = generate_one_id(mdb, "page_tokens")
164
+ # TODO unify with `/queries:run` query continuation model
165
+ # => {_id: cursor/token, query: <full query>, last_id: <>, last_modified: <>}
166
+ mdb.page_tokens.insert_one(
167
+ {"_id": token, "ns": collection_name, "last_id": last_id}
168
+ )
169
+ return {"resources": resources, "next_page_token": token}
170
+
171
+
172
+ def coerce_to_float_if_possible(val):
173
+ r"""
174
+ Converts the specified value into a floating-point number if possible;
175
+ raising a `ValueError` if not possible.
176
+ """
177
+ try:
178
+ return float(val)
179
+ except ValueError:
180
+ return val
181
+
182
+
183
+ def comma_separated_values(s: str):
184
+ r"""
185
+ Returns a list of the comma-delimited substrings of the specified string. Discards any whitespace
186
+ surrounding each substring.
187
+
188
+ Reference: https://docs.python.org/3/library/re.html#re.split
189
+
190
+ >>> comma_separated_values("apple, banana, cherry")
191
+ ['apple', 'banana', 'cherry']
192
+ """
193
+ return [v.strip() for v in s.split(",")]
194
+
195
+
196
+ def get_mongo_filter(filter_str):
197
+ r"""
198
+ Convert a str in the domain-specific language (DSL) solicited by `nmdc_runtime.api.models.util.FindRequest.filter`
199
+ -- i.e., a comma-separated list of `attribute:value` pairs, where the `value` can include a comparison operator
200
+ (e.g. `>=`) and where if the attribute is of type _string_ and has the suffix `.search` appended to its name
201
+ then the server should perform a full-text search
202
+ -- to a corresponding MongoDB filter representation for e.g. passing to a collection `find` call.
203
+ """
204
+ filter_ = {}
205
+ if not filter_str:
206
+ return filter_
207
+
208
+ pairs = comma_separated_values(filter_str)
209
+ if not all(len(split) == 2 for split in (p.split(":", maxsplit=1) for p in pairs)):
210
+ raise HTTPException(
211
+ status_code=status.HTTP_400_BAD_REQUEST,
212
+ detail="Filter must be of form: attribute:spec[,attribute:spec]*",
213
+ )
214
+
215
+ for attr, spec in (p.split(":", maxsplit=1) for p in pairs):
216
+ if attr.endswith(".search"):
217
+ actual_attr = attr[: -len(".search")]
218
+ filter_[actual_attr] = {"$regex": spec}
219
+ else:
220
+ for op, key in {("<", "$lt"), ("<=", "$lte"), (">", "$gt"), (">=", "$gte")}:
221
+ if spec.startswith(op):
222
+ filter_[attr] = {key: coerce_to_float_if_possible(spec[len(op) :])}
223
+ break
224
+ else:
225
+ filter_[attr] = spec
226
+ return filter_
227
+
228
+
229
+ def get_mongo_sort(sort_str) -> Optional[List[Tuple[str, int]]]:
230
+ """
231
+ Parse `sort_str` and a str of the form "attribute:spec[,attribute:spec]*",
232
+ where spec is `asc` (ascending -- the default if no spec) or `desc` (descending),
233
+ and return a value suitable to pass as a `sort` kwarg to a mongo collection `find` call.
234
+ """
235
+ sort_ = []
236
+ if not sort_str:
237
+ return None
238
+
239
+ pairs = comma_separated_values(sort_str)
240
+ for p in pairs:
241
+ components = p.split(":", maxsplit=1)
242
+ if len(components) == 1:
243
+ attr, spec = components[0], ""
244
+ else:
245
+ attr, spec = components
246
+ for op, key in {("", 1), ("asc", 1), ("desc", -1)}:
247
+ if spec == op:
248
+ sort_.append((attr, key))
249
+ break
250
+ else:
251
+ raise HTTPException(
252
+ status_code=status.HTTP_400_BAD_REQUEST,
253
+ detail=(
254
+ "Sort must be of form: attribute:spec[,attribute:spec]* "
255
+ "where spec is `asc` (ascending -- the default if no spec) "
256
+ "or `desc` (descending).",
257
+ ),
258
+ )
259
+ return sort_
260
+
261
+
262
+ def strip_oid(doc: dict) -> dict:
263
+ r"""
264
+ Returns a copy of the specified dictionary, that has no `_id` key.
265
+ """
266
+ return dissoc(doc, "_id")
267
+
268
+
269
+ def timeit(cursor):
270
+ """Collect from cursor and return time taken in milliseconds."""
271
+ tic = time_ns()
272
+ results = list(cursor)
273
+ toc = time_ns()
274
+ return results, int(round((toc - tic) / 1e6))
275
+
276
+
277
+ def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str):
278
+ """Find nmdc schema collection entities that match the FindRequest.
279
+
280
+ "resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
281
+ """
282
+ if req.group_by:
283
+ raise HTTPException(
284
+ status_code=status.HTTP_418_IM_A_TEAPOT,
285
+ detail="I don't yet know how to ?group_by=",
286
+ )
287
+ if req.search:
288
+ raise HTTPException(
289
+ status_code=status.HTTP_418_IM_A_TEAPOT,
290
+ detail=(
291
+ "I don't yet know how to ?search=. "
292
+ "Use ?filter=<attribute>.search:<spec> instead."
293
+ ),
294
+ )
295
+
296
+ filter_ = get_mongo_filter(req.filter)
297
+ projection = (
298
+ list(set(comma_separated_values(req.fields)) | {"id"}) if req.fields else None
299
+ )
300
+ sort_ = get_mongo_sort(req.sort)
301
+
302
+ total_count = mdb[collection_name].count_documents(filter=filter_)
303
+
304
+ if req.page:
305
+ skip = (req.page - 1) * req.per_page
306
+ if skip > 10_000:
307
+ raise HTTPException(
308
+ status_code=status.HTTP_400_BAD_REQUEST,
309
+ detail="Use cursor-based pagination for paging beyond 10,000 items",
310
+ )
311
+ limit = req.per_page
312
+ results, db_response_time_ms = timeit(
313
+ mdb[collection_name].find(
314
+ filter=filter_,
315
+ skip=skip,
316
+ limit=limit,
317
+ sort=sort_,
318
+ projection=projection,
319
+ )
320
+ )
321
+ rv = {
322
+ "meta": {
323
+ "mongo_filter_dict": filter_,
324
+ "mongo_sort_list": [[a, s] for a, s in sort_] if sort_ else None,
325
+ "count": total_count,
326
+ "db_response_time_ms": db_response_time_ms,
327
+ "page": req.page,
328
+ "per_page": req.per_page,
329
+ },
330
+ "results": [strip_oid(d) for d in results],
331
+ "group_by": [],
332
+ }
333
+ if req.fields:
334
+ rv["meta"]["fields"] = req.fields
335
+
336
+ else: # req.cursor is not None
337
+ if req.cursor != "*":
338
+ doc = mdb.page_tokens.find_one({"_id": req.cursor, "ns": collection_name})
339
+ if doc is None:
340
+ raise HTTPException(
341
+ status_code=status.HTTP_400_BAD_REQUEST, detail="Bad cursor value"
342
+ )
343
+ last_id = doc["last_id"]
344
+ mdb.page_tokens.delete_one({"_id": req.cursor})
345
+ else:
346
+ last_id = None
347
+
348
+ if last_id is not None:
349
+ if "id" in filter_:
350
+ filter_["id"] = merge(filter_["id"], {"$gt": last_id})
351
+ else:
352
+ filter_ = merge(filter_, {"id": {"$gt": last_id}})
353
+
354
+ if "id_1" not in mdb[collection_name].index_information():
355
+ raise HTTPException(
356
+ status_code=status.HTTP_400_BAD_REQUEST,
357
+ detail=f"Cursor-based pagination is not enabled for this resource.",
358
+ )
359
+
360
+ limit = req.per_page
361
+ sort_for_cursor = (sort_ or []) + [("id", 1)]
362
+ results, db_response_time_ms = timeit(
363
+ mdb[collection_name].find(
364
+ filter=filter_, limit=limit, sort=sort_for_cursor, projection=projection
365
+ )
366
+ )
367
+ last_id = results[-1]["id"]
368
+
369
+ # Is this the last id overall? Then next_cursor should be None.
370
+ filter_eager = filter_
371
+ if "id" in filter_:
372
+ filter_eager["id"] = merge(filter_["id"], {"$gt": last_id})
373
+ else:
374
+ filter_eager = merge(filter_, {"id": {"$gt": last_id}})
375
+ more_results = (
376
+ mdb[collection_name].count_documents(filter=filter_eager, limit=limit) > 0
377
+ )
378
+ if more_results:
379
+ token = generate_one_id(mdb, "page_tokens")
380
+ mdb.page_tokens.insert_one(
381
+ {"_id": token, "ns": collection_name, "last_id": last_id}
382
+ )
383
+ else:
384
+ token = None
385
+
386
+ rv = {
387
+ "meta": {
388
+ "mongo_filter_dict": filter_,
389
+ "mongo_sort_list": sort_for_cursor,
390
+ "count": total_count,
391
+ "db_response_time_ms": db_response_time_ms,
392
+ "page": None,
393
+ "per_page": req.per_page,
394
+ "next_cursor": token,
395
+ },
396
+ "results": [strip_oid(d) for d in results],
397
+ "group_by": [],
398
+ }
399
+ if req.fields:
400
+ rv["meta"]["fields"] = req.fields
401
+ return rv
402
+
403
+
404
+ def find_resources_spanning(
405
+ req: FindRequest, mdb: MongoDatabase, collection_names: Set[str]
406
+ ):
407
+ """Find nmdc schema collection entities -- here, across multiple collections -- that match the FindRequest.
408
+
409
+ This is useful for collections that house documents that are subclasses of a common ancestor class.
410
+
411
+ "resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
412
+ """
413
+ if req.cursor or not req.page:
414
+ raise HTTPException(
415
+ status_code=status.HTTP_400_BAD_REQUEST,
416
+ detail="This resource only supports page-based pagination",
417
+ )
418
+
419
+ if len(collection_names) == 0:
420
+ return {
421
+ "meta": {
422
+ "mongo_filter_dict": get_mongo_filter(req.filter),
423
+ "count": 0,
424
+ "db_response_time_ms": 0,
425
+ "page": req.page,
426
+ "per_page": req.per_page,
427
+ },
428
+ "results": [],
429
+ "group_by": [],
430
+ }
431
+
432
+ responses = {name: find_resources(req, mdb, name) for name in collection_names}
433
+ rv = {
434
+ "meta": {
435
+ "mongo_filter_dict": next(
436
+ r["meta"]["mongo_filter_dict"] for r in responses.values()
437
+ ),
438
+ "count": sum(r["meta"]["count"] for r in responses.values()),
439
+ "db_response_time_ms": sum(
440
+ r["meta"]["db_response_time_ms"] for r in responses.values()
441
+ ),
442
+ "page": req.page,
443
+ "per_page": req.per_page,
444
+ },
445
+ "results": list(concat(r["results"] for r in responses.values())),
446
+ "group_by": [],
447
+ }
448
+ return rv
449
+
450
+
451
+ def exists(collection: MongoCollection, filter_: dict):
452
+ r"""
453
+ Returns True if there are any documents in the collection that meet the filter requirements.
454
+ """
455
+ return collection.count_documents(filter_) > 0
456
+
457
+
458
+ def persist_content_and_get_drs_object(
459
+ content: str,
460
+ description: str,
461
+ username="(anonymous)",
462
+ filename=None,
463
+ content_type="application/json",
464
+ id_ns="json-metadata-in",
465
+ exists_ok=False,
466
+ ):
467
+ """Persist a Data Repository Service (DRS) object.
468
+
469
+ An object may be a blob, analogous to a file, or a bundle, analogous to a folder. Sites register objects,
470
+ and sites must ensure that these objects are accessible to the NMDC data broker.
471
+ An object may be associated with one or more object types, useful for triggering workflows.
472
+
473
+ Reference: https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.1.0/docs/#_drs_datatypes
474
+ """
475
+ mdb = get_mongo_db()
476
+ drs_id = local_part(generate_one_id(mdb, ns=id_ns, shoulder="gfs0"))
477
+ filename = filename or drs_id
478
+ PortableFilename(filename) # validates
479
+ DrsId(drs_id) # validates
480
+
481
+ mdb_fs = GridFS(mdb)
482
+ mdb_fs.put(
483
+ content,
484
+ _id=drs_id,
485
+ filename=filename,
486
+ content_type=content_type,
487
+ encoding="utf-8",
488
+ )
489
+ with tempfile.TemporaryDirectory() as save_dir:
490
+ filepath = str(Path(save_dir).joinpath(filename))
491
+ with open(filepath, "w") as f:
492
+ f.write(content)
493
+ now_to_the_minute = datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat(
494
+ timespec="minutes"
495
+ )
496
+ object_in = DrsObjectIn(
497
+ **drs_metadata_for(
498
+ filepath,
499
+ base={
500
+ "description": (
501
+ description
502
+ + f" (created by/for {username}"
503
+ + f" at {now_to_the_minute})"
504
+ ),
505
+ "access_methods": [{"access_id": drs_id}],
506
+ },
507
+ timestamp=now_to_the_minute,
508
+ )
509
+ )
510
+ self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}"
511
+ return _create_object(
512
+ mdb,
513
+ object_in,
514
+ mgr_site="nmdc-runtime",
515
+ drs_id=drs_id,
516
+ self_uri=self_uri,
517
+ exists_ok=exists_ok,
518
+ )
519
+
520
+
521
+ def _create_object(
522
+ mdb: MongoDatabase,
523
+ object_in: DrsObjectIn,
524
+ mgr_site,
525
+ drs_id,
526
+ self_uri,
527
+ exists_ok=False,
528
+ ):
529
+ """Helper function for creating a Data Repository Service (DRS) object."""
530
+ drs_obj = DrsObject(
531
+ **object_in.model_dump(exclude_unset=True),
532
+ id=drs_id,
533
+ self_uri=self_uri,
534
+ )
535
+ doc = drs_obj.model_dump(exclude_unset=True)
536
+ doc["_mgr_site"] = mgr_site # manager site
537
+ try:
538
+ mdb.objects.insert_one(doc)
539
+ except DuplicateKeyError as e:
540
+ if e.details["keyPattern"] == {"checksums.type": 1, "checksums.checksum": 1}:
541
+ if exists_ok:
542
+ return mdb.objects.find_one(
543
+ {
544
+ "checksums": {
545
+ "$elemMatch": {
546
+ "type": e.details["keyValue"]["checksums.type"],
547
+ "checksum": e.details["keyValue"]["checksums.checksum"],
548
+ }
549
+ }
550
+ }
551
+ )
552
+ else:
553
+ raise HTTPException(
554
+ status_code=status.HTTP_409_CONFLICT,
555
+ detail=f"provided checksum matches existing object: {e.details['keyValue']}",
556
+ )
557
+ else:
558
+ raise HTTPException(
559
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
560
+ detail="duplicate key error",
561
+ )
562
+ return doc
563
+
564
+
565
+ def _claim_job(job_id: str, mdb: MongoDatabase, site: Site):
566
+ r"""
567
+ TODO: Document this function.
568
+ """
569
+ job_doc = raise404_if_none(mdb.jobs.find_one({"id": job_id}))
570
+ job = Job(**job_doc)
571
+ # check that site satisfies the job's workflow's required capabilities.
572
+ capabilities_required = job.workflow.capability_ids or []
573
+ for cid in capabilities_required:
574
+ if cid not in site.capability_ids:
575
+ raise HTTPException(
576
+ status_code=status.HTTP_403_FORBIDDEN,
577
+ detail=f"client site does not have capability {cid} required to claim job",
578
+ )
579
+
580
+ # For now, allow site to claim same job multiple times,
581
+ # to re-submit results given same job input config.
582
+ job_op_for_site = mdb.operations.find_one(
583
+ {"metadata.job.id": job.id, "metadata.site_id": site.id}
584
+ )
585
+ if job_op_for_site is not None:
586
+ # raise HTTPException(
587
+ # status_code=status.HTTP_409_CONFLICT,
588
+ # detail={
589
+ # "msg": (
590
+ # f"client site already claimed job -- "
591
+ # f"see operation {job_op_for_site['id']}"
592
+ # ),
593
+ # "id": job_op_for_site["id"],
594
+ # },
595
+ # )
596
+ pass
597
+
598
+ op_id = generate_one_id(mdb, "op")
599
+ job.claims = (job.claims or []) + [JobClaim(op_id=op_id, site_id=site.id)]
600
+ op = Operation[ResultT, JobOperationMetadata](
601
+ **{
602
+ "id": op_id,
603
+ "expire_time": expiry_dt_from_now(days=30),
604
+ "metadata": {
605
+ "job": Job(
606
+ **{
607
+ "id": job.id,
608
+ "workflow": job.workflow,
609
+ "config": job.config,
610
+ }
611
+ ).model_dump(exclude_unset=True),
612
+ "site_id": site.id,
613
+ "model": dotted_path_for(JobOperationMetadata),
614
+ },
615
+ }
616
+ )
617
+ mdb.operations.insert_one(op.model_dump())
618
+ mdb.jobs.replace_one({"id": job.id}, job.model_dump(exclude_unset=True))
619
+
620
+ return op.model_dump(exclude_unset=True)
621
+
622
+
623
+ @lru_cache
624
+ def map_nmdc_workflow_id_to_dagster_job_name():
625
+ """Returns a dictionary mapping nmdc_workflow_id to dagster_job_name."""
626
+ return {
627
+ "metadata-in-1.0.0": "apply_metadata_in",
628
+ "export-study-biosamples-as-csv-1.0.0": "export_study_biosamples_metadata",
629
+ "gold_study_to_database": "gold_study_to_database",
630
+ }
631
+
632
+
633
+ def ensure_run_config_data(
634
+ nmdc_workflow_id: str,
635
+ nmdc_workflow_inputs: List[str],
636
+ run_config_data: dict,
637
+ mdb: MongoDatabase,
638
+ user: User,
639
+ ):
640
+ r"""
641
+ Ensures that run_config_data has entries for certain nmdc workflow ids.
642
+ Returns return_config_data.
643
+ """
644
+ if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
645
+ run_config_data = assoc_in(
646
+ run_config_data,
647
+ ["ops", "get_study_biosamples_metadata", "config", "study_id"],
648
+ nmdc_workflow_inputs[0],
649
+ )
650
+ run_config_data = assoc_in(
651
+ run_config_data,
652
+ ["ops", "get_study_biosamples_metadata", "config", "username"],
653
+ user.username,
654
+ )
655
+ return run_config_data
656
+ if nmdc_workflow_id == "gold_study_to_database":
657
+ run_config_data = assoc_in(
658
+ run_config_data,
659
+ ["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
660
+ nmdc_workflow_inputs[0],
661
+ )
662
+ run_config_data = assoc_in(
663
+ run_config_data,
664
+ ["ops", "export_json_to_drs", "config", "username"],
665
+ user.username,
666
+ )
667
+ return run_config_data
668
+ else:
669
+ return run_config_data
670
+
671
+
672
+ def inputs_for(nmdc_workflow_id, run_config_data):
673
+ """Returns a URI path for given nmdc_workflow_id, constructed from run_config_data."""
674
+ if nmdc_workflow_id == "metadata-in-1.0.0":
675
+ return [
676
+ "/objects/"
677
+ + get_in(["ops", "get_json_in", "config", "object_id"], run_config_data)
678
+ ]
679
+ if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
680
+ return [
681
+ "/studies/"
682
+ + get_in(
683
+ ["ops", "get_study_biosamples_metadata", "config", "study_id"],
684
+ run_config_data,
685
+ )
686
+ ]
687
+ if nmdc_workflow_id == "gold_study_to_database":
688
+ return [
689
+ "/studies/"
690
+ + get_in(
691
+ ["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
692
+ run_config_data,
693
+ )
694
+ ]
695
+
696
+
697
+ def _request_dagster_run(
698
+ nmdc_workflow_id: str,
699
+ nmdc_workflow_inputs: List[str],
700
+ extra_run_config_data: dict,
701
+ mdb: MongoDatabase,
702
+ user: User,
703
+ repository_location_name=None,
704
+ repository_name=None,
705
+ ):
706
+ r"""
707
+ Requests a Dagster run using the specified parameters.
708
+ Returns a json dictionary indicating the job's success or failure.
709
+ This is a generic wrapper.
710
+ """
711
+ dagster_job_name = map_nmdc_workflow_id_to_dagster_job_name()[nmdc_workflow_id]
712
+
713
+ extra_run_config_data = ensure_run_config_data(
714
+ nmdc_workflow_id, nmdc_workflow_inputs, extra_run_config_data, mdb, user
715
+ )
716
+
717
+ # add REQUESTED RunEvent
718
+ nmdc_run_id = _add_run_requested_event(
719
+ run_spec=RunUserSpec(
720
+ job_id=nmdc_workflow_id,
721
+ run_config=extra_run_config_data,
722
+ inputs=inputs_for(nmdc_workflow_id, extra_run_config_data),
723
+ ),
724
+ mdb=mdb,
725
+ user=user,
726
+ )
727
+
728
+ dagster_client = get_dagster_graphql_client()
729
+ try:
730
+ dagster_run_id: str = dagster_client.submit_job_execution(
731
+ dagster_job_name,
732
+ repository_location_name=repository_location_name,
733
+ repository_name=repository_name,
734
+ run_config=extra_run_config_data,
735
+ )
736
+
737
+ # add STARTED RunEvent
738
+ _add_run_started_event(run_id=nmdc_run_id, mdb=mdb)
739
+ mdb.run_events.find_one_and_update(
740
+ filter={"run.id": nmdc_run_id, "type": "STARTED"},
741
+ update={"$set": {"run.facets.nmdcRuntime_dagsterRunId": dagster_run_id}},
742
+ sort=[("time", -1)],
743
+ )
744
+
745
+ return {"type": "success", "detail": {"run_id": nmdc_run_id}}
746
+ except DagsterGraphQLClientError as exc:
747
+ # add FAIL RunEvent
748
+ _add_run_fail_event(run_id=nmdc_run_id, mdb=mdb)
749
+
750
+ return {
751
+ "type": "error",
752
+ "detail": {"run_id": nmdc_run_id, "error_detail": str(exc)},
753
+ }
754
+
755
+
756
+ def _get_dagster_run_status(run_id: str):
757
+ r"""
758
+ Returns the status (either "success" or "error") of a requested Dagster run.
759
+ """
760
+ dagster_client = get_dagster_graphql_client()
761
+ try:
762
+ run_status: DagsterRunStatus = dagster_client.get_run_status(run_id)
763
+ return {"type": "success", "detail": str(run_status.value)}
764
+ except DagsterGraphQLClientError as exc:
765
+ return {"type": "error", "detail": str(exc)}
766
+
767
+
768
+ def check_action_permitted(username: str, action: str):
769
+ """Returns True if a Mongo database action is "allowed" and "not denied"."""
770
+ db: MongoDatabase = get_mongo_db()
771
+ filter_ = {"username": username, "action": action}
772
+ denied = db["_runtime.api.deny"].find_one(filter_) is not None
773
+ allowed = db["_runtime.api.allow"].find_one(filter_) is not None
774
+ return (not denied) and allowed