nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +208 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +788 -0
  10. nmdc_runtime/api/core/util.py +109 -0
  11. nmdc_runtime/api/db/mongo.py +435 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +143 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +270 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +796 -0
  31. nmdc_runtime/api/endpoints/workflows.py +353 -0
  32. nmdc_runtime/api/entrypoint.sh +7 -0
  33. nmdc_runtime/api/main.py +425 -0
  34. nmdc_runtime/api/middleware.py +43 -0
  35. nmdc_runtime/api/models/capability.py +14 -0
  36. nmdc_runtime/api/models/id.py +92 -0
  37. nmdc_runtime/api/models/job.py +37 -0
  38. nmdc_runtime/api/models/lib/helpers.py +78 -0
  39. nmdc_runtime/api/models/metadata.py +11 -0
  40. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  41. nmdc_runtime/api/models/object.py +180 -0
  42. nmdc_runtime/api/models/object_type.py +20 -0
  43. nmdc_runtime/api/models/operation.py +66 -0
  44. nmdc_runtime/api/models/query.py +246 -0
  45. nmdc_runtime/api/models/query_continuation.py +111 -0
  46. nmdc_runtime/api/models/run.py +161 -0
  47. nmdc_runtime/api/models/site.py +87 -0
  48. nmdc_runtime/api/models/trigger.py +13 -0
  49. nmdc_runtime/api/models/user.py +140 -0
  50. nmdc_runtime/api/models/util.py +260 -0
  51. nmdc_runtime/api/models/workflow.py +15 -0
  52. nmdc_runtime/api/openapi.py +178 -0
  53. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  54. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  55. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  56. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  57. nmdc_runtime/config.py +7 -8
  58. nmdc_runtime/minter/adapters/repository.py +22 -2
  59. nmdc_runtime/minter/config.py +2 -0
  60. nmdc_runtime/minter/domain/model.py +55 -1
  61. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  62. nmdc_runtime/mongo_util.py +1 -2
  63. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  64. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  65. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  66. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  67. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  68. nmdc_runtime/site/dagster.yaml +53 -0
  69. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  70. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  71. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  72. nmdc_runtime/site/export/ncbi_xml.py +633 -13
  73. nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
  74. nmdc_runtime/site/graphs.py +8 -22
  75. nmdc_runtime/site/ops.py +147 -181
  76. nmdc_runtime/site/repository.py +2 -112
  77. nmdc_runtime/site/resources.py +16 -3
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/site/workspace.yaml +13 -0
  87. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  88. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  89. nmdc_runtime/static/README.md +5 -0
  90. nmdc_runtime/static/favicon.ico +0 -0
  91. nmdc_runtime/util.py +90 -48
  92. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  93. nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
  94. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  95. nmdc_runtime/containers.py +0 -14
  96. nmdc_runtime/core/db/Database.py +0 -15
  97. nmdc_runtime/core/exceptions/__init__.py +0 -23
  98. nmdc_runtime/core/exceptions/base.py +0 -47
  99. nmdc_runtime/core/exceptions/token.py +0 -13
  100. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  101. nmdc_runtime/domain/users/userSchema.py +0 -37
  102. nmdc_runtime/domain/users/userService.py +0 -14
  103. nmdc_runtime/infrastructure/database/db.py +0 -3
  104. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  105. nmdc_runtime/lib/__init__.py +0 -1
  106. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  107. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  108. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  109. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  110. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  111. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  112. nmdc_runtime/site/drsobjects/registration.py +0 -131
  113. nmdc_runtime/site/translation/emsl.py +0 -43
  114. nmdc_runtime/site/translation/gold.py +0 -53
  115. nmdc_runtime/site/translation/jgi.py +0 -32
  116. nmdc_runtime/site/translation/util.py +0 -132
  117. nmdc_runtime/site/validation/jgi.py +0 -43
  118. nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
  119. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  120. nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
  121. /nmdc_runtime/{client → api}/__init__.py +0 -0
  122. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  123. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  124. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  125. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  126. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  127. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  128. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  129. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  130. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  131. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,796 @@
1
+ import logging
2
+ import os
3
+ import tempfile
4
+ from datetime import datetime
5
+ from functools import lru_cache
6
+ from json import JSONDecodeError
7
+ from pathlib import Path
8
+ from time import time_ns
9
+ from typing import List, Optional, Set, Tuple
10
+ from zoneinfo import ZoneInfo
11
+
12
+ from bson import json_util
13
+ from dagster import DagsterRunStatus
14
+ from dagster_graphql import DagsterGraphQLClientError
15
+ from fastapi import HTTPException
16
+ from gridfs import GridFS
17
+ from nmdc_runtime.api.core.idgen import generate_one_id, local_part
18
+ from nmdc_runtime.api.core.util import (
19
+ dotted_path_for,
20
+ expiry_dt_from_now,
21
+ raise404_if_none,
22
+ )
23
+ from nmdc_runtime.api.db.mongo import get_mongo_db
24
+ from nmdc_runtime.api.models.job import Job, JobClaim, JobOperationMetadata
25
+ from nmdc_runtime.api.models.object import (
26
+ DrsId,
27
+ DrsObject,
28
+ DrsObjectIn,
29
+ PortableFilename,
30
+ )
31
+ from nmdc_runtime.api.models.operation import Operation
32
+ from nmdc_runtime.api.models.run import (
33
+ RunUserSpec,
34
+ _add_run_fail_event,
35
+ _add_run_requested_event,
36
+ _add_run_started_event,
37
+ get_dagster_graphql_client,
38
+ )
39
+ from nmdc_runtime.api.models.site import Site
40
+ from nmdc_runtime.api.models.user import User
41
+ from nmdc_runtime.api.models.util import (
42
+ FindRequest,
43
+ ListRequest,
44
+ ResultT,
45
+ )
46
+ from nmdc_runtime.util import drs_metadata_for
47
+ from pymongo.collection import Collection as MongoCollection
48
+ from pymongo.database import Database as MongoDatabase
49
+ from pymongo.errors import DuplicateKeyError
50
+ from starlette import status
51
+ from toolz import assoc_in, concat, dissoc, get_in, merge
52
+
53
+ BASE_URL_INTERNAL = os.getenv("API_HOST")
54
+ BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL")
55
+ HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1]
56
+
57
+
58
+ def is_num_matching_docs_within_limit(
59
+ collection: MongoCollection, filter_: dict, limit: int
60
+ ) -> bool:
61
+ """
62
+ Check whether the number of documents in a MongoDB collection that match
63
+ the filter is within (i.e. is no greater than) the specified limit.
64
+ """
65
+ if limit < 0:
66
+ raise ValueError("Limit must be at least 0.")
67
+
68
+ # Count the number of documents matching the filter, but only count up to limit + 1,
69
+ # since that's enough to determine whether the number exceeds the limit.
70
+ limited_num_matching_docs = collection.count_documents(
71
+ filter=filter_,
72
+ limit=limit + 1,
73
+ )
74
+ return limited_num_matching_docs <= limit
75
+
76
+
77
+ def check_filter(filter_: str):
78
+ """A pass-through function that checks if `filter_` is parsable as a JSON object. Raises otherwise."""
79
+ filter_ = filter_.strip()
80
+ if not filter_.startswith("{") or not filter_.endswith("}"):
81
+ raise HTTPException(
82
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
83
+ detail=f"The given `filter` is not a valid JSON object, which must start with '{{' and end with '}}'.",
84
+ )
85
+ try:
86
+ json_util.loads(filter_)
87
+ except JSONDecodeError as e:
88
+ raise HTTPException(
89
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
90
+ detail=f"Given `filter` is not valid JSON: {e}",
91
+ )
92
+ return filter_
93
+
94
+
95
+ def list_resources(
96
+ req: ListRequest, mdb: MongoDatabase, collection_name: str = ""
97
+ ) -> dict:
98
+ """
99
+ Returns a dictionary containing the requested MongoDB documents, maybe alongside pagination information.
100
+
101
+ `mdb.page_tokens` docs are `{"_id": req.page_token, "ns": collection_name}`, Because `page_token` is globally
102
+ unique, and because the `mdb.page_tokens.find_one({"_id": req.page_token})` document stores `collection_name` in
103
+ the "ns" (namespace) field, the value for `collection_name` stored there takes precedence over any value supplied
104
+ as an argument to this function's `collection_name` parameter.
105
+
106
+ If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter criteria than
107
+ can fit on a page of that size, this function will paginate the resources.
108
+ """
109
+ if collection_name == "" and req.page_token is None:
110
+ raise HTTPException(
111
+ status_code=status.HTTP_400_BAD_REQUEST,
112
+ detail="Must specify a collection name if no page token is supplied.",
113
+ )
114
+ if req.page_token:
115
+ doc = mdb.page_tokens.find_one({"_id": req.page_token})
116
+ if doc is None:
117
+ raise HTTPException(
118
+ status_code=status.HTTP_400_BAD_REQUEST, detail="`page_token` not found"
119
+ )
120
+ collection_name = doc["ns"]
121
+ last_id = doc["last_id"]
122
+ mdb.page_tokens.delete_one({"_id": req.page_token})
123
+ else:
124
+ last_id = None
125
+
126
+ id_field = "id"
127
+ if "id_1" not in mdb[collection_name].index_information():
128
+ logging.warning(
129
+ f"list_resources: no index set on 'id' for collection {collection_name}"
130
+ )
131
+ id_field = "_id" # expected for `functional_annotation_agg` collection
132
+
133
+ max_page_size = req.max_page_size
134
+ filter_ = json_util.loads(check_filter(req.filter)) if req.filter else {}
135
+ projection = (
136
+ list(set(comma_separated_values(req.projection)) | {id_field})
137
+ if req.projection
138
+ else None
139
+ )
140
+ if last_id is not None:
141
+ if id_field in filter_:
142
+ filter_[id_field] = merge(filter_[id_field], {"$gt": last_id})
143
+ else:
144
+ filter_ = merge(filter_, {id_field: {"$gt": last_id}})
145
+
146
+ # Determine whether we will paginate the results.
147
+ #
148
+ # Note: We will paginate them unless either (a) the `max_page_size` is less than 1,
149
+ # or (b) the number of documents matching the filter can fit on a single page.
150
+ #
151
+ will_paginate = True
152
+ if max_page_size < 1 or is_num_matching_docs_within_limit(
153
+ collection=mdb[collection_name], filter_=filter_, limit=max_page_size
154
+ ):
155
+ will_paginate = False
156
+
157
+ if not will_paginate:
158
+ rv = {
159
+ "resources": list(
160
+ mdb[collection_name].find(filter=filter_, projection=projection)
161
+ )
162
+ }
163
+ return rv
164
+ else:
165
+ resources = list(
166
+ mdb[collection_name].find(
167
+ filter=filter_,
168
+ projection=projection,
169
+ limit=max_page_size,
170
+ sort=[(id_field, 1)],
171
+ allow_disk_use=True,
172
+ )
173
+ )
174
+ last_id = resources[-1][id_field]
175
+ token = generate_one_id(mdb, "page_tokens")
176
+ # TODO unify with `/queries:run` query continuation model
177
+ # => {_id: cursor/token, query: <full query>, last_id: <>, last_modified: <>}
178
+ mdb.page_tokens.insert_one(
179
+ {"_id": token, "ns": collection_name, "last_id": last_id}
180
+ )
181
+ return {"resources": resources, "next_page_token": token}
182
+
183
+
184
+ def coerce_to_float_if_possible(val):
185
+ r"""
186
+ Converts the specified value into a floating-point number if possible;
187
+ raising a `ValueError` if not possible.
188
+ """
189
+ try:
190
+ return float(val)
191
+ except ValueError:
192
+ return val
193
+
194
+
195
+ def comma_separated_values(s: str):
196
+ r"""
197
+ Returns a list of the comma-delimited substrings of the specified string. Discards any whitespace
198
+ surrounding each substring.
199
+
200
+ Reference: https://docs.python.org/3/library/re.html#re.split
201
+
202
+ >>> comma_separated_values("apple, banana, cherry")
203
+ ['apple', 'banana', 'cherry']
204
+ """
205
+ return [v.strip() for v in s.split(",")]
206
+
207
+
208
+ def get_mongo_filter(filter_str):
209
+ r"""
210
+ Convert a str in the domain-specific language (DSL) solicited by `nmdc_runtime.api.models.util.FindRequest.filter`
211
+ -- i.e., a comma-separated list of `attribute:value` pairs, where the `value` can include a comparison operator
212
+ (e.g. `>=`) and where if the attribute is of type _string_ and has the suffix `.search` appended to its name
213
+ then the server should perform a full-text search
214
+ -- to a corresponding MongoDB filter representation for e.g. passing to a collection `find` call.
215
+ """
216
+ filter_ = {}
217
+ if not filter_str:
218
+ return filter_
219
+
220
+ pairs = comma_separated_values(filter_str)
221
+ if not all(len(split) == 2 for split in (p.split(":", maxsplit=1) for p in pairs)):
222
+ raise HTTPException(
223
+ status_code=status.HTTP_400_BAD_REQUEST,
224
+ detail="Filter must be of form: attribute:spec[,attribute:spec]*",
225
+ )
226
+
227
+ for attr, spec in (p.split(":", maxsplit=1) for p in pairs):
228
+ if attr.endswith(".search"):
229
+ actual_attr = attr[: -len(".search")]
230
+ filter_[actual_attr] = {"$regex": spec}
231
+ else:
232
+ for op, key in {("<", "$lt"), ("<=", "$lte"), (">", "$gt"), (">=", "$gte")}:
233
+ if spec.startswith(op):
234
+ filter_[attr] = {key: coerce_to_float_if_possible(spec[len(op) :])}
235
+ break
236
+ else:
237
+ filter_[attr] = spec
238
+ return filter_
239
+
240
+
241
+ def get_mongo_sort(sort_str) -> Optional[List[Tuple[str, int]]]:
242
+ """
243
+ Parse `sort_str` and a str of the form "attribute:spec[,attribute:spec]*",
244
+ where spec is `asc` (ascending -- the default if no spec) or `desc` (descending),
245
+ and return a value suitable to pass as a `sort` kwarg to a mongo collection `find` call.
246
+ """
247
+ sort_ = []
248
+ if not sort_str:
249
+ return None
250
+
251
+ pairs = comma_separated_values(sort_str)
252
+ for p in pairs:
253
+ components = p.split(":", maxsplit=1)
254
+ if len(components) == 1:
255
+ attr, spec = components[0], ""
256
+ else:
257
+ attr, spec = components
258
+ for op, key in {("", 1), ("asc", 1), ("desc", -1)}:
259
+ if spec == op:
260
+ sort_.append((attr, key))
261
+ break
262
+ else:
263
+ raise HTTPException(
264
+ status_code=status.HTTP_400_BAD_REQUEST,
265
+ detail=(
266
+ "Sort must be of form: attribute:spec[,attribute:spec]* "
267
+ "where spec is `asc` (ascending -- the default if no spec) "
268
+ "or `desc` (descending).",
269
+ ),
270
+ )
271
+ return sort_
272
+
273
+
274
+ def strip_oid(doc: dict) -> dict:
275
+ r"""
276
+ Returns a copy of the specified dictionary, that has no `_id` key.
277
+ """
278
+ return dissoc(doc, "_id")
279
+
280
+
281
+ def timeit(cursor):
282
+ """Collect from cursor and return time taken in milliseconds."""
283
+ tic = time_ns()
284
+ results = list(cursor)
285
+ toc = time_ns()
286
+ return results, int(round((toc - tic) / 1e6))
287
+
288
+
289
+ def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str):
290
+ """Find nmdc schema collection entities that match the FindRequest.
291
+
292
+ "resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
293
+ """
294
+ if req.group_by:
295
+ raise HTTPException(
296
+ status_code=status.HTTP_418_IM_A_TEAPOT,
297
+ detail="I don't yet know how to ?group_by=",
298
+ )
299
+ if req.search:
300
+ raise HTTPException(
301
+ status_code=status.HTTP_418_IM_A_TEAPOT,
302
+ detail=(
303
+ "I don't yet know how to ?search=. "
304
+ "Use ?filter=<attribute>.search:<spec> instead."
305
+ ),
306
+ )
307
+
308
+ filter_ = get_mongo_filter(req.filter)
309
+ projection = (
310
+ list(set(comma_separated_values(req.fields)) | {"id"}) if req.fields else None
311
+ )
312
+ sort_ = get_mongo_sort(req.sort)
313
+
314
+ total_count = mdb[collection_name].count_documents(filter=filter_)
315
+
316
+ if req.page:
317
+ skip = (req.page - 1) * req.per_page
318
+ if skip > 10_000:
319
+ # Note: because _page number_-based pagination is currently implemented via MongoDB's `skip` and `limit`
320
+ # parameters, a full (slow) collection scan is performed to skip to the requested page. This scan takes
321
+ # longer and longer as `skip` increases, which is why cursor-based pagination is preferred for large
322
+ # collections.
323
+ raise HTTPException(
324
+ status_code=status.HTTP_400_BAD_REQUEST,
325
+ detail=(
326
+ "Use cursor-based pagination for paging beyond 10,000 items. "
327
+ "That is, instead of specifying the `page` query parameter for this endpoint, "
328
+ "specify the `cursor` query parameter. In particular, set `cursor` to `*` to get the first page, "
329
+ "and use the value of `meta.next_cursor` in the response, if not `null`, as the value to which "
330
+ "you set `cursor` in the next request."
331
+ ),
332
+ )
333
+ limit = req.per_page
334
+ results, db_response_time_ms = timeit(
335
+ mdb[collection_name].find(
336
+ filter=filter_,
337
+ skip=skip,
338
+ limit=limit,
339
+ sort=sort_,
340
+ projection=projection,
341
+ )
342
+ )
343
+ rv = {
344
+ "meta": {
345
+ "mongo_filter_dict": filter_,
346
+ "mongo_sort_list": [[a, s] for a, s in sort_] if sort_ else None,
347
+ "count": total_count,
348
+ "db_response_time_ms": db_response_time_ms,
349
+ "page": req.page,
350
+ "per_page": req.per_page,
351
+ },
352
+ "results": [strip_oid(d) for d in results],
353
+ "group_by": [],
354
+ }
355
+ if req.fields:
356
+ rv["meta"]["fields"] = req.fields
357
+
358
+ else: # req.cursor is not None
359
+ if req.cursor != "*":
360
+ doc = mdb.page_tokens.find_one({"_id": req.cursor, "ns": collection_name})
361
+ if doc is None:
362
+ raise HTTPException(
363
+ status_code=status.HTTP_400_BAD_REQUEST, detail="Bad cursor value"
364
+ )
365
+ last_id = doc["last_id"]
366
+ mdb.page_tokens.delete_one({"_id": req.cursor})
367
+ else:
368
+ last_id = None
369
+
370
+ if last_id is not None:
371
+ if "id" in filter_:
372
+ filter_["id"] = merge(filter_["id"], {"$gt": last_id})
373
+ else:
374
+ filter_ = merge(filter_, {"id": {"$gt": last_id}})
375
+
376
+ if "id_1" not in mdb[collection_name].index_information():
377
+ raise HTTPException(
378
+ status_code=status.HTTP_400_BAD_REQUEST,
379
+ detail=f"Cursor-based pagination is not enabled for this resource.",
380
+ )
381
+
382
+ limit = req.per_page
383
+ sort_for_cursor = (sort_ or []) + [("id", 1)]
384
+ results, db_response_time_ms = timeit(
385
+ mdb[collection_name].find(
386
+ filter=filter_, limit=limit, sort=sort_for_cursor, projection=projection
387
+ )
388
+ )
389
+ last_id = results[-1]["id"]
390
+
391
+ # Is this the last id overall? Then next_cursor should be None.
392
+ filter_eager = filter_
393
+ if "id" in filter_:
394
+ filter_eager["id"] = merge(filter_["id"], {"$gt": last_id})
395
+ else:
396
+ filter_eager = merge(filter_, {"id": {"$gt": last_id}})
397
+ more_results = (
398
+ mdb[collection_name].count_documents(filter=filter_eager, limit=limit) > 0
399
+ )
400
+ if more_results:
401
+ token = generate_one_id(mdb, "page_tokens")
402
+ mdb.page_tokens.insert_one(
403
+ {"_id": token, "ns": collection_name, "last_id": last_id}
404
+ )
405
+ else:
406
+ token = None
407
+
408
+ rv = {
409
+ "meta": {
410
+ "mongo_filter_dict": filter_,
411
+ "mongo_sort_list": sort_for_cursor,
412
+ "count": total_count,
413
+ "db_response_time_ms": db_response_time_ms,
414
+ "page": None,
415
+ "per_page": req.per_page,
416
+ "next_cursor": token,
417
+ },
418
+ "results": [strip_oid(d) for d in results],
419
+ "group_by": [],
420
+ }
421
+ if req.fields:
422
+ rv["meta"]["fields"] = req.fields
423
+ return rv
424
+
425
+
426
+ def find_resources_spanning(
427
+ req: FindRequest, mdb: MongoDatabase, collection_names: Set[str]
428
+ ):
429
+ """Find nmdc schema collection entities -- here, across multiple collections -- that match the FindRequest.
430
+
431
+ This is useful for collections that house documents that are subclasses of a common ancestor class.
432
+
433
+ "resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
434
+ """
435
+ if req.cursor or not req.page:
436
+ raise HTTPException(
437
+ status_code=status.HTTP_400_BAD_REQUEST,
438
+ detail="This resource only supports page-based pagination",
439
+ )
440
+
441
+ if len(collection_names) == 0:
442
+ return {
443
+ "meta": {
444
+ "mongo_filter_dict": get_mongo_filter(req.filter),
445
+ "count": 0,
446
+ "db_response_time_ms": 0,
447
+ "page": req.page,
448
+ "per_page": req.per_page,
449
+ },
450
+ "results": [],
451
+ "group_by": [],
452
+ }
453
+
454
+ responses = {name: find_resources(req, mdb, name) for name in collection_names}
455
+ rv = {
456
+ "meta": {
457
+ "mongo_filter_dict": next(
458
+ r["meta"]["mongo_filter_dict"] for r in responses.values()
459
+ ),
460
+ "count": sum(r["meta"]["count"] for r in responses.values()),
461
+ "db_response_time_ms": sum(
462
+ r["meta"]["db_response_time_ms"] for r in responses.values()
463
+ ),
464
+ "page": req.page,
465
+ "per_page": req.per_page,
466
+ },
467
+ "results": list(concat(r["results"] for r in responses.values())),
468
+ "group_by": [],
469
+ }
470
+ return rv
471
+
472
+
473
+ def exists(collection: MongoCollection, filter_: dict):
474
+ r"""
475
+ Returns True if there are any documents in the collection that meet the filter requirements.
476
+ """
477
+ return collection.count_documents(filter_) > 0
478
+
479
+
480
+ def persist_content_and_get_drs_object(
481
+ content: str,
482
+ description: str,
483
+ username="(anonymous)",
484
+ filename=None,
485
+ content_type="application/json",
486
+ id_ns="json-metadata-in",
487
+ exists_ok=False,
488
+ ):
489
+ """Persist a Data Repository Service (DRS) object.
490
+
491
+ An object may be a blob, analogous to a file, or a bundle, analogous to a folder. Sites register objects,
492
+ and sites must ensure that these objects are accessible to the NMDC data broker.
493
+ An object may be associated with one or more object types, useful for triggering workflows.
494
+
495
+ Reference: https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.1.0/docs/#_drs_datatypes
496
+ """
497
+ mdb = get_mongo_db()
498
+ drs_id = local_part(generate_one_id(mdb, ns=id_ns, shoulder="gfs0"))
499
+ filename = filename or drs_id
500
+ PortableFilename(filename) # validates
501
+ DrsId(drs_id) # validates
502
+
503
+ mdb_fs = GridFS(mdb)
504
+ mdb_fs.put(
505
+ content,
506
+ _id=drs_id,
507
+ filename=filename,
508
+ content_type=content_type,
509
+ encoding="utf-8",
510
+ )
511
+ with tempfile.TemporaryDirectory() as save_dir:
512
+ filepath = str(Path(save_dir).joinpath(filename))
513
+ with open(filepath, "w") as f:
514
+ f.write(content)
515
+ now_to_the_minute = datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat(
516
+ timespec="minutes"
517
+ )
518
+ object_in = DrsObjectIn(
519
+ **drs_metadata_for(
520
+ filepath,
521
+ base={
522
+ "description": (
523
+ description
524
+ + f" (created by/for {username}"
525
+ + f" at {now_to_the_minute})"
526
+ ),
527
+ "access_methods": [{"access_id": drs_id}],
528
+ },
529
+ timestamp=now_to_the_minute,
530
+ )
531
+ )
532
+ self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}"
533
+ return _create_object(
534
+ mdb,
535
+ object_in,
536
+ mgr_site="nmdc-runtime",
537
+ drs_id=drs_id,
538
+ self_uri=self_uri,
539
+ exists_ok=exists_ok,
540
+ )
541
+
542
+
543
+ def _create_object(
544
+ mdb: MongoDatabase,
545
+ object_in: DrsObjectIn,
546
+ mgr_site,
547
+ drs_id,
548
+ self_uri,
549
+ exists_ok=False,
550
+ ):
551
+ """Helper function for creating a Data Repository Service (DRS) object."""
552
+ drs_obj = DrsObject(
553
+ **object_in.model_dump(exclude_unset=True),
554
+ id=drs_id,
555
+ self_uri=self_uri,
556
+ )
557
+ doc = drs_obj.model_dump(exclude_unset=True)
558
+ doc["_mgr_site"] = mgr_site # manager site
559
+ try:
560
+ mdb.objects.insert_one(doc)
561
+ except DuplicateKeyError as e:
562
+ if e.details["keyPattern"] == {"checksums.type": 1, "checksums.checksum": 1}:
563
+ if exists_ok:
564
+ return mdb.objects.find_one(
565
+ {
566
+ "checksums": {
567
+ "$elemMatch": {
568
+ "type": e.details["keyValue"]["checksums.type"],
569
+ "checksum": e.details["keyValue"]["checksums.checksum"],
570
+ }
571
+ }
572
+ }
573
+ )
574
+ else:
575
+ raise HTTPException(
576
+ status_code=status.HTTP_409_CONFLICT,
577
+ detail=f"provided checksum matches existing object: {e.details['keyValue']}",
578
+ )
579
+ else:
580
+ raise HTTPException(
581
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
582
+ detail="duplicate key error",
583
+ )
584
+ return doc
585
+
586
+
587
+ def _claim_job(job_id: str, mdb: MongoDatabase, site: Site):
588
+ r"""
589
+ TODO: Document this function.
590
+ """
591
+ job_doc = raise404_if_none(mdb.jobs.find_one({"id": job_id}))
592
+ job = Job(**job_doc)
593
+ # check that site satisfies the job's workflow's required capabilities.
594
+ capabilities_required = job.workflow.capability_ids or []
595
+ for cid in capabilities_required:
596
+ if cid not in site.capability_ids:
597
+ raise HTTPException(
598
+ status_code=status.HTTP_403_FORBIDDEN,
599
+ detail=f"client site does not have capability {cid} required to claim job",
600
+ )
601
+
602
+ # For now, allow site to claim same job multiple times,
603
+ # to re-submit results given same job input config.
604
+ job_op_for_site = mdb.operations.find_one(
605
+ {"metadata.job.id": job.id, "metadata.site_id": site.id}
606
+ )
607
+ if job_op_for_site is not None:
608
+ # raise HTTPException(
609
+ # status_code=status.HTTP_409_CONFLICT,
610
+ # detail={
611
+ # "msg": (
612
+ # f"client site already claimed job -- "
613
+ # f"see operation {job_op_for_site['id']}"
614
+ # ),
615
+ # "id": job_op_for_site["id"],
616
+ # },
617
+ # )
618
+ pass
619
+
620
+ op_id = generate_one_id(mdb, "op")
621
+ job.claims = (job.claims or []) + [JobClaim(op_id=op_id, site_id=site.id)]
622
+ op = Operation[ResultT, JobOperationMetadata](
623
+ **{
624
+ "id": op_id,
625
+ "expire_time": expiry_dt_from_now(days=30),
626
+ "metadata": {
627
+ "job": Job(
628
+ **{
629
+ "id": job.id,
630
+ "workflow": job.workflow,
631
+ "config": job.config,
632
+ }
633
+ ).model_dump(exclude_unset=True),
634
+ "site_id": site.id,
635
+ "model": dotted_path_for(JobOperationMetadata),
636
+ },
637
+ }
638
+ )
639
+ mdb.operations.insert_one(op.model_dump())
640
+ mdb.jobs.replace_one({"id": job.id}, job.model_dump(exclude_unset=True))
641
+
642
+ return op.model_dump(exclude_unset=True)
643
+
644
+
645
+ @lru_cache
646
+ def map_nmdc_workflow_id_to_dagster_job_name():
647
+ """Returns a dictionary mapping nmdc_workflow_id to dagster_job_name."""
648
+ return {
649
+ "metadata-in-1.0.0": "apply_metadata_in",
650
+ "export-study-biosamples-as-csv-1.0.0": "export_study_biosamples_metadata",
651
+ "gold_study_to_database": "gold_study_to_database",
652
+ }
653
+
654
+
655
+ def ensure_run_config_data(
656
+ nmdc_workflow_id: str,
657
+ nmdc_workflow_inputs: List[str],
658
+ run_config_data: dict,
659
+ mdb: MongoDatabase,
660
+ user: User,
661
+ ):
662
+ r"""
663
+ Ensures that run_config_data has entries for certain nmdc workflow ids.
664
+ Returns return_config_data.
665
+ """
666
+ if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
667
+ run_config_data = assoc_in(
668
+ run_config_data,
669
+ ["ops", "get_study_biosamples_metadata", "config", "study_id"],
670
+ nmdc_workflow_inputs[0],
671
+ )
672
+ run_config_data = assoc_in(
673
+ run_config_data,
674
+ ["ops", "get_study_biosamples_metadata", "config", "username"],
675
+ user.username,
676
+ )
677
+ return run_config_data
678
+ if nmdc_workflow_id == "gold_study_to_database":
679
+ run_config_data = assoc_in(
680
+ run_config_data,
681
+ ["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
682
+ nmdc_workflow_inputs[0],
683
+ )
684
+ run_config_data = assoc_in(
685
+ run_config_data,
686
+ ["ops", "export_json_to_drs", "config", "username"],
687
+ user.username,
688
+ )
689
+ return run_config_data
690
+ else:
691
+ return run_config_data
692
+
693
+
694
+ def inputs_for(nmdc_workflow_id, run_config_data):
695
+ """Returns a URI path for given nmdc_workflow_id, constructed from run_config_data."""
696
+ if nmdc_workflow_id == "metadata-in-1.0.0":
697
+ return [
698
+ "/objects/"
699
+ + get_in(["ops", "get_json_in", "config", "object_id"], run_config_data)
700
+ ]
701
+ if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
702
+ return [
703
+ "/studies/"
704
+ + get_in(
705
+ ["ops", "get_study_biosamples_metadata", "config", "study_id"],
706
+ run_config_data,
707
+ )
708
+ ]
709
+ if nmdc_workflow_id == "gold_study_to_database":
710
+ return [
711
+ "/studies/"
712
+ + get_in(
713
+ ["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
714
+ run_config_data,
715
+ )
716
+ ]
717
+
718
+
719
+ def _request_dagster_run(
720
+ nmdc_workflow_id: str,
721
+ nmdc_workflow_inputs: List[str],
722
+ extra_run_config_data: dict,
723
+ mdb: MongoDatabase,
724
+ user: User,
725
+ repository_location_name=None,
726
+ repository_name=None,
727
+ ):
728
+ r"""
729
+ Requests a Dagster run using the specified parameters.
730
+ Returns a json dictionary indicating the job's success or failure.
731
+ This is a generic wrapper.
732
+ """
733
+ dagster_job_name = map_nmdc_workflow_id_to_dagster_job_name()[nmdc_workflow_id]
734
+
735
+ extra_run_config_data = ensure_run_config_data(
736
+ nmdc_workflow_id, nmdc_workflow_inputs, extra_run_config_data, mdb, user
737
+ )
738
+
739
+ # add REQUESTED RunEvent
740
+ nmdc_run_id = _add_run_requested_event(
741
+ run_spec=RunUserSpec(
742
+ job_id=nmdc_workflow_id,
743
+ run_config=extra_run_config_data,
744
+ inputs=inputs_for(nmdc_workflow_id, extra_run_config_data),
745
+ ),
746
+ mdb=mdb,
747
+ user=user,
748
+ )
749
+
750
+ dagster_client = get_dagster_graphql_client()
751
+ try:
752
+ dagster_run_id: str = dagster_client.submit_job_execution(
753
+ dagster_job_name,
754
+ repository_location_name=repository_location_name,
755
+ repository_name=repository_name,
756
+ run_config=extra_run_config_data,
757
+ )
758
+
759
+ # add STARTED RunEvent
760
+ _add_run_started_event(run_id=nmdc_run_id, mdb=mdb)
761
+ mdb.run_events.find_one_and_update(
762
+ filter={"run.id": nmdc_run_id, "type": "STARTED"},
763
+ update={"$set": {"run.facets.nmdcRuntime_dagsterRunId": dagster_run_id}},
764
+ sort=[("time", -1)],
765
+ )
766
+
767
+ return {"type": "success", "detail": {"run_id": nmdc_run_id}}
768
+ except DagsterGraphQLClientError as exc:
769
+ # add FAIL RunEvent
770
+ _add_run_fail_event(run_id=nmdc_run_id, mdb=mdb)
771
+
772
+ return {
773
+ "type": "error",
774
+ "detail": {"run_id": nmdc_run_id, "error_detail": str(exc)},
775
+ }
776
+
777
+
778
+ def _get_dagster_run_status(run_id: str):
779
+ r"""
780
+ Returns the status (either "success" or "error") of a requested Dagster run.
781
+ """
782
+ dagster_client = get_dagster_graphql_client()
783
+ try:
784
+ run_status: DagsterRunStatus = dagster_client.get_run_status(run_id)
785
+ return {"type": "success", "detail": str(run_status.value)}
786
+ except DagsterGraphQLClientError as exc:
787
+ return {"type": "error", "detail": str(exc)}
788
+
789
+
790
+ def check_action_permitted(username: str, action: str):
791
+ """Returns True if a Mongo database action is "allowed" and "not denied"."""
792
+ db: MongoDatabase = get_mongo_db()
793
+ filter_ = {"username": username, "action": action}
794
+ denied = db["_runtime.api.deny"].find_one(filter_) is not None
795
+ allowed = db["_runtime.api.allow"].find_one(filter_) is not None
796
+ return (not denied) and allowed