nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,817 @@
1
+ import logging
2
+ import os
3
+ import tempfile
4
+ from datetime import datetime
5
+ from functools import lru_cache
6
+ from json import JSONDecodeError
7
+ from pathlib import Path
8
+ from time import time_ns
9
+ from typing import List, Optional, Set, Tuple
10
+ from zoneinfo import ZoneInfo
11
+
12
+ from bson import json_util
13
+ from dagster import DagsterRunStatus
14
+ from dagster_graphql import DagsterGraphQLClientError
15
+ from fastapi import HTTPException
16
+ from gridfs import GridFS
17
+ from nmdc_runtime.api.core.idgen import generate_one_id, local_part
18
+ from nmdc_runtime.api.core.util import (
19
+ dotted_path_for,
20
+ expiry_dt_from_now,
21
+ raise404_if_none,
22
+ )
23
+ from nmdc_runtime.api.db.mongo import get_mongo_db
24
+ from nmdc_runtime.api.models.job import Job, JobClaim, JobOperationMetadata
25
+ from nmdc_runtime.api.models.object import (
26
+ DrsId,
27
+ DrsObject,
28
+ DrsObjectIn,
29
+ PortableFilename,
30
+ )
31
+ from nmdc_runtime.api.models.operation import Operation
32
+ from nmdc_runtime.api.models.run import (
33
+ RunUserSpec,
34
+ _add_run_fail_event,
35
+ _add_run_requested_event,
36
+ _add_run_started_event,
37
+ get_dagster_graphql_client,
38
+ )
39
+ from nmdc_runtime.api.models.site import Site
40
+ from nmdc_runtime.api.models.user import User
41
+ from nmdc_runtime.api.models.util import (
42
+ FindRequest,
43
+ ListRequest,
44
+ ResultT,
45
+ )
46
+ from nmdc_runtime.util import drs_metadata_for
47
+ from pymongo.collection import Collection as MongoCollection
48
+ from pymongo.database import Database as MongoDatabase
49
+ from pymongo.errors import DuplicateKeyError
50
+ from starlette import status
51
+ from toolz import assoc_in, concat, dissoc, get_in, merge
52
+
53
+ BASE_URL_INTERNAL = os.getenv("API_HOST")
54
+ BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL")
55
+ HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1]
56
+
57
+
58
+ def is_num_matching_docs_within_limit(
59
+ collection: MongoCollection, filter_: dict, limit: int
60
+ ) -> bool:
61
+ """
62
+ Check whether the number of documents in a MongoDB collection that match
63
+ the filter is within (i.e. is no greater than) the specified limit.
64
+ """
65
+ if limit < 0:
66
+ raise ValueError("Limit must be at least 0.")
67
+
68
+ # Count the number of documents matching the filter, but only count up to limit + 1,
69
+ # since that's enough to determine whether the number exceeds the limit.
70
+ limited_num_matching_docs = collection.count_documents(
71
+ filter=filter_,
72
+ limit=limit + 1,
73
+ )
74
+ return limited_num_matching_docs <= limit
75
+
76
+
77
+ def check_filter(filter_: str):
78
+ """A pass-through function that checks if `filter_` is parsable as a JSON object. Raises otherwise."""
79
+ filter_ = filter_.strip()
80
+ if not filter_.startswith("{") or not filter_.endswith("}"):
81
+ raise HTTPException(
82
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
83
+ detail=f"The given `filter` is not a valid JSON object, which must start with '{{' and end with '}}'.",
84
+ )
85
+ try:
86
+ json_util.loads(filter_)
87
+ except JSONDecodeError as e:
88
+ raise HTTPException(
89
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
90
+ detail=f"Given `filter` is not valid JSON: {e}",
91
+ )
92
+ return filter_
93
+
94
+
95
+ def list_resources(
96
+ req: ListRequest, mdb: MongoDatabase, collection_name: str = ""
97
+ ) -> dict:
98
+ """
99
+ Returns a dictionary containing the requested MongoDB documents, maybe alongside pagination information.
100
+
101
+ `mdb.page_tokens` docs are `{"_id": req.page_token, "ns": collection_name}`, Because `page_token` is globally
102
+ unique, and because the `mdb.page_tokens.find_one({"_id": req.page_token})` document stores `collection_name` in
103
+ the "ns" (namespace) field, the value for `collection_name` stored there takes precedence over any value supplied
104
+ as an argument to this function's `collection_name` parameter.
105
+
106
+ If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter criteria than
107
+ can fit on a page of that size, this function will paginate the resources.
108
+ """
109
+ if collection_name == "" and req.page_token is None:
110
+ raise HTTPException(
111
+ status_code=status.HTTP_400_BAD_REQUEST,
112
+ detail="Must specify a collection name if no page token is supplied.",
113
+ )
114
+ if req.page_token:
115
+ doc = mdb.page_tokens.find_one({"_id": req.page_token})
116
+ if doc is None:
117
+ raise HTTPException(
118
+ status_code=status.HTTP_400_BAD_REQUEST, detail="`page_token` not found"
119
+ )
120
+ collection_name = doc["ns"]
121
+ last_id = doc["last_id"]
122
+ mdb.page_tokens.delete_one({"_id": req.page_token})
123
+ else:
124
+ last_id = None
125
+
126
+ id_field = "id"
127
+ if "id_1" not in mdb[collection_name].index_information():
128
+ # Note: This warning is displayed for the "functional_annotation_agg" and
129
+ # "users" collections, for example.
130
+ logging.warning(
131
+ f"list_resources: no index set on 'id' for collection {collection_name}"
132
+ )
133
+ id_field = "_id"
134
+
135
+ max_page_size = req.max_page_size
136
+ filter_ = json_util.loads(check_filter(req.filter)) if req.filter else {}
137
+ projection = (
138
+ list(set(comma_separated_values(req.projection)) | {id_field})
139
+ if req.projection
140
+ else None
141
+ )
142
+ if last_id is not None:
143
+ if id_field in filter_:
144
+ filter_[id_field] = merge(filter_[id_field], {"$gt": last_id})
145
+ else:
146
+ filter_ = merge(filter_, {id_field: {"$gt": last_id}})
147
+
148
+ # Determine whether we will paginate the results.
149
+ #
150
+ # Note: We will paginate them unless either (a) the `max_page_size` is less than 1,
151
+ # or (b) the number of documents matching the filter can fit on a single page.
152
+ #
153
+ will_paginate = True
154
+ if max_page_size < 1 or is_num_matching_docs_within_limit(
155
+ collection=mdb[collection_name], filter_=filter_, limit=max_page_size
156
+ ):
157
+ will_paginate = False
158
+
159
+ if not will_paginate:
160
+ rv = {
161
+ "resources": list(
162
+ mdb[collection_name].find(filter=filter_, projection=projection)
163
+ )
164
+ }
165
+ return rv
166
+ else:
167
+ resources = list(
168
+ mdb[collection_name].find(
169
+ filter=filter_,
170
+ projection=projection,
171
+ limit=max_page_size,
172
+ sort=[(id_field, 1)],
173
+ allow_disk_use=True,
174
+ )
175
+ )
176
+ last_id = resources[-1][id_field]
177
+ token = generate_one_id(mdb, "page_tokens")
178
+ # TODO unify with `/queries:run` query continuation model
179
+ # => {_id: cursor/token, query: <full query>, last_id: <>, last_modified: <>}
180
+ mdb.page_tokens.insert_one(
181
+ {"_id": token, "ns": collection_name, "last_id": last_id}
182
+ )
183
+ return {"resources": resources, "next_page_token": token}
184
+
185
+
186
+ def coerce_to_float_if_possible(val):
187
+ r"""
188
+ Converts the specified value into a floating-point number if possible;
189
+ raising a `ValueError` if not possible.
190
+ """
191
+ try:
192
+ return float(val)
193
+ except ValueError:
194
+ return val
195
+
196
+
197
+ def comma_separated_values(s: str):
198
+ r"""
199
+ Returns a list of the comma-delimited substrings of the specified string. Discards any whitespace
200
+ surrounding each substring.
201
+
202
+ Reference: https://docs.python.org/3/library/re.html#re.split
203
+
204
+ >>> comma_separated_values("apple, banana, cherry")
205
+ ['apple', 'banana', 'cherry']
206
+ """
207
+ return [v.strip() for v in s.split(",")]
208
+
209
+
210
+ def get_mongo_filter(filter_str):
211
+ r"""
212
+ Convert a str in the domain-specific language (DSL) solicited by `nmdc_runtime.api.models.util.FindRequest.filter`
213
+ -- i.e., a comma-separated list of `attribute:value` pairs, where the `value` can include a comparison operator
214
+ (e.g. `>=`) and where if the attribute is of type _string_ and has the suffix `.search` appended to its name
215
+ then the server should perform a full-text search
216
+ -- to a corresponding MongoDB filter representation for e.g. passing to a collection `find` call.
217
+ """
218
+ filter_ = {}
219
+ if not filter_str:
220
+ return filter_
221
+
222
+ pairs = comma_separated_values(filter_str)
223
+ if not all(len(split) == 2 for split in (p.split(":", maxsplit=1) for p in pairs)):
224
+ raise HTTPException(
225
+ status_code=status.HTTP_400_BAD_REQUEST,
226
+ detail="Filter must be of form: attribute:spec[,attribute:spec]*",
227
+ )
228
+
229
+ for attr, spec in (p.split(":", maxsplit=1) for p in pairs):
230
+ if attr.endswith(".search"):
231
+ actual_attr = attr[: -len(".search")]
232
+ filter_[actual_attr] = {"$regex": spec}
233
+ else:
234
+ for op, key in {("<", "$lt"), ("<=", "$lte"), (">", "$gt"), (">=", "$gte")}:
235
+ if spec.startswith(op):
236
+ filter_[attr] = {key: coerce_to_float_if_possible(spec[len(op) :])}
237
+ break
238
+ else:
239
+ filter_[attr] = spec
240
+ return filter_
241
+
242
+
243
+ def get_mongo_sort(sort_str) -> Optional[List[Tuple[str, int]]]:
244
+ """
245
+ Parse `sort_str` and a str of the form "attribute:spec[,attribute:spec]*",
246
+ where spec is `asc` (ascending -- the default if no spec) or `desc` (descending),
247
+ and return a value suitable to pass as a `sort` kwarg to a mongo collection `find` call.
248
+ """
249
+ sort_ = []
250
+ if not sort_str:
251
+ return None
252
+
253
+ pairs = comma_separated_values(sort_str)
254
+ for p in pairs:
255
+ components = p.split(":", maxsplit=1)
256
+ if len(components) == 1:
257
+ attr, spec = components[0], ""
258
+ else:
259
+ attr, spec = components
260
+ for op, key in {("", 1), ("asc", 1), ("desc", -1)}:
261
+ if spec == op:
262
+ sort_.append((attr, key))
263
+ break
264
+ else:
265
+ raise HTTPException(
266
+ status_code=status.HTTP_400_BAD_REQUEST,
267
+ detail=(
268
+ "Sort must be of form: attribute:spec[,attribute:spec]* "
269
+ "where spec is `asc` (ascending -- the default if no spec) "
270
+ "or `desc` (descending).",
271
+ ),
272
+ )
273
+ return sort_
274
+
275
+
276
+ def strip_oid(doc: dict) -> dict:
277
+ r"""
278
+ Returns a copy of the specified dictionary, that has no `_id` key.
279
+ """
280
+ return dissoc(doc, "_id")
281
+
282
+
283
+ def timeit(cursor):
284
+ """Collect from cursor and return time taken in milliseconds."""
285
+ tic = time_ns()
286
+ results = list(cursor)
287
+ toc = time_ns()
288
+ return results, int(round((toc - tic) / 1e6))
289
+
290
+
291
+ def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str):
292
+ """Find nmdc schema collection entities that match the FindRequest.
293
+
294
+ "resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
295
+
296
+ TODO: Add type hint for function's return value (see `nmdc_runtime.api.models.util.FindResponse`).
297
+ """
298
+ if req.group_by:
299
+ raise HTTPException(
300
+ status_code=status.HTTP_418_IM_A_TEAPOT,
301
+ detail="I don't yet know how to ?group_by=",
302
+ )
303
+ if req.search:
304
+ raise HTTPException(
305
+ status_code=status.HTTP_418_IM_A_TEAPOT,
306
+ detail=(
307
+ "I don't yet know how to ?search=. "
308
+ "Use ?filter=<attribute>.search:<spec> instead."
309
+ ),
310
+ )
311
+
312
+ filter_ = get_mongo_filter(req.filter)
313
+ projection = (
314
+ list(set(comma_separated_values(req.fields)) | {"id"}) if req.fields else None
315
+ )
316
+ sort_ = get_mongo_sort(req.sort)
317
+
318
+ total_count = mdb[collection_name].count_documents(filter=filter_)
319
+
320
+ if req.page:
321
+ skip = (req.page - 1) * req.per_page
322
+ if skip > 10_000:
323
+ # Note: because _page number_-based pagination is currently implemented via MongoDB's `skip` and `limit`
324
+ # parameters, a full (slow) collection scan is performed to skip to the requested page. This scan takes
325
+ # longer and longer as `skip` increases, which is why cursor-based pagination is preferred for large
326
+ # collections.
327
+ raise HTTPException(
328
+ status_code=status.HTTP_400_BAD_REQUEST,
329
+ detail=(
330
+ "Use cursor-based pagination for paging beyond 10,000 items. "
331
+ "That is, instead of specifying the `page` query parameter for this endpoint, "
332
+ "specify the `cursor` query parameter. In particular, set `cursor` to `*` to get the first page, "
333
+ "and use the value of `meta.next_cursor` in the response, if not `null`, as the value to which "
334
+ "you set `cursor` in the next request."
335
+ ),
336
+ )
337
+ limit = req.per_page
338
+ results, db_response_time_ms = timeit(
339
+ mdb[collection_name].find(
340
+ filter=filter_,
341
+ skip=skip,
342
+ limit=limit,
343
+ sort=sort_,
344
+ projection=projection,
345
+ )
346
+ )
347
+ rv = {
348
+ "meta": {
349
+ "mongo_filter_dict": filter_,
350
+ "mongo_sort_list": [[a, s] for a, s in sort_] if sort_ else None,
351
+ "count": total_count,
352
+ "db_response_time_ms": db_response_time_ms,
353
+ "page": req.page,
354
+ "per_page": req.per_page,
355
+ },
356
+ "results": [strip_oid(d) for d in results],
357
+ "group_by": [],
358
+ }
359
+ if req.fields:
360
+ rv["meta"]["fields"] = req.fields
361
+
362
+ else: # req.cursor is not None
363
+ if req.cursor != "*":
364
+ doc = mdb.page_tokens.find_one({"_id": req.cursor, "ns": collection_name})
365
+ if doc is None:
366
+ raise HTTPException(
367
+ status_code=status.HTTP_400_BAD_REQUEST, detail="Bad cursor value"
368
+ )
369
+ last_id = doc["last_id"]
370
+ mdb.page_tokens.delete_one({"_id": req.cursor})
371
+ else:
372
+ last_id = None
373
+
374
+ if last_id is not None:
375
+ if "id" in filter_:
376
+ filter_["id"] = merge(filter_["id"], {"$gt": last_id})
377
+ else:
378
+ filter_ = merge(filter_, {"id": {"$gt": last_id}})
379
+
380
+ if "id_1" not in mdb[collection_name].index_information():
381
+ raise HTTPException(
382
+ status_code=status.HTTP_400_BAD_REQUEST,
383
+ detail=f"Cursor-based pagination is not enabled for this resource.",
384
+ )
385
+
386
+ limit = req.per_page
387
+ sort_for_cursor = (sort_ or []) + [("id", 1)]
388
+ results, db_response_time_ms = timeit(
389
+ mdb[collection_name].find(
390
+ filter=filter_, limit=limit, sort=sort_for_cursor, projection=projection
391
+ )
392
+ )
393
+
394
+ # Check whether there are any results. If there aren't any, we refrain from
395
+ # trying to access the `id` of the final one (since it doesn't exist).
396
+ if len(results) > 0:
397
+ last_id = results[-1]["id"]
398
+ else:
399
+ last_id = None
400
+
401
+ # If we have a `last_id` value other than `None`, we check whether it belongs
402
+ # to the final document overall (not just the final one on this page). On the
403
+ # other hand, if `last_id` is `None`, we set the token to `None` (since there
404
+ # is no "next page" of results to be retrieved).
405
+ if last_id is not None:
406
+ filter_eager = filter_
407
+ if "id" in filter_:
408
+ filter_eager["id"] = merge(filter_["id"], {"$gt": last_id})
409
+ else:
410
+ filter_eager = merge(filter_, {"id": {"$gt": last_id}})
411
+ more_results = (
412
+ mdb[collection_name].count_documents(filter=filter_eager, limit=limit)
413
+ > 0
414
+ )
415
+ # If the `last_id` does not belong to the final document overall, generate
416
+ # a new pagination token and persist it to the database. Otherwise (i.e. if
417
+ # the `last_id` _does_ belong to the final document overall), set the token
418
+ # to `None` (since there is no "next page" to be retrieved).
419
+ if more_results:
420
+ token = generate_one_id(mdb, "page_tokens")
421
+ mdb.page_tokens.insert_one(
422
+ {"_id": token, "ns": collection_name, "last_id": last_id}
423
+ )
424
+ else:
425
+ token = None
426
+ else:
427
+ token = None
428
+
429
+ rv = {
430
+ "meta": {
431
+ "mongo_filter_dict": filter_,
432
+ "mongo_sort_list": sort_for_cursor,
433
+ "count": total_count,
434
+ "db_response_time_ms": db_response_time_ms,
435
+ "page": None,
436
+ "per_page": req.per_page,
437
+ "next_cursor": token,
438
+ },
439
+ "results": [strip_oid(d) for d in results],
440
+ "group_by": [],
441
+ }
442
+ if req.fields:
443
+ rv["meta"]["fields"] = req.fields
444
+ return rv
445
+
446
+
447
+ def find_resources_spanning(
448
+ req: FindRequest, mdb: MongoDatabase, collection_names: Set[str]
449
+ ):
450
+ """Find nmdc schema collection entities -- here, across multiple collections -- that match the FindRequest.
451
+
452
+ This is useful for collections that house documents that are subclasses of a common ancestor class.
453
+
454
+ "resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
455
+ """
456
+ if req.cursor or not req.page:
457
+ raise HTTPException(
458
+ status_code=status.HTTP_400_BAD_REQUEST,
459
+ detail="This resource only supports page-based pagination",
460
+ )
461
+
462
+ if len(collection_names) == 0:
463
+ return {
464
+ "meta": {
465
+ "mongo_filter_dict": get_mongo_filter(req.filter),
466
+ "count": 0,
467
+ "db_response_time_ms": 0,
468
+ "page": req.page,
469
+ "per_page": req.per_page,
470
+ },
471
+ "results": [],
472
+ "group_by": [],
473
+ }
474
+
475
+ responses = {name: find_resources(req, mdb, name) for name in collection_names}
476
+ rv = {
477
+ "meta": {
478
+ "mongo_filter_dict": next(
479
+ r["meta"]["mongo_filter_dict"] for r in responses.values()
480
+ ),
481
+ "count": sum(r["meta"]["count"] for r in responses.values()),
482
+ "db_response_time_ms": sum(
483
+ r["meta"]["db_response_time_ms"] for r in responses.values()
484
+ ),
485
+ "page": req.page,
486
+ "per_page": req.per_page,
487
+ },
488
+ "results": list(concat(r["results"] for r in responses.values())),
489
+ "group_by": [],
490
+ }
491
+ return rv
492
+
493
+
494
+ def exists(collection: MongoCollection, filter_: dict):
495
+ r"""
496
+ Returns True if there are any documents in the collection that meet the filter requirements.
497
+ """
498
+ return collection.count_documents(filter_) > 0
499
+
500
+
501
+ def persist_content_and_get_drs_object(
502
+ content: str,
503
+ description: str,
504
+ username="(anonymous)",
505
+ filename=None,
506
+ content_type="application/json",
507
+ id_ns="json-metadata-in",
508
+ exists_ok=False,
509
+ ):
510
+ """Persist a Data Repository Service (DRS) object.
511
+
512
+ An object may be a blob, analogous to a file, or a bundle, analogous to a folder. Sites register objects,
513
+ and sites must ensure that these objects are accessible to the NMDC data broker.
514
+ An object may be associated with one or more object types, useful for triggering workflows.
515
+
516
+ Reference: https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.1.0/docs/#_drs_datatypes
517
+ """
518
+ mdb = get_mongo_db()
519
+ drs_id = local_part(generate_one_id(mdb, ns=id_ns, shoulder="gfs0"))
520
+ filename = filename or drs_id
521
+ PortableFilename(filename) # validates
522
+ DrsId(drs_id) # validates
523
+
524
+ mdb_fs = GridFS(mdb)
525
+ mdb_fs.put(
526
+ content,
527
+ _id=drs_id,
528
+ filename=filename,
529
+ content_type=content_type,
530
+ encoding="utf-8",
531
+ )
532
+ with tempfile.TemporaryDirectory() as save_dir:
533
+ filepath = str(Path(save_dir).joinpath(filename))
534
+ with open(filepath, "w") as f:
535
+ f.write(content)
536
+ now_to_the_minute = datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat(
537
+ timespec="minutes"
538
+ )
539
+ object_in = DrsObjectIn(
540
+ **drs_metadata_for(
541
+ filepath,
542
+ base={
543
+ "description": (
544
+ description
545
+ + f" (created by/for {username}"
546
+ + f" at {now_to_the_minute})"
547
+ ),
548
+ "access_methods": [{"access_id": drs_id}],
549
+ },
550
+ timestamp=now_to_the_minute,
551
+ )
552
+ )
553
+ self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}"
554
+ return _create_object(
555
+ mdb,
556
+ object_in,
557
+ mgr_site="nmdc-runtime",
558
+ drs_id=drs_id,
559
+ self_uri=self_uri,
560
+ exists_ok=exists_ok,
561
+ )
562
+
563
+
564
+ def _create_object(
565
+ mdb: MongoDatabase,
566
+ object_in: DrsObjectIn,
567
+ mgr_site,
568
+ drs_id,
569
+ self_uri,
570
+ exists_ok=False,
571
+ ):
572
+ """Helper function for creating a Data Repository Service (DRS) object."""
573
+ drs_obj = DrsObject(
574
+ **object_in.model_dump(exclude_unset=True),
575
+ id=drs_id,
576
+ self_uri=self_uri,
577
+ )
578
+ doc = drs_obj.model_dump(exclude_unset=True)
579
+ doc["_mgr_site"] = mgr_site # manager site
580
+ try:
581
+ mdb.objects.insert_one(doc)
582
+ except DuplicateKeyError as e:
583
+ if e.details["keyPattern"] == {"checksums.type": 1, "checksums.checksum": 1}:
584
+ if exists_ok:
585
+ return mdb.objects.find_one(
586
+ {
587
+ "checksums": {
588
+ "$elemMatch": {
589
+ "type": e.details["keyValue"]["checksums.type"],
590
+ "checksum": e.details["keyValue"]["checksums.checksum"],
591
+ }
592
+ }
593
+ }
594
+ )
595
+ else:
596
+ raise HTTPException(
597
+ status_code=status.HTTP_409_CONFLICT,
598
+ detail=f"provided checksum matches existing object: {e.details['keyValue']}",
599
+ )
600
+ else:
601
+ raise HTTPException(
602
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
603
+ detail="duplicate key error",
604
+ )
605
+ return doc
606
+
607
+
608
+ def _claim_job(job_id: str, mdb: MongoDatabase, site: Site):
609
+ r"""
610
+ TODO: Document this function.
611
+ """
612
+ job_doc = raise404_if_none(mdb.jobs.find_one({"id": job_id}))
613
+ job = Job(**job_doc)
614
+ # check that site satisfies the job's workflow's required capabilities.
615
+ capabilities_required = job.workflow.capability_ids or []
616
+ for cid in capabilities_required:
617
+ if cid not in site.capability_ids:
618
+ raise HTTPException(
619
+ status_code=status.HTTP_403_FORBIDDEN,
620
+ detail=f"client site does not have capability {cid} required to claim job",
621
+ )
622
+
623
+ # For now, allow site to claim same job multiple times,
624
+ # to re-submit results given same job input config.
625
+ job_op_for_site = mdb.operations.find_one(
626
+ {"metadata.job.id": job.id, "metadata.site_id": site.id}
627
+ )
628
+ if job_op_for_site is not None:
629
+ # raise HTTPException(
630
+ # status_code=status.HTTP_409_CONFLICT,
631
+ # detail={
632
+ # "msg": (
633
+ # f"client site already claimed job -- "
634
+ # f"see operation {job_op_for_site['id']}"
635
+ # ),
636
+ # "id": job_op_for_site["id"],
637
+ # },
638
+ # )
639
+ pass
640
+
641
+ op_id = generate_one_id(mdb, "op")
642
+ job.claims = (job.claims or []) + [JobClaim(op_id=op_id, site_id=site.id)]
643
+ op = Operation[ResultT, JobOperationMetadata](
644
+ **{
645
+ "id": op_id,
646
+ "expire_time": expiry_dt_from_now(days=30),
647
+ "metadata": {
648
+ "job": Job(
649
+ **{
650
+ "id": job.id,
651
+ "workflow": job.workflow,
652
+ "config": job.config,
653
+ }
654
+ ).model_dump(exclude_unset=True),
655
+ "site_id": site.id,
656
+ "model": dotted_path_for(JobOperationMetadata),
657
+ },
658
+ }
659
+ )
660
+ mdb.operations.insert_one(op.model_dump())
661
+ mdb.jobs.replace_one({"id": job.id}, job.model_dump(exclude_unset=True))
662
+
663
+ return op.model_dump(exclude_unset=True)
664
+
665
+
666
+ @lru_cache
667
+ def map_nmdc_workflow_id_to_dagster_job_name():
668
+ """Returns a dictionary mapping nmdc_workflow_id to dagster_job_name."""
669
+ return {
670
+ "metadata-in-1.0.0": "apply_metadata_in",
671
+ "export-study-biosamples-as-csv-1.0.0": "export_study_biosamples_metadata",
672
+ "gold_study_to_database": "gold_study_to_database",
673
+ }
674
+
675
+
676
+ def ensure_run_config_data(
677
+ nmdc_workflow_id: str,
678
+ nmdc_workflow_inputs: List[str],
679
+ run_config_data: dict,
680
+ mdb: MongoDatabase,
681
+ user: User,
682
+ ):
683
+ r"""
684
+ Ensures that run_config_data has entries for certain nmdc workflow ids.
685
+ Returns return_config_data.
686
+ """
687
+ if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
688
+ run_config_data = assoc_in(
689
+ run_config_data,
690
+ ["ops", "get_study_biosamples_metadata", "config", "study_id"],
691
+ nmdc_workflow_inputs[0],
692
+ )
693
+ run_config_data = assoc_in(
694
+ run_config_data,
695
+ ["ops", "get_study_biosamples_metadata", "config", "username"],
696
+ user.username,
697
+ )
698
+ return run_config_data
699
+ if nmdc_workflow_id == "gold_study_to_database":
700
+ run_config_data = assoc_in(
701
+ run_config_data,
702
+ ["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
703
+ nmdc_workflow_inputs[0],
704
+ )
705
+ run_config_data = assoc_in(
706
+ run_config_data,
707
+ ["ops", "export_json_to_drs", "config", "username"],
708
+ user.username,
709
+ )
710
+ return run_config_data
711
+ else:
712
+ return run_config_data
713
+
714
+
715
+ def inputs_for(nmdc_workflow_id, run_config_data):
716
+ """Returns a URI path for given nmdc_workflow_id, constructed from run_config_data."""
717
+ if nmdc_workflow_id == "metadata-in-1.0.0":
718
+ return [
719
+ "/objects/"
720
+ + get_in(["ops", "get_json_in", "config", "object_id"], run_config_data)
721
+ ]
722
+ if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
723
+ return [
724
+ "/studies/"
725
+ + get_in(
726
+ ["ops", "get_study_biosamples_metadata", "config", "study_id"],
727
+ run_config_data,
728
+ )
729
+ ]
730
+ if nmdc_workflow_id == "gold_study_to_database":
731
+ return [
732
+ "/studies/"
733
+ + get_in(
734
+ ["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
735
+ run_config_data,
736
+ )
737
+ ]
738
+
739
+
740
+ def _request_dagster_run(
741
+ nmdc_workflow_id: str,
742
+ nmdc_workflow_inputs: List[str],
743
+ extra_run_config_data: dict,
744
+ mdb: MongoDatabase,
745
+ user: User,
746
+ repository_location_name=None,
747
+ repository_name=None,
748
+ ):
749
+ r"""
750
+ Requests a Dagster run using the specified parameters.
751
+ Returns a json dictionary indicating the job's success or failure.
752
+ This is a generic wrapper.
753
+ """
754
+ dagster_job_name = map_nmdc_workflow_id_to_dagster_job_name()[nmdc_workflow_id]
755
+
756
+ extra_run_config_data = ensure_run_config_data(
757
+ nmdc_workflow_id, nmdc_workflow_inputs, extra_run_config_data, mdb, user
758
+ )
759
+
760
+ # add REQUESTED RunEvent
761
+ nmdc_run_id = _add_run_requested_event(
762
+ run_spec=RunUserSpec(
763
+ job_id=nmdc_workflow_id,
764
+ run_config=extra_run_config_data,
765
+ inputs=inputs_for(nmdc_workflow_id, extra_run_config_data),
766
+ ),
767
+ mdb=mdb,
768
+ user=user,
769
+ )
770
+
771
+ dagster_client = get_dagster_graphql_client()
772
+ try:
773
+ dagster_run_id: str = dagster_client.submit_job_execution(
774
+ dagster_job_name,
775
+ repository_location_name=repository_location_name,
776
+ repository_name=repository_name,
777
+ run_config=extra_run_config_data,
778
+ )
779
+
780
+ # add STARTED RunEvent
781
+ _add_run_started_event(run_id=nmdc_run_id, mdb=mdb)
782
+ mdb.run_events.find_one_and_update(
783
+ filter={"run.id": nmdc_run_id, "type": "STARTED"},
784
+ update={"$set": {"run.facets.nmdcRuntime_dagsterRunId": dagster_run_id}},
785
+ sort=[("time", -1)],
786
+ )
787
+
788
+ return {"type": "success", "detail": {"run_id": nmdc_run_id}}
789
+ except DagsterGraphQLClientError as exc:
790
+ # add FAIL RunEvent
791
+ _add_run_fail_event(run_id=nmdc_run_id, mdb=mdb)
792
+
793
+ return {
794
+ "type": "error",
795
+ "detail": {"run_id": nmdc_run_id, "error_detail": str(exc)},
796
+ }
797
+
798
+
799
+ def _get_dagster_run_status(run_id: str):
800
+ r"""
801
+ Returns the status (either "success" or "error") of a requested Dagster run.
802
+ """
803
+ dagster_client = get_dagster_graphql_client()
804
+ try:
805
+ run_status: DagsterRunStatus = dagster_client.get_run_status(run_id)
806
+ return {"type": "success", "detail": str(run_status.value)}
807
+ except DagsterGraphQLClientError as exc:
808
+ return {"type": "error", "detail": str(exc)}
809
+
810
+
811
+ def check_action_permitted(username: str, action: str):
812
+ """Returns True if a Mongo database action is "allowed" and "not denied"."""
813
+ db: MongoDatabase = get_mongo_db()
814
+ filter_ = {"username": username, "action": action}
815
+ denied = db["_runtime.api.deny"].find_one(filter_) is not None
816
+ allowed = db["_runtime.api.allow"].find_one(filter_) is not None
817
+ return (not denied) and allowed