nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +208 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +788 -0
  10. nmdc_runtime/api/core/util.py +109 -0
  11. nmdc_runtime/api/db/mongo.py +435 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +143 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +270 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +796 -0
  31. nmdc_runtime/api/endpoints/workflows.py +353 -0
  32. nmdc_runtime/api/entrypoint.sh +7 -0
  33. nmdc_runtime/api/main.py +425 -0
  34. nmdc_runtime/api/middleware.py +43 -0
  35. nmdc_runtime/api/models/capability.py +14 -0
  36. nmdc_runtime/api/models/id.py +92 -0
  37. nmdc_runtime/api/models/job.py +37 -0
  38. nmdc_runtime/api/models/lib/helpers.py +78 -0
  39. nmdc_runtime/api/models/metadata.py +11 -0
  40. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  41. nmdc_runtime/api/models/object.py +180 -0
  42. nmdc_runtime/api/models/object_type.py +20 -0
  43. nmdc_runtime/api/models/operation.py +66 -0
  44. nmdc_runtime/api/models/query.py +246 -0
  45. nmdc_runtime/api/models/query_continuation.py +111 -0
  46. nmdc_runtime/api/models/run.py +161 -0
  47. nmdc_runtime/api/models/site.py +87 -0
  48. nmdc_runtime/api/models/trigger.py +13 -0
  49. nmdc_runtime/api/models/user.py +140 -0
  50. nmdc_runtime/api/models/util.py +260 -0
  51. nmdc_runtime/api/models/workflow.py +15 -0
  52. nmdc_runtime/api/openapi.py +178 -0
  53. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  54. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  55. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  56. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  57. nmdc_runtime/config.py +7 -8
  58. nmdc_runtime/minter/adapters/repository.py +22 -2
  59. nmdc_runtime/minter/config.py +2 -0
  60. nmdc_runtime/minter/domain/model.py +55 -1
  61. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  62. nmdc_runtime/mongo_util.py +1 -2
  63. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  64. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  65. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  66. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  67. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  68. nmdc_runtime/site/dagster.yaml +53 -0
  69. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  70. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  71. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  72. nmdc_runtime/site/export/ncbi_xml.py +633 -13
  73. nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
  74. nmdc_runtime/site/graphs.py +8 -22
  75. nmdc_runtime/site/ops.py +147 -181
  76. nmdc_runtime/site/repository.py +2 -112
  77. nmdc_runtime/site/resources.py +16 -3
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/site/workspace.yaml +13 -0
  87. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  88. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  89. nmdc_runtime/static/README.md +5 -0
  90. nmdc_runtime/static/favicon.ico +0 -0
  91. nmdc_runtime/util.py +90 -48
  92. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  93. nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
  94. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  95. nmdc_runtime/containers.py +0 -14
  96. nmdc_runtime/core/db/Database.py +0 -15
  97. nmdc_runtime/core/exceptions/__init__.py +0 -23
  98. nmdc_runtime/core/exceptions/base.py +0 -47
  99. nmdc_runtime/core/exceptions/token.py +0 -13
  100. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  101. nmdc_runtime/domain/users/userSchema.py +0 -37
  102. nmdc_runtime/domain/users/userService.py +0 -14
  103. nmdc_runtime/infrastructure/database/db.py +0 -3
  104. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  105. nmdc_runtime/lib/__init__.py +0 -1
  106. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  107. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  108. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  109. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  110. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  111. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  112. nmdc_runtime/site/drsobjects/registration.py +0 -131
  113. nmdc_runtime/site/translation/emsl.py +0 -43
  114. nmdc_runtime/site/translation/gold.py +0 -53
  115. nmdc_runtime/site/translation/jgi.py +0 -32
  116. nmdc_runtime/site/translation/util.py +0 -132
  117. nmdc_runtime/site/validation/jgi.py +0 -43
  118. nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
  119. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  120. nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
  121. /nmdc_runtime/{client → api}/__init__.py +0 -0
  122. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  123. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  124. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  125. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  126. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  127. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  128. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  129. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  130. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  131. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,634 @@
1
+ import logging
2
+ from typing import Annotated
3
+
4
+ from fastapi import APIRouter, Depends, Path, Query
5
+ from pymongo.database import Database as MongoDatabase
6
+
7
+ from nmdc_schema.get_nmdc_view import ViewGetter
8
+ from nmdc_runtime.api.core.util import raise404_if_none
9
+ from nmdc_runtime.api.db.mongo import (
10
+ get_mongo_db,
11
+ get_planned_process_collection_names,
12
+ get_nonempty_nmdc_schema_collection_names,
13
+ )
14
+ from nmdc_runtime.api.endpoints.nmdcschema import get_linked_instances
15
+ from nmdc_runtime.api.endpoints.util import (
16
+ find_resources,
17
+ strip_oid,
18
+ find_resources_spanning,
19
+ )
20
+ from nmdc_runtime.api.models.metadata import Doc
21
+ from nmdc_runtime.api.models.util import (
22
+ FindResponse,
23
+ FindRequest,
24
+ )
25
+
26
+
27
+ router = APIRouter()
28
+
29
+
30
+ @router.get(
31
+ "/studies",
32
+ response_model=FindResponse,
33
+ response_model_exclude_unset=True,
34
+ )
35
+ def find_studies(
36
+ req: Annotated[FindRequest, Query()],
37
+ mdb: MongoDatabase = Depends(get_mongo_db),
38
+ ):
39
+ """
40
+ The `GET /studies` endpoint is a general purpose way to retrieve NMDC studies based on parameters provided by the user.
41
+ Studies can be filtered and sorted based on the applicable [Study attributes](https://microbiomedata.github.io/nmdc-schema/Study/).
42
+ """
43
+ return find_resources(req, mdb, "study_set")
44
+
45
+
46
+ @router.get(
47
+ "/studies/{study_id}",
48
+ response_model=Doc,
49
+ response_model_exclude_unset=True,
50
+ )
51
+ def find_study_by_id(
52
+ study_id: Annotated[
53
+ str,
54
+ Path(
55
+ title="Study ID",
56
+ description="The `id` of the `Study` you want to find.\n\n_Example_: `nmdc:sty-11-abc123`",
57
+ examples=["nmdc:sty-11-abc123"],
58
+ ),
59
+ ],
60
+ mdb: MongoDatabase = Depends(get_mongo_db),
61
+ ):
62
+ """
63
+ If the study identifier is known, a study can be retrieved directly using the GET /studies/{study_id} endpoint.
64
+ \n Note that only one study can be retrieved at a time using this method.
65
+ """
66
+ return strip_oid(raise404_if_none(mdb["study_set"].find_one({"id": study_id})))
67
+
68
+
69
+ @router.get(
70
+ "/biosamples",
71
+ response_model=FindResponse,
72
+ response_model_exclude_unset=True,
73
+ )
74
+ def find_biosamples(
75
+ req: Annotated[FindRequest, Query()],
76
+ mdb: MongoDatabase = Depends(get_mongo_db),
77
+ ):
78
+ """
79
+ The GET /biosamples endpoint is a general purpose way to retrieve biosample metadata using user-provided filter and sort criteria.
80
+ Please see the applicable [Biosample attributes](https://microbiomedata.github.io/nmdc-schema/Biosample/).
81
+ """
82
+ return find_resources(req, mdb, "biosample_set")
83
+
84
+
85
+ @router.get(
86
+ "/biosamples/{sample_id}",
87
+ response_model=Doc,
88
+ response_model_exclude_unset=True,
89
+ )
90
+ def find_biosample_by_id(
91
+ sample_id: Annotated[
92
+ str,
93
+ Path(
94
+ title="Biosample ID",
95
+ description="The `id` of the `Biosample` you want to find.\n\n_Example_: `nmdc:bsm-11-abc123`",
96
+ examples=["nmdc:bsm-11-abc123"],
97
+ ),
98
+ ],
99
+ mdb: MongoDatabase = Depends(get_mongo_db),
100
+ ):
101
+ """
102
+ If the biosample identifier is known, a biosample can be retrieved directly using the GET /biosamples/{sample_id}.
103
+ \n Note that only one biosample metadata record can be retrieved at a time using this method.
104
+ """
105
+ return strip_oid(raise404_if_none(mdb["biosample_set"].find_one({"id": sample_id})))
106
+
107
+
108
+ @router.get(
109
+ "/data_objects",
110
+ response_model=FindResponse,
111
+ response_model_exclude_unset=True,
112
+ )
113
+ def find_data_objects(
114
+ req: Annotated[FindRequest, Query()],
115
+ mdb: MongoDatabase = Depends(get_mongo_db),
116
+ ):
117
+ """
118
+ To retrieve metadata about NMDC data objects (such as files, records, or omics data) the GET /data_objects endpoint
119
+ may be used along with various parameters. Please see the applicable [Data Object](https://microbiomedata.github.io/nmdc-schema/DataObject/)
120
+ attributes.
121
+ """
122
+ return find_resources(req, mdb, "data_object_set")
123
+
124
+
125
+ @router.get(
126
+ "/data_objects/study/{study_id}",
127
+ response_model_exclude_unset=True,
128
+ #
129
+ # Customize the name that Swagger UI displays for the API endpoint.
130
+ #
131
+ # Note: By default, FastAPI derives the name of the API endpoint from the name of the decorated function. Here, we
132
+ # are using a custom name that matches the derived one, except that the custom one ends with `(delayed)`.
133
+ #
134
+ # Note: Each word in the name will appear capitalized on Swagger UI.
135
+ #
136
+ name="Find data objects for study (delayed)",
137
+ #
138
+ # Customize the description that Swagger UI displays for the API endpoint.
139
+ #
140
+ # Note: By default, FastAPI derives the description of the API endpoint from the docstring of the decorated
141
+ # function. Here, we are using a custom description that was written for an audience of API consumers,
142
+ # as opposed to the derived description that was written for an audience of `nmdc-runtime` developers.
143
+ #
144
+ description=(
145
+ "Gets all `DataObject`s related to all `Biosample`s related to the specified `Study`."
146
+ "<br /><br />" # newlines
147
+ "**Note:** The data returned by this API endpoint can be up to 24 hours out of date "
148
+ "with respect to the NMDC database. That's because the cache that underlies this API "
149
+ "endpoint gets refreshed to match the NMDC database once every 24 hours."
150
+ ),
151
+ )
152
+ def find_data_objects_for_study(
153
+ study_id: Annotated[
154
+ str,
155
+ Path(
156
+ title="Study ID",
157
+ description="""The `id` of the `Study` having `Biosample`s with which you want to find
158
+ associated `DataObject`s.\n\n_Example_: `nmdc:sty-11-abc123`""",
159
+ examples=["nmdc:sty-11-abc123"],
160
+ ),
161
+ ],
162
+ mdb: MongoDatabase = Depends(get_mongo_db),
163
+ ):
164
+ """This API endpoint is used to retrieve data objects associated with
165
+ all the biosamples associated with a given study. This endpoint makes
166
+ use of the `alldocs` collection for its implementation.
167
+
168
+ :param study_id: NMDC study id for which data objects are to be retrieved
169
+ :param mdb: PyMongo connection, defaults to Depends(get_mongo_db)
170
+ :return: List of dictionaries, each of which has a `biosample_id` entry
171
+ and a `data_object_set` entry. The value of the `biosample_id` entry
172
+ is the `Biosample`'s `id`. The value of the `data_object_set` entry
173
+ is a list of the `DataObject`s associated with that `Biosample`.
174
+ """
175
+ biosample_data_objects = []
176
+
177
+ # Respond with an error if the specified `Study` does not exist.
178
+ # Note: We project only the `_id` field, to minimize data transfer.
179
+ raise404_if_none(
180
+ mdb["study_set"].find_one({"id": study_id}, projection={"_id": 1}),
181
+ detail="Study not found",
182
+ )
183
+
184
+ # Use the `get_linked_instances` function—which is the function that
185
+ # underlies the `/nmdcschema/linked_instances` API endpoint—to get all
186
+ # the `Biosample`s that are downstream of the specified `Study`.
187
+ #
188
+ # Note: The `get_linked_instances` function requires that a `max_page_size`
189
+ # integer argument be passed in. In our case, we want to get _all_ of
190
+ # the instances. Python has no "infinity" integer; and, even if it did,
191
+ # if we were to specify too large of an integer, we'd get this error:
192
+ # > "OverflowError: MongoDB can only handle up to 8-byte ints"
193
+ # So, as a workaround, we pass in a number that is large enough that we
194
+ # think it will account for all cases in practice (e.g., a study having
195
+ # a trillion biosamples or a trillion data objects).
196
+ #
197
+ # TODO: Update the `get_linked_instances` function to optionally impose _no_ limit.
198
+ #
199
+ large_max_page_size: int = 1_000_000_000_000
200
+ linked_biosamples_result: dict = get_linked_instances(
201
+ ids=[study_id],
202
+ types=["nmdc:Biosample"],
203
+ hydrate=False, # we'll only use their `id` values
204
+ page_token=None,
205
+ max_page_size=large_max_page_size,
206
+ mdb=mdb,
207
+ )
208
+ biosample_ids = [d["id"] for d in linked_biosamples_result.get("resources", [])]
209
+ logging.debug(f"Found {len(biosample_ids)} Biosamples for Study {study_id}")
210
+
211
+ # Get all the `DataObject`s that are downstream from any of those `Biosample`s.
212
+ data_objects_by_biosample_id = {}
213
+ linked_data_objects_result: dict = get_linked_instances(
214
+ ids=biosample_ids,
215
+ types=["nmdc:DataObject"],
216
+ hydrate=True, # we want the full `DataObject` documents
217
+ page_token=None,
218
+ max_page_size=large_max_page_size,
219
+ mdb=mdb,
220
+ )
221
+ for data_object in linked_data_objects_result.get("resources", []):
222
+ upstream_biosample_id = data_object["_downstream_of"][0]
223
+ if upstream_biosample_id not in data_objects_by_biosample_id.keys():
224
+ data_objects_by_biosample_id[upstream_biosample_id] = []
225
+
226
+ # Strip away the metadata fields injected by `get_linked_instances()`.
227
+ data_object.pop("_upstream_of", None)
228
+ data_object.pop("_downstream_of", None)
229
+ data_objects_by_biosample_id[upstream_biosample_id].append(data_object)
230
+
231
+ # Convert the `data_objects_by_biosample_id` dictionary into a list of dicts;
232
+ # i.e., into the format returned by the initial version of this API endpoint,
233
+ # which did not use the `get_linked_instances` function under the hood.
234
+ for biosample_id, data_objects in data_objects_by_biosample_id.items():
235
+ biosample_data_objects.append(
236
+ {
237
+ "biosample_id": biosample_id,
238
+ "data_objects": data_objects,
239
+ }
240
+ )
241
+ return biosample_data_objects
242
+
243
+
244
+ @router.get(
245
+ "/data_objects/{data_object_id}",
246
+ response_model=Doc,
247
+ response_model_exclude_unset=True,
248
+ )
249
+ def find_data_object_by_id(
250
+ data_object_id: Annotated[
251
+ str,
252
+ Path(
253
+ title="DataObject ID",
254
+ description="The `id` of the `DataObject` you want to find.\n\n_Example_: `nmdc:dobj-11-abc123`",
255
+ examples=["nmdc:dobj-11-abc123"],
256
+ ),
257
+ ],
258
+ mdb: MongoDatabase = Depends(get_mongo_db),
259
+ ):
260
+ """
261
+ If the data object identifier is known, the metadata can be retrieved using the GET /data_objects/{data_object_id} endpoint.
262
+ \n Note that only one data object metadata record may be retrieved at a time using this method.
263
+ """
264
+ return strip_oid(
265
+ raise404_if_none(mdb["data_object_set"].find_one({"id": data_object_id}))
266
+ )
267
+
268
+
269
+ @router.get(
270
+ "/planned_processes",
271
+ response_model=FindResponse,
272
+ response_model_exclude_unset=True,
273
+ )
274
+ def find_planned_processes(
275
+ req: Annotated[FindRequest, Query()],
276
+ mdb: MongoDatabase = Depends(get_mongo_db),
277
+ ):
278
+ """
279
+ The GET /planned_processes endpoint is a general way to fetch metadata about various planned processes (e.g.
280
+ workflow execution, material processing, etc.). Any "slot" (a.k.a. attribute) for
281
+ [`PlannedProcess`](https://w3id.org/nmdc/PlannedProcess) may be used in the filter
282
+ and sort parameters, including attributes of subclasses of *PlannedProcess*.
283
+
284
+ For example, attributes used in subclasses such as [`Extraction`](https://w3id.org/nmdc/Extraction)
285
+ (subclass of *PlannedProcess*), can be used as input criteria for the filter and sort parameters of this endpoint.
286
+ """
287
+ return find_resources_spanning(
288
+ req,
289
+ mdb,
290
+ get_planned_process_collection_names()
291
+ & get_nonempty_nmdc_schema_collection_names(mdb),
292
+ )
293
+
294
+
295
+ @router.get(
296
+ "/planned_processes/{planned_process_id}",
297
+ response_model=Doc,
298
+ response_model_exclude_unset=True,
299
+ )
300
+ def find_planned_process_by_id(
301
+ planned_process_id: Annotated[
302
+ str,
303
+ Path(
304
+ title="PlannedProcess ID",
305
+ description="The `id` of the document that represents an instance of "
306
+ "the `PlannedProcess` class or any of its subclasses",
307
+ examples=[r"nmdc:wfmag-11-00jn7876.1"],
308
+ ),
309
+ ],
310
+ mdb: MongoDatabase = Depends(get_mongo_db),
311
+ ):
312
+ r"""
313
+ Returns the document that has the specified `id` and represents an instance of the `PlannedProcess` class
314
+ or any of its subclasses. If no such document exists, returns an HTTP 404 response.
315
+ """
316
+ doc = None
317
+
318
+ # Note: We exclude empty collections as a performance optimization
319
+ # (we already know they don't contain the document).
320
+ collection_names = (
321
+ get_planned_process_collection_names()
322
+ & get_nonempty_nmdc_schema_collection_names(mdb)
323
+ )
324
+
325
+ # For each collection, search it for a document having the specified `id`.
326
+ for name in collection_names:
327
+ doc = mdb[name].find_one({"id": planned_process_id})
328
+ if doc is not None:
329
+ return strip_oid(doc)
330
+
331
+ # Note: If execution gets to this point, it means we didn't find the document.
332
+ return raise404_if_none(doc)
333
+
334
+
335
+ @router.get(
336
+ "/workflow_executions/{workflow_execution_id}/related_resources",
337
+ response_model_exclude_unset=True,
338
+ name="Find resources related to the specified WorkflowExecution",
339
+ description=(
340
+ "Finds `DataObject`s, `Biosample`s, `Study`s, and other `WorkflowExecution`s "
341
+ "related to the specified `WorkflowExecution`."
342
+ "<br /><br />" # newlines
343
+ "This endpoint returns a JSON object that contains "
344
+ "(a) the specified `WorkflowExecution`, "
345
+ "(b) all the `DataObject`s that are inputs to — or outputs from — the specified `WorkflowExecution`, "
346
+ "(c) all the `Biosample`s that were inputs to those `DataGeneration`s, "
347
+ "(d) all the `Study`s with which those `Biosample`s are associated, and "
348
+ "(e) all the other `WorkflowExecution`s that are part of the same processing pipeline "
349
+ "as the specified `WorkflowExecution`."
350
+ "<br /><br />" # newlines
351
+ "**Note:** The data returned by this API endpoint can be up to 24 hours out of date "
352
+ "with respect to the NMDC database. That's because the cache that underlies this API "
353
+ "endpoint gets refreshed to match the NMDC database once every 24 hours."
354
+ ),
355
+ )
356
+ def find_related_objects_for_workflow_execution(
357
+ workflow_execution_id: Annotated[
358
+ str,
359
+ Path(
360
+ title="Workflow Execution ID",
361
+ description=(
362
+ "The `id` of the `WorkflowExecution` to which you want to find related resources."
363
+ "\n\n"
364
+ "_Example_: `nmdc:wfmgan-11-wdx72h27.1`"
365
+ ),
366
+ examples=["nmdc:wfmgan-11-wdx72h27.1"],
367
+ ),
368
+ ],
369
+ mdb: MongoDatabase = Depends(get_mongo_db),
370
+ ):
371
+ """This API endpoint retrieves resources related to the specified WorkflowExecution,
372
+ including DataObjects that are inputs to — or outputs from — it, other WorkflowExecution
373
+ instances that are part of the same pipeline, and related Biosamples and Studies.
374
+
375
+ :param workflow_execution_id: id of workflow_execution_set instance for which related objects are to be retrieved
376
+ :param mdb: A PyMongo `Database` instance that can be used to access the MongoDB database
377
+ :return: Dictionary with data_objects, related_workflow_executions, biosamples, and studies lists
378
+ """
379
+ # Get the specified `WorkflowExecution` document from the database.
380
+ workflow_execution = raise404_if_none(
381
+ mdb.workflow_execution_set.find_one({"id": workflow_execution_id}),
382
+ detail="Workflow execution not found",
383
+ )
384
+
385
+ # Create empty lists that will contain the related documents we find.
386
+ data_objects = []
387
+ related_workflow_executions = []
388
+ biosamples = []
389
+ studies = []
390
+
391
+ # Create empty sets that we'll use to avoid processing a given document multiple times.
392
+ unique_data_object_ids = set()
393
+ unique_workflow_execution_ids = set()
394
+ unique_biosample_ids = set()
395
+ unique_study_ids = set()
396
+
397
+ # Add the ID of the specified `WorkflowExecution` document, to the set of unique `WorkflowExecution` IDs.
398
+ unique_workflow_execution_ids.add(workflow_execution_id)
399
+
400
+ # Get a `SchemaView` that is bound to the NMDC schema.
401
+ nmdc_view = ViewGetter()
402
+ nmdc_sv = nmdc_view.get_view()
403
+ dg_descendants = [
404
+ (f"nmdc:{t}" if ":" not in t else t)
405
+ for t in nmdc_sv.class_descendants("DataGeneration")
406
+ ]
407
+
408
+ def add_data_object(doc_id: str) -> bool:
409
+ r"""
410
+ Helper function that adds the `DataObject` having the specified `id`
411
+ to our list of `DataObjects`, if it isn't already in there.
412
+ """
413
+ # Check if this is a DataObject by looking at the document's type directly
414
+ doc = mdb.alldocs.find_one({"id": doc_id}, {"type": 1})
415
+ if (
416
+ doc
417
+ and doc.get("type") == "nmdc:DataObject"
418
+ and doc_id not in unique_data_object_ids
419
+ ):
420
+ data_obj = mdb.data_object_set.find_one({"id": doc_id})
421
+ if data_obj:
422
+ data_objects.append(strip_oid(data_obj))
423
+ unique_data_object_ids.add(doc_id)
424
+ return True
425
+ return False
426
+
427
+ def add_workflow_execution(wfe: dict) -> None:
428
+ r"""
429
+ Helper function that adds the specified `WorkflowExecution`
430
+ to our list of `WorkflowExecution`s, if it isn't already in there;
431
+ and adds its related `DataObjects` to our list of `DataObject`s.
432
+ """
433
+ if wfe["id"] not in unique_workflow_execution_ids:
434
+ related_workflow_executions.append(strip_oid(wfe))
435
+ unique_workflow_execution_ids.add(wfe["id"])
436
+
437
+ # Add data objects related to this workflow execution.
438
+ ids_of_inputs = wfe.get("has_input", [])
439
+ ids_of_outputs = wfe.get("has_output", [])
440
+ for doc_id in ids_of_inputs + ids_of_outputs:
441
+ add_data_object(doc_id)
442
+
443
+ def add_biosample(biosample_id: str) -> bool:
444
+ r"""
445
+ Helper function that adds the specified `Biosample`
446
+ to our list of `Biosample`s, if it isn't already in there;
447
+ and adds its related `Study`s to our list of `Study`s.
448
+ """
449
+ if biosample_id not in unique_biosample_ids:
450
+ biosample = mdb.biosample_set.find_one({"id": biosample_id})
451
+ if biosample:
452
+ biosamples.append(strip_oid(biosample))
453
+ unique_biosample_ids.add(biosample_id)
454
+
455
+ # Add studies related to this biosample.
456
+ for study_id in biosample.get("associated_studies", []):
457
+ add_study(study_id)
458
+ return True
459
+ return False
460
+
461
+ def add_study(study_id: str) -> bool:
462
+ r"""
463
+ Helper function that adds the specified `Study`
464
+ to our list of `Study`s, if it isn't already in there.
465
+ """
466
+ if study_id not in unique_study_ids:
467
+ study = mdb.study_set.find_one({"id": study_id})
468
+ if study:
469
+ studies.append(strip_oid(study))
470
+ unique_study_ids.add(study_id)
471
+ return True
472
+ return False
473
+
474
+ def find_biosamples_recursively(start_id: str) -> None:
475
+ r"""
476
+ Recursive helper function that traverses the database in search of relevant `Biosample`s.
477
+
478
+ This function searches for biosamples starting from the "input" to a DataGeneration record by
479
+ traversing the data provenance graph – which is the bipartite graph formed by the
480
+ `has_input` / `has_output` relationships in the schema. It uses the ids asserted on
481
+ `has_input` and `has_output` slots on documents in the `alldocs` collection to tie related documents
482
+ in the chain together.
483
+
484
+ Note: The function uses an internal nested recursive function (`process_id()`) to avoid cycles
485
+ in the graph and tracks processed IDs to prevent infinite recursion.
486
+
487
+ :param start_id: The ID of the document to start the search from. This will typically
488
+ be the input to a `DataGeneration` record, which may be a `Biosample` directly or a
489
+ `ProcessedSample`.
490
+ """
491
+ # Create an empty set we can use to track the `id`s of documents we've already processed,
492
+ # in order to avoid processing the same documents multiple times (i.e. cycling in the graph).
493
+ processed_ids = set()
494
+
495
+ def process_id(current_id):
496
+ r"""
497
+ Recursive helper function that processes a single document ID and follows
498
+ connections to discover related biosamples.
499
+
500
+ This function:
501
+ 1. Checks if the current ID is already processed to prevent cycles
502
+ 2. Directly adds the document if it's a `Biosample`
503
+ 3. For non-Biosample documents (type of `PlannedProcess`), it:
504
+ - Processes input (`has_input`) IDs of the current document
505
+ - Finds documents that have the current ID as output (`has_output`) and processes their inputs
506
+
507
+ This recursive approach allows traversing the provenance graph in both directions.
508
+
509
+ :param current_id: The ID of the document to process in this recursive step
510
+ """
511
+ if current_id in processed_ids:
512
+ return
513
+
514
+ processed_ids.add(current_id)
515
+
516
+ # If it's a `Biosample`, i.e., "type" == "nmdc:Biosample"
517
+ doc = mdb.alldocs.find_one({"id": current_id}, {"type": 1})
518
+ if doc and doc.get("type") == "nmdc:Biosample":
519
+ add_biosample(current_id)
520
+ return
521
+
522
+ # Find the document with this ID to see what it is
523
+ current_doc = mdb.alldocs.find_one({"id": current_id})
524
+ if current_doc:
525
+ # Check if this document has inputs - if so, process them
526
+ for input_id in current_doc.get("has_input", []):
527
+ if input_id not in processed_ids:
528
+ process_id(input_id)
529
+
530
+ # Also find documents that have this ID as an output
531
+ # This is the key to walking backward through the chain
532
+ for doc in mdb.alldocs.find({"has_output": current_id}):
533
+ # Process all inputs of this document
534
+ for input_id in doc.get("has_input", []):
535
+ if input_id not in processed_ids:
536
+ process_id(input_id)
537
+
538
+ # Start the recursive search
539
+ process_id(start_id)
540
+
541
+ # Get the DataObject `id`s that are inputs (`has_input`) to and
542
+ # outputs (`has_output`) from the user-specified WorkflowExecution.
543
+ input_ids = workflow_execution.get("has_input", [])
544
+ output_ids = workflow_execution.get("has_output", [])
545
+
546
+ # Add those DataObjects to our list of DataObjects.
547
+ for doc_id in input_ids + output_ids:
548
+ add_data_object(doc_id)
549
+
550
+ # Find WorkflowExecutions whose inputs are outputs of this WorkflowExecution.
551
+ # Add those to our list of related WorkflowExecutions.
552
+ for output_id in output_ids:
553
+ related_wfes = mdb.workflow_execution_set.find({"has_input": output_id})
554
+ for wfe in related_wfes:
555
+ add_workflow_execution(wfe)
556
+
557
+ # Find WorkflowExecutions whose outputs are inputs of this WorkflowExecution.
558
+ # Add those, too, to our list of related WorkflowExecutions.
559
+ for input_id in input_ids:
560
+ related_wfes = mdb.workflow_execution_set.find({"has_output": input_id})
561
+ for wfe in related_wfes:
562
+ add_workflow_execution(wfe)
563
+
564
+ # Find WorkflowExecutions whose `was_informed_by` list contains that of the user-specified WorkflowExecution.
565
+ # Add those, too, to our list of related WorkflowExecutions.
566
+ if "was_informed_by" in workflow_execution:
567
+ was_informed_by = workflow_execution["was_informed_by"]
568
+
569
+ # Note: We added this assertion in an attempt to facilitate debugging
570
+ # the system in the situation where a `WorkflowExecution` document
571
+ # has a `was_informed_by` field whose value is not a list (which
572
+ # would be a violation of NMDC schema 11.9.0).
573
+ assert isinstance(was_informed_by, list), (
574
+ "A WorkflowExecution's `was_informed_by` field contained "
575
+ f"a {type(was_informed_by)} instead of a list."
576
+ )
577
+
578
+ # Get all WorkflowExecutions that were informed by any of the
579
+ # things that informed the user-specified WorkflowExecution.
580
+ related_wfes = mdb.workflow_execution_set.find(
581
+ {"was_informed_by": {"$in": was_informed_by}}
582
+ )
583
+ for wfe in related_wfes:
584
+ if wfe["id"] != workflow_execution_id:
585
+ add_workflow_execution(wfe)
586
+
587
+ # Get all `DataGeneration`s that informed the user-specified `WorkflowExecution`, then
588
+ # get all `Biosample`s and `Study`s associated with each of those `DataGeneration`s.
589
+ dg_docs = mdb.alldocs.find({"id": {"$in": was_informed_by}})
590
+ for dg_doc in dg_docs:
591
+ if any(t in dg_descendants for t in dg_doc.get("_type_and_ancestors", [])):
592
+ # Get Biosamples from the DataGeneration's `has_input` field by recursively walking up the chain.
593
+ # While we recursively walk up the chain, we'll add those Biosamples to our list of Biosamples.
594
+ for input_id in dg_doc.get("has_input", []):
595
+ find_biosamples_recursively(input_id)
596
+
597
+ # Get Studies associated with the DataGeneration,
598
+ # and add them to our list of Studies.
599
+ for study_id in dg_doc.get("associated_studies", []):
600
+ add_study(study_id)
601
+
602
+ # If the DataGeneration has no associated Studies, but has related Biosamples,
603
+ # add the Studies associated with those Biosamples to our list of Studies.
604
+ if not dg_doc.get("associated_studies") and len(biosamples) > 0:
605
+ for bs in biosamples:
606
+ for study_id in bs.get("associated_studies", []):
607
+ add_study(study_id)
608
+
609
+ # For all data objects we collected, check if they have a `was_generated_by` reference
610
+ # This is a supplementary path to find more relationships
611
+ for data_obj in data_objects:
612
+ if "was_generated_by" in data_obj:
613
+ gen_id = data_obj["was_generated_by"]
614
+ dg_doc = mdb.alldocs.find_one({"id": gen_id})
615
+
616
+ if dg_doc and any(
617
+ t in dg_descendants for t in dg_doc.get("_type_and_ancestors", [])
618
+ ):
619
+ # Get Studies directly associated with the DataGeneration
620
+ for study_id in dg_doc.get("associated_studies", []):
621
+ add_study(study_id)
622
+
623
+ response = {
624
+ "workflow_execution_id": workflow_execution_id, # `WorkflowExecution` `id` provided by user
625
+ "workflow_execution": strip_oid(
626
+ workflow_execution
627
+ ), # the specified `WorkflowExecution`
628
+ "data_objects": data_objects, # related `DataObject`s
629
+ "related_workflow_executions": related_workflow_executions, # related `WorkflowExecution`s
630
+ "biosamples": biosamples, # related `Biosample`s
631
+ "studies": studies, # related `Study`s
632
+ }
633
+
634
+ return response