nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (100) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +55 -4
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +33 -28
  76. nmdc_runtime/site/ops.py +97 -237
  77. nmdc_runtime/site/repair/database_updater.py +8 -0
  78. nmdc_runtime/site/repository.py +7 -117
  79. nmdc_runtime/site/resources.py +4 -4
  80. nmdc_runtime/site/translation/gold_translator.py +22 -21
  81. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  82. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  83. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  84. nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
  85. nmdc_runtime/site/translation/translator.py +63 -1
  86. nmdc_runtime/site/util.py +8 -3
  87. nmdc_runtime/site/validation/util.py +10 -5
  88. nmdc_runtime/util.py +9 -321
  89. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  90. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  91. nmdc_runtime/site/translation/emsl.py +0 -43
  92. nmdc_runtime/site/translation/gold.py +0 -53
  93. nmdc_runtime/site/translation/jgi.py +0 -32
  94. nmdc_runtime/site/translation/util.py +0 -132
  95. nmdc_runtime/site/validation/jgi.py +0 -43
  96. nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
  97. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  98. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  99. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  100. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,794 @@
1
+ from operator import itemgetter
2
+ from typing import List, Annotated
3
+
4
+ from fastapi import APIRouter, Depends, Path, Query
5
+ from jinja2 import Environment, PackageLoader, select_autoescape
6
+ from nmdc_runtime.util import get_nmdc_jsonschema_dict
7
+ from pymongo.database import Database as MongoDatabase
8
+ from starlette.responses import HTMLResponse
9
+ from toolz import merge, assoc_in
10
+
11
+ from nmdc_schema.get_nmdc_view import ViewGetter
12
+ from nmdc_runtime.api.core.util import raise404_if_none
13
+ from nmdc_runtime.api.db.mongo import (
14
+ get_mongo_db,
15
+ activity_collection_names,
16
+ get_planned_process_collection_names,
17
+ get_nonempty_nmdc_schema_collection_names,
18
+ )
19
+ from nmdc_runtime.api.endpoints.util import (
20
+ find_resources,
21
+ strip_oid,
22
+ find_resources_spanning,
23
+ )
24
+ from nmdc_runtime.api.models.metadata import Doc
25
+ from nmdc_runtime.api.models.util import (
26
+ FindResponse,
27
+ FindRequest,
28
+ entity_attributes_to_index,
29
+ )
30
+ from nmdc_runtime.util import get_class_names_from_collection_spec
31
+
32
+ router = APIRouter()
33
+
34
+
35
+ @router.get(
36
+ "/studies",
37
+ response_model=FindResponse,
38
+ response_model_exclude_unset=True,
39
+ )
40
+ def find_studies(
41
+ req: Annotated[FindRequest, Query()],
42
+ mdb: MongoDatabase = Depends(get_mongo_db),
43
+ ):
44
+ """
45
+ The `GET /studies` endpoint is a general purpose way to retrieve NMDC studies based on parameters provided by the user.
46
+ Studies can be filtered and sorted based on the applicable [Study attributes](https://microbiomedata.github.io/nmdc-schema/Study/).
47
+ """
48
+ return find_resources(req, mdb, "study_set")
49
+
50
+
51
+ @router.get(
52
+ "/studies/{study_id}",
53
+ response_model=Doc,
54
+ response_model_exclude_unset=True,
55
+ )
56
+ def find_study_by_id(
57
+ study_id: Annotated[
58
+ str,
59
+ Path(
60
+ title="Study ID",
61
+ description="The `id` of the `Study` you want to find.\n\n_Example_: `nmdc:sty-11-abc123`",
62
+ examples=["nmdc:sty-11-abc123"],
63
+ ),
64
+ ],
65
+ mdb: MongoDatabase = Depends(get_mongo_db),
66
+ ):
67
+ """
68
+ If the study identifier is known, a study can be retrieved directly using the GET /studies/{study_id} endpoint.
69
+ \n Note that only one study can be retrieved at a time using this method.
70
+ """
71
+ return strip_oid(raise404_if_none(mdb["study_set"].find_one({"id": study_id})))
72
+
73
+
74
+ @router.get(
75
+ "/biosamples",
76
+ response_model=FindResponse,
77
+ response_model_exclude_unset=True,
78
+ )
79
+ def find_biosamples(
80
+ req: Annotated[FindRequest, Query()],
81
+ mdb: MongoDatabase = Depends(get_mongo_db),
82
+ ):
83
+ """
84
+ The GET /biosamples endpoint is a general purpose way to retrieve biosample metadata using user-provided filter and sort criteria.
85
+ Please see the applicable [Biosample attributes](https://microbiomedata.github.io/nmdc-schema/Biosample/).
86
+ """
87
+ return find_resources(req, mdb, "biosample_set")
88
+
89
+
90
+ @router.get(
91
+ "/biosamples/{sample_id}",
92
+ response_model=Doc,
93
+ response_model_exclude_unset=True,
94
+ )
95
+ def find_biosample_by_id(
96
+ sample_id: Annotated[
97
+ str,
98
+ Path(
99
+ title="Biosample ID",
100
+ description="The `id` of the `Biosample` you want to find.\n\n_Example_: `nmdc:bsm-11-abc123`",
101
+ examples=["nmdc:bsm-11-abc123"],
102
+ ),
103
+ ],
104
+ mdb: MongoDatabase = Depends(get_mongo_db),
105
+ ):
106
+ """
107
+ If the biosample identifier is known, a biosample can be retrieved directly using the GET /biosamples/{sample_id}.
108
+ \n Note that only one biosample metadata record can be retrieved at a time using this method.
109
+ """
110
+ return strip_oid(raise404_if_none(mdb["biosample_set"].find_one({"id": sample_id})))
111
+
112
+
113
+ @router.get(
114
+ "/data_objects",
115
+ response_model=FindResponse,
116
+ response_model_exclude_unset=True,
117
+ )
118
+ def find_data_objects(
119
+ req: Annotated[FindRequest, Query()],
120
+ mdb: MongoDatabase = Depends(get_mongo_db),
121
+ ):
122
+ """
123
+ To retrieve metadata about NMDC data objects (such as files, records, or omics data) the GET /data_objects endpoint
124
+ may be used along with various parameters. Please see the applicable [Data Object](https://microbiomedata.github.io/nmdc-schema/DataObject/)
125
+ attributes.
126
+ """
127
+ return find_resources(req, mdb, "data_object_set")
128
+
129
+
130
+ @router.get(
131
+ "/data_objects/study/{study_id}",
132
+ response_model_exclude_unset=True,
133
+ #
134
+ # Customize the name that Swagger UI displays for the API endpoint.
135
+ #
136
+ # Note: By default, FastAPI derives the name of the API endpoint from the name of the decorated function. Here, we
137
+ # are using a custom name that matches the derived one, except that the custom one ends with `(delayed)`.
138
+ #
139
+ # Note: Each word in the name will appear capitalized on Swagger UI.
140
+ #
141
+ name="Find data objects for study (delayed)",
142
+ #
143
+ # Customize the description that Swagger UI displays for the API endpoint.
144
+ #
145
+ # Note: By default, FastAPI derives the description of the API endpoint from the docstring of the decorated
146
+ # function. Here, we are using a custom description that was written for an audience of API consumers,
147
+ # as opposed to the derived description that was written for an audience of `nmdc-runtime` developers.
148
+ #
149
+ description=(
150
+ "Gets all `DataObject`s related to all `Biosample`s related to the specified `Study`."
151
+ "<br /><br />" # newlines
152
+ "**Note:** The data returned by this API endpoint can be up to 24 hours out of date "
153
+ "with respect to the NMDC database. That's because the cache that underlies this API "
154
+ "endpoint gets refreshed to match the NMDC database once every 24 hours."
155
+ ),
156
+ )
157
+ def find_data_objects_for_study(
158
+ study_id: Annotated[
159
+ str,
160
+ Path(
161
+ title="Study ID",
162
+ description="""The `id` of the `Study` having `Biosample`s with which you want to find
163
+ associated `DataObject`s.\n\n_Example_: `nmdc:sty-11-abc123`""",
164
+ examples=["nmdc:sty-11-abc123"],
165
+ ),
166
+ ],
167
+ mdb: MongoDatabase = Depends(get_mongo_db),
168
+ ):
169
+ """This API endpoint is used to retrieve data objects associated with
170
+ all the biosamples associated with a given study. This endpoint makes
171
+ use of the `alldocs` collection for its implementation.
172
+
173
+ :param study_id: NMDC study id for which data objects are to be retrieved
174
+ :param mdb: PyMongo connection, defaults to Depends(get_mongo_db)
175
+ :return: List of dictionaries, each of which has a `biosample_id` entry
176
+ and a `data_object_set` entry. The value of the `biosample_id` entry
177
+ is the `Biosample`'s `id`. The value of the `data_object_set` entry
178
+ is a list of the `DataObject`s associated with that `Biosample`.
179
+ """
180
+ biosample_data_objects = []
181
+ study = raise404_if_none(
182
+ mdb.study_set.find_one({"id": study_id}, ["id"]), detail="Study not found"
183
+ )
184
+
185
+ # Note: With nmdc-schema v10 (legacy schema), we used the field named `part_of` here.
186
+ # With nmdc-schema v11 (Berkeley schema), we use the field named `associated_studies` here.
187
+ biosamples = mdb.biosample_set.find({"associated_studies": study["id"]}, ["id"])
188
+ biosample_ids = [biosample["id"] for biosample in biosamples]
189
+
190
+ # SchemaView interface to NMDC Schema
191
+ nmdc_view = ViewGetter()
192
+ nmdc_sv = nmdc_view.get_view()
193
+ dg_descendants = [
194
+ (f"nmdc:{t}" if ":" not in t else t)
195
+ for t in nmdc_sv.class_descendants("DataGeneration")
196
+ ]
197
+
198
+ def collect_data_objects(doc_ids, collected_objects, unique_ids):
199
+ """Helper function to collect data objects from `has_input` and `has_output` references."""
200
+ for doc_id in doc_ids:
201
+ # Check if this is a DataObject by looking at the document's type directly
202
+ doc = mdb.alldocs.find_one({"id": doc_id}, {"type": 1})
203
+ if (
204
+ doc
205
+ and doc.get("type") == "nmdc:DataObject"
206
+ and doc_id not in unique_ids
207
+ ):
208
+ data_obj = mdb.data_object_set.find_one({"id": doc_id})
209
+ if data_obj:
210
+ collected_objects.append(strip_oid(data_obj))
211
+ unique_ids.add(doc_id)
212
+
213
+ # Another way in which DataObjects can be related to Biosamples is through the
214
+ # `was_informed_by` key/slot. We need to link records from the `workflow_execution_set`
215
+ # collection that are "informed" by the same DataGeneration records that created
216
+ # the outputs above. Then we need to get additional DataObject records that are
217
+ # created by this linkage.
218
+ def process_informed_by_docs(doc, collected_objects, unique_ids):
219
+ """Process documents linked by `was_informed_by` and collect relevant data objects."""
220
+ # Note: As of nmdc-schema 11.9.0, the `was_informed_by` field, if defined,
221
+ # will contain a list of strings. In MongoDB, the `{k: v}` filter
222
+ # can be used to check whether either (a) the value of field `f` is
223
+ # an array containing `v` as one of its elements, or (b) the value
224
+ # of field `f` is exactly equal to `v`. We rely on behavior (a) here.
225
+ informed_by_docs = mdb.workflow_execution_set.find(
226
+ {"was_informed_by": doc["id"]}
227
+ )
228
+ for informed_doc in informed_by_docs:
229
+ collect_data_objects(
230
+ informed_doc.get("has_input", []), collected_objects, unique_ids
231
+ )
232
+ collect_data_objects(
233
+ informed_doc.get("has_output", []), collected_objects, unique_ids
234
+ )
235
+
236
+ biosample_data_objects = []
237
+
238
+ for biosample_id in biosample_ids:
239
+ current_ids = [biosample_id]
240
+ collected_data_objects = []
241
+ unique_ids = set()
242
+
243
+ # Iterate over records in the `alldocs` collection. Look for
244
+ # records that have the given biosample_id as value on the
245
+ # `has_input` key/slot. The retrieved documents might also have a
246
+ # `has_output` key/slot associated with them. Get the value of the
247
+ # `has_output` key and check if it's type is `nmdc:DataObject`. If
248
+ # it's not, repeat the process till it is.
249
+ while current_ids:
250
+ new_current_ids = []
251
+ for current_id in current_ids:
252
+ # Query to find all documents with current_id as the value on
253
+ # `has_input` slot
254
+ for doc in mdb.alldocs.find({"has_input": current_id}):
255
+ has_output = doc.get("has_output", [])
256
+
257
+ # Process `DataGeneration` type documents linked by `was_informed_by`
258
+ if not has_output and any(
259
+ t in dg_descendants for t in doc.get("_type_and_ancestors", [])
260
+ ):
261
+ process_informed_by_docs(
262
+ doc, collected_data_objects, unique_ids
263
+ )
264
+ continue
265
+
266
+ collect_data_objects(has_output, collected_data_objects, unique_ids)
267
+ # Add non-DataObject outputs to continue the chain
268
+ for op in has_output:
269
+ doc_check = mdb.alldocs.find_one({"id": op}, {"type": 1})
270
+ if doc_check and doc_check.get("type") != "nmdc:DataObject":
271
+ new_current_ids.append(op)
272
+
273
+ if any(
274
+ t in dg_descendants for t in doc.get("_type_and_ancestors", [])
275
+ ):
276
+ process_informed_by_docs(
277
+ doc, collected_data_objects, unique_ids
278
+ )
279
+
280
+ # Also check if current_id is a DataObject that serves as input to other processes
281
+ current_doc_type = mdb.alldocs.find_one({"id": current_id}, {"type": 1})
282
+ if (
283
+ current_doc_type
284
+ and current_doc_type.get("type") == "nmdc:DataObject"
285
+ ):
286
+ # Find all documents in alldocs that have this DataObject as input
287
+ for doc in mdb.alldocs.find({"has_input": current_id}):
288
+ has_output = doc.get("has_output", [])
289
+ # Process outputs from these documents
290
+ collect_data_objects(
291
+ has_output, collected_data_objects, unique_ids
292
+ )
293
+ # Add non-DataObject outputs to continue the chain
294
+ for op in has_output:
295
+ doc_check = mdb.alldocs.find_one({"id": op}, {"type": 1})
296
+ if doc_check and doc_check.get("type") != "nmdc:DataObject":
297
+ new_current_ids.append(op)
298
+
299
+ current_ids = new_current_ids
300
+
301
+ if collected_data_objects:
302
+ result = {
303
+ "biosample_id": biosample_id,
304
+ "data_objects": collected_data_objects,
305
+ }
306
+ biosample_data_objects.append(result)
307
+
308
+ return biosample_data_objects
309
+
310
+
311
+ @router.get(
312
+ "/data_objects/{data_object_id}",
313
+ response_model=Doc,
314
+ response_model_exclude_unset=True,
315
+ )
316
+ def find_data_object_by_id(
317
+ data_object_id: Annotated[
318
+ str,
319
+ Path(
320
+ title="DataObject ID",
321
+ description="The `id` of the `DataObject` you want to find.\n\n_Example_: `nmdc:dobj-11-abc123`",
322
+ examples=["nmdc:dobj-11-abc123"],
323
+ ),
324
+ ],
325
+ mdb: MongoDatabase = Depends(get_mongo_db),
326
+ ):
327
+ """
328
+ If the data object identifier is known, the metadata can be retrieved using the GET /data_objects/{data_object_id} endpoint.
329
+ \n Note that only one data object metadata record may be retrieved at a time using this method.
330
+ """
331
+ return strip_oid(
332
+ raise404_if_none(mdb["data_object_set"].find_one({"id": data_object_id}))
333
+ )
334
+
335
+
336
+ @router.get(
337
+ "/planned_processes",
338
+ response_model=FindResponse,
339
+ response_model_exclude_unset=True,
340
+ )
341
+ def find_planned_processes(
342
+ req: Annotated[FindRequest, Query()],
343
+ mdb: MongoDatabase = Depends(get_mongo_db),
344
+ ):
345
+ """
346
+ The GET /planned_processes endpoint is a general way to fetch metadata about various planned processes (e.g.
347
+ workflow execution, material processing, etc.). Any "slot" (a.k.a. attribute) for
348
+ [`PlannedProcess`](https://w3id.org/nmdc/PlannedProcess) may be used in the filter
349
+ and sort parameters, including attributes of subclasses of *PlannedProcess*.
350
+
351
+ For example, attributes used in subclasses such as [`Extraction`](https://w3id.org/nmdc/Extraction)
352
+ (subclass of *PlannedProcess*), can be used as input criteria for the filter and sort parameters of this endpoint.
353
+ """
354
+ return find_resources_spanning(
355
+ req,
356
+ mdb,
357
+ get_planned_process_collection_names()
358
+ & get_nonempty_nmdc_schema_collection_names(mdb),
359
+ )
360
+
361
+
362
+ @router.get(
363
+ "/planned_processes/{planned_process_id}",
364
+ response_model=Doc,
365
+ response_model_exclude_unset=True,
366
+ )
367
+ def find_planned_process_by_id(
368
+ planned_process_id: Annotated[
369
+ str,
370
+ Path(
371
+ title="PlannedProcess ID",
372
+ description="The `id` of the document that represents an instance of "
373
+ "the `PlannedProcess` class or any of its subclasses",
374
+ examples=[r"nmdc:wfmag-11-00jn7876.1"],
375
+ ),
376
+ ],
377
+ mdb: MongoDatabase = Depends(get_mongo_db),
378
+ ):
379
+ r"""
380
+ Returns the document that has the specified `id` and represents an instance of the `PlannedProcess` class
381
+ or any of its subclasses. If no such document exists, returns an HTTP 404 response.
382
+ """
383
+ doc = None
384
+
385
+ # Note: We exclude empty collections as a performance optimization
386
+ # (we already know they don't contain the document).
387
+ collection_names = (
388
+ get_planned_process_collection_names()
389
+ & get_nonempty_nmdc_schema_collection_names(mdb)
390
+ )
391
+
392
+ # For each collection, search it for a document having the specified `id`.
393
+ for name in collection_names:
394
+ doc = mdb[name].find_one({"id": planned_process_id})
395
+ if doc is not None:
396
+ return strip_oid(doc)
397
+
398
+ # Note: If execution gets to this point, it means we didn't find the document.
399
+ return raise404_if_none(doc)
400
+
401
+
402
+ @router.get(
403
+ "/workflow_executions/{workflow_execution_id}/related_resources",
404
+ response_model_exclude_unset=True,
405
+ name="Find resources related to the specified WorkflowExecution",
406
+ description=(
407
+ "Finds `DataObject`s, `Biosample`s, `Study`s, and other `WorkflowExecution`s "
408
+ "related to the specified `WorkflowExecution`."
409
+ "<br /><br />" # newlines
410
+ "This endpoint returns a JSON object that contains "
411
+ "(a) the specified `WorkflowExecution`, "
412
+ "(b) all the `DataObject`s that are inputs to — or outputs from — the specified `WorkflowExecution`, "
413
+ "(c) all the `Biosample`s that were inputs to those `DataGeneration`s, "
414
+ "(d) all the `Study`s with which those `Biosample`s are associated, and "
415
+ "(e) all the other `WorkflowExecution`s that are part of the same processing pipeline "
416
+ "as the specified `WorkflowExecution`."
417
+ "<br /><br />" # newlines
418
+ "**Note:** The data returned by this API endpoint can be up to 24 hours out of date "
419
+ "with respect to the NMDC database. That's because the cache that underlies this API "
420
+ "endpoint gets refreshed to match the NMDC database once every 24 hours."
421
+ ),
422
+ )
423
+ def find_related_objects_for_workflow_execution(
424
+ workflow_execution_id: Annotated[
425
+ str,
426
+ Path(
427
+ title="Workflow Execution ID",
428
+ description=(
429
+ "The `id` of the `WorkflowExecution` to which you want to find related resources."
430
+ "\n\n"
431
+ "_Example_: `nmdc:wfmgan-11-wdx72h27.1`"
432
+ ),
433
+ examples=["nmdc:wfmgan-11-wdx72h27.1"],
434
+ ),
435
+ ],
436
+ mdb: MongoDatabase = Depends(get_mongo_db),
437
+ ):
438
+ """This API endpoint retrieves resources related to the specified WorkflowExecution,
439
+ including DataObjects that are inputs to — or outputs from — it, other WorkflowExecution
440
+ instances that are part of the same pipeline, and related Biosamples and Studies.
441
+
442
+ :param workflow_execution_id: id of workflow_execution_set instance for which related objects are to be retrieved
443
+ :param mdb: A PyMongo `Database` instance that can be used to access the MongoDB database
444
+ :return: Dictionary with data_objects, related_workflow_executions, biosamples, and studies lists
445
+ """
446
+ # Get the specified `WorkflowExecution` document from the database.
447
+ workflow_execution = raise404_if_none(
448
+ mdb.workflow_execution_set.find_one({"id": workflow_execution_id}),
449
+ detail="Workflow execution not found",
450
+ )
451
+
452
+ # Create empty lists that will contain the related documents we find.
453
+ data_objects = []
454
+ related_workflow_executions = []
455
+ biosamples = []
456
+ studies = []
457
+
458
+ # Create empty sets that we'll use to avoid processing a given document multiple times.
459
+ unique_data_object_ids = set()
460
+ unique_workflow_execution_ids = set()
461
+ unique_biosample_ids = set()
462
+ unique_study_ids = set()
463
+
464
+ # Add the ID of the specified `WorkflowExecution` document, to the set of unique `WorkflowExecution` IDs.
465
+ unique_workflow_execution_ids.add(workflow_execution_id)
466
+
467
+ # Get a `SchemaView` that is bound to the NMDC schema.
468
+ nmdc_view = ViewGetter()
469
+ nmdc_sv = nmdc_view.get_view()
470
+ dg_descendants = [
471
+ (f"nmdc:{t}" if ":" not in t else t)
472
+ for t in nmdc_sv.class_descendants("DataGeneration")
473
+ ]
474
+
475
+ def add_data_object(doc_id: str) -> bool:
476
+ r"""
477
+ Helper function that adds the `DataObject` having the specified `id`
478
+ to our list of `DataObjects`, if it isn't already in there.
479
+ """
480
+ # Check if this is a DataObject by looking at the document's type directly
481
+ doc = mdb.alldocs.find_one({"id": doc_id}, {"type": 1})
482
+ if (
483
+ doc
484
+ and doc.get("type") == "nmdc:DataObject"
485
+ and doc_id not in unique_data_object_ids
486
+ ):
487
+ data_obj = mdb.data_object_set.find_one({"id": doc_id})
488
+ if data_obj:
489
+ data_objects.append(strip_oid(data_obj))
490
+ unique_data_object_ids.add(doc_id)
491
+ return True
492
+ return False
493
+
494
+ def add_workflow_execution(wfe: dict) -> None:
495
+ r"""
496
+ Helper function that adds the specified `WorkflowExecution`
497
+ to our list of `WorkflowExecution`s, if it isn't already in there;
498
+ and adds its related `DataObjects` to our list of `DataObject`s.
499
+ """
500
+ if wfe["id"] not in unique_workflow_execution_ids:
501
+ related_workflow_executions.append(strip_oid(wfe))
502
+ unique_workflow_execution_ids.add(wfe["id"])
503
+
504
+ # Add data objects related to this workflow execution.
505
+ ids_of_inputs = wfe.get("has_input", [])
506
+ ids_of_outputs = wfe.get("has_output", [])
507
+ for doc_id in ids_of_inputs + ids_of_outputs:
508
+ add_data_object(doc_id)
509
+
510
+ def add_biosample(biosample_id: str) -> bool:
511
+ r"""
512
+ Helper function that adds the specified `Biosample`
513
+ to our list of `Biosample`s, if it isn't already in there;
514
+ and adds its related `Study`s to our list of `Study`s.
515
+ """
516
+ if biosample_id not in unique_biosample_ids:
517
+ biosample = mdb.biosample_set.find_one({"id": biosample_id})
518
+ if biosample:
519
+ biosamples.append(strip_oid(biosample))
520
+ unique_biosample_ids.add(biosample_id)
521
+
522
+ # Add studies related to this biosample.
523
+ for study_id in biosample.get("associated_studies", []):
524
+ add_study(study_id)
525
+ return True
526
+ return False
527
+
528
+ def add_study(study_id: str) -> bool:
529
+ r"""
530
+ Helper function that adds the specified `Study`
531
+ to our list of `Study`s, if it isn't already in there.
532
+ """
533
+ if study_id not in unique_study_ids:
534
+ study = mdb.study_set.find_one({"id": study_id})
535
+ if study:
536
+ studies.append(strip_oid(study))
537
+ unique_study_ids.add(study_id)
538
+ return True
539
+ return False
540
+
541
+ def find_biosamples_recursively(start_id: str) -> None:
542
+ r"""
543
+ Recursive helper function that traverses the database in search of relevant `Biosample`s.
544
+
545
+ This function searches for biosamples starting from the "input" to a DataGeneration record by
546
+ traversing the data provenance graph – which is the bipartite graph formed by the
547
+ `has_input` / `has_output` relationships in the schema. It uses the ids asserted on
548
+ `has_input` and `has_output` slots on documents in the `alldocs` collection to tie related documents
549
+ in the chain together.
550
+
551
+ Note: The function uses an internal nested recursive function (`process_id()`) to avoid cycles
552
+ in the graph and tracks processed IDs to prevent infinite recursion.
553
+
554
+ :param start_id: The ID of the document to start the search from. This will typically
555
+ be the input to a `DataGeneration` record, which may be a `Biosample` directly or a
556
+ `ProcessedSample`.
557
+ """
558
+ # Create an empty set we can use to track the `id`s of documents we've already processed,
559
+ # in order to avoid processing the same documents multiple times (i.e. cycling in the graph).
560
+ processed_ids = set()
561
+
562
+ def process_id(current_id):
563
+ r"""
564
+ Recursive helper function that processes a single document ID and follows
565
+ connections to discover related biosamples.
566
+
567
+ This function:
568
+ 1. Checks if the current ID is already processed to prevent cycles
569
+ 2. Directly adds the document if it's a `Biosample`
570
+ 3. For non-Biosample documents (type of `PlannedProcess`), it:
571
+ - Processes input (`has_input`) IDs of the current document
572
+ - Finds documents that have the current ID as output (`has_output`) and processes their inputs
573
+
574
+ This recursive approach allows traversing the provenance graph in both directions.
575
+
576
+ :param current_id: The ID of the document to process in this recursive step
577
+ """
578
+ if current_id in processed_ids:
579
+ return
580
+
581
+ processed_ids.add(current_id)
582
+
583
+ # If it's a `Biosample`, i.e., "type" == "nmdc:Biosample"
584
+ doc = mdb.alldocs.find_one({"id": current_id}, {"type": 1})
585
+ if doc and doc.get("type") == "nmdc:Biosample":
586
+ add_biosample(current_id)
587
+ return
588
+
589
+ # Find the document with this ID to see what it is
590
+ current_doc = mdb.alldocs.find_one({"id": current_id})
591
+ if current_doc:
592
+ # Check if this document has inputs - if so, process them
593
+ for input_id in current_doc.get("has_input", []):
594
+ if input_id not in processed_ids:
595
+ process_id(input_id)
596
+
597
+ # Also find documents that have this ID as an output
598
+ # This is the key to walking backward through the chain
599
+ for doc in mdb.alldocs.find({"has_output": current_id}):
600
+ # Process all inputs of this document
601
+ for input_id in doc.get("has_input", []):
602
+ if input_id not in processed_ids:
603
+ process_id(input_id)
604
+
605
+ # Start the recursive search
606
+ process_id(start_id)
607
+
608
+ # Get the DataObject `id`s that are inputs (`has_input`) to and
609
+ # outputs (`has_output`) from the user-specified WorkflowExecution.
610
+ input_ids = workflow_execution.get("has_input", [])
611
+ output_ids = workflow_execution.get("has_output", [])
612
+
613
+ # Add those DataObjects to our list of DataObjects.
614
+ for doc_id in input_ids + output_ids:
615
+ add_data_object(doc_id)
616
+
617
+ # Find WorkflowExecutions whose inputs are outputs of this WorkflowExecution.
618
+ # Add those to our list of related WorkflowExecutions.
619
+ for output_id in output_ids:
620
+ related_wfes = mdb.workflow_execution_set.find({"has_input": output_id})
621
+ for wfe in related_wfes:
622
+ add_workflow_execution(wfe)
623
+
624
+ # Find WorkflowExecutions whose outputs are inputs of this WorkflowExecution.
625
+ # Add those, too, to our list of related WorkflowExecutions.
626
+ for input_id in input_ids:
627
+ related_wfes = mdb.workflow_execution_set.find({"has_output": input_id})
628
+ for wfe in related_wfes:
629
+ add_workflow_execution(wfe)
630
+
631
+ # Find WorkflowExecutions whose `was_informed_by` list contains that of the user-specified WorkflowExecution.
632
+ # Add those, too, to our list of related WorkflowExecutions.
633
+ if "was_informed_by" in workflow_execution:
634
+ was_informed_by = workflow_execution["was_informed_by"]
635
+
636
+ # Note: We added this assertion in an attempt to facilitate debugging
637
+ # the system in the situation where a `WorkflowExecution` document
638
+ # has a `was_informed_by` field whose value is not a list (which
639
+ # would be a violation of NMDC schema 11.9.0).
640
+ assert isinstance(was_informed_by, list), (
641
+ "A WorkflowExecution's `was_informed_by` field contained "
642
+ f"a {type(was_informed_by)} instead of a list."
643
+ )
644
+
645
+ # Get all WorkflowExecutions that were informed by any of the
646
+ # things that informed the user-specified WorkflowExecution.
647
+ related_wfes = mdb.workflow_execution_set.find(
648
+ {"was_informed_by": {"$in": was_informed_by}}
649
+ )
650
+ for wfe in related_wfes:
651
+ if wfe["id"] != workflow_execution_id:
652
+ add_workflow_execution(wfe)
653
+
654
+ # Get all `DataGeneration`s that informed the user-specified `WorkflowExecution`, then
655
+ # get all `Biosample`s and `Study`s associated with each of those `DataGeneration`s.
656
+ dg_docs = mdb.alldocs.find({"id": {"$in": was_informed_by}})
657
+ for dg_doc in dg_docs:
658
+ if any(t in dg_descendants for t in dg_doc.get("_type_and_ancestors", [])):
659
+ # Get Biosamples from the DataGeneration's `has_input` field by recursively walking up the chain.
660
+ # While we recursively walk up the chain, we'll add those Biosamples to our list of Biosamples.
661
+ for input_id in dg_doc.get("has_input", []):
662
+ find_biosamples_recursively(input_id)
663
+
664
+ # Get Studies associated with the DataGeneration,
665
+ # and add them to our list of Studies.
666
+ for study_id in dg_doc.get("associated_studies", []):
667
+ add_study(study_id)
668
+
669
+ # If the DataGeneration has no associated Studies, but has related Biosamples,
670
+ # add the Studies associated with those Biosamples to our list of Studies.
671
+ if not dg_doc.get("associated_studies") and len(biosamples) > 0:
672
+ for bs in biosamples:
673
+ for study_id in bs.get("associated_studies", []):
674
+ add_study(study_id)
675
+
676
+ # For all data objects we collected, check if they have a `was_generated_by` reference
677
+ # This is a supplementary path to find more relationships
678
+ for data_obj in data_objects:
679
+ if "was_generated_by" in data_obj:
680
+ gen_id = data_obj["was_generated_by"]
681
+ dg_doc = mdb.alldocs.find_one({"id": gen_id})
682
+
683
+ if dg_doc and any(
684
+ t in dg_descendants for t in dg_doc.get("_type_and_ancestors", [])
685
+ ):
686
+ # Get Studies directly associated with the DataGeneration
687
+ for study_id in dg_doc.get("associated_studies", []):
688
+ add_study(study_id)
689
+
690
+ response = {
691
+ "workflow_execution_id": workflow_execution_id, # `WorkflowExecution` `id` provided by user
692
+ "workflow_execution": strip_oid(
693
+ workflow_execution
694
+ ), # the specified `WorkflowExecution`
695
+ "data_objects": data_objects, # related `DataObject`s
696
+ "related_workflow_executions": related_workflow_executions, # related `WorkflowExecution`s
697
+ "biosamples": biosamples, # related `Biosample`s
698
+ "studies": studies, # related `Study`s
699
+ }
700
+
701
+ return response
702
+
703
+
704
+ jinja_env = Environment(
705
+ loader=PackageLoader("nmdc_runtime"), autoescape=select_autoescape()
706
+ )
707
+
708
+
709
+ def attr_index_sort_key(attr):
710
+ return "_" if attr == "id" else attr
711
+
712
+
713
+ def documentation_links(jsonschema_dict, collection_names) -> dict:
714
+ """This function constructs a hierarchical catalog of (links to) schema classes and their slots.
715
+
716
+ The returned dictionary `doc_links` is used as input to the Jinja template `nmdc_runtime/templates/search.html`
717
+ in order to support user experience for `GET /search`.
718
+ """
719
+
720
+ # Note: All documentation URLs generated within this function will begin with this.
721
+ base_url = r"https://w3id.org/nmdc"
722
+
723
+ # Initialize dictionary in which to associate key/value pairs via the following for loop.
724
+ doc_links = {}
725
+
726
+ for collection_name in collection_names:
727
+ # Since a given collection can be associated with multiple classes, the `doc_links` dictionary
728
+ # will have a _list_ of values for each collection.
729
+ class_descriptors = []
730
+
731
+ # If the collection name is one that the `search.html` page has a dedicated section for,
732
+ # give it a top-level key; otherwise, nest it under `activity_set`.
733
+ key_hierarchy: List[str] = ["activity_set", collection_name]
734
+ if collection_name in ("biosample_set", "study_set", "data_object_set"):
735
+ key_hierarchy = [collection_name]
736
+
737
+ # Process the name of each class that the schema associates with this collection.
738
+ collection_spec = jsonschema_dict["$defs"]["Database"]["properties"][
739
+ collection_name
740
+ ]
741
+ class_names = get_class_names_from_collection_spec(collection_spec)
742
+ for idx, class_name in enumerate(class_names):
743
+ # Make a list of dictionaries, each of which describes one attribute of this class.
744
+ entity_attrs = list(jsonschema_dict["$defs"][class_name]["properties"])
745
+ entity_attr_descriptors = [
746
+ {"url": f"{base_url}/{attr_name}", "attr_name": attr_name}
747
+ for attr_name in entity_attrs
748
+ ]
749
+
750
+ # Make a dictionary describing this class.
751
+ class_descriptor = {
752
+ "collection_name": collection_name,
753
+ "entity_url": f"{base_url}/{class_name}",
754
+ "entity_name": class_name,
755
+ "entity_attrs": sorted(
756
+ entity_attr_descriptors, key=itemgetter("attr_name")
757
+ ),
758
+ }
759
+
760
+ # Add that descriptor to this collection's list of class descriptors.
761
+ class_descriptors.append(class_descriptor)
762
+
763
+ # Add a key/value pair describing this collection to the `doc_links` dictionary.
764
+ # Reference: https://toolz.readthedocs.io/en/latest/api.html#toolz.dicttoolz.assoc_in
765
+ doc_links = assoc_in(doc_links, keys=key_hierarchy, value=class_descriptors)
766
+
767
+ return doc_links
768
+
769
+
770
+ @router.get("/search", response_class=HTMLResponse, include_in_schema=False)
771
+ def search_page(
772
+ mdb: MongoDatabase = Depends(get_mongo_db),
773
+ ):
774
+ template = jinja_env.get_template("search.html")
775
+ indexed_entity_attributes = merge(
776
+ {n: {"id"} for n in activity_collection_names(mdb)},
777
+ {
778
+ coll: sorted(attrs | {"id"}, key=attr_index_sort_key)
779
+ for coll, attrs in entity_attributes_to_index.items()
780
+ },
781
+ )
782
+ doc_links = documentation_links(
783
+ get_nmdc_jsonschema_dict(),
784
+ (
785
+ list(activity_collection_names(mdb))
786
+ + ["biosample_set", "study_set", "data_object_set"]
787
+ ),
788
+ )
789
+ html_content = template.render(
790
+ activity_collection_names=sorted(activity_collection_names(mdb)),
791
+ indexed_entity_attributes=indexed_entity_attributes,
792
+ doc_links=doc_links,
793
+ )
794
+ return HTMLResponse(content=html_content, status_code=200)