nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +22 -2
- nmdc_runtime/api/core/idgen.py +36 -6
- nmdc_runtime/api/db/mongo.py +0 -12
- nmdc_runtime/api/endpoints/find.py +65 -225
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
- nmdc_runtime/api/endpoints/objects.py +4 -11
- nmdc_runtime/api/endpoints/operations.py +0 -27
- nmdc_runtime/api/endpoints/queries.py +22 -0
- nmdc_runtime/api/endpoints/sites.py +0 -24
- nmdc_runtime/api/endpoints/util.py +57 -35
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +84 -60
- nmdc_runtime/api/models/util.py +12 -5
- nmdc_runtime/api/openapi.py +116 -180
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/minter/adapters/repository.py +21 -0
- nmdc_runtime/minter/domain/model.py +20 -0
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +632 -11
- nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
- nmdc_runtime/site/graphs.py +7 -0
- nmdc_runtime/site/ops.py +92 -34
- nmdc_runtime/site/repository.py +2 -0
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +87 -1
- nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
- nmdc_runtime/api/endpoints/ids.py +0 -192
- nmdc_runtime/client/__init__.py +0 -0
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/__init__.py +0 -0
- nmdc_runtime/core/db/Database.py +0 -13
- nmdc_runtime/core/db/__init__.py +0 -0
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/__init__.py +0 -0
- nmdc_runtime/domain/users/__init__.py +0 -0
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/models/user.py +0 -1
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -33
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -825
- nmdc_runtime/lib/nmdc_etl_class.py +0 -396
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/__init__.py +0 -0
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
- nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,21 +1,17 @@
|
|
|
1
|
-
|
|
2
|
-
from typing import
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Annotated
|
|
3
3
|
|
|
4
4
|
from fastapi import APIRouter, Depends, Path, Query
|
|
5
|
-
from jinja2 import Environment, PackageLoader, select_autoescape
|
|
6
|
-
from nmdc_runtime.util import get_nmdc_jsonschema_dict
|
|
7
5
|
from pymongo.database import Database as MongoDatabase
|
|
8
|
-
from starlette.responses import HTMLResponse
|
|
9
|
-
from toolz import merge, assoc_in
|
|
10
6
|
|
|
11
7
|
from nmdc_schema.get_nmdc_view import ViewGetter
|
|
12
8
|
from nmdc_runtime.api.core.util import raise404_if_none
|
|
13
9
|
from nmdc_runtime.api.db.mongo import (
|
|
14
10
|
get_mongo_db,
|
|
15
|
-
activity_collection_names,
|
|
16
11
|
get_planned_process_collection_names,
|
|
17
12
|
get_nonempty_nmdc_schema_collection_names,
|
|
18
13
|
)
|
|
14
|
+
from nmdc_runtime.api.endpoints.nmdcschema import get_linked_instances
|
|
19
15
|
from nmdc_runtime.api.endpoints.util import (
|
|
20
16
|
find_resources,
|
|
21
17
|
strip_oid,
|
|
@@ -25,9 +21,8 @@ from nmdc_runtime.api.models.metadata import Doc
|
|
|
25
21
|
from nmdc_runtime.api.models.util import (
|
|
26
22
|
FindResponse,
|
|
27
23
|
FindRequest,
|
|
28
|
-
entity_attributes_to_index,
|
|
29
24
|
)
|
|
30
|
-
|
|
25
|
+
|
|
31
26
|
|
|
32
27
|
router = APIRouter()
|
|
33
28
|
|
|
@@ -178,133 +173,71 @@ def find_data_objects_for_study(
|
|
|
178
173
|
is a list of the `DataObject`s associated with that `Biosample`.
|
|
179
174
|
"""
|
|
180
175
|
biosample_data_objects = []
|
|
181
|
-
study = raise404_if_none(
|
|
182
|
-
mdb.study_set.find_one({"id": study_id}, ["id"]), detail="Study not found"
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
# Note: With nmdc-schema v10 (legacy schema), we used the field named `part_of` here.
|
|
186
|
-
# With nmdc-schema v11 (Berkeley schema), we use the field named `associated_studies` here.
|
|
187
|
-
biosamples = mdb.biosample_set.find({"associated_studies": study["id"]}, ["id"])
|
|
188
|
-
biosample_ids = [biosample["id"] for biosample in biosamples]
|
|
189
|
-
|
|
190
|
-
# SchemaView interface to NMDC Schema
|
|
191
|
-
nmdc_view = ViewGetter()
|
|
192
|
-
nmdc_sv = nmdc_view.get_view()
|
|
193
|
-
dg_descendants = [
|
|
194
|
-
(f"nmdc:{t}" if ":" not in t else t)
|
|
195
|
-
for t in nmdc_sv.class_descendants("DataGeneration")
|
|
196
|
-
]
|
|
197
|
-
|
|
198
|
-
def collect_data_objects(doc_ids, collected_objects, unique_ids):
|
|
199
|
-
"""Helper function to collect data objects from `has_input` and `has_output` references."""
|
|
200
|
-
for doc_id in doc_ids:
|
|
201
|
-
# Check if this is a DataObject by looking at the document's type directly
|
|
202
|
-
doc = mdb.alldocs.find_one({"id": doc_id}, {"type": 1})
|
|
203
|
-
if (
|
|
204
|
-
doc
|
|
205
|
-
and doc.get("type") == "nmdc:DataObject"
|
|
206
|
-
and doc_id not in unique_ids
|
|
207
|
-
):
|
|
208
|
-
data_obj = mdb.data_object_set.find_one({"id": doc_id})
|
|
209
|
-
if data_obj:
|
|
210
|
-
collected_objects.append(strip_oid(data_obj))
|
|
211
|
-
unique_ids.add(doc_id)
|
|
212
|
-
|
|
213
|
-
# Another way in which DataObjects can be related to Biosamples is through the
|
|
214
|
-
# `was_informed_by` key/slot. We need to link records from the `workflow_execution_set`
|
|
215
|
-
# collection that are "informed" by the same DataGeneration records that created
|
|
216
|
-
# the outputs above. Then we need to get additional DataObject records that are
|
|
217
|
-
# created by this linkage.
|
|
218
|
-
def process_informed_by_docs(doc, collected_objects, unique_ids):
|
|
219
|
-
"""Process documents linked by `was_informed_by` and collect relevant data objects."""
|
|
220
|
-
# Note: As of nmdc-schema 11.9.0, the `was_informed_by` field, if defined,
|
|
221
|
-
# will contain a list of strings. In MongoDB, the `{k: v}` filter
|
|
222
|
-
# can be used to check whether either (a) the value of field `f` is
|
|
223
|
-
# an array containing `v` as one of its elements, or (b) the value
|
|
224
|
-
# of field `f` is exactly equal to `v`. We rely on behavior (a) here.
|
|
225
|
-
informed_by_docs = mdb.workflow_execution_set.find(
|
|
226
|
-
{"was_informed_by": doc["id"]}
|
|
227
|
-
)
|
|
228
|
-
for informed_doc in informed_by_docs:
|
|
229
|
-
collect_data_objects(
|
|
230
|
-
informed_doc.get("has_input", []), collected_objects, unique_ids
|
|
231
|
-
)
|
|
232
|
-
collect_data_objects(
|
|
233
|
-
informed_doc.get("has_output", []), collected_objects, unique_ids
|
|
234
|
-
)
|
|
235
176
|
|
|
236
|
-
|
|
177
|
+
# Respond with an error if the specified `Study` does not exist.
|
|
178
|
+
# Note: We project only the `_id` field, to minimize data transfer.
|
|
179
|
+
raise404_if_none(
|
|
180
|
+
mdb["study_set"].find_one({"id": study_id}, projection={"_id": 1}),
|
|
181
|
+
detail="Study not found",
|
|
182
|
+
)
|
|
237
183
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
has_output, collected_data_objects, unique_ids
|
|
292
|
-
)
|
|
293
|
-
# Add non-DataObject outputs to continue the chain
|
|
294
|
-
for op in has_output:
|
|
295
|
-
doc_check = mdb.alldocs.find_one({"id": op}, {"type": 1})
|
|
296
|
-
if doc_check and doc_check.get("type") != "nmdc:DataObject":
|
|
297
|
-
new_current_ids.append(op)
|
|
298
|
-
|
|
299
|
-
current_ids = new_current_ids
|
|
300
|
-
|
|
301
|
-
if collected_data_objects:
|
|
302
|
-
result = {
|
|
184
|
+
# Use the `get_linked_instances` function—which is the function that
|
|
185
|
+
# underlies the `/nmdcschema/linked_instances` API endpoint—to get all
|
|
186
|
+
# the `Biosample`s that are downstream of the specified `Study`.
|
|
187
|
+
#
|
|
188
|
+
# Note: The `get_linked_instances` function requires that a `max_page_size`
|
|
189
|
+
# integer argument be passed in. In our case, we want to get _all_ of
|
|
190
|
+
# the instances. Python has no "infinity" integer; and, even if it did,
|
|
191
|
+
# if we were to specify too large of an integer, we'd get this error:
|
|
192
|
+
# > "OverflowError: MongoDB can only handle up to 8-byte ints"
|
|
193
|
+
# So, as a workaround, we pass in a number that is large enough that we
|
|
194
|
+
# think it will account for all cases in practice (e.g., a study having
|
|
195
|
+
# a trillion biosamples or a trillion data objects).
|
|
196
|
+
#
|
|
197
|
+
# TODO: Update the `get_linked_instances` function to optionally impose _no_ limit.
|
|
198
|
+
#
|
|
199
|
+
large_max_page_size: int = 1_000_000_000_000
|
|
200
|
+
linked_biosamples_result: dict = get_linked_instances(
|
|
201
|
+
ids=[study_id],
|
|
202
|
+
types=["nmdc:Biosample"],
|
|
203
|
+
hydrate=False, # we'll only use their `id` values
|
|
204
|
+
page_token=None,
|
|
205
|
+
max_page_size=large_max_page_size,
|
|
206
|
+
mdb=mdb,
|
|
207
|
+
)
|
|
208
|
+
biosample_ids = [d["id"] for d in linked_biosamples_result.get("resources", [])]
|
|
209
|
+
logging.debug(f"Found {len(biosample_ids)} Biosamples for Study {study_id}")
|
|
210
|
+
|
|
211
|
+
# Get all the `DataObject`s that are downstream from any of those `Biosample`s.
|
|
212
|
+
data_objects_by_biosample_id = {}
|
|
213
|
+
linked_data_objects_result: dict = get_linked_instances(
|
|
214
|
+
ids=biosample_ids,
|
|
215
|
+
types=["nmdc:DataObject"],
|
|
216
|
+
hydrate=True, # we want the full `DataObject` documents
|
|
217
|
+
page_token=None,
|
|
218
|
+
max_page_size=large_max_page_size,
|
|
219
|
+
mdb=mdb,
|
|
220
|
+
)
|
|
221
|
+
for data_object in linked_data_objects_result.get("resources", []):
|
|
222
|
+
upstream_biosample_id = data_object["_downstream_of"][0]
|
|
223
|
+
if upstream_biosample_id not in data_objects_by_biosample_id.keys():
|
|
224
|
+
data_objects_by_biosample_id[upstream_biosample_id] = []
|
|
225
|
+
|
|
226
|
+
# Strip away the metadata fields injected by `get_linked_instances()`.
|
|
227
|
+
data_object.pop("_upstream_of", None)
|
|
228
|
+
data_object.pop("_downstream_of", None)
|
|
229
|
+
data_objects_by_biosample_id[upstream_biosample_id].append(data_object)
|
|
230
|
+
|
|
231
|
+
# Convert the `data_objects_by_biosample_id` dictionary into a list of dicts;
|
|
232
|
+
# i.e., into the format returned by the initial version of this API endpoint,
|
|
233
|
+
# which did not use the `get_linked_instances` function under the hood.
|
|
234
|
+
for biosample_id, data_objects in data_objects_by_biosample_id.items():
|
|
235
|
+
biosample_data_objects.append(
|
|
236
|
+
{
|
|
303
237
|
"biosample_id": biosample_id,
|
|
304
|
-
"data_objects":
|
|
238
|
+
"data_objects": data_objects,
|
|
305
239
|
}
|
|
306
|
-
|
|
307
|
-
|
|
240
|
+
)
|
|
308
241
|
return biosample_data_objects
|
|
309
242
|
|
|
310
243
|
|
|
@@ -699,96 +632,3 @@ def find_related_objects_for_workflow_execution(
|
|
|
699
632
|
}
|
|
700
633
|
|
|
701
634
|
return response
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
jinja_env = Environment(
|
|
705
|
-
loader=PackageLoader("nmdc_runtime"), autoescape=select_autoescape()
|
|
706
|
-
)
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
def attr_index_sort_key(attr):
|
|
710
|
-
return "_" if attr == "id" else attr
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
def documentation_links(jsonschema_dict, collection_names) -> dict:
|
|
714
|
-
"""This function constructs a hierarchical catalog of (links to) schema classes and their slots.
|
|
715
|
-
|
|
716
|
-
The returned dictionary `doc_links` is used as input to the Jinja template `nmdc_runtime/templates/search.html`
|
|
717
|
-
in order to support user experience for `GET /search`.
|
|
718
|
-
"""
|
|
719
|
-
|
|
720
|
-
# Note: All documentation URLs generated within this function will begin with this.
|
|
721
|
-
base_url = r"https://w3id.org/nmdc"
|
|
722
|
-
|
|
723
|
-
# Initialize dictionary in which to associate key/value pairs via the following for loop.
|
|
724
|
-
doc_links = {}
|
|
725
|
-
|
|
726
|
-
for collection_name in collection_names:
|
|
727
|
-
# Since a given collection can be associated with multiple classes, the `doc_links` dictionary
|
|
728
|
-
# will have a _list_ of values for each collection.
|
|
729
|
-
class_descriptors = []
|
|
730
|
-
|
|
731
|
-
# If the collection name is one that the `search.html` page has a dedicated section for,
|
|
732
|
-
# give it a top-level key; otherwise, nest it under `activity_set`.
|
|
733
|
-
key_hierarchy: List[str] = ["activity_set", collection_name]
|
|
734
|
-
if collection_name in ("biosample_set", "study_set", "data_object_set"):
|
|
735
|
-
key_hierarchy = [collection_name]
|
|
736
|
-
|
|
737
|
-
# Process the name of each class that the schema associates with this collection.
|
|
738
|
-
collection_spec = jsonschema_dict["$defs"]["Database"]["properties"][
|
|
739
|
-
collection_name
|
|
740
|
-
]
|
|
741
|
-
class_names = get_class_names_from_collection_spec(collection_spec)
|
|
742
|
-
for idx, class_name in enumerate(class_names):
|
|
743
|
-
# Make a list of dictionaries, each of which describes one attribute of this class.
|
|
744
|
-
entity_attrs = list(jsonschema_dict["$defs"][class_name]["properties"])
|
|
745
|
-
entity_attr_descriptors = [
|
|
746
|
-
{"url": f"{base_url}/{attr_name}", "attr_name": attr_name}
|
|
747
|
-
for attr_name in entity_attrs
|
|
748
|
-
]
|
|
749
|
-
|
|
750
|
-
# Make a dictionary describing this class.
|
|
751
|
-
class_descriptor = {
|
|
752
|
-
"collection_name": collection_name,
|
|
753
|
-
"entity_url": f"{base_url}/{class_name}",
|
|
754
|
-
"entity_name": class_name,
|
|
755
|
-
"entity_attrs": sorted(
|
|
756
|
-
entity_attr_descriptors, key=itemgetter("attr_name")
|
|
757
|
-
),
|
|
758
|
-
}
|
|
759
|
-
|
|
760
|
-
# Add that descriptor to this collection's list of class descriptors.
|
|
761
|
-
class_descriptors.append(class_descriptor)
|
|
762
|
-
|
|
763
|
-
# Add a key/value pair describing this collection to the `doc_links` dictionary.
|
|
764
|
-
# Reference: https://toolz.readthedocs.io/en/latest/api.html#toolz.dicttoolz.assoc_in
|
|
765
|
-
doc_links = assoc_in(doc_links, keys=key_hierarchy, value=class_descriptors)
|
|
766
|
-
|
|
767
|
-
return doc_links
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
@router.get("/search", response_class=HTMLResponse, include_in_schema=False)
|
|
771
|
-
def search_page(
|
|
772
|
-
mdb: MongoDatabase = Depends(get_mongo_db),
|
|
773
|
-
):
|
|
774
|
-
template = jinja_env.get_template("search.html")
|
|
775
|
-
indexed_entity_attributes = merge(
|
|
776
|
-
{n: {"id"} for n in activity_collection_names(mdb)},
|
|
777
|
-
{
|
|
778
|
-
coll: sorted(attrs | {"id"}, key=attr_index_sort_key)
|
|
779
|
-
for coll, attrs in entity_attributes_to_index.items()
|
|
780
|
-
},
|
|
781
|
-
)
|
|
782
|
-
doc_links = documentation_links(
|
|
783
|
-
get_nmdc_jsonschema_dict(),
|
|
784
|
-
(
|
|
785
|
-
list(activity_collection_names(mdb))
|
|
786
|
-
+ ["biosample_set", "study_set", "data_object_set"]
|
|
787
|
-
),
|
|
788
|
-
)
|
|
789
|
-
html_content = template.render(
|
|
790
|
-
activity_collection_names=sorted(activity_collection_names(mdb)),
|
|
791
|
-
indexed_entity_attributes=indexed_entity_attributes,
|
|
792
|
-
doc_links=doc_links,
|
|
793
|
-
)
|
|
794
|
-
return HTMLResponse(content=html_content, status_code=200)
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
This module houses logic for the `GET /nmdcschema/linked_instances` endpoint, defined as
|
|
4
|
+
`nmdc_runtime.api.endpoints.nmdcschema.linked_instances`, to avoid (further) bloating the
|
|
5
|
+
`nmdc_runtime.api.endpoints.nmdcschema` module.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Literal, Any
|
|
10
|
+
|
|
11
|
+
from bson import ObjectId
|
|
12
|
+
from pymongo.collection import Collection as MongoCollection
|
|
13
|
+
from pymongo.database import Database as MongoDatabase
|
|
14
|
+
from toolz import merge
|
|
15
|
+
|
|
16
|
+
from nmdc_runtime.api.core.util import hash_from_str
|
|
17
|
+
from nmdc_runtime.util import get_class_name_to_collection_names_map, nmdc_schema_view
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def hash_from_ids_and_types(ids: list[str], types: list[str]) -> str:
|
|
21
|
+
"""A quick hash as a function of `ids` and `types`.
|
|
22
|
+
|
|
23
|
+
This will serve as part of a temporary mongo collection name.
|
|
24
|
+
Because it will only be "part of" the name, avoiding hash collisions isn't a priority.
|
|
25
|
+
|
|
26
|
+
Returns a hex digest truncated to 8 characters, so 16**8 ≈ 4M possible values.
|
|
27
|
+
"""
|
|
28
|
+
return hash_from_str(
|
|
29
|
+
",".join(sorted(ids)) + "." + ",".join(sorted(types)), algo="md5"
|
|
30
|
+
)[:8]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def temp_linked_instances_collection_name(ids: list[str], types: list[str]) -> str:
|
|
34
|
+
"""A name for a temporary mongo collection to store linked instances in service of an API request."""
|
|
35
|
+
return f"_runtime.tmp.linked_instances.{hash_from_ids_and_types(ids=ids,types=types)}.{ObjectId()}"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def gather_linked_instances(
|
|
39
|
+
alldocs_collection: MongoCollection,
|
|
40
|
+
ids: list[str],
|
|
41
|
+
types: list[str],
|
|
42
|
+
) -> str:
|
|
43
|
+
"""Collect linked instances and stores them in a new temporary collection.
|
|
44
|
+
|
|
45
|
+
Run an aggregation pipeline over `alldocs_collection` that collects ∈`types` instances linked to `ids`.
|
|
46
|
+
The pipeline is run twice, once for each of {"downstream", "upstream"} directions.
|
|
47
|
+
"""
|
|
48
|
+
merge_into_collection_name = temp_linked_instances_collection_name(
|
|
49
|
+
ids=ids, types=types
|
|
50
|
+
)
|
|
51
|
+
for direction in ["downstream", "upstream"]:
|
|
52
|
+
_ = list(
|
|
53
|
+
alldocs_collection.aggregate(
|
|
54
|
+
pipeline_for_direction(
|
|
55
|
+
ids=ids,
|
|
56
|
+
types=types,
|
|
57
|
+
direction=direction,
|
|
58
|
+
merge_into_collection_name=merge_into_collection_name,
|
|
59
|
+
),
|
|
60
|
+
allowDiskUse=True,
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
return merge_into_collection_name
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def pipeline_for_direction(
|
|
67
|
+
ids: list[str],
|
|
68
|
+
types: list[str],
|
|
69
|
+
direction: Literal["downstream", "upstream"],
|
|
70
|
+
merge_into_collection_name: str,
|
|
71
|
+
alldocs_collection_name: str = "alldocs",
|
|
72
|
+
) -> list:
|
|
73
|
+
"""A pure function that returns the aggregation pipeline for `direction`.
|
|
74
|
+
|
|
75
|
+
The pipeline
|
|
76
|
+
- collects ∈`types` instances linked to `ids` along `direction`,
|
|
77
|
+
- retains only those document fields essential to the caller, and
|
|
78
|
+
- ensures the collected instances are present, and properly updated if applicable, in a merge-target collection.
|
|
79
|
+
"""
|
|
80
|
+
return pipeline_for_instances_linked_to_ids_by_direction(
|
|
81
|
+
ids=ids,
|
|
82
|
+
types=types,
|
|
83
|
+
direction=direction,
|
|
84
|
+
alldocs_collection_name=alldocs_collection_name,
|
|
85
|
+
) + [
|
|
86
|
+
{"$project": {"id": 1, "type": 1, f"_{direction}_of": 1}},
|
|
87
|
+
pipeline_stage_for_merging_instances_and_grouping_link_provenance_by_direction(
|
|
88
|
+
merge_into_collection_name=merge_into_collection_name, direction=direction
|
|
89
|
+
),
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def pipeline_for_instances_linked_to_ids_by_direction(
|
|
94
|
+
ids: list[str],
|
|
95
|
+
types: list[str],
|
|
96
|
+
direction: Literal["downstream", "upstream"],
|
|
97
|
+
alldocs_collection_name: str = "alldocs",
|
|
98
|
+
slim: bool = True,
|
|
99
|
+
) -> list[dict[str, Any]]:
|
|
100
|
+
"""
|
|
101
|
+
Returns an aggregation pipeline that:
|
|
102
|
+
- traverses the graph of documents in the alldocs collection, following `direction`-specific relationships
|
|
103
|
+
to discover documents linked to the documents given by `ids`.
|
|
104
|
+
- `$unwind`s the collected (via `$graphLookup`) docs,
|
|
105
|
+
- filters them by given `types` of interest,
|
|
106
|
+
- adds bookkeeping information about `direction`ality, and
|
|
107
|
+
- (optionally) projects only essential fields to reduce response latency and size.
|
|
108
|
+
"""
|
|
109
|
+
return [
|
|
110
|
+
{"$match": {"id": {"$in": ids}}},
|
|
111
|
+
{
|
|
112
|
+
"$graphLookup": {
|
|
113
|
+
"from": alldocs_collection_name,
|
|
114
|
+
"startWith": f"$_{direction}.id",
|
|
115
|
+
"connectFromField": f"_{direction}.id",
|
|
116
|
+
"connectToField": "id",
|
|
117
|
+
"as": f"{direction}_docs",
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
{"$unwind": {"path": f"${direction}_docs"}},
|
|
121
|
+
{"$match": {f"{direction}_docs._type_and_ancestors": {"$in": types}}},
|
|
122
|
+
{"$addFields": {f"{direction}_docs._{direction}_of": ["$id"]}},
|
|
123
|
+
{"$replaceRoot": {"newRoot": f"${direction}_docs"}},
|
|
124
|
+
] + ([{"$project": {"id": 1, "type": 1, f"_{direction}_of": 1}}] if slim else [])
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def pipeline_stage_for_merging_instances_and_grouping_link_provenance_by_direction(
|
|
128
|
+
merge_into_collection_name: str,
|
|
129
|
+
direction: Literal["downstream", "upstream"],
|
|
130
|
+
) -> dict[str, Any]:
|
|
131
|
+
"""
|
|
132
|
+
Returns an aggregation-pipeline step that merges its input document stream to a collection dedicated to serving
|
|
133
|
+
the caller in a manner amenable to pagination across multiple HTTP requests.
|
|
134
|
+
"""
|
|
135
|
+
return {
|
|
136
|
+
"$merge": {
|
|
137
|
+
"into": merge_into_collection_name,
|
|
138
|
+
"on": "_id",
|
|
139
|
+
"whenMatched": [
|
|
140
|
+
{
|
|
141
|
+
"$set": {
|
|
142
|
+
f"_{direction}_of": {
|
|
143
|
+
"$setUnion": [
|
|
144
|
+
f"$_{direction}_of",
|
|
145
|
+
f"$$new._{direction}_of",
|
|
146
|
+
]
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
],
|
|
151
|
+
"whenNotMatched": "insert",
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def hydrated(resources: list[dict], mdb: MongoDatabase) -> list[dict]:
|
|
157
|
+
"""Replace each `dict` in `resources` with a hydrated version.
|
|
158
|
+
|
|
159
|
+
Instead of returning the retrieved "full" documents as is, we merge each one with (a copy of) the corresponding
|
|
160
|
+
original document in *resources*, which includes additional fields, e.g. `_upstream_of` and `_downstream_of`.
|
|
161
|
+
"""
|
|
162
|
+
class_name_to_collection_names_map = get_class_name_to_collection_names_map(
|
|
163
|
+
nmdc_schema_view()
|
|
164
|
+
)
|
|
165
|
+
types_of_resources = {r["type"] for r in resources}
|
|
166
|
+
full_docs_by_id = {}
|
|
167
|
+
|
|
168
|
+
for type in types_of_resources:
|
|
169
|
+
resource_ids_of_type = [d["id"] for d in resources if d["type"] == type]
|
|
170
|
+
schema_collection = mdb.get_collection(
|
|
171
|
+
# Note: We are assuming that documents of a given type are only allowed (by the schema) to reside in one
|
|
172
|
+
# collection. Based on that assumption, we will query only the _first_ collection whose name we get from
|
|
173
|
+
# the map. This assumption is continuously verified prior to code deployment via
|
|
174
|
+
# `test_get_class_name_to_collection_names_map_has_one_and_only_one_collection_name_per_class_name`.
|
|
175
|
+
class_name_to_collection_names_map[type.removeprefix("nmdc:")][0]
|
|
176
|
+
)
|
|
177
|
+
for doc in schema_collection.find({"id": {"$in": resource_ids_of_type}}):
|
|
178
|
+
full_docs_by_id[doc["id"]] = doc
|
|
179
|
+
|
|
180
|
+
return [merge(r, full_docs_by_id[r["id"]]) for r in resources]
|