PyPI - nmdc-runtime - Versions diffs - 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl - Mend

nmdc-runtime 2.10.0py3-none-any.whl → 2.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +22 -2
nmdc_runtime/api/core/idgen.py +36 -6
nmdc_runtime/api/db/mongo.py +0 -12
nmdc_runtime/api/endpoints/find.py +65 -225
nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
nmdc_runtime/api/endpoints/objects.py +4 -11
nmdc_runtime/api/endpoints/operations.py +0 -27
nmdc_runtime/api/endpoints/queries.py +22 -0
nmdc_runtime/api/endpoints/sites.py +0 -24
nmdc_runtime/api/endpoints/util.py +57 -35
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +84 -60
nmdc_runtime/api/models/util.py +12 -5
nmdc_runtime/api/openapi.py +116 -180
nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/minter/adapters/repository.py +21 -0
nmdc_runtime/minter/domain/model.py +20 -0
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +26 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +26 -0
nmdc_runtime/site/export/ncbi_xml.py +632 -11
nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
nmdc_runtime/site/graphs.py +7 -0
nmdc_runtime/site/ops.py +92 -34
nmdc_runtime/site/repository.py +2 -0
nmdc_runtime/site/resources.py +16 -3
nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +87 -1
nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
nmdc_runtime/api/endpoints/ids.py +0 -192
nmdc_runtime/client/__init__.py +0 -0
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/__init__.py +0 -0
nmdc_runtime/core/db/Database.py +0 -13
nmdc_runtime/core/db/__init__.py +0 -0
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/__init__.py +0 -0
nmdc_runtime/domain/users/__init__.py +0 -0
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/__init__.py +0 -0
nmdc_runtime/infrastructure/database/__init__.py +0 -0
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
nmdc_runtime/infrastructure/database/models/user.py +0 -1
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -33
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -825
nmdc_runtime/lib/nmdc_etl_class.py +0 -396
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/__init__.py +0 -0
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/site/export/ncbi_xml_utils.py CHANGED Viewed

@@ -275,6 +275,120 @@ def load_mappings(url):
     return attribute_mappings, slot_range_mappings
+def check_pooling_for_biosamples(
+    material_processing_set: Collection, biosamples_list: List[Dict[str, Any]]
+) -> Dict[str, Dict[str, Any]]:
+    """Check which biosamples are part of pooling processes and return pooling information.
+    The way in which we check if a biosample is part of a Pooling process is by checking if
+    the biosample id has been asserted on the `has_input` slot/key of an `nmdc:Pooling` process
+    instance.
+    :param material_processing_set: reference to the material_processing_set collection
+    :param biosamples_list: list of all biosamples to check
+    :return: dictionary mapping biosample_id to pooling information (empty dict if not pooled)
+    """
+    result = {}
+    # get list of all biosample IDs that are part of a given study
+    biosample_lookup = {bs["id"]: bs for bs in biosamples_list}
+    # get list of all pooling processes
+    pooling_processes = list(material_processing_set.find({"type": "nmdc:Pooling"}))
+    # initialize all biosamples as not pooled
+    for biosample in biosamples_list:
+        result[biosample["id"]] = {}
+    # process each pooling process
+    for pooling_process in pooling_processes:
+        pooled_biosample_ids = pooling_process.get("has_input", [])
+        # get the processed sample output from the pooling process
+        has_output = pooling_process.get("has_output", [])
+        processed_sample_id = None
+        for output_id in has_output:
+            if get_classname_from_typecode(output_id) == "ProcessedSample":
+                processed_sample_id = output_id
+                break
+        # aggregate the values on `collection_date` and `depth` slots
+        # here, we are collecting the `collection_date` and `depth` values
+        # asserted on each of the biosamples that are part of a given pooling
+        # process in the following way:
+        # example of aggregated `collection_date`: 2017-06-05T16:50Z/2017-06-05T17:47Z
+        # example of aggregated `depth`: 0-10 m
+        collection_dates = []
+        depths = []
+        for bs_id in pooled_biosample_ids:
+            biosample = biosample_lookup.get(bs_id)
+            if not biosample:
+                continue
+            if "collection_date" in biosample:
+                collection_date = biosample["collection_date"]
+                if (
+                    isinstance(collection_date, dict)
+                    and "has_raw_value" in collection_date
+                ):
+                    collection_dates.append(collection_date["has_raw_value"])
+                elif isinstance(collection_date, str):
+                    collection_dates.append(collection_date)
+            if "depth" in biosample:
+                depth = biosample["depth"]
+                if isinstance(depth, dict):
+                    if "has_numeric_value" in depth:
+                        depths.append(depth["has_numeric_value"])
+                    elif (
+                        "has_minimum_numeric_value" in depth
+                        and "has_maximum_numeric_value" in depth
+                    ):
+                        depths.extend(
+                            [
+                                depth["has_minimum_numeric_value"],
+                                depth["has_maximum_numeric_value"],
+                            ]
+                        )
+                elif isinstance(depth, (int, float)):
+                    depths.append(depth)
+        # create aggregated (forward slash separated) value for `collection_date`
+        aggregated_collection_date = None
+        if collection_dates:
+            sorted_dates = sorted(collection_dates)
+            if len(sorted_dates) > 1:
+                aggregated_collection_date = f"{sorted_dates[0]}/{sorted_dates[-1]}"
+            else:
+                aggregated_collection_date = sorted_dates[0]
+        # create aggregated (hyphen separated) value for `depth`
+        aggregated_depth = None
+        if depths:
+            min_depth = min(depths)
+            max_depth = max(depths)
+            if min_depth != max_depth:
+                aggregated_depth = f"{min_depth}-{max_depth} m"
+            else:
+                aggregated_depth = f"{min_depth} m"
+        # update all biosamples that are part of this pooling process
+        pooling_info = {
+            "processed_sample_id": processed_sample_id,
+            "pooling_process_id": pooling_process.get("id"),
+            "pooled_biosample_ids": pooled_biosample_ids,
+            "aggregated_collection_date": aggregated_collection_date,
+            "aggregated_depth": aggregated_depth,
+        }
+        for bs_id in pooled_biosample_ids:
+            if bs_id in result:
+                result[bs_id] = pooling_info
+    return result
 def validate_xml(xml, xsd_url):
     response = requests.get(xsd_url)
     response.raise_for_status()

nmdc_runtime/site/graphs.py CHANGED Viewed

@@ -53,6 +53,7 @@ from nmdc_runtime.site.ops import (
     get_data_objects_from_biosamples,
     get_nucleotide_sequencing_from_biosamples,
     get_library_preparation_from_biosamples,
+    get_aggregated_pooled_biosamples,
     get_all_instruments,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
@@ -173,6 +174,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
         data_object_mapping_file_url,
         biosample_extras_file_url,
         biosample_extras_slot_mapping_file_url,
+        study_id,
     ) = get_submission_portal_pipeline_inputs()
     metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
@@ -193,6 +195,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
         biosample_extras=biosample_extras,
         biosample_extras_slot_mapping=biosample_extras_slot_mapping,
         instrument_mapping=instrument_mapping,
+        study_id=study_id,
     )
     validate_metadata(database)
@@ -213,6 +216,7 @@ def ingest_metadata_submission():
         data_object_mapping_file_url,
         biosample_extras_file_url,
         biosample_extras_slot_mapping_file_url,
+        study_id,
     ) = get_submission_portal_pipeline_inputs()
     metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
@@ -233,6 +237,7 @@ def ingest_metadata_submission():
         biosample_extras=biosample_extras,
         biosample_extras_slot_mapping=biosample_extras_slot_mapping,
         instrument_mapping=instrument_mapping,
+        study_id=study_id,
     )
     log_database_ids(database)
@@ -472,6 +477,7 @@ def nmdc_study_to_ncbi_submission_export():
     )
     data_object_records = get_data_objects_from_biosamples(biosamples)
     library_preparation_records = get_library_preparation_from_biosamples(biosamples)
+    pooled_biosamples_data = get_aggregated_pooled_biosamples(biosamples)
     all_instruments = get_all_instruments()
     xml_data = ncbi_submission_xml_from_nmdc_study(
         nmdc_study,
@@ -481,6 +487,7 @@ def nmdc_study_to_ncbi_submission_export():
         data_object_records,
         library_preparation_records,
         all_instruments,
+        pooled_biosamples_data,
     )
     ncbi_submission_xml_asset(xml_data)

nmdc_runtime/site/ops.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import csv
 import json
 import logging
-import mimetypes
 import os
 import subprocess
 from collections import defaultdict
@@ -16,6 +15,7 @@ from ontology_loader.ontology_load_controller import OntologyLoaderController
 import pandas as pd
 import requests
 from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
+from toolz import dissoc
 from bson import ObjectId, json_util
 from dagster import (
@@ -73,7 +73,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
     fetch_nucleotide_sequencing_from_biosamples,
     fetch_library_preparation_from_biosamples,
 )
-from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
 from nmdc_runtime.site.resources import (
     NmdcPortalApiClient,
     GoldApiClient,
@@ -95,15 +94,12 @@ from nmdc_runtime.site.translation.submission_portal_translator import (
 )
 from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
 from nmdc_runtime.site.util import (
-    run_and_log,
     schema_collection_has_index_on_id,
     nmdc_study_id_to_filename,
     get_instruments_by_id,
 )
 from nmdc_runtime.util import (
-    drs_object_in_for,
     pluralize,
-    put_object,
     specialize_activity_set_docs,
     collection_name_to_class_names,
     nmdc_schema_view,
@@ -112,7 +108,7 @@ from nmdc_runtime.util import (
 from nmdc_schema import nmdc
 from pymongo import InsertOne, UpdateOne
 from pymongo.database import Database as MongoDatabase
-from starlette import status
+from pymongo.collection import Collection as MongoCollection
 from toolz import get_in, valfilter, identity
@@ -373,6 +369,9 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn):
 @op(required_resource_keys={"runtime_api_site_client"})
 def get_json_in(context):
+    """
+    TODO: Document this function.
+    """
     object_id = context.op_config.get("object_id")
     client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
     rv = client.get_object_bytes(object_id)
@@ -385,6 +384,9 @@ def get_json_in(context):
 @op(required_resource_keys={"runtime_api_site_client", "mongo"})
 def perform_mongo_updates(context, json_in):
+    """
+    TODO: Document this function.
+    """
     mongo = context.resources.mongo
     client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
     op_id = context.op_config.get("operation_id")
@@ -414,6 +416,9 @@ def perform_mongo_updates(context, json_in):
 def _add_schema_docs_with_or_without_replacement(
     mongo: MongoDBResource, docs: Dict[str, list]
 ):
+    """
+    TODO: Document this function.
+    """
     coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
     if all(coll_index_on_id_map[coll] for coll in docs.keys()):
         replace = True
@@ -437,7 +442,13 @@ def _add_schema_docs_with_or_without_replacement(
             f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
         )
     op_result = mongo.add_docs(docs, validate=False, replace=replace)
-    return mongo_add_docs_result_as_dict(op_result)
+    # Translate the operation result into a dictionary in which each item's key is a collection name
+    # and each item's value is the corresponding bulk API result (excluding the "upserted" field).
+    return {
+        collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
+        for collection_name, bulk_write_result in op_result.items()
+    }
 @op(required_resource_keys={"mongo"})
@@ -545,27 +556,39 @@ def nmdc_schema_database_from_gold_study(
 @op(
+    required_resource_keys={"mongo"},
     out={
         "submission_id": Out(),
         "nucleotide_sequencing_mapping_file_url": Out(Optional[str]),
         "data_object_mapping_file_url": Out(Optional[str]),
         "biosample_extras_file_url": Out(Optional[str]),
         "biosample_extras_slot_mapping_file_url": Out(Optional[str]),
+        "study_id": Out(Optional[str]),
     },
 )
 def get_submission_portal_pipeline_inputs(
+    context: OpExecutionContext,
     submission_id: str,
     nucleotide_sequencing_mapping_file_url: Optional[str],
     data_object_mapping_file_url: Optional[str],
     biosample_extras_file_url: Optional[str],
     biosample_extras_slot_mapping_file_url: Optional[str],
-) -> Tuple[str, str | None, str | None, str | None, str | None]:
+    study_id: Optional[str],
+) -> Tuple[str, str | None, str | None, str | None, str | None, str | None]:
+    # query for studies matching the ID to see if it eists
+    if study_id:
+        mdb = context.resources.mongo.db
+        result = mdb.study_set.find_one({"id": study_id})
+        if not result:
+            raise Exception(f"Study id: {study_id} does not exist in Mongo.")
     return (
         submission_id,
         nucleotide_sequencing_mapping_file_url,
         data_object_mapping_file_url,
         biosample_extras_file_url,
         biosample_extras_slot_mapping_file_url,
+        study_id,
     )
@@ -590,6 +613,7 @@ def translate_portal_submission_to_nmdc_schema_database(
     study_pi_image_url: Optional[str],
     biosample_extras: Optional[list[dict]],
     biosample_extras_slot_mapping: Optional[list[dict]],
+    study_id: Optional[str],
 ) -> nmdc.Database:
     client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
@@ -607,6 +631,7 @@ def translate_portal_submission_to_nmdc_schema_database(
         biosample_extras=biosample_extras,
         biosample_extras_slot_mapping=biosample_extras_slot_mapping,
         illumina_instrument_mapping=instrument_mapping,
+        study_id=study_id,
     )
     database = translator.get_database()
     return database
@@ -947,7 +972,9 @@ def load_ontology(context: OpExecutionContext):
 def _add_linked_instances_to_alldocs(
-    temp_collection, context, document_reference_ranged_slots_by_type
+    temp_collection: MongoCollection,
+    context: OpExecutionContext,
+    document_reference_ranged_slots_by_type: dict,
 ) -> None:
     """
     Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
@@ -983,16 +1010,13 @@ def _add_linked_instances_to_alldocs(
         # Store the full type with prefix intact
         doc_type = doc["type"]
         # For looking up reference slots, we still need the type without prefix
-        # FIXME `document_reference_ranged_slots_by_type` should key on `doc_type`
         doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
         # Record ID to type mapping - preserve the original type with prefix
         id_to_type_map[doc_id] = doc_type
         # Find all document references from this document
-        reference_slots = document_reference_ranged_slots_by_type.get(
-            doc_type_no_prefix, []
-        )
+        reference_slots = document_reference_ranged_slots_by_type.get(doc_type, [])
         for slot in reference_slots:
             if slot in doc:
                 # Handle both single-value and array references
@@ -1116,7 +1140,7 @@ def _add_linked_instances_to_alldocs(
 #       Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
 #
 @op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
-def materialize_alldocs(context) -> int:
+def materialize_alldocs(context: OpExecutionContext) -> int:
     """
     This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
@@ -1167,17 +1191,16 @@ def materialize_alldocs(context) -> int:
         )
     )
-    # FIXME rename to `document_reference_ranged_slots_by_type`
-    # FIXME key on CURIE, e.g. `nmdc:Study`
-    #  (here, not upstream in `cls_slot_map`/`document_referenceable_ranges`, b/c `schema_view` used directly in those)
-    document_reference_ranged_slots = defaultdict(list)
+    document_reference_ranged_slots_by_type = defaultdict(list)
     for cls_name, slot_map in cls_slot_map.items():
         for slot_name, slot in slot_map.items():
             if (
                 set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
                 & document_referenceable_ranges
             ):
-                document_reference_ranged_slots[cls_name].append(slot_name)
+                document_reference_ranged_slots_by_type[f"nmdc:{cls_name}"].append(
+                    slot_name
+                )
     # Build `alldocs` to a temporary collection for atomic replacement
     # https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
@@ -1194,25 +1217,19 @@ def materialize_alldocs(context) -> int:
                 # Keep the full type with prefix for document
                 doc_type_full = doc["type"]
                 # Remove prefix for slot lookup and ancestor lookup
-                doc_type = (
-                    doc_type_full[5:]
-                    if doc_type_full.startswith("nmdc:")
-                    else doc_type_full
-                )
+                doc_type = doc_type_full.removeprefix("nmdc:")
             except KeyError:
                 raise Exception(
                     f"doc {doc['id']} in collection {coll_name} has no 'type'!"
                 )
-            slots_to_include = ["id", "type"] + document_reference_ranged_slots[
-                doc_type
+            slots_to_include = ["id", "type"] + document_reference_ranged_slots_by_type[
+                doc_type_full
             ]
             new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
-            new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
             # Get ancestors without the prefix, but add prefix to each one in the output
-            ancestors = schema_view.class_ancestors(doc_type)
             new_doc["_type_and_ancestors"] = [
-                "nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
+                f"nmdc:{a}" for a in schema_view.class_ancestors(doc_type)
             ]
             # InsertOne is a pymongo representation of a mongo command.
             write_operations.append(InsertOne(new_doc))
@@ -1221,7 +1238,7 @@ def materialize_alldocs(context) -> int:
                 write_operations.clear()
                 documents_processed_counter += BULK_WRITE_BATCH_SIZE
         if len(write_operations) > 0:
-            # here bulk_write is a method on the py-mongo db Client class
+            # here bulk_write is a method on the pymongo db Collection class
             _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
             documents_processed_counter += len(write_operations)
         context.log.info(
@@ -1238,15 +1255,18 @@ def materialize_alldocs(context) -> int:
     # so that `temp_alldocs_collection` will be "good to go" on renaming.
     temp_alldocs_collection.create_index("id", unique=True)
     # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
-    # TODO add indexes on each of `set(document_reference_ranged_slots.values())`.
-    slots_to_index = ["has_input", "has_output", "was_informed_by"]
+    slots_to_index = {"_type_and_ancestors"} | {
+        slot
+        for slots in document_reference_ranged_slots_by_type.values()
+        for slot in slots
+    }
     [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
-    context.log.info(f"created indexes on id, {slots_to_index}.")
+    context.log.info(f"created indexes on id and on each of {slots_to_index=}.")
     # Add related-ids fields to enable efficient relationship traversal
     context.log.info("Adding fields for related ids to documents...")
     _add_linked_instances_to_alldocs(
-        temp_alldocs_collection, context, document_reference_ranged_slots
+        temp_alldocs_collection, context, document_reference_ranged_slots_by_type
     )
     context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
     temp_alldocs_collection.create_index("_upstream.id")
@@ -1350,6 +1370,42 @@ def get_library_preparation_from_biosamples(
     return biosample_lib_prep
+@op(required_resource_keys={"mongo"})
+def get_aggregated_pooled_biosamples(context: OpExecutionContext, biosamples: list):
+    from nmdc_runtime.site.export.ncbi_xml_utils import check_pooling_for_biosamples
+    mdb = context.resources.mongo.db
+    material_processing_set = mdb["material_processing_set"]
+    pooled_biosamples_data = check_pooling_for_biosamples(
+        material_processing_set, biosamples
+    )
+    # Fetch ProcessedSample names from database
+    processed_sample_ids = set()
+    for biosample_id, pooling_info in pooled_biosamples_data.items():
+        if pooling_info and pooling_info.get("processed_sample_id"):
+            processed_sample_ids.add(pooling_info["processed_sample_id"])
+    # Query database for ProcessedSample names
+    if processed_sample_ids:
+        processed_sample_set = mdb["processed_sample_set"]
+        cursor = processed_sample_set.find(
+            {"id": {"$in": list(processed_sample_ids)}}, {"id": 1, "name": 1}
+        )
+        processed_samples = {doc["id"]: doc.get("name", "") for doc in cursor}
+        # Update pooled_biosamples_data with ProcessedSample names
+        for biosample_id, pooling_info in pooled_biosamples_data.items():
+            if pooling_info and pooling_info.get("processed_sample_id"):
+                processed_sample_id = pooling_info["processed_sample_id"]
+                if processed_sample_id in processed_samples:
+                    pooling_info["processed_sample_name"] = processed_samples[
+                        processed_sample_id
+                    ]
+    return pooled_biosamples_data
 @op(required_resource_keys={"mongo"})
 def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
     mdb = context.resources.mongo.db
@@ -1383,6 +1439,7 @@ def ncbi_submission_xml_from_nmdc_study(
     data_object_records: list,
     library_preparation_records: list,
     all_instruments: dict,
+    pooled_biosamples_data: dict,
 ) -> str:
     ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
     ncbi_xml = ncbi_exporter.get_submission_xml(
@@ -1391,6 +1448,7 @@ def ncbi_submission_xml_from_nmdc_study(
         data_object_records,
         library_preparation_records,
         all_instruments,
+        pooled_biosamples_data,
     )
     return ncbi_xml

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -502,6 +502,7 @@ def biosample_submission_ingest():
                             "data_object_mapping_file_url": None,
                             "biosample_extras_file_url": None,
                             "biosample_extras_slot_mapping_file_url": None,
+                            "study_id": None,
                         }
                     },
                     "translate_portal_submission_to_nmdc_schema_database": {
@@ -538,6 +539,7 @@ def biosample_submission_ingest():
                             "data_object_mapping_file_url": None,
                             "biosample_extras_file_url": None,
                             "biosample_extras_slot_mapping_file_url": None,
+                            "study_id": None,
                         }
                     },
                     "translate_portal_submission_to_nmdc_schema_database": {

nmdc_runtime/site/resources.py CHANGED Viewed

@@ -520,11 +520,24 @@ class MongoDB:
         self.db = self.client[dbname]
     def add_docs(self, docs, validate=True, replace=True):
+        """
+        TODO: Document this function.
+        """
         try:
             if validate:
                 nmdc_jsonschema_validator_noidpatterns(docs)
             rv = {}
-            for collection_name, docs in docs.items():
+            for collection_name, collection_docs in docs.items():
+                # If `collection_docs` is empty, abort this iteration.
+                #
+                # Note: We do this because the `bulk_write` method called below will raise
+                #       an `InvalidOperation` exception if it is passed 0 operations.
+                #
+                # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write
+                #
+                if len(collection_docs) == 0:
+                    continue
                 rv[collection_name] = self.db[collection_name].bulk_write(
                     [
                         (
@@ -532,7 +545,7 @@ class MongoDB:
                             if replace
                             else InsertOne(d)
                         )
-                        for d in docs
+                        for d in collection_docs
                     ]
                 )
                 now = datetime.now(timezone.utc)
@@ -544,7 +557,7 @@ class MongoDB:
                             "ts": now,
                             # "dtl": {},
                         }
-                        for d in docs
+                        for d in collection_docs
                     ]
                 )
             return rv

nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.10.0py3-none-any.whl → 2.11.1py3-none-any.whl