PyPI - nmdc-runtime - Versions diffs - 1.7.0__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

nmdc-runtime 1.7.0py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (11) hide show

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -159,7 +159,6 @@ class NCBISubmissionXML:
         org,
         bioproject_id,
         nmdc_biosamples,
-        nmdc_omics_processing,
     ):
         attribute_mappings, slot_range_mappings = load_mappings(
             self.nmdc_ncbi_attribute_mapping_file_url
@@ -278,22 +277,41 @@ class NCBISubmissionXML:
         biosample_data_objects: list,
         bioproject_id: str,
         org: str,
+        nmdc_omics_processing: list,
+        nmdc_biosamples: list,
     ):
+        bsm_id_name_dict = {
+            biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
+        }
         for entry in biosample_data_objects:
             fastq_files = []
             biosample_ids = []
+            omics_processing_ids = {}
+            instrument_name = ""
+            omics_type = ""
+            library_name = ""
             for biosample_id, data_objects in entry.items():
                 biosample_ids.append(biosample_id)
                 for data_object in data_objects:
                     if "url" in data_object:
                         url = urlparse(data_object["url"])
-                        file_path = os.path.join(
-                            os.path.basename(os.path.dirname(url.path)),
-                            os.path.basename(url.path),
-                        )
+                        file_path = os.path.basename(url.path)
                         fastq_files.append(file_path)
+                for omprc_dict in nmdc_omics_processing:
+                    if biosample_id in omprc_dict:
+                        for omprc in omprc_dict[biosample_id]:
+                            omics_processing_ids[biosample_id] = omprc.get("id", "")
+                            instrument_name = omprc.get("instrument_name", "")
+                            omics_type = (
+                                omprc.get("omics_type", {})
+                                .get("has_raw_value", "")
+                                .lower()
+                            )
+                            library_name = bsm_id_name_dict.get(biosample_id, "")
             if fastq_files:
                 files_elements = [
                     self.set_element(
@@ -344,29 +362,106 @@ class NCBISubmissionXML:
                         )
                     )
-                identifier_element = self.set_element(
-                    "Identifier",
-                    children=[
+                sra_attributes = []
+                if instrument_name.lower().startswith("illumina"):
+                    sra_attributes.append(
+                        self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
+                    )
+                    if "nextseq550" in instrument_name.lower():
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute", "NextSeq 550", {"name": "instrument_model"}
+                            )
+                        )
+                if omics_type == "metagenome":
+                    sra_attributes.append(
                         self.set_element(
-                            "SPUID", bioproject_id, {"spuid_namespace": org}
+                            "Attribute", "WGS", {"name": "library_strategy"}
                         )
-                    ],
-                )
+                    )
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "METAGENOMIC", {"name": "library_source"}
+                        )
+                    )
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "RANDOM", {"name": "library_selection"}
+                        )
+                    )
-                action = self.set_element(
-                    "Action",
-                    children=[
+                if omics_type == "metatranscriptome":
+                    sra_attributes.append(
                         self.set_element(
-                            "AddFiles",
-                            attrib={"target_db": "SRA"},
-                            children=files_elements
-                            + attribute_elements
-                            + [identifier_element],
-                        ),
-                    ],
+                            "Attribute",
+                            "METATRANSCRIPTOMIC",
+                            {"name": "library_source"},
+                        )
+                    )
+                has_paired_reads = any(
+                    data_object.get("data_object_type", "").lower()
+                    == "metagenome raw reads"
+                    for data_object in data_objects
+                ) or (
+                    any(
+                        data_object.get("data_object_type", "").lower()
+                        == "metagenome raw read 1"
+                        for data_object in data_objects
+                    )
+                    and any(
+                        data_object.get("data_object_type", "").lower()
+                        == "metagenome raw read 2"
+                        for data_object in data_objects
+                    )
                 )
-                self.root.append(action)
+                if has_paired_reads:
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "paired", {"name": "library_layout"}
+                        )
+                    )
+                else:
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "single", {"name": "library_layout"}
+                        )
+                    )
+                if library_name:
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", library_name, {"name": "library_name"}
+                        )
+                    )
+                for biosample_id, omics_processing_id in omics_processing_ids.items():
+                    identifier_element = self.set_element(
+                        "Identifier",
+                        children=[
+                            self.set_element(
+                                "SPUID", omics_processing_id, {"spuid_namespace": org}
+                            )
+                        ],
+                    )
+                    action = self.set_element(
+                        "Action",
+                        children=[
+                            self.set_element(
+                                "AddFiles",
+                                attrib={"target_db": "SRA"},
+                                children=files_elements
+                                + attribute_elements
+                                + sra_attributes
+                                + [identifier_element],
+                            ),
+                        ],
+                    )
+                    self.root.append(action)
     def get_submission_xml(
         self,
@@ -407,13 +502,14 @@ class NCBISubmissionXML:
             org=self.ncbi_submission_metadata.get("organization", ""),
             bioproject_id=ncbi_project_id,
             nmdc_biosamples=biosamples_list,
-            nmdc_omics_processing=biosample_omics_processing_list,
         )
         self.set_fastq(
             biosample_data_objects=biosample_data_objects_list,
             bioproject_id=ncbi_project_id,
             org=self.ncbi_submission_metadata.get("organization", ""),
+            nmdc_omics_processing=biosample_omics_processing_list,
+            nmdc_biosamples=biosamples_list,
         )
         rough_string = ET.tostring(self.root, "unicode")

nmdc_runtime/site/graphs.py CHANGED Viewed

@@ -48,6 +48,7 @@ from nmdc_runtime.site.ops import (
     get_neon_pipeline_inputs,
     get_df_from_url,
     site_code_mapping,
+    materialize_alldocs,
     get_ncbi_export_pipeline_study,
     get_data_objects_from_biosamples,
     get_omics_processing_from_biosamples,
@@ -98,6 +99,11 @@ def housekeeping():
     delete_operations(list_operations(filter_ops_undone_expired()))
+@graph
+def ensure_alldocs():
+    materialize_alldocs()
 @graph
 def ensure_jobs():
     jobs = construct_jobs()

nmdc_runtime/site/ops.py CHANGED Viewed

@@ -13,6 +13,7 @@ from zipfile import ZipFile
 import pandas as pd
 import requests
 from bson import ObjectId, json_util
 from dagster import (
     Any,
@@ -92,8 +93,12 @@ from nmdc_runtime.util import (
     put_object,
     validate_json,
     specialize_activity_set_docs,
+    collection_name_to_class_names,
+    class_hierarchy_as_list,
+    populated_schema_collection_names_with_id_field,
 )
 from nmdc_schema import nmdc
+from nmdc_schema.nmdc import Database as NMDCDatabase
 from pydantic import BaseModel
 from pymongo.database import Database as MongoDatabase
 from starlette import status
@@ -973,6 +978,61 @@ def site_code_mapping() -> dict:
         )
+@op(required_resource_keys={"mongo"})
+def materialize_alldocs(context) -> int:
+    mdb = context.resources.mongo.db
+    collection_names = populated_schema_collection_names_with_id_field(mdb)
+    for name in collection_names:
+        assert (
+            len(collection_name_to_class_names[name]) == 1
+        ), f"{name} collection has class name of {collection_name_to_class_names[name]} and len {len(collection_name_to_class_names[name])}"
+    context.log.info(f"{collection_names=}")
+    # Drop any existing `alldocs` collection (e.g. from previous use of this op).
+    mdb.alldocs.drop()
+    # Build alldocs
+    context.log.info("constructing `alldocs` collection")
+    for collection in collection_names:
+        # Calculate class_hierarchy_as_list once per collection, using the first document in list
+        try:
+            nmdcdb = NMDCDatabase(
+                **{collection: [dissoc(mdb[collection].find_one(), "_id")]}
+            )
+            exemplar = getattr(nmdcdb, collection)[0]
+            newdoc_type: list[str] = class_hierarchy_as_list(exemplar)
+        except ValueError as e:
+            context.log.info(f"Collection {collection} does not exist.")
+            raise e
+        context.log.info(
+            f"Found {mdb[collection].estimated_document_count()} estimated documents for {collection=}."
+        )
+        # For each document in this collection, replace the value of the `type` field with
+        # a _list_ of the document's own class and ancestor classes, remove the `_id` field,
+        # and insert the resulting document into the `alldocs` collection.
+        inserted_many_result = mdb.alldocs.insert_many(
+            [
+                assoc(dissoc(doc, "type", "_id"), "type", newdoc_type)
+                for doc in mdb[collection].find()
+            ]
+        )
+        context.log.info(
+            f"Inserted {len(inserted_many_result.inserted_ids)} documents for {collection=}."
+        )
+    # Re-idx for `alldocs` collection
+    mdb.alldocs.create_index("id", unique=True)
+    context.log.info(
+        f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
+    )
+    return mdb.alldocs.estimated_document_count()
 @op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
 def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any:
     nmdc_study = find_study_by_id(

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -42,6 +42,7 @@ from nmdc_runtime.site.graphs import (
     ingest_neon_soil_metadata,
     ingest_neon_benthic_metadata,
     ingest_neon_surface_water_metadata,
+    ensure_alldocs,
     nmdc_study_to_ncbi_submission_export,
 )
 from nmdc_runtime.site.resources import (
@@ -450,6 +451,7 @@ def repo():
         ensure_jobs.to_job(**preset_normal),
         apply_metadata_in.to_job(**preset_normal),
         export_study_biosamples_metadata.to_job(**preset_normal),
+        ensure_alldocs.to_job(**preset_normal),
     ]
     schedules = [housekeeping_weekly]
     sensors = [

nmdc_runtime/util.py CHANGED Viewed

@@ -376,6 +376,24 @@ collection_name_to_class_names: Dict[str, List[str]] = {
 }
+def class_hierarchy_as_list(obj) -> list[str]:
+    """
+    get list of inherited classes for each concrete class
+    """
+    rv = []
+    current_class = obj.__class__
+    def recurse_through_bases(cls):
+        if cls.__name__ == "YAMLRoot":
+            return rv
+        rv.append(cls.__name__)
+        for base in cls.__bases__:
+            recurse_through_bases(base)
+        return rv
+    return recurse_through_bases(current_class)
 @lru_cache
 def schema_collection_names_with_id_field() -> Set[str]:
     """
@@ -393,6 +411,11 @@ def schema_collection_names_with_id_field() -> Set[str]:
     return target_collection_names
+def populated_schema_collection_names_with_id_field(mdb: MongoDatabase) -> List[str]:
+    collection_names = sorted(schema_collection_names_with_id_field())
+    return [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]
 def ensure_unique_id_indexes(mdb: MongoDatabase):
     """Ensure that any collections with an "id" field have an index on "id"."""
     candidate_names = (

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nmdc_runtime
-Version: 1.7.0
+Version: 1.8.0
 Summary: A runtime system for NMDC data management and orchestration
 Home-page: https://github.com/microbiomedata/nmdc-runtime
 Author: Donny Winston

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.8.0.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
 nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/util.py,sha256=3mHVEUdMOv73XgT6NTuzMuMCL5Gs6NJ4Mk0bkgQQaQU,19844
+nmdc_runtime/util.py,sha256=nfj1MjZzVaxs9pKrHo6A98yGAzL-jHQ0zTGs_sOkBnM,20531
 nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
@@ -35,9 +35,9 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
 nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
 nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/graphs.py,sha256=_vCyQnICis4OQGH91i1ZwpvHYcXOG6Nfg04f5DVdy2M,12040
-nmdc_runtime/site/ops.py,sha256=G6X3YgSmDNxOnsMEByLUMfB0peY4o21o0_Ig3V7v6M4,35835
-nmdc_runtime/site/repository.py,sha256=-dOk9BEnLSrmAN6bZoIu_WnFSqriIpO0c5P76PuHW1M,37472
+nmdc_runtime/site/graphs.py,sha256=QdmNvdtDLCgpJyKviLUj-IIF1gPS_vYzl1Kzv2mSF4g,12122
+nmdc_runtime/site/ops.py,sha256=btdgcGBwNOFnVCzAa-vO4Gs1lMxgnjcRFd8B28X0who,38222
+nmdc_runtime/site/repository.py,sha256=xTHAfokzbZVqlRFG65VuHxTfZfhyKZskOaCSGyrW_hw,37540
 nmdc_runtime/site/resources.py,sha256=ZSH1yvA-li0R7Abc22_v0XLbjBYf5igETr2G01J3hnc,17557
 nmdc_runtime/site/util.py,sha256=6hyVPpb6ZkWEG8Nm7uQxnZ-QmuPOG9hgWvl0mUBr5JU,1303
 nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,7 +50,7 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
 nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
 nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/export/ncbi_xml.py,sha256=Z2qsaGIBvY2OdOkf8kJEZl1T_8R_YzhAlXxJ1gMQwnk,16946
+nmdc_runtime/site/export/ncbi_xml.py,sha256=-GflgZO_Q4Y2rm53QIkI7vYY6pWwCf_l7tolGgTXiBg,21026
 nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=CqrtjwzmUbZXEW8aD-KpnCV_PlXVH-Gqp309nw3vbeo,6464
 nmdc_runtime/site/export/study_metadata.py,sha256=WRU0F1ksWfNX3k9LD91Pn2DuLA-IOpGvYPJd6DnguEs,4819
 nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,9 +72,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
 nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
 nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
 nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
-nmdc_runtime-1.7.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
-nmdc_runtime-1.7.0.dist-info/METADATA,sha256=FnoXHNgR6o5PEe6XhqRGdqOjbIX_ry-SKY5uMtZJQXY,7302
-nmdc_runtime-1.7.0.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
-nmdc_runtime-1.7.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
-nmdc_runtime-1.7.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
-nmdc_runtime-1.7.0.dist-info/RECORD,,
+nmdc_runtime-1.8.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
+nmdc_runtime-1.8.0.dist-info/METADATA,sha256=lBQzzEEXtwobBObmYmDogAdFKQMLvSJn3wmjG8lHQ5I,7302
+nmdc_runtime-1.8.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+nmdc_runtime-1.8.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
+nmdc_runtime-1.8.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
+nmdc_runtime-1.8.0.dist-info/RECORD,,

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.8.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (70.1.1)
+Generator: setuptools (72.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.8.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.8.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.8.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

nmdc-runtime 1.7.0__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 1.7.0py3-none-any.whl → 1.8.0py3-none-any.whl