PyPI - nmdc-runtime - Versions diffs - 1.6.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

nmdc-runtime 1.6.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (19) hide show

nmdc_runtime/site/export/ncbi_xml.py +433 -0
nmdc_runtime/site/export/ncbi_xml_utils.py +206 -0
nmdc_runtime/site/export/study_metadata.py +24 -4
nmdc_runtime/site/graphs.py +24 -12
nmdc_runtime/site/ops.py +120 -44
nmdc_runtime/site/repository.py +56 -6
nmdc_runtime/site/resources.py +30 -40
nmdc_runtime/site/translation/submission_portal_translator.py +16 -9
nmdc_runtime/util.py +1 -1
{nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.7.0.dist-info}/METADATA +4 -7
{nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.7.0.dist-info}/RECORD +15 -17
{nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.7.0.dist-info}/WHEEL +1 -1
{nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.7.0.dist-info}/entry_points.txt +0 -1
nmdc_runtime/site/terminusdb/__init__.py +0 -0
nmdc_runtime/site/terminusdb/generate.py +0 -198
nmdc_runtime/site/terminusdb/ingest.py +0 -44
nmdc_runtime/site/terminusdb/schema.py +0 -1671
{nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.7.0.dist-info}/LICENSE +0 -0
{nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.7.0.dist-info}/top_level.txt +0 -0

nmdc_runtime/site/export/ncbi_xml.py ADDED Viewed

@@ -0,0 +1,433 @@
+import os
+import datetime
+import xml.etree.ElementTree as ET
+import xml.dom.minidom
+from typing import Any
+from urllib.parse import urlparse
+from nmdc_runtime.site.export.ncbi_xml_utils import (
+    handle_controlled_identified_term_value,
+    handle_controlled_term_value,
+    handle_geolocation_value,
+    handle_quantity_value,
+    handle_text_value,
+    handle_timestamp_value,
+    handle_float_value,
+    handle_string_value,
+    load_mappings,
+    validate_xml,
+)
+class NCBISubmissionXML:
+    def __init__(self, nmdc_study: Any, ncbi_submission_metadata: dict):
+        self.root = ET.Element("Submission")
+        self.nmdc_study_id = nmdc_study.get("id")
+        self.nmdc_study_title = nmdc_study.get("title")
+        self.nmdc_study_description = nmdc_study.get("description")
+        self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")
+        self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
+        nmdc_study_pi_name = (
+            nmdc_study.get("principal_investigator", {}).get("name").split()
+        )
+        self.first_name = nmdc_study_pi_name[0]
+        self.last_name = nmdc_study_pi_name[1] if len(nmdc_study_pi_name) > 1 else None
+        self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_metadata.get(
+            "nmdc_ncbi_attribute_mapping_file_url"
+        )
+        self.ncbi_submission_metadata = ncbi_submission_metadata.get(
+            "ncbi_submission_metadata", {}
+        )
+        self.ncbi_biosample_metadata = ncbi_submission_metadata.get(
+            "ncbi_biosample_metadata", {}
+        )
+        # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute
+        # type handlers
+        self.type_handlers = {
+            "QuantityValue": handle_quantity_value,
+            "TextValue": handle_text_value,
+            "TimestampValue": handle_timestamp_value,
+            "ControlledTermValue": handle_controlled_term_value,
+            "ControlledIdentifiedTermValue": handle_controlled_identified_term_value,
+            "GeolocationValue": handle_geolocation_value,
+            "float": handle_float_value,
+            "string": handle_string_value,
+        }
+    def set_element(self, tag, text="", attrib=None, children=None):
+        attrib = attrib or {}
+        children = children or []
+        element = ET.Element(tag, attrib=attrib)
+        element.text = text
+        for child in children:
+            element.append(child)
+        return element
+    def set_description(self, email, user, first, last, org, date=None):
+        date = date or datetime.datetime.now().strftime("%Y-%m-%d")
+        description = self.set_element(
+            "Description",
+            children=[
+                self.set_element(
+                    "Comment", f"NMDC Submission for {self.nmdc_study_id}"
+                ),
+                self.set_element("Submitter", attrib={"user_name": user}),
+                self.set_element(
+                    "Organization",
+                    attrib={"role": "owner", "type": "center"},
+                    children=[
+                        self.set_element("Name", org),
+                        self.set_element(
+                            "Contact",
+                            attrib={"email": email},
+                            children=[
+                                self.set_element(
+                                    "Name",
+                                    children=[
+                                        self.set_element("First", first),
+                                        self.set_element("Last", last),
+                                    ],
+                                )
+                            ],
+                        ),
+                    ],
+                ),
+                self.set_element("Hold", attrib={"release_date": date}),
+            ],
+        )
+        self.root.append(description)
+    def set_descriptor(self, title, description):
+        descriptor_elements = []
+        descriptor_elements.append(self.set_element("Title", title))
+        descriptor_elements.append(
+            self.set_element(
+                "Description", children=[self.set_element("p", description)]
+            )
+        )
+        return descriptor_elements
+    def set_bioproject(self, title, project_id, description, data_type, org):
+        action = self.set_element("Action")
+        add_data = self.set_element("AddData", attrib={"target_db": "BioProject"})
+        data_element = self.set_element("Data", attrib={"content_type": "XML"})
+        xml_content = self.set_element("XmlContent")
+        project = self.set_element("Project", attrib={"schema_version": "2.0"})
+        project_id_element = self.set_element("ProjectID")
+        spuid = self.set_element("SPUID", project_id, {"spuid_namespace": org})
+        project_id_element.append(spuid)
+        descriptor = self.set_descriptor(title, description)
+        project_type = self.set_element("ProjectType")
+        # "sample_scope" is a enumeration feild. Docs: https://www.ncbi.nlm.nih.gov/data_specs/schema/other/bioproject/Core.xsd
+        # scope is "eEnvironment" when "Content of species in a sample is not known, i.e. microbiome,metagenome, etc.."
+        project_type_submission = self.set_element(
+            "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"}
+        )
+        intended_data_type_set = self.set_element("IntendedDataTypeSet")
+        data_type_element = self.set_element("DataType", data_type)
+        intended_data_type_set.append(data_type_element)
+        project_type_submission.append(intended_data_type_set)
+        project_type.append(project_type_submission)
+        project.extend([project_id_element] + descriptor + [project_type])
+        xml_content.append(project)
+        data_element.append(xml_content)
+        add_data.append(data_element)
+        identifier = self.set_element("Identifier")
+        spuid_identifier = self.set_element(
+            "SPUID", project_id, {"spuid_namespace": org}
+        )
+        identifier.append(spuid_identifier)
+        add_data.append(identifier)
+        action.append(add_data)
+        self.root.append(action)
+    def set_biosample(
+        self,
+        organism_name,
+        org,
+        bioproject_id,
+        nmdc_biosamples,
+        nmdc_omics_processing,
+    ):
+        attribute_mappings, slot_range_mappings = load_mappings(
+            self.nmdc_ncbi_attribute_mapping_file_url
+        )
+        for biosample in nmdc_biosamples:
+            attributes = {}
+            sample_id_value = None
+            env_package = None
+            for json_key, value in biosample.items():
+                if isinstance(value, list):
+                    continue  # Skip processing for list values
+                if json_key == "env_package":
+                    env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
+                # Special handling for NMDC Biosample "id"
+                if json_key == "id":
+                    sample_id_value = value
+                    continue
+                if json_key not in attribute_mappings:
+                    continue
+                xml_key = attribute_mappings[json_key]
+                value_type = slot_range_mappings.get(json_key, "string")
+                handler = self.type_handlers.get(value_type, handle_string_value)
+                formatted_value = handler(value)
+                attributes[xml_key] = formatted_value
+            biosample_elements = [
+                self.set_element(
+                    "SampleId",
+                    children=[
+                        self.set_element(
+                            "SPUID", sample_id_value, {"spuid_namespace": org}
+                        )
+                    ],
+                ),
+                self.set_element(
+                    "Descriptor",
+                    children=[
+                        self.set_element(
+                            "Title",
+                            f"NMDC Biosample {sample_id_value} from {organism_name} part of {self.nmdc_study_id} study",
+                        ),
+                    ],
+                ),
+                self.set_element(
+                    "Organism",
+                    children=[self.set_element("OrganismName", organism_name)],
+                ),
+                self.set_element(
+                    "BioProject",
+                    children=[
+                        self.set_element(
+                            "PrimaryId", bioproject_id, {"db": "BioProject"}
+                        )
+                    ],
+                ),
+                self.set_element("Package", env_package),
+                self.set_element(
+                    "Attributes",
+                    children=[
+                        self.set_element(
+                            "Attribute", attributes[key], {"attribute_name": key}
+                        )
+                        for key in sorted(attributes)
+                    ],
+                ),
+            ]
+            action = self.set_element(
+                "Action",
+                children=[
+                    self.set_element(
+                        "AddData",
+                        attrib={"target_db": "BioSample"},
+                        children=[
+                            self.set_element(
+                                "Data",
+                                attrib={"content_type": "XML"},
+                                children=[
+                                    self.set_element(
+                                        "XmlContent",
+                                        children=[
+                                            self.set_element(
+                                                "BioSample",
+                                                attrib={"schema_version": "2.0"},
+                                                children=biosample_elements,
+                                            ),
+                                        ],
+                                    ),
+                                ],
+                            ),
+                            self.set_element(
+                                "Identifier",
+                                children=[
+                                    self.set_element(
+                                        "SPUID",
+                                        sample_id_value,
+                                        {"spuid_namespace": org},
+                                    ),
+                                ],
+                            ),
+                        ],
+                    ),
+                ],
+            )
+            self.root.append(action)
+    def set_fastq(
+        self,
+        biosample_data_objects: list,
+        bioproject_id: str,
+        org: str,
+    ):
+        for entry in biosample_data_objects:
+            fastq_files = []
+            biosample_ids = []
+            for biosample_id, data_objects in entry.items():
+                biosample_ids.append(biosample_id)
+                for data_object in data_objects:
+                    if "url" in data_object:
+                        url = urlparse(data_object["url"])
+                        file_path = os.path.join(
+                            os.path.basename(os.path.dirname(url.path)),
+                            os.path.basename(url.path),
+                        )
+                        fastq_files.append(file_path)
+            if fastq_files:
+                files_elements = [
+                    self.set_element(
+                        "File",
+                        "",
+                        {"file_path": f},
+                        [self.set_element("DataType", "generic-data")],
+                    )
+                    for f in fastq_files
+                ]
+                attribute_elements = [
+                    self.set_element(
+                        "AttributeRefId",
+                        attrib={"name": "BioProject"},
+                        children=[
+                            self.set_element(
+                                "RefId",
+                                children=[
+                                    self.set_element(
+                                        "SPUID",
+                                        bioproject_id,
+                                        {"spuid_namespace": org},
+                                    )
+                                ],
+                            )
+                        ],
+                    )
+                ]
+                for biosample_id in biosample_ids:
+                    attribute_elements.append(
+                        self.set_element(
+                            "AttributeRefId",
+                            attrib={"name": "BioSample"},
+                            children=[
+                                self.set_element(
+                                    "RefId",
+                                    children=[
+                                        self.set_element(
+                                            "SPUID",
+                                            biosample_id,
+                                            {"spuid_namespace": org},
+                                        )
+                                    ],
+                                )
+                            ],
+                        )
+                    )
+                identifier_element = self.set_element(
+                    "Identifier",
+                    children=[
+                        self.set_element(
+                            "SPUID", bioproject_id, {"spuid_namespace": org}
+                        )
+                    ],
+                )
+                action = self.set_element(
+                    "Action",
+                    children=[
+                        self.set_element(
+                            "AddFiles",
+                            attrib={"target_db": "SRA"},
+                            children=files_elements
+                            + attribute_elements
+                            + [identifier_element],
+                        ),
+                    ],
+                )
+                self.root.append(action)
+    def get_submission_xml(
+        self,
+        biosamples_list: list,
+        biosample_omics_processing_list: list,
+        biosample_data_objects_list: list,
+    ):
+        data_type = None
+        ncbi_project_id = None
+        for bsm_omprc in biosample_omics_processing_list:
+            for _, omprc_list in bsm_omprc.items():
+                for omprc in omprc_list:
+                    if "omics_type" in omprc:
+                        data_type = handle_text_value(omprc["omics_type"]).capitalize()
+                    if "ncbi_project_name" in omprc:
+                        ncbi_project_id = omprc["ncbi_project_name"]
+        self.set_description(
+            email=self.nmdc_pi_email,
+            user="National Microbiome Data Collaborative (NMDC)",
+            first=self.first_name,
+            last=self.last_name,
+            org=self.ncbi_submission_metadata.get("organization", ""),
+        )
+        if not ncbi_project_id:
+            self.set_bioproject(
+                title=self.nmdc_study_title,
+                project_id=ncbi_project_id,
+                description=self.nmdc_study_description,
+                data_type=data_type,
+                org=self.ncbi_submission_metadata.get("organization", ""),
+            )
+        self.set_biosample(
+            organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
+            org=self.ncbi_submission_metadata.get("organization", ""),
+            bioproject_id=ncbi_project_id,
+            nmdc_biosamples=biosamples_list,
+            nmdc_omics_processing=biosample_omics_processing_list,
+        )
+        self.set_fastq(
+            biosample_data_objects=biosample_data_objects_list,
+            bioproject_id=ncbi_project_id,
+            org=self.ncbi_submission_metadata.get("organization", ""),
+        )
+        rough_string = ET.tostring(self.root, "unicode")
+        reparsed = xml.dom.minidom.parseString(rough_string)
+        submission_xml = reparsed.toprettyxml(indent="    ", newl="\n")
+        # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ #
+        # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co"
+        # validate_xml(submission_xml, submission_xsd_url)
+        # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/bioproject/bioproject.xsd?view=co"
+        # validate_xml(submission_xml, bioproject_xsd_url)
+        # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co"
+        # validate_xml(submission_xml, biosample_xsd_url)
+        return submission_xml

nmdc_runtime/site/export/ncbi_xml_utils.py ADDED Viewed

@@ -0,0 +1,206 @@
+from io import BytesIO, StringIO
+from nmdc_runtime.minter.config import typecodes
+from lxml import etree
+import csv
+import requests
+def _build_class_map(class_map_data):
+    return {
+        entry["name"]: entry["schema_class"].split(":")[1] for entry in class_map_data
+    }
+def get_classname_from_typecode(doc_id):
+    class_map_data = typecodes()
+    class_map = _build_class_map(class_map_data)
+    typecode = doc_id.split(":")[1].split("-")[0]
+    return class_map.get(typecode)
+def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
+    biosample_data_objects = []
+    for biosample in biosamples_list:
+        current_ids = [biosample["id"]]
+        collected_data_objects = []
+        while current_ids:
+            new_current_ids = []
+            for current_id in current_ids:
+                query = {"has_input": current_id}
+                document = all_docs_collection.find_one(query)
+                if not document:
+                    continue
+                has_output = document.get("has_output")
+                if not has_output:
+                    continue
+                for output_id in has_output:
+                    if get_classname_from_typecode(output_id) == "DataObject":
+                        data_object_doc = all_docs_collection.find_one(
+                            {"id": output_id}
+                        )
+                        if data_object_doc:
+                            collected_data_objects.append(data_object_doc)
+                    else:
+                        new_current_ids.append(output_id)
+            current_ids = new_current_ids
+        if collected_data_objects:
+            biosample_data_objects.append({biosample["id"]: collected_data_objects})
+    return biosample_data_objects
+def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list):
+    biosample_data_objects = []
+    for biosample in biosamples_list:
+        current_ids = [biosample["id"]]
+        collected_data_objects = []
+        while current_ids:
+            new_current_ids = []
+            for current_id in current_ids:
+                query = {"has_input": current_id}
+                document = all_docs_collection.find_one(query)
+                if not document:
+                    continue
+                has_output = document.get("has_output")
+                if not has_output:
+                    continue
+                for output_id in has_output:
+                    if get_classname_from_typecode(output_id) == "DataObject":
+                        omics_processing_doc = all_docs_collection.find_one(
+                            {"id": document["id"]}
+                        )
+                        if omics_processing_doc:
+                            collected_data_objects.append(omics_processing_doc)
+                    else:
+                        new_current_ids.append(output_id)
+            current_ids = new_current_ids
+        if collected_data_objects:
+            biosample_data_objects.append({biosample["id"]: collected_data_objects})
+    return biosample_data_objects
+def handle_quantity_value(slot_value):
+    if "has_numeric_value" in slot_value and "has_unit" in slot_value:
+        return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}"
+    elif (
+        "has_maximum_numeric_value" in slot_value
+        and "has_minimum_numeric_value" in slot_value
+        and "has_unit" in slot_value
+    ):
+        range_value = (
+            slot_value["has_maximum_numeric_value"]
+            - slot_value["has_minimum_numeric_value"]
+        )
+        return f"{range_value} {slot_value['has_unit']}"
+    elif "has_raw_value" in slot_value:
+        return slot_value["has_raw_value"]
+    return "Unknown format"
+def handle_text_value(slot_value):
+    return slot_value.get("has_raw_value", "Unknown format")
+def handle_timestamp_value(slot_value):
+    return slot_value.get("has_raw_value", "Unknown format")
+def handle_controlled_term_value(slot_value):
+    if "term" in slot_value:
+        term = slot_value["term"]
+        if "name" in term and "id" in term:
+            return f"{term['name']} [{term['id']}]"
+        elif "id" in term:
+            return term["id"]
+        elif "name" in term:
+            return term["name"]
+    elif "has_raw_value" in slot_value:
+        return slot_value["has_raw_value"]
+    return "Unknown format"
+def handle_controlled_identified_term_value(slot_value):
+    if "term" in slot_value:
+        term = slot_value["term"]
+        if "name" in term and "id" in term:
+            return f"{term['name']} [{term['id']}]"
+        elif "id" in term:
+            return term["id"]
+    elif "has_raw_value" in slot_value:
+        return slot_value["has_raw_value"]
+    return "Unknown format"
+def handle_geolocation_value(slot_value):
+    if "latitude" in slot_value and "longitude" in slot_value:
+        return f"{slot_value['latitude']} {slot_value['longitude']}"
+    elif "has_raw_value" in slot_value:
+        return slot_value["has_raw_value"]
+    return "Unknown format"
+def handle_float_value(slot_value):
+    return f"{slot_value:.2f}"
+def handle_string_value(slot_value):
+    return f"{slot_value}"
+def load_mappings(url):
+    response = requests.get(url)
+    response.raise_for_status()
+    file_content = response.text
+    attribute_mappings = {}
+    slot_range_mappings = {}
+    reader = csv.DictReader(StringIO(file_content), delimiter="\t")
+    for row in reader:
+        if row["ignore"].strip():
+            continue
+        json_key = row["nmdc_schema_slot"]
+        # attribute mappings
+        xml_attribute_name = row["ncbi_biosample_attribute_name"]
+        attribute_mappings[json_key] = (
+            xml_attribute_name if xml_attribute_name else json_key
+        )
+        # slot range mappings
+        data_type = row["nmdc_schema_slot_range"]
+        slot_range_mappings[json_key] = data_type if data_type else "default"
+    return attribute_mappings, slot_range_mappings
+def validate_xml(xml, xsd_url):
+    response = requests.get(xsd_url)
+    response.raise_for_status()
+    xsd_content = response.text
+    xml_schema_doc = etree.parse(BytesIO(xsd_content.encode("utf-8")))
+    xml_schema = etree.XMLSchema(xml_schema_doc)
+    xml_doc = etree.parse(BytesIO(xml.encode("utf-8")))
+    if not xml_schema.validate(xml_doc):
+        raise ValueError(f"There were errors while validating against: {xsd_url}")
+    return True

nmdc_runtime/site/export/study_metadata.py CHANGED Viewed

@@ -5,7 +5,6 @@ Get NMDC study-associated metadata from search api
 import csv
 from io import StringIO
-import requests
 from dagster import (
     op,
     get_dagster_logger,
@@ -26,13 +25,27 @@ def get_all_docs(client, collection, filter_):
     per_page = 200
     url_base = f"/{collection}?filter={filter_}&per_page={per_page}"
     results = []
-    rv = client.request("GET", url_base).json()
+    response = client.request("GET", url_base)
+    if response.status_code != 200:
+        raise Exception(
+            f"Runtime API request failed with status {response.status_code}."
+            f" Check URL: {url_base}"
+        )
+    rv = response.json()
     results.extend(rv.get("results", []))
     page, count = rv["meta"]["page"], rv["meta"]["count"]
     assert count <= 10_000
     while page * per_page < count:
-        rv = requests.get(url_base + f"&page={page + 1}").json()
-        results.extend(rv["results"])
+        page += 1
+        url = f"{url_base}&page={page}"
+        response = client.request("GET", url)
+        if response.status_code != 200:
+            raise Exception(
+                f"Runtime API request failed with status {response.status_code}."
+                f" Check URL: {url}"
+            )
+        rv = response.json()
+        results.extend(rv.get("results", []))
     return results
@@ -115,3 +128,10 @@ def export_study_biosamples_as_csv(context: OpExecutionContext, study_export_inf
 def export_study_biosamples_metadata():
     outputs = export_study_biosamples_as_csv(get_study_biosamples_metadata())
     add_output_run_event(outputs)
+@op(required_resource_keys={"runtime_api_site_client"})
+def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study: dict):
+    client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
+    biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study['id']}")
+    return biosamples

nmdc-runtime 1.6.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 1.6.0py3-none-any.whl → 1.7.0py3-none-any.whl