PyPI - nmdc-runtime - Versions diffs - 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl - Mend

nmdc-runtime 1.3.1py3-none-any.whl → 2.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +212 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +777 -0
nmdc_runtime/api/core/util.py +114 -0
nmdc_runtime/api/db/mongo.py +436 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +206 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +277 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +817 -0
nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +495 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +57 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +207 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/wfe_file_stages.py +122 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +56 -0
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +30 -4
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +89 -0
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +29 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +29 -0
nmdc_runtime/site/export/ncbi_xml.py +1331 -0
nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
nmdc_runtime/site/export/study_metadata.py +27 -4
nmdc_runtime/site/graphs.py +294 -45
nmdc_runtime/site/ops.py +1008 -230
nmdc_runtime/site/repair/database_updater.py +451 -0
nmdc_runtime/site/repository.py +368 -133
nmdc_runtime/site/resources.py +154 -80
nmdc_runtime/site/translation/gold_translator.py +235 -83
nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
nmdc_runtime/site/translation/neon_utils.py +24 -7
nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
nmdc_runtime/site/translation/translator.py +73 -3
nmdc_runtime/site/util.py +26 -7
nmdc_runtime/site/validation/emsl.py +1 -0
nmdc_runtime/site/validation/gold.py +1 -0
nmdc_runtime/site/validation/util.py +16 -12
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +236 -192
nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/terminusdb/generate.py +0 -198
nmdc_runtime/site/terminusdb/ingest.py +0 -44
nmdc_runtime/site/terminusdb/schema.py +0 -1671
nmdc_runtime/site/translation/emsl.py +0 -42
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -31
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -42
nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
/nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0

nmdc_runtime/site/resources.py CHANGED Viewed

@@ -14,12 +14,10 @@ from dagster import (
     StringSource,
     InitResourceContext,
 )
-from fastjsonschema import JsonSchemaValueException
 from frozendict import frozendict
 from linkml_runtime.dumpers import json_dumper
 from pydantic import BaseModel, AnyUrl
 from pymongo import MongoClient, ReplaceOne, InsertOne
-from terminusdb_client import WOQLClient
 from toolz import get_in
 from toolz import merge
@@ -28,7 +26,7 @@ from nmdc_runtime.api.models.object import DrsObject, AccessURL, DrsObjectIn
 from nmdc_runtime.api.models.operation import ListOperationsResponse
 from nmdc_runtime.api.models.util import ListRequest
 from nmdc_runtime.site.normalization.gold import normalize_gold_id
-from nmdc_runtime.util import unfreeze, nmdc_jsonschema_validator_noidpatterns
+from nmdc_runtime.util import unfreeze, get_nmdc_schema_validator
 from nmdc_schema import nmdc
@@ -110,7 +108,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["cursor"]["batch"]
     def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
         gold_project_id = normalize_gold_id(gold_project_id)
@@ -127,19 +125,39 @@ class RuntimeApiUserClient(RuntimeApiClient):
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["cursor"]["batch"]
     def get_biosamples_for_study(self, study_id: str):
+        # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
+        # The /nmdcschema/{collection-name} endpoint implements pagination via the page_token mechanism,
+        # but the tradeoff there is that we would need to make multiple requests to step through the
+        # each of the pages. By picking a large number for max_page_size, we can get all the results
+        # in a single request.
+        # This method previously used the /queries:run endpoint but the problem with that was that
+        # it used to truncate the number of results returned to 100.
         response = self.request(
-            "POST",
-            f"/queries:run",
+            "GET",
+            f"/nmdcschema/biosample_set",
             {
-                "find": "biosample_set",
-                "filter": {"part_of": {"$elemMatch": {"$eq": study_id}}},
+                "filter": json.dumps({"associated_studies": study_id}),
+                "max_page_size": 10000,
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["resources"]
+    def get_data_generation_records_for_study(self, study_id: str):
+        # TODO: same as above, we are using a large max_page_size to avoid pagination.
+        response = self.request(
+            "GET",
+            f"/nmdcschema/data_generation_set",
+            {
+                "filter": json.dumps({"associated_studies": study_id}),
+                "max_page_size": 10000,
+            },
+        )
+        response.raise_for_status()
+        return response.json()["resources"]
     def get_omics_processing_by_name(self, name: str):
         response = self.request(
@@ -151,7 +169,19 @@ class RuntimeApiUserClient(RuntimeApiClient):
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["cursor"]["batch"]
+    def get_study(self, study_id: str):
+        response = self.request(
+            "POST",
+            f"/queries:run",
+            {
+                "find": "study_set",
+                "filter": {"id": study_id},
+            },
+        )
+        response.raise_for_status()
+        return response.json()["cursor"]["batch"]
 class RuntimeApiSiteClient(RuntimeApiClient):
@@ -332,9 +362,26 @@ class GoldApiClient(BasicAuthClient):
         """
         return id.replace("gold:", "")
-    def fetch_biosamples_by_study(self, study_id: str) -> List[Dict[str, Any]]:
+    def fetch_biosamples_by_study(
+        self, study_id: str, include_project=True
+    ) -> List[Dict[str, Any]]:
         id = self._normalize_id(study_id)
         results = self.request("/biosamples", params={"studyGoldId": id})
+        if include_project:
+            projects = self.fetch_projects_by_study(id)
+            biosamples_by_id = {
+                biosample["biosampleGoldId"]: biosample for biosample in results
+            }
+            for project in projects:
+                sample_id = project.get("biosampleGoldId")
+                if not sample_id:
+                    continue
+                if sample_id not in biosamples_by_id:
+                    continue
+                biosample = biosamples_by_id[sample_id]
+                if "projects" not in biosample:
+                    biosample["projects"] = []
+                biosample["projects"].append(project)
         return results
     def fetch_projects_by_study(self, study_id: str) -> List[Dict[str, Any]]:
@@ -354,6 +401,18 @@ class GoldApiClient(BasicAuthClient):
             return None
         return results[0]
+    def fetch_projects_by_biosample(self, biosample_id: str) -> List[Dict[str, Any]]:
+        id = self._normalize_id(biosample_id)
+        results = self.request("/projects", params={"biosampleGoldId": id})
+        return results
+    def fetch_biosample_by_biosample_id(
+        self, biosample_id: str
+    ) -> List[Dict[str, Any]]:
+        id = self._normalize_id(biosample_id)
+        results = self.request("/biosamples", params={"biosampleGoldId": id})
+        return results
 @resource(
     config_schema={
@@ -372,15 +431,47 @@ def gold_api_client_resource(context: InitResourceContext):
 @dataclass
 class NmdcPortalApiClient:
     base_url: str
-    # Using a cookie for authentication is not ideal and should be replaced
-    # when this API has an another authentication method
-    session_cookie: str
+    refresh_token: str
+    access_token: Optional[str] = None
+    access_token_expires_at: Optional[datetime] = None
+    def _request(self, method: str, endpoint: str, **kwargs):
+        r"""
+        Submits a request to the specified API endpoint;
+        after refreshing the access token, if necessary.
+        """
+        if self.access_token is None or datetime.now() > self.access_token_expires_at:
+            refresh_response = requests.post(
+                f"{self.base_url}/auth/refresh",
+                json={"refresh_token": self.refresh_token},
+            )
+            refresh_response.raise_for_status()
+            refresh_body = refresh_response.json()
+            self.access_token_expires_at = datetime.now() + timedelta(
+                seconds=refresh_body["expires_in"]
+            )
+            self.access_token = refresh_body["access_token"]
+        headers = kwargs.get("headers", {})
+        headers["Authorization"] = f"Bearer {self.access_token}"
+        return requests.request(
+            method, f"{self.base_url}{endpoint}", **kwargs, headers=headers
+        )
     def fetch_metadata_submission(self, id: str) -> Dict[str, Any]:
-        response = requests.get(
-            f"{self.base_url}/api/metadata_submission/{id}",
-            cookies={"session": self.session_cookie},
+        response = self._request("GET", f"/api/metadata_submission/{id}")
+        response.raise_for_status()
+        return response.json()
+    def make_submission_images_public(
+        self, submission_id: str, *, study_id: str
+    ) -> Dict[str, Any]:
+        response = self._request(
+            "POST",
+            f"/api/metadata_submission/{submission_id}/image/make_public",
+            json={"study_id": study_id},
         )
         response.raise_for_status()
         return response.json()
@@ -389,13 +480,13 @@ class NmdcPortalApiClient:
 @resource(
     config_schema={
         "base_url": StringSource,
-        "session_cookie": StringSource,
+        "refresh_token": StringSource,
     }
 )
 def nmdc_portal_api_client_resource(context: InitResourceContext):
     return NmdcPortalApiClient(
         base_url=context.resource_config["base_url"],
-        session_cookie=context.resource_config["session_cookie"],
+        refresh_token=context.resource_config["refresh_token"],
     )
@@ -439,36 +530,49 @@ class MongoDB:
         self.db = self.client[dbname]
     def add_docs(self, docs, validate=True, replace=True):
-        try:
-            if validate:
-                nmdc_jsonschema_validator_noidpatterns(docs)
-            rv = {}
-            for collection_name, docs in docs.items():
-                rv[collection_name] = self.db[collection_name].bulk_write(
-                    [
-                        (
-                            ReplaceOne({"id": d["id"]}, d, upsert=True)
-                            if replace
-                            else InsertOne(d)
-                        )
-                        for d in docs
-                    ]
-                )
-                now = datetime.now(timezone.utc)
-                self.db.txn_log.insert_many(
-                    [
-                        {
-                            "tgt": {"id": d.get("id"), "c": collection_name},
-                            "type": "upsert",
-                            "ts": now,
-                            # "dtl": {},
-                        }
-                        for d in docs
-                    ]
-                )
-            return rv
-        except JsonSchemaValueException as e:
-            raise ValueError(e.message)
+        """
+        TODO: Document this function.
+        """
+        if validate:
+            validator = get_nmdc_schema_validator()
+            # Fail fast on first validation error.
+            for result in validator.iter_results(docs, target_class="Database"):
+                raise ValueError(result.message)
+        rv = {}
+        for collection_name, collection_docs in docs.items():
+            # If `collection_docs` is empty, abort this iteration.
+            #
+            # Note: We do this because the `bulk_write` method called below will raise
+            #       an `InvalidOperation` exception if it is passed 0 operations.
+            #
+            # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write
+            #
+            if len(collection_docs) == 0:
+                continue
+            rv[collection_name] = self.db[collection_name].bulk_write(
+                [
+                    (
+                        ReplaceOne({"id": d["id"]}, d, upsert=True)
+                        if replace
+                        else InsertOne(d)
+                    )
+                    for d in collection_docs
+                ]
+            )
+            now = datetime.now(timezone.utc)
+            self.db.txn_log.insert_many(
+                [
+                    {
+                        "tgt": {"id": d.get("id"), "c": collection_name},
+                        "type": "upsert",
+                        "ts": now,
+                        # "dtl": {},
+                    }
+                    for d in collection_docs
+                ]
+            )
+        return rv
 @resource(
@@ -512,33 +616,3 @@ def get_mongo(run_config: frozendict):
         )
     )
     return mongo_resource(resource_context)
-class TerminusDB:
-    def __init__(self, server_url, user, key, account, dbid):
-        self.client = WOQLClient(server_url=server_url)
-        self.client.connect(user=user, key=key, account=account)
-        db_info = self.client.get_database(dbid=dbid, account=account)
-        if db_info is None:
-            self.client.create_database(dbid=dbid, accountid=account, label=dbid)
-            self.client.create_graph(graph_type="inference", graph_id="main")
-        self.client.connect(user=user, key=key, account=account, db=dbid)
-@resource(
-    config_schema={
-        "server_url": StringSource,
-        "user": StringSource,
-        "key": StringSource,
-        "account": StringSource,
-        "dbid": StringSource,
-    }
-)
-def terminus_resource(context):
-    return TerminusDB(
-        server_url=context.resource_config["server_url"],
-        user=context.resource_config["user"],
-        key=context.resource_config["key"],
-        account=context.resource_config["account"],
-        dbid=context.resource_config["dbid"],
-    )

nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

nmdc-runtime 1.3.1py3-none-any.whl → 2.12.0py3-none-any.whl