nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
1
3
|
from abc import ABC, abstractmethod
|
|
2
|
-
from
|
|
4
|
+
from decimal import Decimal
|
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
3
6
|
from nmdc_schema import nmdc
|
|
4
7
|
|
|
5
8
|
JSON_OBJECT = Dict[str, Any]
|
|
6
9
|
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
7
12
|
|
|
8
13
|
class Translator(ABC):
|
|
9
14
|
def __init__(
|
|
@@ -14,9 +19,74 @@ class Translator(ABC):
|
|
|
14
19
|
def _index_by_id(self, collection, id):
|
|
15
20
|
return {item[id]: item for item in collection}
|
|
16
21
|
|
|
17
|
-
|
|
18
|
-
|
|
22
|
+
@staticmethod
|
|
23
|
+
def _ensure_curie(identifier: str, *, default_prefix: str) -> str:
|
|
24
|
+
identifier_parts = identifier.split(":", 1)
|
|
25
|
+
|
|
26
|
+
# Don't add prefix if identifier is already a CURIE
|
|
27
|
+
if len(identifier_parts) == 2:
|
|
28
|
+
return identifier
|
|
29
|
+
|
|
30
|
+
return f"{default_prefix}:{identifier_parts[0]}"
|
|
19
31
|
|
|
20
32
|
@abstractmethod
|
|
21
33
|
def get_database(self) -> nmdc.Database:
|
|
22
34
|
pass
|
|
35
|
+
|
|
36
|
+
def _parse_quantity_value(
|
|
37
|
+
self, raw_value: Optional[str], unit: Optional[str] = None
|
|
38
|
+
) -> Union[nmdc.QuantityValue, None]:
|
|
39
|
+
"""Construct a nmdc:QuantityValue from a raw value string
|
|
40
|
+
|
|
41
|
+
The regex pattern minimally matches on a single numeric value (possibly
|
|
42
|
+
floating point). The pattern can also identify a range represented by
|
|
43
|
+
two numeric values separated by a hyphen. It can also identify non-numeric
|
|
44
|
+
characters at the end of the string which are interpreted as a unit. A unit
|
|
45
|
+
may also be explicitly provided as an argument to this function. If parsing
|
|
46
|
+
identifies a unit and a unit argument is provided, the unit argument is used.
|
|
47
|
+
If the pattern is not matched at all None is returned.
|
|
48
|
+
|
|
49
|
+
:param raw_value: string to parse
|
|
50
|
+
:param unit: optional unit, defaults to None. If None, the unit is extracted from the
|
|
51
|
+
raw_value. If a unit is provided, it will override the unit extracted from the
|
|
52
|
+
raw_value.
|
|
53
|
+
:return: nmdc:QuantityValue
|
|
54
|
+
"""
|
|
55
|
+
if raw_value is None:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
match = re.fullmatch(
|
|
59
|
+
"([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
|
|
60
|
+
raw_value,
|
|
61
|
+
)
|
|
62
|
+
if not match:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
quantity_value_kwargs = {
|
|
66
|
+
"has_raw_value": raw_value,
|
|
67
|
+
"type": "nmdc:QuantityValue",
|
|
68
|
+
}
|
|
69
|
+
if match.group(2):
|
|
70
|
+
# having group 2 means the value is a range like "0 - 1". Either
|
|
71
|
+
# group 1 or group 2 might be the minimum especially when handling
|
|
72
|
+
# negative ranges like "0 - -1"
|
|
73
|
+
num_1 = Decimal(match.group(1))
|
|
74
|
+
num_2 = Decimal(match.group(2))
|
|
75
|
+
quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
|
|
76
|
+
quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
|
|
77
|
+
else:
|
|
78
|
+
# otherwise we just have a single numeric value
|
|
79
|
+
quantity_value_kwargs["has_numeric_value"] = Decimal(match.group(1))
|
|
80
|
+
|
|
81
|
+
if unit:
|
|
82
|
+
# a unit was manually specified
|
|
83
|
+
if match.group(3) and unit != match.group(3):
|
|
84
|
+
# a unit was also found in the raw string; issue a warning
|
|
85
|
+
# if they don't agree, but keep the manually specified one
|
|
86
|
+
logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
|
|
87
|
+
quantity_value_kwargs["has_unit"] = unit
|
|
88
|
+
elif match.group(3):
|
|
89
|
+
# a unit was found in the raw string
|
|
90
|
+
quantity_value_kwargs["has_unit"] = match.group(3)
|
|
91
|
+
|
|
92
|
+
return nmdc.QuantityValue(**quantity_value_kwargs)
|
nmdc_runtime/site/util.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from functools import lru_cache
|
|
3
|
-
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
4
2
|
|
|
3
|
+
from functools import lru_cache
|
|
5
4
|
from pymongo.database import Database as MongoDatabase
|
|
5
|
+
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
6
|
+
from refscan.lib.helpers import get_collection_names_from_schema
|
|
6
7
|
|
|
7
8
|
from nmdc_runtime.site.resources import mongo_resource
|
|
9
|
+
from nmdc_runtime.util import nmdc_schema_view
|
|
10
|
+
|
|
8
11
|
|
|
9
12
|
mode_test = {
|
|
10
13
|
"resource_defs": {"mongo": mongo_resource}
|
|
@@ -34,14 +37,30 @@ def run_and_log(shell_cmd, context):
|
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
@lru_cache
|
|
37
|
-
def
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
|
|
41
|
+
"""
|
|
42
|
+
TODO: Document this function.
|
|
43
|
+
"""
|
|
44
|
+
schema_view = nmdc_schema_view()
|
|
45
|
+
present_collection_names = set(mdb.list_collection_names())
|
|
41
46
|
return {
|
|
42
|
-
name: (
|
|
47
|
+
name: (
|
|
48
|
+
name in present_collection_names and "id_1" in mdb[name].index_information()
|
|
49
|
+
)
|
|
50
|
+
for name in get_collection_names_from_schema(schema_view)
|
|
43
51
|
}
|
|
44
52
|
|
|
45
53
|
|
|
46
54
|
def get_basename(filename: str) -> str:
|
|
47
55
|
return os.path.basename(filename)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
|
|
59
|
+
return nmdc_study_id.replace(":", "_").replace("-", "_")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_instruments_by_id(mdb: MongoDatabase) -> dict[str, dict]:
|
|
63
|
+
"""Get all documents from the instrument_set collection in a dict keyed by id."""
|
|
64
|
+
return {
|
|
65
|
+
instrument["id"]: instrument for instrument in mdb["instrument_set"].find({})
|
|
66
|
+
}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
from dagster import op, AssetMaterialization, AssetKey,
|
|
2
|
-
from
|
|
3
|
-
from nmdc_runtime.util import get_nmdc_jsonschema_dict
|
|
1
|
+
from dagster import op, AssetMaterialization, AssetKey, MetadataValue
|
|
2
|
+
from nmdc_runtime.util import get_nmdc_schema_validator
|
|
4
3
|
from toolz import dissoc
|
|
5
4
|
|
|
6
5
|
from nmdc_runtime.site.resources import mongo_resource
|
|
@@ -61,19 +60,19 @@ def validate_mongo_collection(context, collection_name: str):
|
|
|
61
60
|
collection = mongo_db[collection_name] # get mongo collection
|
|
62
61
|
db_set = collection_name.split(".")[0]
|
|
63
62
|
|
|
64
|
-
validator =
|
|
63
|
+
validator = get_nmdc_schema_validator()
|
|
65
64
|
validation_errors = []
|
|
66
65
|
|
|
67
66
|
for count, doc in enumerate(collection.find()):
|
|
68
67
|
# add logging for progress?
|
|
69
68
|
# e.g.: if count % 1000 == 0: context.log.info(“done X of Y")
|
|
70
69
|
doc = dissoc(doc, "_id") # dissoc _id
|
|
71
|
-
|
|
72
|
-
if len(
|
|
70
|
+
report = validator.validate({f"{db_set}": [doc]}, target_class="Database")
|
|
71
|
+
if len(report.results) > 0:
|
|
73
72
|
if "id" in doc.keys():
|
|
74
|
-
errors = {doc["id"]: [
|
|
73
|
+
errors = {doc["id"]: [r.message for r in report.results]}
|
|
75
74
|
else:
|
|
76
|
-
errors = {f"missing id ({count})": [
|
|
75
|
+
errors = {f"missing id ({count})": [r.message for r in report.results]}
|
|
77
76
|
validation_errors.append(errors)
|
|
78
77
|
|
|
79
78
|
return {"collection_name": collection_name, "errors": validation_errors}
|
|
@@ -92,10 +91,15 @@ def announce_validation_report(context, report, api_object):
|
|
|
92
91
|
asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
|
|
93
92
|
description=f"{collection_name} translation validation",
|
|
94
93
|
metadata={
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
94
|
+
# Note: When this code was originally written, it used Dagster's `EventMetadata` class,
|
|
95
|
+
# which has since been replaced by Dagster's `MetadataValue` class.
|
|
96
|
+
#
|
|
97
|
+
# Reference:
|
|
98
|
+
# - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
|
|
99
|
+
# - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
|
|
100
|
+
#
|
|
101
|
+
"n_errors": MetadataValue.int(len(report["errors"])),
|
|
102
|
+
"object_id": MetadataValue.text(api_object["id"]),
|
|
99
103
|
},
|
|
100
104
|
)
|
|
101
105
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
load_from:
|
|
2
|
+
- python_package:
|
|
3
|
+
package_name: nmdc_runtime.site.repository
|
|
4
|
+
attribute: repo
|
|
5
|
+
- python_package:
|
|
6
|
+
package_name: nmdc_runtime.site.repository
|
|
7
|
+
attribute: biosample_submission_ingest
|
|
8
|
+
- python_package:
|
|
9
|
+
package_name: nmdc_runtime.site.repository
|
|
10
|
+
attribute: biosample_export
|
|
11
|
+
- python_package:
|
|
12
|
+
package_name: nmdc_runtime.site.repository
|
|
13
|
+
attribute: database_records_stitching
|