nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -1
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +731 -40
- nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
- nmdc_runtime/site/graphs.py +80 -29
- nmdc_runtime/site/ops.py +522 -183
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +108 -117
- nmdc_runtime/site/resources.py +72 -36
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
- nmdc_runtime/site/translation/translator.py +64 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +175 -348
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
- nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
- nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from datetime import datetime, timezone
|
|
3
|
-
|
|
4
|
-
from toolz import dissoc
|
|
5
|
-
|
|
6
|
-
from nmdc_runtime.api.models.job import JobOperationMetadata
|
|
7
|
-
from nmdc_runtime.api.models.operation import Operation
|
|
8
|
-
from nmdc_runtime.api.models.operation import UpdateOperationRequest
|
|
9
|
-
from nmdc_runtime.api.models.util import ListRequest
|
|
10
|
-
from nmdc_runtime.api.models.util import ResultT
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def load_local_json(url, prefixes_url_to_local=None):
|
|
14
|
-
"""Useful for large files cached on local filesystem.
|
|
15
|
-
|
|
16
|
-
You may, for example, `cp --parents ` many files on a remote filesystem to a staging
|
|
17
|
-
folder on that remote filesystem, gzip that folder, scp it to your local machine, and then
|
|
18
|
-
extract to your local machine.
|
|
19
|
-
|
|
20
|
-
Example:
|
|
21
|
-
prefixes_url_to_local = {
|
|
22
|
-
"https://data.microbiomedata.org/data/": "/Users/dwinston/nmdc_files/2021-09-scanon-meta/ficus/pipeline_products/",
|
|
23
|
-
"https://portal.nersc.gov/project/m3408/": "/Users/dwinston/nmdc_files/2021-09-scanon-meta/www/",
|
|
24
|
-
}
|
|
25
|
-
"""
|
|
26
|
-
path = url
|
|
27
|
-
for before, after in prefixes_url_to_local.items():
|
|
28
|
-
path = path.replace(before, after)
|
|
29
|
-
with open(path) as f:
|
|
30
|
-
return json.load(f)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def claim_metadata_ingest_jobs(
|
|
34
|
-
client, drs_object_ids_to_ingest, wf_id, max_page_size=1000
|
|
35
|
-
):
|
|
36
|
-
lr = ListRequest(
|
|
37
|
-
filter=json.dumps(
|
|
38
|
-
{
|
|
39
|
-
"workflow.id": wf_id,
|
|
40
|
-
"config.object_id": {"$in": drs_object_ids_to_ingest},
|
|
41
|
-
}
|
|
42
|
-
),
|
|
43
|
-
max_page_size=max_page_size,
|
|
44
|
-
)
|
|
45
|
-
jobs = []
|
|
46
|
-
while True:
|
|
47
|
-
rv = client.list_jobs(lr.model_dump()).json()
|
|
48
|
-
jobs.extend(rv["resources"])
|
|
49
|
-
if "next_page_token" not in rv:
|
|
50
|
-
break
|
|
51
|
-
else:
|
|
52
|
-
lr.page_token = rv["next_page_token"]
|
|
53
|
-
|
|
54
|
-
# safety escape
|
|
55
|
-
if len(jobs) == len(drs_object_ids_to_ingest):
|
|
56
|
-
break
|
|
57
|
-
|
|
58
|
-
job_claim_responses = [client.claim_job(j["id"]) for j in jobs]
|
|
59
|
-
|
|
60
|
-
return job_claim_responses
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def mongo_add_docs_result_as_dict(rv):
|
|
64
|
-
return {
|
|
65
|
-
collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
|
|
66
|
-
for collection_name, bulk_write_result in rv.items()
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def get_metadata_ingest_job_ops(mongo, wf_id, drs_object_ids_to_ingest):
|
|
71
|
-
return list(
|
|
72
|
-
mongo.db.operations.find(
|
|
73
|
-
{
|
|
74
|
-
"metadata.job.workflow.id": wf_id,
|
|
75
|
-
"metadata.job.config.object_id": {"$in": drs_object_ids_to_ingest},
|
|
76
|
-
"done": False,
|
|
77
|
-
}
|
|
78
|
-
)
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def do_metadata_ingest_job(client, mongo, job_op_doc):
|
|
83
|
-
op = Operation[ResultT, JobOperationMetadata](**job_op_doc)
|
|
84
|
-
object_info = client.get_object_info(op.metadata.job.config["object_id"]).json()
|
|
85
|
-
url = object_info["access_methods"][0]["access_url"]["url"]
|
|
86
|
-
docs = load_local_json(url)
|
|
87
|
-
op_result = mongo.add_docs(docs, validate=False, replace=False)
|
|
88
|
-
op_patch = UpdateOperationRequest(
|
|
89
|
-
done=True,
|
|
90
|
-
result=mongo_add_docs_result_as_dict(op_result),
|
|
91
|
-
metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")},
|
|
92
|
-
)
|
|
93
|
-
return client.update_operation(op.id, op_patch)
|
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import re
|
|
4
|
-
from datetime import datetime, timezone, timedelta
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from tempfile import TemporaryDirectory
|
|
7
|
-
|
|
8
|
-
import requests
|
|
9
|
-
from bs4 import BeautifulSoup
|
|
10
|
-
|
|
11
|
-
from nmdc_runtime.api.models.object import DrsObjectIn
|
|
12
|
-
from nmdc_runtime.util import (
|
|
13
|
-
drs_metadata_for,
|
|
14
|
-
nmdc_jsonschema_validator,
|
|
15
|
-
specialize_activity_set_docs,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
pattern = re.compile(r"https?://(?P<domain>[^/]+)/(?P<path>.+)")
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def url_to_name(url):
|
|
22
|
-
m = pattern.match(url)
|
|
23
|
-
return (
|
|
24
|
-
f"{'.'.join(reversed(m.group('domain').split('.')))}"
|
|
25
|
-
f"__{m.group('path').replace('/', '.')}"
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def fetch_url(url, timeout=30):
|
|
30
|
-
return requests.get(url, timeout=timeout)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class HttpResponseNotOk(Exception):
|
|
34
|
-
pass
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class HttpResponseNotJson(Exception):
|
|
38
|
-
pass
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def response_to_json(response):
|
|
42
|
-
if response.status_code != 200:
|
|
43
|
-
raise HttpResponseNotOk()
|
|
44
|
-
try:
|
|
45
|
-
json_data = response.json()
|
|
46
|
-
except ValueError:
|
|
47
|
-
raise HttpResponseNotJson()
|
|
48
|
-
return json_data
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def json_data_from_url_to_file(json_data, url, save_dir):
|
|
52
|
-
filepath = os.path.join(save_dir, url_to_name(url))
|
|
53
|
-
with open(filepath, "w") as f:
|
|
54
|
-
json.dump(json_data, f)
|
|
55
|
-
return filepath
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def json_clean(d, model, exclude_unset=False):
|
|
59
|
-
return json.loads(model(**d).json(exclude_unset=exclude_unset))
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def drs_object_in_for(url):
|
|
63
|
-
with TemporaryDirectory() as save_dir:
|
|
64
|
-
response = fetch_url(url)
|
|
65
|
-
try:
|
|
66
|
-
json_data = response_to_json(response)
|
|
67
|
-
except HttpResponseNotOk:
|
|
68
|
-
return {"error": "HttpResponseNotOk"}
|
|
69
|
-
|
|
70
|
-
except HttpResponseNotJson:
|
|
71
|
-
return {"error": "HttpResponseNotJson"}
|
|
72
|
-
|
|
73
|
-
filepath = json_data_from_url_to_file(json_data, url, save_dir)
|
|
74
|
-
drs_object_in = DrsObjectIn(
|
|
75
|
-
**drs_metadata_for(
|
|
76
|
-
filepath,
|
|
77
|
-
{
|
|
78
|
-
"access_methods": [{"access_url": {"url": url}}],
|
|
79
|
-
"name": Path(filepath).name.replace(":", "-"),
|
|
80
|
-
},
|
|
81
|
-
)
|
|
82
|
-
)
|
|
83
|
-
return {"result": drs_object_in}
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def create_drs_object_for(url, drs_object_in, client):
|
|
87
|
-
rv = client.create_object(json.loads(drs_object_in.json(exclude_unset=True)))
|
|
88
|
-
return {"url": url, "response": rv}
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def validate_as_metadata_and_ensure_tags_for(
|
|
92
|
-
drs_id, client, tags=("schema#/definitions/Database", "metadata-in")
|
|
93
|
-
):
|
|
94
|
-
docs = client.get_object_bytes(drs_id).json()
|
|
95
|
-
docs, _ = specialize_activity_set_docs(docs)
|
|
96
|
-
_ = nmdc_jsonschema_validator(docs)
|
|
97
|
-
return {tag: client.ensure_object_tag(drs_id, tag) for tag in tags}
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def recent_metadata_urls(
|
|
101
|
-
urlpath="https://portal.nersc.gov/project/m3408/meta/anno2/",
|
|
102
|
-
urlpath_extra="?C=M;O=D",
|
|
103
|
-
since="2021-09",
|
|
104
|
-
):
|
|
105
|
-
"""Scrapes recent URLs from Apache/2.4.38 (Debian) Server listing.
|
|
106
|
-
|
|
107
|
-
Designed with urlpath.startwsith("https://portal.nersc.gov/project/m3408/") in mind.
|
|
108
|
-
"""
|
|
109
|
-
if since is None:
|
|
110
|
-
now = datetime.now(timezone.utc)
|
|
111
|
-
recent_enuf = now - timedelta(days=30)
|
|
112
|
-
since = f"{recent_enuf.year}-{recent_enuf.month}"
|
|
113
|
-
|
|
114
|
-
rv = requests.get(f"{urlpath}{urlpath_extra}")
|
|
115
|
-
|
|
116
|
-
soup = BeautifulSoup(rv.text, "html.parser")
|
|
117
|
-
|
|
118
|
-
urls = []
|
|
119
|
-
|
|
120
|
-
for tr in soup.find_all("tr"):
|
|
121
|
-
tds = tr.find_all("td")
|
|
122
|
-
if len(tds) != 5:
|
|
123
|
-
continue
|
|
124
|
-
|
|
125
|
-
_, td_name, td_last_modified, td_size, _ = tds
|
|
126
|
-
if td_last_modified.text.startswith(since):
|
|
127
|
-
name = td_name.a.text
|
|
128
|
-
if name.endswith(".json"):
|
|
129
|
-
urls.append(f"{urlpath}{name}")
|
|
130
|
-
|
|
131
|
-
return urls
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Translates EMSL data into JSON conformant with the NMDC JSON schema
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from dagster import op, graph
|
|
6
|
-
|
|
7
|
-
from nmdc_runtime.lib.nmdc_etl_class import NMDC_ETL
|
|
8
|
-
from nmdc_runtime.site.translation.util import (
|
|
9
|
-
load_nmdc_etl_class,
|
|
10
|
-
load_mongo_collection,
|
|
11
|
-
preset_prod,
|
|
12
|
-
preset_test,
|
|
13
|
-
schema_validate,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@op
|
|
18
|
-
def transform_emsl_omics_processing(_context, nmdc_etl: NMDC_ETL) -> tuple:
|
|
19
|
-
return ("emsl.omics_processing_set", nmdc_etl.transform_emsl_omics_processing())
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@op
|
|
23
|
-
def transform_emsl_data_object(_context, nmdc_etl: NMDC_ETL) -> tuple:
|
|
24
|
-
return ("emsl.data_object_set", nmdc_etl.transform_emsl_data_object())
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@graph
|
|
28
|
-
def emsl():
|
|
29
|
-
# load_merged_data_source()
|
|
30
|
-
nmdc_etl = load_nmdc_etl_class()
|
|
31
|
-
emsl_omics_processing = transform_emsl_omics_processing(nmdc_etl)
|
|
32
|
-
emsl_omics_processing_validated = schema_validate(emsl_omics_processing)
|
|
33
|
-
|
|
34
|
-
emsl_data_object = transform_emsl_data_object(nmdc_etl)
|
|
35
|
-
emsl_data_object_validated = schema_validate(emsl_data_object)
|
|
36
|
-
|
|
37
|
-
# load data into mongo
|
|
38
|
-
load_mongo_collection(emsl_omics_processing_validated)
|
|
39
|
-
load_mongo_collection(emsl_data_object_validated)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
emsl_job = emsl.to_job(**preset_prod)
|
|
43
|
-
test_emsl_job = emsl.to_job(name="test_emsl", **preset_test)
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Translate an export of the JGI GOLD [1] study, project, and biosample data into JSON conformant with the NMDC JSON schema.
|
|
3
|
-
[1] Genomes OnLine Database (GOLD) <https://gold.jgi.doe.gov/>.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
from dagster import op, graph
|
|
7
|
-
|
|
8
|
-
from nmdc_runtime.lib.nmdc_etl_class import NMDC_ETL
|
|
9
|
-
from nmdc_runtime.site.translation.util import (
|
|
10
|
-
load_nmdc_etl_class,
|
|
11
|
-
load_mongo_collection,
|
|
12
|
-
preset_prod,
|
|
13
|
-
preset_test,
|
|
14
|
-
schema_validate,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@op
|
|
19
|
-
def transform_study(_context, nmdc_etl: NMDC_ETL) -> tuple:
|
|
20
|
-
# return {"study_set": nmdc_etl.transform_study()}
|
|
21
|
-
return ("gold.study_set", nmdc_etl.transform_study())
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@op
|
|
25
|
-
def transform_gold_omics_processing(_context, nmdc_etl: NMDC_ETL) -> tuple:
|
|
26
|
-
return ("gold.omics_processing_set", nmdc_etl.transform_omics_processing())
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@op
|
|
30
|
-
def transform_biosample(_context, nmdc_etl: NMDC_ETL) -> tuple:
|
|
31
|
-
return ("gold.biosample_set", nmdc_etl.transform_biosample())
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
@graph
|
|
35
|
-
def gold():
|
|
36
|
-
nmdc_etl = load_nmdc_etl_class()
|
|
37
|
-
gold_study = transform_study(nmdc_etl)
|
|
38
|
-
gold_study_validated = schema_validate(gold_study)
|
|
39
|
-
|
|
40
|
-
gold_omics_processing = transform_gold_omics_processing(nmdc_etl)
|
|
41
|
-
gold_omics_processing_validated = schema_validate(gold_omics_processing)
|
|
42
|
-
|
|
43
|
-
gold_biosample = transform_biosample(nmdc_etl)
|
|
44
|
-
gold_biosample_validated = schema_validate(gold_biosample)
|
|
45
|
-
|
|
46
|
-
# load data into mongo
|
|
47
|
-
load_mongo_collection(gold_study_validated)
|
|
48
|
-
load_mongo_collection(gold_omics_processing_validated)
|
|
49
|
-
load_mongo_collection(gold_biosample_validated)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
gold_job = gold.to_job(**preset_prod)
|
|
53
|
-
test_gold_job = gold.to_job(name="test_gold", **preset_test)
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Translates EMSL data into JSON conformant with the NMDC JSON schema
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from dagster import op, graph
|
|
6
|
-
|
|
7
|
-
from nmdc_runtime.lib.nmdc_etl_class import NMDC_ETL
|
|
8
|
-
from nmdc_runtime.site.translation.util import (
|
|
9
|
-
load_nmdc_etl_class,
|
|
10
|
-
load_mongo_collection,
|
|
11
|
-
preset_prod,
|
|
12
|
-
preset_test,
|
|
13
|
-
schema_validate,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@op
|
|
18
|
-
def transform_jgi_data_object(_context, nmdc_etl: NMDC_ETL) -> tuple:
|
|
19
|
-
# return "jgi.data_object_set", [{"foo": "bar"}] # used for testing failure
|
|
20
|
-
return "jgi.data_object_set", nmdc_etl.transform_jgi_data_object()
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@graph
|
|
24
|
-
def jgi():
|
|
25
|
-
nmdc_etl = load_nmdc_etl_class()
|
|
26
|
-
jgi_data_object = transform_jgi_data_object(nmdc_etl)
|
|
27
|
-
jgi_data_object_validated = schema_validate(jgi_data_object)
|
|
28
|
-
load_mongo_collection(jgi_data_object_validated)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
jgi_job = jgi.to_job(**preset_prod)
|
|
32
|
-
test_jgi_job = jgi.to_job(name="test_jgi", **preset_test)
|
|
@@ -1,132 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
from dagster import op, Failure, AssetMaterialization
|
|
4
|
-
from dagster.core.definitions.events import AssetKey, Output
|
|
5
|
-
from fastjsonschema import JsonSchemaValueException
|
|
6
|
-
|
|
7
|
-
from nmdc_runtime.lib.nmdc_etl_class import NMDC_ETL
|
|
8
|
-
from nmdc_runtime.site.resources import mongo_resource
|
|
9
|
-
from nmdc_runtime.util import nmdc_jsonschema_validator
|
|
10
|
-
|
|
11
|
-
mode_prod = {"resource_defs": {"mongo": mongo_resource}}
|
|
12
|
-
mode_dev = {
|
|
13
|
-
"resource_defs": {"mongo": mongo_resource}
|
|
14
|
-
} # Connect to a real MongoDB instance for development.
|
|
15
|
-
mode_test = {
|
|
16
|
-
"resource_defs": {"mongo": mongo_resource}
|
|
17
|
-
} # Connect to a real MongoDB instance for testing.
|
|
18
|
-
|
|
19
|
-
config_prod = {
|
|
20
|
-
"resources": {
|
|
21
|
-
"mongo": {
|
|
22
|
-
"config": {
|
|
23
|
-
"host": {"env": "MONGO_HOST"},
|
|
24
|
-
"username": {"env": "MONGO_USERNAME"},
|
|
25
|
-
"password": {"env": "MONGO_PASSWORD"},
|
|
26
|
-
"dbname": "nmdc_etl_staging",
|
|
27
|
-
},
|
|
28
|
-
}
|
|
29
|
-
},
|
|
30
|
-
"ops": {
|
|
31
|
-
"load_nmdc_etl_class": {
|
|
32
|
-
"config": {
|
|
33
|
-
"data_file": str(
|
|
34
|
-
Path(__file__).parent.parent.parent.parent.joinpath(
|
|
35
|
-
"metadata-translation/src/data/nmdc_merged_data.tsv.zip"
|
|
36
|
-
)
|
|
37
|
-
),
|
|
38
|
-
"sssom_map_file": "",
|
|
39
|
-
"spec_file": str(
|
|
40
|
-
Path(__file__).parent.parent.parent.parent.joinpath(
|
|
41
|
-
"nmdc_runtime/lib/nmdc_data_source.yaml"
|
|
42
|
-
)
|
|
43
|
-
),
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
},
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
config_test = {
|
|
50
|
-
"resources": {
|
|
51
|
-
"mongo": {
|
|
52
|
-
"config": {
|
|
53
|
-
# local docker container via docker-compose.yml
|
|
54
|
-
"host": "mongo",
|
|
55
|
-
"username": "admin",
|
|
56
|
-
"password": "root",
|
|
57
|
-
"dbname": "nmdc_etl_staging",
|
|
58
|
-
},
|
|
59
|
-
}
|
|
60
|
-
},
|
|
61
|
-
"ops": {
|
|
62
|
-
"load_nmdc_etl_class": {
|
|
63
|
-
"config": {
|
|
64
|
-
"data_file": str(
|
|
65
|
-
Path(__file__).parent.parent.parent.parent.joinpath(
|
|
66
|
-
"metadata-translation/src/data/nmdc_merged_data.tsv.zip"
|
|
67
|
-
)
|
|
68
|
-
),
|
|
69
|
-
"sssom_map_file": "",
|
|
70
|
-
"spec_file": str(
|
|
71
|
-
Path(__file__).parent.parent.parent.parent.joinpath(
|
|
72
|
-
"nmdc_runtime/lib/nmdc_data_source.yaml"
|
|
73
|
-
)
|
|
74
|
-
),
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
},
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
preset_prod = dict(**mode_prod, config=config_prod)
|
|
81
|
-
preset_test = dict(**mode_test, config=config_test)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
@op
|
|
85
|
-
def load_nmdc_etl_class(context) -> NMDC_ETL:
|
|
86
|
-
# build instance of NMDC_ETL class
|
|
87
|
-
etl = NMDC_ETL(
|
|
88
|
-
merged_data_file=context.op_config["data_file"],
|
|
89
|
-
data_source_spec_file=context.op_config["spec_file"],
|
|
90
|
-
sssom_file="",
|
|
91
|
-
)
|
|
92
|
-
return etl
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
@op(required_resource_keys={"mongo"})
|
|
96
|
-
def load_mongo_collection(context, data: tuple):
|
|
97
|
-
mongo_db = context.resources.mongo.db
|
|
98
|
-
collection_name, documents = data
|
|
99
|
-
collection = mongo_db[collection_name] # get mongo collection
|
|
100
|
-
|
|
101
|
-
# drop collection if exists
|
|
102
|
-
collection.drop()
|
|
103
|
-
|
|
104
|
-
# insert data
|
|
105
|
-
collection.insert(documents)
|
|
106
|
-
context.log.info(f"inserted {len(documents)} documents into {collection.name}")
|
|
107
|
-
return collection_name
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
@op()
|
|
111
|
-
def schema_validate(context, data: tuple):
|
|
112
|
-
def schema_validate_asset(collection_name, status, errors):
|
|
113
|
-
return AssetMaterialization(
|
|
114
|
-
asset_key=AssetKey(["translation", f"{collection_name}_translation"]),
|
|
115
|
-
description=f"{collection_name} translation validation",
|
|
116
|
-
metadata={"status": status, "errors": errors},
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
collection_name, documents = data
|
|
120
|
-
_, schema_collection_name = collection_name.split(".")
|
|
121
|
-
try:
|
|
122
|
-
nmdc_jsonschema_validator({schema_collection_name: documents})
|
|
123
|
-
context.log.info(f"data for {collection_name} is valid")
|
|
124
|
-
yield schema_validate_asset(collection_name, "valid", "none")
|
|
125
|
-
return data # do I need a return statement and an Output?
|
|
126
|
-
except JsonSchemaValueException as e:
|
|
127
|
-
context.log.error(f"validation failed for {schema_collection_name} " + str(e))
|
|
128
|
-
context.log.error(f"documents: {documents}")
|
|
129
|
-
yield schema_validate_asset(collection_name, "not valid", str(e))
|
|
130
|
-
raise Failure(str(e))
|
|
131
|
-
finally:
|
|
132
|
-
yield Output(data)
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Validates data in the JGI collection in the nmdc_etl_staging database.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from dagster import op, graph
|
|
6
|
-
|
|
7
|
-
from nmdc_runtime.site.ops import local_file_to_api_object
|
|
8
|
-
from nmdc_runtime.site.validation.util import (
|
|
9
|
-
preset_prod,
|
|
10
|
-
preset_test,
|
|
11
|
-
validate_mongo_collection,
|
|
12
|
-
write_to_local_file,
|
|
13
|
-
announce_validation_report,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@op
|
|
18
|
-
def jgi_data_object_set_collection_name():
|
|
19
|
-
return "jgi.data_object_set"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@graph()
|
|
23
|
-
def jgi():
|
|
24
|
-
report = validate_mongo_collection(jgi_data_object_set_collection_name())
|
|
25
|
-
# the below could also be a @graph and loaded as a "subgraph" by e.g. the jgi graph job.
|
|
26
|
-
local_path = write_to_local_file(report)
|
|
27
|
-
obj = local_file_to_api_object(local_path)
|
|
28
|
-
announce_validation_report(report, obj)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
# passing the collecton name via the config
|
|
32
|
-
# problem: not sure if this best when multiple sets need to be validated
|
|
33
|
-
# from toolz import assoc_in
|
|
34
|
-
# config_ops = {
|
|
35
|
-
# "validate_mongo_collection": {"config": {"collection_name": "jgi.data_object_set"}}
|
|
36
|
-
# }
|
|
37
|
-
# validate_jgi_job = jgi.to_job(**assoc_in(preset_prod, ["config", "ops"], config_ops))
|
|
38
|
-
# test_validate_jgi_job = jgi.to_job(
|
|
39
|
-
# **assoc_in(preset_test, ["config", "ops"], config_ops)
|
|
40
|
-
# )
|
|
41
|
-
|
|
42
|
-
validate_jgi_job = jgi.to_job(**preset_prod)
|
|
43
|
-
test_validate_jgi_job = jgi.to_job(**preset_test)
|