nmdc-runtime 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/site/export/ncbi_xml.py +529 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +206 -0
- nmdc_runtime/site/export/study_metadata.py +24 -4
- nmdc_runtime/site/graphs.py +29 -11
- nmdc_runtime/site/ops.py +180 -44
- nmdc_runtime/site/repository.py +58 -6
- nmdc_runtime/site/resources.py +30 -40
- nmdc_runtime/site/translation/submission_portal_translator.py +16 -9
- nmdc_runtime/util.py +24 -1
- {nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.8.0.dist-info}/METADATA +4 -7
- {nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.8.0.dist-info}/RECORD +15 -17
- {nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.8.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.8.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/site/terminusdb/__init__.py +0 -0
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- {nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.8.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-1.6.0.dist-info → nmdc_runtime-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
from io import BytesIO, StringIO
|
|
2
|
+
from nmdc_runtime.minter.config import typecodes
|
|
3
|
+
from lxml import etree
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _build_class_map(class_map_data):
|
|
10
|
+
return {
|
|
11
|
+
entry["name"]: entry["schema_class"].split(":")[1] for entry in class_map_data
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_classname_from_typecode(doc_id):
|
|
16
|
+
class_map_data = typecodes()
|
|
17
|
+
class_map = _build_class_map(class_map_data)
|
|
18
|
+
|
|
19
|
+
typecode = doc_id.split(":")[1].split("-")[0]
|
|
20
|
+
return class_map.get(typecode)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
|
|
24
|
+
biosample_data_objects = []
|
|
25
|
+
|
|
26
|
+
for biosample in biosamples_list:
|
|
27
|
+
current_ids = [biosample["id"]]
|
|
28
|
+
collected_data_objects = []
|
|
29
|
+
|
|
30
|
+
while current_ids:
|
|
31
|
+
new_current_ids = []
|
|
32
|
+
for current_id in current_ids:
|
|
33
|
+
query = {"has_input": current_id}
|
|
34
|
+
document = all_docs_collection.find_one(query)
|
|
35
|
+
|
|
36
|
+
if not document:
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
has_output = document.get("has_output")
|
|
40
|
+
if not has_output:
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
for output_id in has_output:
|
|
44
|
+
if get_classname_from_typecode(output_id) == "DataObject":
|
|
45
|
+
data_object_doc = all_docs_collection.find_one(
|
|
46
|
+
{"id": output_id}
|
|
47
|
+
)
|
|
48
|
+
if data_object_doc:
|
|
49
|
+
collected_data_objects.append(data_object_doc)
|
|
50
|
+
else:
|
|
51
|
+
new_current_ids.append(output_id)
|
|
52
|
+
|
|
53
|
+
current_ids = new_current_ids
|
|
54
|
+
|
|
55
|
+
if collected_data_objects:
|
|
56
|
+
biosample_data_objects.append({biosample["id"]: collected_data_objects})
|
|
57
|
+
|
|
58
|
+
return biosample_data_objects
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list):
|
|
62
|
+
biosample_data_objects = []
|
|
63
|
+
|
|
64
|
+
for biosample in biosamples_list:
|
|
65
|
+
current_ids = [biosample["id"]]
|
|
66
|
+
collected_data_objects = []
|
|
67
|
+
|
|
68
|
+
while current_ids:
|
|
69
|
+
new_current_ids = []
|
|
70
|
+
for current_id in current_ids:
|
|
71
|
+
query = {"has_input": current_id}
|
|
72
|
+
document = all_docs_collection.find_one(query)
|
|
73
|
+
|
|
74
|
+
if not document:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
has_output = document.get("has_output")
|
|
78
|
+
if not has_output:
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
for output_id in has_output:
|
|
82
|
+
if get_classname_from_typecode(output_id) == "DataObject":
|
|
83
|
+
omics_processing_doc = all_docs_collection.find_one(
|
|
84
|
+
{"id": document["id"]}
|
|
85
|
+
)
|
|
86
|
+
if omics_processing_doc:
|
|
87
|
+
collected_data_objects.append(omics_processing_doc)
|
|
88
|
+
else:
|
|
89
|
+
new_current_ids.append(output_id)
|
|
90
|
+
|
|
91
|
+
current_ids = new_current_ids
|
|
92
|
+
|
|
93
|
+
if collected_data_objects:
|
|
94
|
+
biosample_data_objects.append({biosample["id"]: collected_data_objects})
|
|
95
|
+
|
|
96
|
+
return biosample_data_objects
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def handle_quantity_value(slot_value):
|
|
100
|
+
if "has_numeric_value" in slot_value and "has_unit" in slot_value:
|
|
101
|
+
return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}"
|
|
102
|
+
elif (
|
|
103
|
+
"has_maximum_numeric_value" in slot_value
|
|
104
|
+
and "has_minimum_numeric_value" in slot_value
|
|
105
|
+
and "has_unit" in slot_value
|
|
106
|
+
):
|
|
107
|
+
range_value = (
|
|
108
|
+
slot_value["has_maximum_numeric_value"]
|
|
109
|
+
- slot_value["has_minimum_numeric_value"]
|
|
110
|
+
)
|
|
111
|
+
return f"{range_value} {slot_value['has_unit']}"
|
|
112
|
+
elif "has_raw_value" in slot_value:
|
|
113
|
+
return slot_value["has_raw_value"]
|
|
114
|
+
return "Unknown format"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def handle_text_value(slot_value):
|
|
118
|
+
return slot_value.get("has_raw_value", "Unknown format")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def handle_timestamp_value(slot_value):
|
|
122
|
+
return slot_value.get("has_raw_value", "Unknown format")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def handle_controlled_term_value(slot_value):
|
|
126
|
+
if "term" in slot_value:
|
|
127
|
+
term = slot_value["term"]
|
|
128
|
+
if "name" in term and "id" in term:
|
|
129
|
+
return f"{term['name']} [{term['id']}]"
|
|
130
|
+
elif "id" in term:
|
|
131
|
+
return term["id"]
|
|
132
|
+
elif "name" in term:
|
|
133
|
+
return term["name"]
|
|
134
|
+
elif "has_raw_value" in slot_value:
|
|
135
|
+
return slot_value["has_raw_value"]
|
|
136
|
+
return "Unknown format"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def handle_controlled_identified_term_value(slot_value):
|
|
140
|
+
if "term" in slot_value:
|
|
141
|
+
term = slot_value["term"]
|
|
142
|
+
if "name" in term and "id" in term:
|
|
143
|
+
return f"{term['name']} [{term['id']}]"
|
|
144
|
+
elif "id" in term:
|
|
145
|
+
return term["id"]
|
|
146
|
+
elif "has_raw_value" in slot_value:
|
|
147
|
+
return slot_value["has_raw_value"]
|
|
148
|
+
return "Unknown format"
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def handle_geolocation_value(slot_value):
|
|
152
|
+
if "latitude" in slot_value and "longitude" in slot_value:
|
|
153
|
+
return f"{slot_value['latitude']} {slot_value['longitude']}"
|
|
154
|
+
elif "has_raw_value" in slot_value:
|
|
155
|
+
return slot_value["has_raw_value"]
|
|
156
|
+
return "Unknown format"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def handle_float_value(slot_value):
|
|
160
|
+
return f"{slot_value:.2f}"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def handle_string_value(slot_value):
|
|
164
|
+
return f"{slot_value}"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def load_mappings(url):
|
|
168
|
+
response = requests.get(url)
|
|
169
|
+
response.raise_for_status()
|
|
170
|
+
file_content = response.text
|
|
171
|
+
|
|
172
|
+
attribute_mappings = {}
|
|
173
|
+
slot_range_mappings = {}
|
|
174
|
+
reader = csv.DictReader(StringIO(file_content), delimiter="\t")
|
|
175
|
+
for row in reader:
|
|
176
|
+
if row["ignore"].strip():
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
json_key = row["nmdc_schema_slot"]
|
|
180
|
+
# attribute mappings
|
|
181
|
+
xml_attribute_name = row["ncbi_biosample_attribute_name"]
|
|
182
|
+
attribute_mappings[json_key] = (
|
|
183
|
+
xml_attribute_name if xml_attribute_name else json_key
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# slot range mappings
|
|
187
|
+
data_type = row["nmdc_schema_slot_range"]
|
|
188
|
+
slot_range_mappings[json_key] = data_type if data_type else "default"
|
|
189
|
+
|
|
190
|
+
return attribute_mappings, slot_range_mappings
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def validate_xml(xml, xsd_url):
|
|
194
|
+
response = requests.get(xsd_url)
|
|
195
|
+
response.raise_for_status()
|
|
196
|
+
xsd_content = response.text
|
|
197
|
+
|
|
198
|
+
xml_schema_doc = etree.parse(BytesIO(xsd_content.encode("utf-8")))
|
|
199
|
+
xml_schema = etree.XMLSchema(xml_schema_doc)
|
|
200
|
+
|
|
201
|
+
xml_doc = etree.parse(BytesIO(xml.encode("utf-8")))
|
|
202
|
+
|
|
203
|
+
if not xml_schema.validate(xml_doc):
|
|
204
|
+
raise ValueError(f"There were errors while validating against: {xsd_url}")
|
|
205
|
+
|
|
206
|
+
return True
|
|
@@ -5,7 +5,6 @@ Get NMDC study-associated metadata from search api
|
|
|
5
5
|
import csv
|
|
6
6
|
from io import StringIO
|
|
7
7
|
|
|
8
|
-
import requests
|
|
9
8
|
from dagster import (
|
|
10
9
|
op,
|
|
11
10
|
get_dagster_logger,
|
|
@@ -26,13 +25,27 @@ def get_all_docs(client, collection, filter_):
|
|
|
26
25
|
per_page = 200
|
|
27
26
|
url_base = f"/{collection}?filter={filter_}&per_page={per_page}"
|
|
28
27
|
results = []
|
|
29
|
-
|
|
28
|
+
response = client.request("GET", url_base)
|
|
29
|
+
if response.status_code != 200:
|
|
30
|
+
raise Exception(
|
|
31
|
+
f"Runtime API request failed with status {response.status_code}."
|
|
32
|
+
f" Check URL: {url_base}"
|
|
33
|
+
)
|
|
34
|
+
rv = response.json()
|
|
30
35
|
results.extend(rv.get("results", []))
|
|
31
36
|
page, count = rv["meta"]["page"], rv["meta"]["count"]
|
|
32
37
|
assert count <= 10_000
|
|
33
38
|
while page * per_page < count:
|
|
34
|
-
|
|
35
|
-
|
|
39
|
+
page += 1
|
|
40
|
+
url = f"{url_base}&page={page}"
|
|
41
|
+
response = client.request("GET", url)
|
|
42
|
+
if response.status_code != 200:
|
|
43
|
+
raise Exception(
|
|
44
|
+
f"Runtime API request failed with status {response.status_code}."
|
|
45
|
+
f" Check URL: {url}"
|
|
46
|
+
)
|
|
47
|
+
rv = response.json()
|
|
48
|
+
results.extend(rv.get("results", []))
|
|
36
49
|
return results
|
|
37
50
|
|
|
38
51
|
|
|
@@ -115,3 +128,10 @@ def export_study_biosamples_as_csv(context: OpExecutionContext, study_export_inf
|
|
|
115
128
|
def export_study_biosamples_metadata():
|
|
116
129
|
outputs = export_study_biosamples_as_csv(get_study_biosamples_metadata())
|
|
117
130
|
add_output_run_event(outputs)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@op(required_resource_keys={"runtime_api_site_client"})
|
|
134
|
+
def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study: dict):
|
|
135
|
+
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
136
|
+
biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study['id']}")
|
|
137
|
+
return biosamples
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -22,7 +22,6 @@ from nmdc_runtime.site.ops import (
|
|
|
22
22
|
hello,
|
|
23
23
|
mongo_stats,
|
|
24
24
|
submit_metadata_to_db,
|
|
25
|
-
update_schema,
|
|
26
25
|
filter_ops_undone_expired,
|
|
27
26
|
construct_jobs,
|
|
28
27
|
maybe_post_jobs,
|
|
@@ -49,7 +48,15 @@ from nmdc_runtime.site.ops import (
|
|
|
49
48
|
get_neon_pipeline_inputs,
|
|
50
49
|
get_df_from_url,
|
|
51
50
|
site_code_mapping,
|
|
51
|
+
materialize_alldocs,
|
|
52
|
+
get_ncbi_export_pipeline_study,
|
|
53
|
+
get_data_objects_from_biosamples,
|
|
54
|
+
get_omics_processing_from_biosamples,
|
|
55
|
+
get_ncbi_export_pipeline_inputs,
|
|
56
|
+
ncbi_submission_xml_from_nmdc_study,
|
|
57
|
+
ncbi_submission_xml_asset,
|
|
52
58
|
)
|
|
59
|
+
from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
|
|
53
60
|
|
|
54
61
|
|
|
55
62
|
@graph
|
|
@@ -88,19 +95,13 @@ def hello_mongo():
|
|
|
88
95
|
|
|
89
96
|
|
|
90
97
|
@graph
|
|
91
|
-
def
|
|
92
|
-
|
|
93
|
-
A pipeline definition. This example pipeline has a single solid.
|
|
94
|
-
|
|
95
|
-
For more hints on writing Dagster pipelines, see our documentation overview on Pipelines:
|
|
96
|
-
https://docs.dagster.io/overview/solids-pipelines/pipelines
|
|
97
|
-
"""
|
|
98
|
-
update_schema()
|
|
98
|
+
def housekeeping():
|
|
99
|
+
delete_operations(list_operations(filter_ops_undone_expired()))
|
|
99
100
|
|
|
100
101
|
|
|
101
102
|
@graph
|
|
102
|
-
def
|
|
103
|
-
|
|
103
|
+
def ensure_alldocs():
|
|
104
|
+
materialize_alldocs()
|
|
104
105
|
|
|
105
106
|
|
|
106
107
|
@graph
|
|
@@ -381,3 +382,20 @@ def ingest_neon_surface_water_metadata():
|
|
|
381
382
|
)
|
|
382
383
|
run_id = submit_metadata_to_db(database)
|
|
383
384
|
poll_for_run_completion(run_id)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@graph
|
|
388
|
+
def nmdc_study_to_ncbi_submission_export():
|
|
389
|
+
nmdc_study = get_ncbi_export_pipeline_study()
|
|
390
|
+
ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
|
|
391
|
+
biosamples = get_biosamples_by_study_id(nmdc_study)
|
|
392
|
+
omics_processing_records = get_omics_processing_from_biosamples(biosamples)
|
|
393
|
+
data_objects = get_data_objects_from_biosamples(biosamples)
|
|
394
|
+
xml_data = ncbi_submission_xml_from_nmdc_study(
|
|
395
|
+
nmdc_study,
|
|
396
|
+
ncbi_submission_metadata,
|
|
397
|
+
biosamples,
|
|
398
|
+
omics_processing_records,
|
|
399
|
+
data_objects,
|
|
400
|
+
)
|
|
401
|
+
ncbi_submission_xml_asset(xml_data)
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -9,9 +9,11 @@ from datetime import datetime, timezone
|
|
|
9
9
|
from io import BytesIO, StringIO
|
|
10
10
|
from typing import Tuple
|
|
11
11
|
from zipfile import ZipFile
|
|
12
|
+
|
|
12
13
|
import pandas as pd
|
|
13
14
|
import requests
|
|
14
15
|
|
|
16
|
+
|
|
15
17
|
from bson import ObjectId, json_util
|
|
16
18
|
from dagster import (
|
|
17
19
|
Any,
|
|
@@ -29,10 +31,14 @@ from dagster import (
|
|
|
29
31
|
String,
|
|
30
32
|
op,
|
|
31
33
|
Optional,
|
|
34
|
+
Field,
|
|
35
|
+
Permissive,
|
|
36
|
+
Bool,
|
|
32
37
|
)
|
|
33
38
|
from gridfs import GridFS
|
|
34
39
|
from linkml_runtime.dumpers import json_dumper
|
|
35
40
|
from linkml_runtime.utils.yamlutils import YAMLRoot
|
|
41
|
+
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
36
42
|
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
37
43
|
from nmdc_runtime.api.core.metadata import (
|
|
38
44
|
_validate_changesheet,
|
|
@@ -42,6 +48,7 @@ from nmdc_runtime.api.core.metadata import (
|
|
|
42
48
|
)
|
|
43
49
|
from nmdc_runtime.api.core.util import dotted_path_for, hash_from_str, json_clean, now
|
|
44
50
|
from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object
|
|
51
|
+
from nmdc_runtime.api.endpoints.find import find_study_by_id
|
|
45
52
|
from nmdc_runtime.api.models.job import Job, JobOperationMetadata
|
|
46
53
|
from nmdc_runtime.api.models.metadata import ChangesheetIn
|
|
47
54
|
from nmdc_runtime.api.models.operation import (
|
|
@@ -55,6 +62,11 @@ from nmdc_runtime.api.models.run import (
|
|
|
55
62
|
_add_run_complete_event,
|
|
56
63
|
)
|
|
57
64
|
from nmdc_runtime.api.models.util import ResultT
|
|
65
|
+
from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
|
|
66
|
+
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
67
|
+
fetch_data_objects_from_biosamples,
|
|
68
|
+
fetch_omics_processing_from_biosamples,
|
|
69
|
+
)
|
|
58
70
|
from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
|
|
59
71
|
from nmdc_runtime.site.resources import (
|
|
60
72
|
NmdcPortalApiClient,
|
|
@@ -81,12 +93,15 @@ from nmdc_runtime.util import (
|
|
|
81
93
|
put_object,
|
|
82
94
|
validate_json,
|
|
83
95
|
specialize_activity_set_docs,
|
|
96
|
+
collection_name_to_class_names,
|
|
97
|
+
class_hierarchy_as_list,
|
|
98
|
+
populated_schema_collection_names_with_id_field,
|
|
84
99
|
)
|
|
85
100
|
from nmdc_schema import nmdc
|
|
101
|
+
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
86
102
|
from pydantic import BaseModel
|
|
87
103
|
from pymongo.database import Database as MongoDatabase
|
|
88
104
|
from starlette import status
|
|
89
|
-
from terminusdb_client.woqlquery import WOQLQuery as WQ
|
|
90
105
|
from toolz import assoc, dissoc, get_in, valfilter, identity
|
|
91
106
|
|
|
92
107
|
|
|
@@ -111,14 +126,6 @@ def log_env(context):
|
|
|
111
126
|
context.log.info("\n".join(out))
|
|
112
127
|
|
|
113
128
|
|
|
114
|
-
@op(required_resource_keys={"terminus"})
|
|
115
|
-
def list_databases(context) -> List[String]:
|
|
116
|
-
client = context.resources.terminus.client
|
|
117
|
-
list_ = client.list_databases()
|
|
118
|
-
context.log.info(f"databases: {list_}")
|
|
119
|
-
return list_
|
|
120
|
-
|
|
121
|
-
|
|
122
129
|
@op(required_resource_keys={"mongo"})
|
|
123
130
|
def mongo_stats(context) -> List[str]:
|
|
124
131
|
db = context.resources.mongo.db
|
|
@@ -127,41 +134,6 @@ def mongo_stats(context) -> List[str]:
|
|
|
127
134
|
return collection_names
|
|
128
135
|
|
|
129
136
|
|
|
130
|
-
@op(required_resource_keys={"terminus"})
|
|
131
|
-
def update_schema(context):
|
|
132
|
-
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
133
|
-
try:
|
|
134
|
-
context.log.info("shallow-cloning nmdc-schema repo")
|
|
135
|
-
subprocess.check_output(
|
|
136
|
-
"git clone https://github.com/microbiomedata/nmdc-schema.git"
|
|
137
|
-
f" --branch main --single-branch {tmpdirname}/nmdc-schema",
|
|
138
|
-
shell=True,
|
|
139
|
-
)
|
|
140
|
-
context.log.info("generating TerminusDB JSON-LD from NMDC LinkML")
|
|
141
|
-
subprocess.check_output(
|
|
142
|
-
f"gen-terminusdb {tmpdirname}/nmdc-schema/src/schema/nmdc.yaml"
|
|
143
|
-
f" > {tmpdirname}/nmdc.terminus.json",
|
|
144
|
-
shell=True,
|
|
145
|
-
)
|
|
146
|
-
except subprocess.CalledProcessError as e:
|
|
147
|
-
if e.stdout:
|
|
148
|
-
context.log.debug(e.stdout.decode())
|
|
149
|
-
if e.stderr:
|
|
150
|
-
context.log.error(e.stderr.decode())
|
|
151
|
-
context.log.debug(str(e.returncode))
|
|
152
|
-
raise e
|
|
153
|
-
|
|
154
|
-
with open(f"{tmpdirname}/nmdc.terminus.json") as f:
|
|
155
|
-
woql_dict = json.load(f)
|
|
156
|
-
|
|
157
|
-
context.log.info("Updating terminus schema via WOQLQuery")
|
|
158
|
-
rv = WQ(query=woql_dict).execute(
|
|
159
|
-
context.resources.terminus.client, "update schema via WOQL"
|
|
160
|
-
)
|
|
161
|
-
context.log.info(str(rv))
|
|
162
|
-
return rv
|
|
163
|
-
|
|
164
|
-
|
|
165
137
|
@op(
|
|
166
138
|
required_resource_keys={"mongo", "runtime_api_site_client"},
|
|
167
139
|
retry_policy=RetryPolicy(max_retries=2),
|
|
@@ -768,6 +740,33 @@ def export_json_to_drs(
|
|
|
768
740
|
return ["/objects/" + drs_object["id"]]
|
|
769
741
|
|
|
770
742
|
|
|
743
|
+
@op(
|
|
744
|
+
description="NCBI Submission XML file rendered in a Dagster Asset",
|
|
745
|
+
out=Out(description="XML content rendered through Dagit UI"),
|
|
746
|
+
)
|
|
747
|
+
def ncbi_submission_xml_asset(context: OpExecutionContext, data: str):
|
|
748
|
+
filename = "ncbi_submission.xml"
|
|
749
|
+
file_path = os.path.join(context.instance.storage_directory(), filename)
|
|
750
|
+
|
|
751
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
752
|
+
|
|
753
|
+
with open(file_path, "w") as f:
|
|
754
|
+
f.write(data)
|
|
755
|
+
|
|
756
|
+
context.log_event(
|
|
757
|
+
AssetMaterialization(
|
|
758
|
+
asset_key="ncbi_submission_xml",
|
|
759
|
+
description="NCBI Submission XML Data",
|
|
760
|
+
metadata={
|
|
761
|
+
"file_path": MetadataValue.path(file_path),
|
|
762
|
+
"xml": MetadataValue.text(data),
|
|
763
|
+
},
|
|
764
|
+
)
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
return Output(data)
|
|
768
|
+
|
|
769
|
+
|
|
771
770
|
def unique_field_values(docs: List[Dict[str, Any]], field: str):
|
|
772
771
|
return {doc[field] for doc in docs if field in doc}
|
|
773
772
|
|
|
@@ -977,3 +976,140 @@ def site_code_mapping() -> dict:
|
|
|
977
976
|
raise Exception(
|
|
978
977
|
f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}"
|
|
979
978
|
)
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
@op(required_resource_keys={"mongo"})
|
|
982
|
+
def materialize_alldocs(context) -> int:
|
|
983
|
+
mdb = context.resources.mongo.db
|
|
984
|
+
collection_names = populated_schema_collection_names_with_id_field(mdb)
|
|
985
|
+
|
|
986
|
+
for name in collection_names:
|
|
987
|
+
assert (
|
|
988
|
+
len(collection_name_to_class_names[name]) == 1
|
|
989
|
+
), f"{name} collection has class name of {collection_name_to_class_names[name]} and len {len(collection_name_to_class_names[name])}"
|
|
990
|
+
|
|
991
|
+
context.log.info(f"{collection_names=}")
|
|
992
|
+
|
|
993
|
+
# Drop any existing `alldocs` collection (e.g. from previous use of this op).
|
|
994
|
+
mdb.alldocs.drop()
|
|
995
|
+
|
|
996
|
+
# Build alldocs
|
|
997
|
+
context.log.info("constructing `alldocs` collection")
|
|
998
|
+
|
|
999
|
+
for collection in collection_names:
|
|
1000
|
+
# Calculate class_hierarchy_as_list once per collection, using the first document in list
|
|
1001
|
+
try:
|
|
1002
|
+
nmdcdb = NMDCDatabase(
|
|
1003
|
+
**{collection: [dissoc(mdb[collection].find_one(), "_id")]}
|
|
1004
|
+
)
|
|
1005
|
+
exemplar = getattr(nmdcdb, collection)[0]
|
|
1006
|
+
newdoc_type: list[str] = class_hierarchy_as_list(exemplar)
|
|
1007
|
+
except ValueError as e:
|
|
1008
|
+
context.log.info(f"Collection {collection} does not exist.")
|
|
1009
|
+
raise e
|
|
1010
|
+
|
|
1011
|
+
context.log.info(
|
|
1012
|
+
f"Found {mdb[collection].estimated_document_count()} estimated documents for {collection=}."
|
|
1013
|
+
)
|
|
1014
|
+
# For each document in this collection, replace the value of the `type` field with
|
|
1015
|
+
# a _list_ of the document's own class and ancestor classes, remove the `_id` field,
|
|
1016
|
+
# and insert the resulting document into the `alldocs` collection.
|
|
1017
|
+
|
|
1018
|
+
inserted_many_result = mdb.alldocs.insert_many(
|
|
1019
|
+
[
|
|
1020
|
+
assoc(dissoc(doc, "type", "_id"), "type", newdoc_type)
|
|
1021
|
+
for doc in mdb[collection].find()
|
|
1022
|
+
]
|
|
1023
|
+
)
|
|
1024
|
+
context.log.info(
|
|
1025
|
+
f"Inserted {len(inserted_many_result.inserted_ids)} documents for {collection=}."
|
|
1026
|
+
)
|
|
1027
|
+
|
|
1028
|
+
# Re-idx for `alldocs` collection
|
|
1029
|
+
mdb.alldocs.create_index("id", unique=True)
|
|
1030
|
+
context.log.info(
|
|
1031
|
+
f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
|
|
1032
|
+
)
|
|
1033
|
+
return mdb.alldocs.estimated_document_count()
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
@op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
|
|
1037
|
+
def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any:
|
|
1038
|
+
nmdc_study = find_study_by_id(
|
|
1039
|
+
context.op_config["nmdc_study_id"], context.resources.mongo.db
|
|
1040
|
+
)
|
|
1041
|
+
return nmdc_study
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
@op(
|
|
1045
|
+
config_schema={
|
|
1046
|
+
"nmdc_ncbi_attribute_mapping_file_url": str,
|
|
1047
|
+
"ncbi_submission_metadata": Field(
|
|
1048
|
+
Permissive(
|
|
1049
|
+
{
|
|
1050
|
+
"organization": String,
|
|
1051
|
+
}
|
|
1052
|
+
),
|
|
1053
|
+
is_required=True,
|
|
1054
|
+
description="General metadata about the NCBI submission.",
|
|
1055
|
+
),
|
|
1056
|
+
"ncbi_biosample_metadata": Field(
|
|
1057
|
+
Permissive(
|
|
1058
|
+
{
|
|
1059
|
+
"organism_name": String,
|
|
1060
|
+
}
|
|
1061
|
+
),
|
|
1062
|
+
is_required=True,
|
|
1063
|
+
description="Metadata for one or many NCBI BioSample in the Submission.",
|
|
1064
|
+
),
|
|
1065
|
+
},
|
|
1066
|
+
out=Out(Dict),
|
|
1067
|
+
)
|
|
1068
|
+
def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
|
|
1069
|
+
nmdc_ncbi_attribute_mapping_file_url = context.op_config[
|
|
1070
|
+
"nmdc_ncbi_attribute_mapping_file_url"
|
|
1071
|
+
]
|
|
1072
|
+
ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {})
|
|
1073
|
+
ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {})
|
|
1074
|
+
|
|
1075
|
+
return {
|
|
1076
|
+
"nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url,
|
|
1077
|
+
"ncbi_submission_metadata": ncbi_submission_metadata,
|
|
1078
|
+
"ncbi_biosample_metadata": ncbi_biosample_metadata,
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
@op(required_resource_keys={"mongo"})
|
|
1083
|
+
def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1084
|
+
mdb = context.resources.mongo.db
|
|
1085
|
+
alldocs_collection = mdb["alldocs"]
|
|
1086
|
+
biosample_data_objects = fetch_data_objects_from_biosamples(
|
|
1087
|
+
alldocs_collection, biosamples
|
|
1088
|
+
)
|
|
1089
|
+
return biosample_data_objects
|
|
1090
|
+
|
|
1091
|
+
|
|
1092
|
+
@op(required_resource_keys={"mongo"})
|
|
1093
|
+
def get_omics_processing_from_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1094
|
+
mdb = context.resources.mongo.db
|
|
1095
|
+
alldocs_collection = mdb["alldocs"]
|
|
1096
|
+
biosample_omics_processing = fetch_omics_processing_from_biosamples(
|
|
1097
|
+
alldocs_collection, biosamples
|
|
1098
|
+
)
|
|
1099
|
+
return biosample_omics_processing
|
|
1100
|
+
|
|
1101
|
+
|
|
1102
|
+
@op
|
|
1103
|
+
def ncbi_submission_xml_from_nmdc_study(
|
|
1104
|
+
context: OpExecutionContext,
|
|
1105
|
+
nmdc_study: Any,
|
|
1106
|
+
ncbi_exporter_metadata: dict,
|
|
1107
|
+
biosamples: list,
|
|
1108
|
+
omics_processing_records: list,
|
|
1109
|
+
data_objects: list,
|
|
1110
|
+
) -> str:
|
|
1111
|
+
ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
|
|
1112
|
+
ncbi_xml = ncbi_exporter.get_submission_xml(
|
|
1113
|
+
biosamples, omics_processing_records, data_objects
|
|
1114
|
+
)
|
|
1115
|
+
return ncbi_xml
|