nmdc-runtime 1.9.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/minter/config.py +59 -3
- nmdc_runtime/site/export/ncbi_xml.py +29 -25
- nmdc_runtime/site/export/ncbi_xml_utils.py +5 -5
- nmdc_runtime/site/export/study_metadata.py +3 -1
- nmdc_runtime/site/graphs.py +71 -15
- nmdc_runtime/site/ops.py +135 -42
- nmdc_runtime/site/repository.py +16 -4
- nmdc_runtime/site/translation/gold_translator.py +112 -43
- nmdc_runtime/site/translation/neon_benthic_translator.py +59 -34
- nmdc_runtime/site/translation/neon_soil_translator.py +72 -48
- nmdc_runtime/site/translation/neon_surface_water_translator.py +61 -32
- nmdc_runtime/site/translation/neon_utils.py +19 -6
- nmdc_runtime/site/translation/submission_portal_translator.py +67 -36
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/METADATA +1 -1
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/RECORD +19 -19
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/minter/config.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from functools import lru_cache
|
|
3
|
+
from typing import List
|
|
3
4
|
|
|
4
5
|
from nmdc_runtime.util import get_nmdc_jsonschema_dict
|
|
5
6
|
|
|
@@ -11,18 +12,73 @@ def minting_service_id() -> str | None:
|
|
|
11
12
|
return os.getenv("MINTING_SERVICE_ID")
|
|
12
13
|
|
|
13
14
|
|
|
15
|
+
def extract_typecode_from_pattern(pattern: str) -> str:
|
|
16
|
+
r"""
|
|
17
|
+
Returns the typecode portion of the specified string.
|
|
18
|
+
|
|
19
|
+
>>> extract_typecode_from_pattern("foo-123-456$") # original behavior
|
|
20
|
+
'foo'
|
|
21
|
+
>>> extract_typecode_from_pattern("(foo)-123-456$") # returns first and only typecode
|
|
22
|
+
'foo'
|
|
23
|
+
>>> extract_typecode_from_pattern("(foo|bar)-123-456$") # returns first of 2 typecodes
|
|
24
|
+
'foo'
|
|
25
|
+
>>> extract_typecode_from_pattern("(foo|bar|baz)-123-456$") # returns first of > 2 typecodes
|
|
26
|
+
'foo'
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# Get the portion of the pattern preceding the first hyphen.
|
|
30
|
+
# e.g. "foo-bar-baz" → ["foo", "bar-baz"] → "foo"
|
|
31
|
+
typecode_sub_pattern = pattern.split("-", maxsplit=1)[0]
|
|
32
|
+
|
|
33
|
+
# If that portion of the pattern is enclosed in parentheses, get the portion between the parentheses.
|
|
34
|
+
# e.g. "(apple|banana|carrot)" → "apple|banana|carrot"
|
|
35
|
+
if typecode_sub_pattern.startswith("(") and typecode_sub_pattern.endswith(")"):
|
|
36
|
+
inner_pattern = typecode_sub_pattern[1:-1]
|
|
37
|
+
|
|
38
|
+
# Finally, get everything before the first `|`, if any.
|
|
39
|
+
# e.g. "apple|banana|carrot" → "apple"
|
|
40
|
+
# e.g. "apple" → "apple"
|
|
41
|
+
typecode = inner_pattern.split("|", maxsplit=1)[0]
|
|
42
|
+
else:
|
|
43
|
+
# Note: This is the original behavior, before we added support for multi-typecode patterns.
|
|
44
|
+
# e.g. "apple" → "apple"
|
|
45
|
+
typecode = typecode_sub_pattern
|
|
46
|
+
|
|
47
|
+
return typecode
|
|
48
|
+
|
|
49
|
+
|
|
14
50
|
@lru_cache()
|
|
15
|
-
def typecodes():
|
|
51
|
+
def typecodes() -> List[dict]:
|
|
52
|
+
r"""
|
|
53
|
+
Returns a list of dictionaries containing typecodes and associated information derived from the schema.
|
|
54
|
+
|
|
55
|
+
Preconditions about the schema:
|
|
56
|
+
- The typecode portion of the pattern is between the pattern prefix and the first subsequent hyphen.
|
|
57
|
+
- The typecode portion of the pattern either consists of a single typecode verbatim (e.g. "foo");
|
|
58
|
+
or consists of multiple typecodes in a pipe-delimited list enclosed in parentheses (e.g. "(foo|bar|baz)").
|
|
59
|
+
- The typecode portion of the pattern does not, itself, contain any hyphens.
|
|
60
|
+
|
|
61
|
+
TODO: Get the typecodes in a different way than by extracting them from a larger string, which seems brittle to me.
|
|
62
|
+
Getting them a different way may require schema authors to _define_ them a different way (e.g. defining them
|
|
63
|
+
in a dedicated property of a class; for example, one named `typecode`).
|
|
64
|
+
"""
|
|
65
|
+
id_pattern_prefix = r"^(nmdc):"
|
|
66
|
+
|
|
16
67
|
rv = []
|
|
17
68
|
schema_dict = get_nmdc_jsonschema_dict()
|
|
18
69
|
for cls_name, defn in schema_dict["$defs"].items():
|
|
19
70
|
match defn.get("properties"):
|
|
20
|
-
case {"id": {"pattern": p}} if p.startswith(
|
|
71
|
+
case {"id": {"pattern": p}} if p.startswith(id_pattern_prefix):
|
|
72
|
+
# Get the portion of the pattern following the prefix.
|
|
73
|
+
# e.g. "^(nmdc):foo-bar-baz" → "foo-bar-baz"
|
|
74
|
+
index_of_first_character_following_prefix = len(id_pattern_prefix)
|
|
75
|
+
pattern_without_prefix = p[index_of_first_character_following_prefix:]
|
|
76
|
+
|
|
21
77
|
rv.append(
|
|
22
78
|
{
|
|
23
79
|
"id": "nmdc:" + cls_name + "_" + "typecode",
|
|
24
80
|
"schema_class": "nmdc:" + cls_name,
|
|
25
|
-
"name":
|
|
81
|
+
"name": extract_typecode_from_pattern(pattern_without_prefix),
|
|
26
82
|
}
|
|
27
83
|
)
|
|
28
84
|
case _:
|
|
@@ -283,7 +283,7 @@ class NCBISubmissionXML:
|
|
|
283
283
|
biosample_data_objects: list,
|
|
284
284
|
bioproject_id: str,
|
|
285
285
|
org: str,
|
|
286
|
-
|
|
286
|
+
nmdc_nucleotide_sequencing: list,
|
|
287
287
|
nmdc_biosamples: list,
|
|
288
288
|
nmdc_library_preparation: list,
|
|
289
289
|
):
|
|
@@ -294,10 +294,10 @@ class NCBISubmissionXML:
|
|
|
294
294
|
for entry in biosample_data_objects:
|
|
295
295
|
fastq_files = []
|
|
296
296
|
biosample_ids = []
|
|
297
|
-
|
|
297
|
+
nucleotide_sequencing_ids = {}
|
|
298
298
|
lib_prep_protocol_names = {}
|
|
299
299
|
instrument_name = ""
|
|
300
|
-
|
|
300
|
+
analyte_category = ""
|
|
301
301
|
library_name = ""
|
|
302
302
|
|
|
303
303
|
for biosample_id, data_objects in entry.items():
|
|
@@ -308,16 +308,16 @@ class NCBISubmissionXML:
|
|
|
308
308
|
file_path = os.path.basename(url.path)
|
|
309
309
|
fastq_files.append(file_path)
|
|
310
310
|
|
|
311
|
-
for
|
|
312
|
-
if biosample_id in
|
|
313
|
-
for
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
omics_type = (
|
|
317
|
-
omprc.get("omics_type", {})
|
|
318
|
-
.get("has_raw_value", "")
|
|
319
|
-
.lower()
|
|
311
|
+
for ntseq_dict in nmdc_nucleotide_sequencing:
|
|
312
|
+
if biosample_id in ntseq_dict:
|
|
313
|
+
for ntseq in ntseq_dict[biosample_id]:
|
|
314
|
+
nucleotide_sequencing_ids[biosample_id] = ntseq.get(
|
|
315
|
+
"id", ""
|
|
320
316
|
)
|
|
317
|
+
# Currently, we are making the assumption that only one instrument
|
|
318
|
+
# is used to sequence a Biosample
|
|
319
|
+
instrument_name = ntseq.get("instrument_used", "")[0]
|
|
320
|
+
analyte_category = ntseq.get("analyte_category", "")
|
|
321
321
|
library_name = bsm_id_name_dict.get(biosample_id, "")
|
|
322
322
|
|
|
323
323
|
for lib_prep_dict in nmdc_library_preparation:
|
|
@@ -395,7 +395,7 @@ class NCBISubmissionXML:
|
|
|
395
395
|
)
|
|
396
396
|
)
|
|
397
397
|
|
|
398
|
-
if
|
|
398
|
+
if analyte_category == "metagenome":
|
|
399
399
|
sra_attributes.append(
|
|
400
400
|
self.set_element(
|
|
401
401
|
"Attribute", "WGS", {"name": "library_strategy"}
|
|
@@ -411,8 +411,7 @@ class NCBISubmissionXML:
|
|
|
411
411
|
"Attribute", "RANDOM", {"name": "library_selection"}
|
|
412
412
|
)
|
|
413
413
|
)
|
|
414
|
-
|
|
415
|
-
if omics_type == "metatranscriptome":
|
|
414
|
+
elif analyte_category == "metatranscriptome":
|
|
416
415
|
sra_attributes.append(
|
|
417
416
|
self.set_element(
|
|
418
417
|
"Attribute",
|
|
@@ -467,7 +466,10 @@ class NCBISubmissionXML:
|
|
|
467
466
|
)
|
|
468
467
|
)
|
|
469
468
|
|
|
470
|
-
for
|
|
469
|
+
for (
|
|
470
|
+
biosample_id,
|
|
471
|
+
omics_processing_id,
|
|
472
|
+
) in nucleotide_sequencing_ids.items():
|
|
471
473
|
identifier_element = self.set_element(
|
|
472
474
|
"Identifier",
|
|
473
475
|
children=[
|
|
@@ -496,20 +498,22 @@ class NCBISubmissionXML:
|
|
|
496
498
|
def get_submission_xml(
|
|
497
499
|
self,
|
|
498
500
|
biosamples_list: list,
|
|
499
|
-
|
|
501
|
+
biosample_nucleotide_sequencing_list: list,
|
|
500
502
|
biosample_data_objects_list: list,
|
|
501
503
|
biosample_library_preparation_list: list,
|
|
502
504
|
):
|
|
503
505
|
data_type = None
|
|
504
506
|
ncbi_project_id = None
|
|
505
|
-
for
|
|
506
|
-
for _,
|
|
507
|
-
for
|
|
508
|
-
if "
|
|
509
|
-
data_type =
|
|
507
|
+
for bsm_ntseq in biosample_nucleotide_sequencing_list:
|
|
508
|
+
for _, ntseq_list in bsm_ntseq.items():
|
|
509
|
+
for ntseq in ntseq_list:
|
|
510
|
+
if "analyte_category" in ntseq:
|
|
511
|
+
data_type = handle_string_value(
|
|
512
|
+
ntseq["analyte_category"]
|
|
513
|
+
).capitalize()
|
|
510
514
|
|
|
511
|
-
if "ncbi_project_name" in
|
|
512
|
-
ncbi_project_id =
|
|
515
|
+
if "ncbi_project_name" in ntseq:
|
|
516
|
+
ncbi_project_id = ntseq["ncbi_project_name"]
|
|
513
517
|
|
|
514
518
|
self.set_description(
|
|
515
519
|
email=self.nmdc_pi_email,
|
|
@@ -538,7 +542,7 @@ class NCBISubmissionXML:
|
|
|
538
542
|
biosample_data_objects=biosample_data_objects_list,
|
|
539
543
|
bioproject_id=ncbi_project_id,
|
|
540
544
|
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
541
|
-
|
|
545
|
+
nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
|
|
542
546
|
nmdc_biosamples=biosamples_list,
|
|
543
547
|
nmdc_library_preparation=biosample_library_preparation_list,
|
|
544
548
|
)
|
|
@@ -58,7 +58,7 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
|
|
|
58
58
|
return biosample_data_objects
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def
|
|
61
|
+
def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
|
|
62
62
|
biosample_data_objects = []
|
|
63
63
|
|
|
64
64
|
for biosample in biosamples_list:
|
|
@@ -80,11 +80,11 @@ def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list)
|
|
|
80
80
|
|
|
81
81
|
for output_id in has_output:
|
|
82
82
|
if get_classname_from_typecode(output_id) == "DataObject":
|
|
83
|
-
|
|
83
|
+
nucleotide_sequencing_doc = all_docs_collection.find_one(
|
|
84
84
|
{"id": document["id"]}
|
|
85
85
|
)
|
|
86
|
-
if
|
|
87
|
-
collected_data_objects.append(
|
|
86
|
+
if nucleotide_sequencing_doc:
|
|
87
|
+
collected_data_objects.append(nucleotide_sequencing_doc)
|
|
88
88
|
else:
|
|
89
89
|
new_current_ids.append(output_id)
|
|
90
90
|
|
|
@@ -117,7 +117,7 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
|
|
|
117
117
|
for output_id in initial_output:
|
|
118
118
|
lib_prep_query = {
|
|
119
119
|
"has_input": output_id,
|
|
120
|
-
"
|
|
120
|
+
"type": {"$in": ["LibraryPreparation"]},
|
|
121
121
|
}
|
|
122
122
|
lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
|
|
123
123
|
|
|
@@ -133,5 +133,7 @@ def export_study_biosamples_metadata():
|
|
|
133
133
|
@op(required_resource_keys={"runtime_api_site_client"})
|
|
134
134
|
def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study: dict):
|
|
135
135
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
136
|
-
biosamples = get_all_docs(
|
|
136
|
+
biosamples = get_all_docs(
|
|
137
|
+
client, "biosamples", f"associated_studies:{nmdc_study['id']}"
|
|
138
|
+
)
|
|
137
139
|
return biosamples
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -51,7 +51,7 @@ from nmdc_runtime.site.ops import (
|
|
|
51
51
|
materialize_alldocs,
|
|
52
52
|
get_ncbi_export_pipeline_study,
|
|
53
53
|
get_data_objects_from_biosamples,
|
|
54
|
-
|
|
54
|
+
get_nucleotide_sequencing_from_biosamples,
|
|
55
55
|
get_library_preparation_from_biosamples,
|
|
56
56
|
get_ncbi_export_pipeline_inputs,
|
|
57
57
|
ncbi_submission_xml_from_nmdc_study,
|
|
@@ -126,15 +126,23 @@ def apply_metadata_in():
|
|
|
126
126
|
|
|
127
127
|
@graph
|
|
128
128
|
def gold_study_to_database():
|
|
129
|
-
study_id =
|
|
129
|
+
(study_id, study_type, gold_nmdc_instrument_mapping_file_url) = (
|
|
130
|
+
get_gold_study_pipeline_inputs()
|
|
131
|
+
)
|
|
130
132
|
|
|
131
133
|
projects = gold_projects_by_study(study_id)
|
|
132
134
|
biosamples = gold_biosamples_by_study(study_id)
|
|
133
135
|
analysis_projects = gold_analysis_projects_by_study(study_id)
|
|
134
136
|
study = gold_study(study_id)
|
|
137
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
135
138
|
|
|
136
139
|
database = nmdc_schema_database_from_gold_study(
|
|
137
|
-
study,
|
|
140
|
+
study,
|
|
141
|
+
study_type,
|
|
142
|
+
projects,
|
|
143
|
+
biosamples,
|
|
144
|
+
analysis_projects,
|
|
145
|
+
gold_nmdc_instrument_map_df,
|
|
138
146
|
)
|
|
139
147
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
140
148
|
filename = nmdc_schema_database_export_filename(study)
|
|
@@ -147,14 +155,16 @@ def gold_study_to_database():
|
|
|
147
155
|
def translate_metadata_submission_to_nmdc_schema_database():
|
|
148
156
|
(
|
|
149
157
|
submission_id,
|
|
150
|
-
|
|
158
|
+
nucleotide_sequencing_mapping_file_url,
|
|
151
159
|
data_object_mapping_file_url,
|
|
152
160
|
biosample_extras_file_url,
|
|
153
161
|
biosample_extras_slot_mapping_file_url,
|
|
154
162
|
) = get_submission_portal_pipeline_inputs()
|
|
155
163
|
|
|
156
164
|
metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
|
|
157
|
-
|
|
165
|
+
nucleotide_sequencing_mapping = get_csv_rows_from_url(
|
|
166
|
+
nucleotide_sequencing_mapping_file_url
|
|
167
|
+
)
|
|
158
168
|
data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
|
|
159
169
|
biosample_extras = get_csv_rows_from_url(biosample_extras_file_url)
|
|
160
170
|
biosample_extras_slot_mapping = get_csv_rows_from_url(
|
|
@@ -163,8 +173,8 @@ def translate_metadata_submission_to_nmdc_schema_database():
|
|
|
163
173
|
|
|
164
174
|
database = translate_portal_submission_to_nmdc_schema_database(
|
|
165
175
|
metadata_submission,
|
|
166
|
-
|
|
167
|
-
data_object_mapping,
|
|
176
|
+
nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
|
|
177
|
+
data_object_mapping=data_object_mapping,
|
|
168
178
|
biosample_extras=biosample_extras,
|
|
169
179
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
170
180
|
)
|
|
@@ -181,14 +191,16 @@ def translate_metadata_submission_to_nmdc_schema_database():
|
|
|
181
191
|
def ingest_metadata_submission():
|
|
182
192
|
(
|
|
183
193
|
submission_id,
|
|
184
|
-
|
|
194
|
+
nucleotide_sequencing_mapping_file_url,
|
|
185
195
|
data_object_mapping_file_url,
|
|
186
196
|
biosample_extras_file_url,
|
|
187
197
|
biosample_extras_slot_mapping_file_url,
|
|
188
198
|
) = get_submission_portal_pipeline_inputs()
|
|
189
199
|
|
|
190
200
|
metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
|
|
191
|
-
|
|
201
|
+
nucleotide_sequencing_mapping = get_csv_rows_from_url(
|
|
202
|
+
nucleotide_sequencing_mapping_file_url
|
|
203
|
+
)
|
|
192
204
|
data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
|
|
193
205
|
biosample_extras = get_csv_rows_from_url(biosample_extras_file_url)
|
|
194
206
|
biosample_extras_slot_mapping = get_csv_rows_from_url(
|
|
@@ -197,8 +209,8 @@ def ingest_metadata_submission():
|
|
|
197
209
|
|
|
198
210
|
database = translate_portal_submission_to_nmdc_schema_database(
|
|
199
211
|
metadata_submission,
|
|
200
|
-
|
|
201
|
-
data_object_mapping,
|
|
212
|
+
nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
|
|
213
|
+
data_object_mapping=data_object_mapping,
|
|
202
214
|
biosample_extras=biosample_extras,
|
|
203
215
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
204
216
|
)
|
|
@@ -217,6 +229,7 @@ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
|
|
|
217
229
|
(
|
|
218
230
|
neon_envo_mappings_file_url,
|
|
219
231
|
neon_raw_data_file_mappings_file_url,
|
|
232
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
220
233
|
) = get_neon_pipeline_inputs()
|
|
221
234
|
|
|
222
235
|
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
@@ -225,8 +238,16 @@ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
|
|
|
225
238
|
neon_raw_data_file_mappings_file_url
|
|
226
239
|
)
|
|
227
240
|
|
|
241
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
242
|
+
neon_nmdc_instrument_mapping_file_url
|
|
243
|
+
)
|
|
244
|
+
|
|
228
245
|
database = nmdc_schema_database_from_neon_soil_data(
|
|
229
|
-
mms_data,
|
|
246
|
+
mms_data,
|
|
247
|
+
sls_data,
|
|
248
|
+
neon_envo_mappings_file,
|
|
249
|
+
neon_raw_data_file_mappings_file,
|
|
250
|
+
neon_nmdc_instrument_mapping_file,
|
|
230
251
|
)
|
|
231
252
|
|
|
232
253
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
@@ -247,6 +268,7 @@ def ingest_neon_soil_metadata():
|
|
|
247
268
|
(
|
|
248
269
|
neon_envo_mappings_file_url,
|
|
249
270
|
neon_raw_data_file_mappings_file_url,
|
|
271
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
250
272
|
) = get_neon_pipeline_inputs()
|
|
251
273
|
|
|
252
274
|
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
@@ -255,8 +277,16 @@ def ingest_neon_soil_metadata():
|
|
|
255
277
|
neon_raw_data_file_mappings_file_url
|
|
256
278
|
)
|
|
257
279
|
|
|
280
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
281
|
+
neon_nmdc_instrument_mapping_file_url
|
|
282
|
+
)
|
|
283
|
+
|
|
258
284
|
database = nmdc_schema_database_from_neon_soil_data(
|
|
259
|
-
mms_data,
|
|
285
|
+
mms_data,
|
|
286
|
+
sls_data,
|
|
287
|
+
neon_envo_mappings_file,
|
|
288
|
+
neon_raw_data_file_mappings_file,
|
|
289
|
+
neon_nmdc_instrument_mapping_file,
|
|
260
290
|
)
|
|
261
291
|
run_id = submit_metadata_to_db(database)
|
|
262
292
|
poll_for_run_completion(run_id)
|
|
@@ -267,6 +297,7 @@ def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
|
|
|
267
297
|
(
|
|
268
298
|
neon_envo_mappings_file_url,
|
|
269
299
|
neon_raw_data_file_mappings_file_url,
|
|
300
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
270
301
|
) = get_neon_pipeline_inputs()
|
|
271
302
|
|
|
272
303
|
mms_benthic_data_product = get_neon_pipeline_benthic_data_product()
|
|
@@ -280,11 +311,16 @@ def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
|
|
|
280
311
|
neon_raw_data_file_mappings_file_url
|
|
281
312
|
)
|
|
282
313
|
|
|
314
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
315
|
+
neon_nmdc_instrument_mapping_file_url
|
|
316
|
+
)
|
|
317
|
+
|
|
283
318
|
database = nmdc_schema_database_from_neon_benthic_data(
|
|
284
319
|
mms_benthic,
|
|
285
320
|
sites_mapping_dict,
|
|
286
321
|
neon_envo_mappings_file,
|
|
287
322
|
neon_raw_data_file_mappings_file,
|
|
323
|
+
neon_nmdc_instrument_mapping_file,
|
|
288
324
|
)
|
|
289
325
|
|
|
290
326
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
@@ -305,6 +341,7 @@ def ingest_neon_benthic_metadata():
|
|
|
305
341
|
(
|
|
306
342
|
neon_envo_mappings_file_url,
|
|
307
343
|
neon_raw_data_file_mappings_file_url,
|
|
344
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
308
345
|
) = get_neon_pipeline_inputs()
|
|
309
346
|
|
|
310
347
|
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
@@ -313,11 +350,16 @@ def ingest_neon_benthic_metadata():
|
|
|
313
350
|
neon_raw_data_file_mappings_file_url
|
|
314
351
|
)
|
|
315
352
|
|
|
353
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
354
|
+
neon_nmdc_instrument_mapping_file_url
|
|
355
|
+
)
|
|
356
|
+
|
|
316
357
|
database = nmdc_schema_database_from_neon_benthic_data(
|
|
317
358
|
mms_benthic,
|
|
318
359
|
sites_mapping_dict,
|
|
319
360
|
neon_envo_mappings_file,
|
|
320
361
|
neon_raw_data_file_mappings_file,
|
|
362
|
+
neon_nmdc_instrument_mapping_file,
|
|
321
363
|
)
|
|
322
364
|
run_id = submit_metadata_to_db(database)
|
|
323
365
|
poll_for_run_completion(run_id)
|
|
@@ -334,6 +376,7 @@ def translate_neon_api_surface_water_metadata_to_nmdc_schema_database():
|
|
|
334
376
|
(
|
|
335
377
|
neon_envo_mappings_file_url,
|
|
336
378
|
neon_raw_data_file_mappings_file_url,
|
|
379
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
337
380
|
) = get_neon_pipeline_inputs()
|
|
338
381
|
|
|
339
382
|
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
@@ -342,11 +385,16 @@ def translate_neon_api_surface_water_metadata_to_nmdc_schema_database():
|
|
|
342
385
|
neon_raw_data_file_mappings_file_url
|
|
343
386
|
)
|
|
344
387
|
|
|
388
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
389
|
+
neon_nmdc_instrument_mapping_file_url
|
|
390
|
+
)
|
|
391
|
+
|
|
345
392
|
database = nmdc_schema_database_from_neon_surface_water_data(
|
|
346
393
|
mms_surface_water,
|
|
347
394
|
sites_mapping_dict,
|
|
348
395
|
neon_envo_mappings_file,
|
|
349
396
|
neon_raw_data_file_mappings_file,
|
|
397
|
+
neon_nmdc_instrument_mapping_file,
|
|
350
398
|
)
|
|
351
399
|
|
|
352
400
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
@@ -367,6 +415,7 @@ def ingest_neon_surface_water_metadata():
|
|
|
367
415
|
(
|
|
368
416
|
neon_envo_mappings_file_url,
|
|
369
417
|
neon_raw_data_file_mappings_file_url,
|
|
418
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
370
419
|
) = get_neon_pipeline_inputs()
|
|
371
420
|
|
|
372
421
|
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
@@ -375,11 +424,16 @@ def ingest_neon_surface_water_metadata():
|
|
|
375
424
|
neon_raw_data_file_mappings_file_url
|
|
376
425
|
)
|
|
377
426
|
|
|
427
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
428
|
+
neon_nmdc_instrument_mapping_file_url
|
|
429
|
+
)
|
|
430
|
+
|
|
378
431
|
database = nmdc_schema_database_from_neon_benthic_data(
|
|
379
432
|
mms_surface_water,
|
|
380
433
|
sites_mapping_dict,
|
|
381
434
|
neon_envo_mappings_file,
|
|
382
435
|
neon_raw_data_file_mappings_file,
|
|
436
|
+
neon_nmdc_instrument_mapping_file,
|
|
383
437
|
)
|
|
384
438
|
run_id = submit_metadata_to_db(database)
|
|
385
439
|
poll_for_run_completion(run_id)
|
|
@@ -390,14 +444,16 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
390
444
|
nmdc_study = get_ncbi_export_pipeline_study()
|
|
391
445
|
ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
|
|
392
446
|
biosamples = get_biosamples_by_study_id(nmdc_study)
|
|
393
|
-
|
|
447
|
+
nucleotide_sequencing_records = get_nucleotide_sequencing_from_biosamples(
|
|
448
|
+
biosamples
|
|
449
|
+
)
|
|
394
450
|
data_object_records = get_data_objects_from_biosamples(biosamples)
|
|
395
451
|
library_preparation_records = get_library_preparation_from_biosamples(biosamples)
|
|
396
452
|
xml_data = ncbi_submission_xml_from_nmdc_study(
|
|
397
453
|
nmdc_study,
|
|
398
454
|
ncbi_submission_metadata,
|
|
399
455
|
biosamples,
|
|
400
|
-
|
|
456
|
+
nucleotide_sequencing_records,
|
|
401
457
|
data_object_records,
|
|
402
458
|
library_preparation_records,
|
|
403
459
|
)
|