nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from dagster import graph
|
|
2
2
|
|
|
3
3
|
from nmdc_runtime.site.ops import (
|
|
4
|
-
|
|
4
|
+
generate_biosample_set_for_nmdc_study_from_gold,
|
|
5
5
|
nmdc_schema_database_export_filename,
|
|
6
6
|
nmdc_schema_database_from_gold_study,
|
|
7
7
|
nmdc_schema_object_to_dict,
|
|
@@ -11,8 +11,6 @@ from nmdc_runtime.site.ops import (
|
|
|
11
11
|
gold_projects_by_study,
|
|
12
12
|
gold_study,
|
|
13
13
|
poll_for_run_completion,
|
|
14
|
-
run_etl,
|
|
15
|
-
local_file_to_api_object,
|
|
16
14
|
get_operation,
|
|
17
15
|
produce_curated_db,
|
|
18
16
|
delete_operations,
|
|
@@ -21,8 +19,8 @@ from nmdc_runtime.site.ops import (
|
|
|
21
19
|
filter_ops_done_object_puts,
|
|
22
20
|
hello,
|
|
23
21
|
mongo_stats,
|
|
22
|
+
run_script_to_update_insdc_biosample_identifiers,
|
|
24
23
|
submit_metadata_to_db,
|
|
25
|
-
update_schema,
|
|
26
24
|
filter_ops_undone_expired,
|
|
27
25
|
construct_jobs,
|
|
28
26
|
maybe_post_jobs,
|
|
@@ -38,34 +36,37 @@ from nmdc_runtime.site.ops import (
|
|
|
38
36
|
neon_data_by_product,
|
|
39
37
|
nmdc_schema_database_from_neon_soil_data,
|
|
40
38
|
nmdc_schema_database_from_neon_benthic_data,
|
|
39
|
+
nmdc_schema_database_from_neon_surface_water_data,
|
|
41
40
|
nmdc_schema_database_export_filename_neon,
|
|
42
41
|
get_neon_pipeline_mms_data_product,
|
|
43
42
|
get_neon_pipeline_sls_data_product,
|
|
43
|
+
get_neon_pipeline_surface_water_data_product,
|
|
44
44
|
get_submission_portal_pipeline_inputs,
|
|
45
45
|
get_csv_rows_from_url,
|
|
46
46
|
get_neon_pipeline_benthic_data_product,
|
|
47
47
|
get_neon_pipeline_inputs,
|
|
48
48
|
get_df_from_url,
|
|
49
49
|
site_code_mapping,
|
|
50
|
+
materialize_alldocs,
|
|
51
|
+
load_ontology,
|
|
52
|
+
get_ncbi_export_pipeline_study,
|
|
53
|
+
get_data_objects_from_biosamples,
|
|
54
|
+
get_nucleotide_sequencing_from_biosamples,
|
|
55
|
+
get_library_preparation_from_biosamples,
|
|
56
|
+
get_aggregated_pooled_biosamples,
|
|
57
|
+
get_all_instruments,
|
|
58
|
+
get_ncbi_export_pipeline_inputs,
|
|
59
|
+
ncbi_submission_xml_from_nmdc_study,
|
|
60
|
+
ncbi_submission_xml_asset,
|
|
61
|
+
render_text,
|
|
62
|
+
get_database_updater_inputs,
|
|
63
|
+
post_submission_portal_biosample_ingest_record_stitching_filename,
|
|
64
|
+
generate_data_generation_set_post_biosample_ingest,
|
|
65
|
+
get_instrument_ids_by_model,
|
|
66
|
+
log_database_ids,
|
|
67
|
+
add_public_image_urls,
|
|
50
68
|
)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@graph
|
|
54
|
-
def gold_translation():
|
|
55
|
-
"""
|
|
56
|
-
Translating an export of the JGI GOLD [1] SQL database to the NMDC database JSON schema.
|
|
57
|
-
|
|
58
|
-
[1] Genomes OnLine Database (GOLD) <https://gold.jgi.doe.gov/>.
|
|
59
|
-
"""
|
|
60
|
-
local_file_to_api_object(run_etl(build_merged_db()))
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
@graph()
|
|
64
|
-
def gold_translation_curation():
|
|
65
|
-
# TODO
|
|
66
|
-
# - have produce_curated_db do actual curation (see notebook), persisting to db.
|
|
67
|
-
# - more steps in pipeline? Or handoff via run_status_sensor on DagsterRunStatus.SUCCESS.
|
|
68
|
-
produce_curated_db(get_operation())
|
|
69
|
+
from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
|
|
69
70
|
|
|
70
71
|
|
|
71
72
|
@graph()
|
|
@@ -86,19 +87,23 @@ def hello_mongo():
|
|
|
86
87
|
|
|
87
88
|
|
|
88
89
|
@graph
|
|
89
|
-
def
|
|
90
|
-
|
|
91
|
-
A pipeline definition. This example pipeline has a single solid.
|
|
90
|
+
def housekeeping():
|
|
91
|
+
delete_operations(list_operations(filter_ops_undone_expired()))
|
|
92
92
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
93
|
+
|
|
94
|
+
@graph
|
|
95
|
+
def ensure_alldocs():
|
|
96
|
+
materialize_alldocs()
|
|
97
97
|
|
|
98
98
|
|
|
99
99
|
@graph
|
|
100
|
-
def
|
|
101
|
-
|
|
100
|
+
def run_ontology_load():
|
|
101
|
+
"""
|
|
102
|
+
A graph for loading ontologies.
|
|
103
|
+
The source_ontology parameter is provided by the job configuration
|
|
104
|
+
and passed to the load_ontology op.
|
|
105
|
+
"""
|
|
106
|
+
load_ontology()
|
|
102
107
|
|
|
103
108
|
|
|
104
109
|
@graph
|
|
@@ -109,28 +114,51 @@ def ensure_jobs():
|
|
|
109
114
|
|
|
110
115
|
@graph
|
|
111
116
|
def apply_changesheet():
|
|
117
|
+
# Note: We use `_` as a "placeholder" variable.
|
|
118
|
+
# It's a variable to whose value we assign no significance. In this case, we use it to
|
|
119
|
+
# tell Dagster that one op depends upon the output of the other (so Dagster runs them
|
|
120
|
+
# in that order), without implying to maintainers that its value is significant to us.
|
|
121
|
+
# Reference (this strategy): https://docs.dagster.io/api/dagster/types#dagster.Nothing
|
|
122
|
+
# Reference (`_` variables): https://stackoverflow.com/a/47599668
|
|
112
123
|
sheet_in = get_changesheet_in()
|
|
113
124
|
outputs = perform_changesheet_updates(sheet_in)
|
|
114
|
-
add_output_run_event(outputs)
|
|
125
|
+
_ = add_output_run_event(outputs)
|
|
126
|
+
materialize_alldocs(waits_for=_)
|
|
115
127
|
|
|
116
128
|
|
|
117
129
|
@graph
|
|
118
130
|
def apply_metadata_in():
|
|
131
|
+
# Note: We use `_` as a "placeholder" variable.
|
|
119
132
|
outputs = perform_mongo_updates(get_json_in())
|
|
120
|
-
add_output_run_event(outputs)
|
|
133
|
+
_ = add_output_run_event(outputs)
|
|
134
|
+
materialize_alldocs(waits_for=_)
|
|
121
135
|
|
|
122
136
|
|
|
123
137
|
@graph
|
|
124
138
|
def gold_study_to_database():
|
|
125
|
-
|
|
139
|
+
(
|
|
140
|
+
study_id,
|
|
141
|
+
study_type,
|
|
142
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
143
|
+
include_field_site_info,
|
|
144
|
+
enable_biosample_filtering,
|
|
145
|
+
) = get_gold_study_pipeline_inputs()
|
|
126
146
|
|
|
127
147
|
projects = gold_projects_by_study(study_id)
|
|
128
148
|
biosamples = gold_biosamples_by_study(study_id)
|
|
129
149
|
analysis_projects = gold_analysis_projects_by_study(study_id)
|
|
130
150
|
study = gold_study(study_id)
|
|
151
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
131
152
|
|
|
132
153
|
database = nmdc_schema_database_from_gold_study(
|
|
133
|
-
study,
|
|
154
|
+
study,
|
|
155
|
+
study_type,
|
|
156
|
+
projects,
|
|
157
|
+
biosamples,
|
|
158
|
+
analysis_projects,
|
|
159
|
+
gold_nmdc_instrument_map_df,
|
|
160
|
+
include_field_site_info,
|
|
161
|
+
enable_biosample_filtering,
|
|
134
162
|
)
|
|
135
163
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
136
164
|
filename = nmdc_schema_database_export_filename(study)
|
|
@@ -143,30 +171,38 @@ def gold_study_to_database():
|
|
|
143
171
|
def translate_metadata_submission_to_nmdc_schema_database():
|
|
144
172
|
(
|
|
145
173
|
submission_id,
|
|
146
|
-
|
|
174
|
+
nucleotide_sequencing_mapping_file_url,
|
|
147
175
|
data_object_mapping_file_url,
|
|
148
176
|
biosample_extras_file_url,
|
|
149
177
|
biosample_extras_slot_mapping_file_url,
|
|
178
|
+
study_id,
|
|
150
179
|
) = get_submission_portal_pipeline_inputs()
|
|
151
180
|
|
|
152
181
|
metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
|
|
153
|
-
|
|
182
|
+
nucleotide_sequencing_mapping = get_csv_rows_from_url(
|
|
183
|
+
nucleotide_sequencing_mapping_file_url
|
|
184
|
+
)
|
|
154
185
|
data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
|
|
155
186
|
biosample_extras = get_csv_rows_from_url(biosample_extras_file_url)
|
|
156
187
|
biosample_extras_slot_mapping = get_csv_rows_from_url(
|
|
157
188
|
biosample_extras_slot_mapping_file_url
|
|
158
189
|
)
|
|
190
|
+
instrument_mapping = get_instrument_ids_by_model()
|
|
159
191
|
|
|
160
192
|
database = translate_portal_submission_to_nmdc_schema_database(
|
|
161
193
|
metadata_submission,
|
|
162
|
-
|
|
163
|
-
data_object_mapping,
|
|
194
|
+
nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
|
|
195
|
+
data_object_mapping=data_object_mapping,
|
|
164
196
|
biosample_extras=biosample_extras,
|
|
165
197
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
198
|
+
instrument_mapping=instrument_mapping,
|
|
199
|
+
study_id=study_id,
|
|
166
200
|
)
|
|
167
201
|
|
|
168
202
|
validate_metadata(database)
|
|
169
203
|
|
|
204
|
+
log_database_ids(database)
|
|
205
|
+
|
|
170
206
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
171
207
|
filename = nmdc_schema_database_export_filename(metadata_submission)
|
|
172
208
|
outputs = export_json_to_drs(database_dict, filename)
|
|
@@ -177,27 +213,37 @@ def translate_metadata_submission_to_nmdc_schema_database():
|
|
|
177
213
|
def ingest_metadata_submission():
|
|
178
214
|
(
|
|
179
215
|
submission_id,
|
|
180
|
-
|
|
216
|
+
nucleotide_sequencing_mapping_file_url,
|
|
181
217
|
data_object_mapping_file_url,
|
|
182
218
|
biosample_extras_file_url,
|
|
183
219
|
biosample_extras_slot_mapping_file_url,
|
|
220
|
+
study_id,
|
|
184
221
|
) = get_submission_portal_pipeline_inputs()
|
|
185
222
|
|
|
186
223
|
metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
|
|
187
|
-
|
|
224
|
+
nucleotide_sequencing_mapping = get_csv_rows_from_url(
|
|
225
|
+
nucleotide_sequencing_mapping_file_url
|
|
226
|
+
)
|
|
188
227
|
data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
|
|
189
228
|
biosample_extras = get_csv_rows_from_url(biosample_extras_file_url)
|
|
190
229
|
biosample_extras_slot_mapping = get_csv_rows_from_url(
|
|
191
230
|
biosample_extras_slot_mapping_file_url
|
|
192
231
|
)
|
|
232
|
+
instrument_mapping = get_instrument_ids_by_model()
|
|
193
233
|
|
|
194
234
|
database = translate_portal_submission_to_nmdc_schema_database(
|
|
195
235
|
metadata_submission,
|
|
196
|
-
|
|
197
|
-
data_object_mapping,
|
|
236
|
+
nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
|
|
237
|
+
data_object_mapping=data_object_mapping,
|
|
198
238
|
biosample_extras=biosample_extras,
|
|
199
239
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
240
|
+
instrument_mapping=instrument_mapping,
|
|
241
|
+
study_id=study_id,
|
|
200
242
|
)
|
|
243
|
+
database = add_public_image_urls(database, submission_id)
|
|
244
|
+
|
|
245
|
+
log_database_ids(database)
|
|
246
|
+
|
|
201
247
|
run_id = submit_metadata_to_db(database)
|
|
202
248
|
poll_for_run_completion(run_id)
|
|
203
249
|
|
|
@@ -213,6 +259,7 @@ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
|
|
|
213
259
|
(
|
|
214
260
|
neon_envo_mappings_file_url,
|
|
215
261
|
neon_raw_data_file_mappings_file_url,
|
|
262
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
216
263
|
) = get_neon_pipeline_inputs()
|
|
217
264
|
|
|
218
265
|
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
@@ -221,8 +268,16 @@ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
|
|
|
221
268
|
neon_raw_data_file_mappings_file_url
|
|
222
269
|
)
|
|
223
270
|
|
|
271
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
272
|
+
neon_nmdc_instrument_mapping_file_url
|
|
273
|
+
)
|
|
274
|
+
|
|
224
275
|
database = nmdc_schema_database_from_neon_soil_data(
|
|
225
|
-
mms_data,
|
|
276
|
+
mms_data,
|
|
277
|
+
sls_data,
|
|
278
|
+
neon_envo_mappings_file,
|
|
279
|
+
neon_raw_data_file_mappings_file,
|
|
280
|
+
neon_nmdc_instrument_mapping_file,
|
|
226
281
|
)
|
|
227
282
|
|
|
228
283
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
@@ -243,6 +298,7 @@ def ingest_neon_soil_metadata():
|
|
|
243
298
|
(
|
|
244
299
|
neon_envo_mappings_file_url,
|
|
245
300
|
neon_raw_data_file_mappings_file_url,
|
|
301
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
246
302
|
) = get_neon_pipeline_inputs()
|
|
247
303
|
|
|
248
304
|
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
@@ -251,8 +307,16 @@ def ingest_neon_soil_metadata():
|
|
|
251
307
|
neon_raw_data_file_mappings_file_url
|
|
252
308
|
)
|
|
253
309
|
|
|
310
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
311
|
+
neon_nmdc_instrument_mapping_file_url
|
|
312
|
+
)
|
|
313
|
+
|
|
254
314
|
database = nmdc_schema_database_from_neon_soil_data(
|
|
255
|
-
mms_data,
|
|
315
|
+
mms_data,
|
|
316
|
+
sls_data,
|
|
317
|
+
neon_envo_mappings_file,
|
|
318
|
+
neon_raw_data_file_mappings_file,
|
|
319
|
+
neon_nmdc_instrument_mapping_file,
|
|
256
320
|
)
|
|
257
321
|
run_id = submit_metadata_to_db(database)
|
|
258
322
|
poll_for_run_completion(run_id)
|
|
@@ -263,6 +327,7 @@ def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
|
|
|
263
327
|
(
|
|
264
328
|
neon_envo_mappings_file_url,
|
|
265
329
|
neon_raw_data_file_mappings_file_url,
|
|
330
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
266
331
|
) = get_neon_pipeline_inputs()
|
|
267
332
|
|
|
268
333
|
mms_benthic_data_product = get_neon_pipeline_benthic_data_product()
|
|
@@ -276,11 +341,16 @@ def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
|
|
|
276
341
|
neon_raw_data_file_mappings_file_url
|
|
277
342
|
)
|
|
278
343
|
|
|
344
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
345
|
+
neon_nmdc_instrument_mapping_file_url
|
|
346
|
+
)
|
|
347
|
+
|
|
279
348
|
database = nmdc_schema_database_from_neon_benthic_data(
|
|
280
349
|
mms_benthic,
|
|
281
350
|
sites_mapping_dict,
|
|
282
351
|
neon_envo_mappings_file,
|
|
283
352
|
neon_raw_data_file_mappings_file,
|
|
353
|
+
neon_nmdc_instrument_mapping_file,
|
|
284
354
|
)
|
|
285
355
|
|
|
286
356
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
@@ -301,6 +371,7 @@ def ingest_neon_benthic_metadata():
|
|
|
301
371
|
(
|
|
302
372
|
neon_envo_mappings_file_url,
|
|
303
373
|
neon_raw_data_file_mappings_file_url,
|
|
374
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
304
375
|
) = get_neon_pipeline_inputs()
|
|
305
376
|
|
|
306
377
|
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
@@ -309,11 +380,189 @@ def ingest_neon_benthic_metadata():
|
|
|
309
380
|
neon_raw_data_file_mappings_file_url
|
|
310
381
|
)
|
|
311
382
|
|
|
383
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
384
|
+
neon_nmdc_instrument_mapping_file_url
|
|
385
|
+
)
|
|
386
|
+
|
|
312
387
|
database = nmdc_schema_database_from_neon_benthic_data(
|
|
313
388
|
mms_benthic,
|
|
314
389
|
sites_mapping_dict,
|
|
315
390
|
neon_envo_mappings_file,
|
|
316
391
|
neon_raw_data_file_mappings_file,
|
|
392
|
+
neon_nmdc_instrument_mapping_file,
|
|
317
393
|
)
|
|
318
394
|
run_id = submit_metadata_to_db(database)
|
|
319
395
|
poll_for_run_completion(run_id)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
@graph
|
|
399
|
+
def translate_neon_api_surface_water_metadata_to_nmdc_schema_database():
|
|
400
|
+
mms_surface_water_data_product = get_neon_pipeline_surface_water_data_product()
|
|
401
|
+
|
|
402
|
+
mms_surface_water = neon_data_by_product(mms_surface_water_data_product)
|
|
403
|
+
|
|
404
|
+
sites_mapping_dict = site_code_mapping()
|
|
405
|
+
|
|
406
|
+
(
|
|
407
|
+
neon_envo_mappings_file_url,
|
|
408
|
+
neon_raw_data_file_mappings_file_url,
|
|
409
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
410
|
+
) = get_neon_pipeline_inputs()
|
|
411
|
+
|
|
412
|
+
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
413
|
+
|
|
414
|
+
neon_raw_data_file_mappings_file = get_df_from_url(
|
|
415
|
+
neon_raw_data_file_mappings_file_url
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
419
|
+
neon_nmdc_instrument_mapping_file_url
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
database = nmdc_schema_database_from_neon_surface_water_data(
|
|
423
|
+
mms_surface_water,
|
|
424
|
+
sites_mapping_dict,
|
|
425
|
+
neon_envo_mappings_file,
|
|
426
|
+
neon_raw_data_file_mappings_file,
|
|
427
|
+
neon_nmdc_instrument_mapping_file,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
database_dict = nmdc_schema_object_to_dict(database)
|
|
431
|
+
filename = nmdc_schema_database_export_filename_neon()
|
|
432
|
+
|
|
433
|
+
outputs = export_json_to_drs(database_dict, filename)
|
|
434
|
+
add_output_run_event(outputs)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
@graph
|
|
438
|
+
def ingest_neon_surface_water_metadata():
|
|
439
|
+
mms_surface_water_data_product = get_neon_pipeline_surface_water_data_product()
|
|
440
|
+
|
|
441
|
+
mms_surface_water = neon_data_by_product(mms_surface_water_data_product)
|
|
442
|
+
|
|
443
|
+
sites_mapping_dict = site_code_mapping()
|
|
444
|
+
|
|
445
|
+
(
|
|
446
|
+
neon_envo_mappings_file_url,
|
|
447
|
+
neon_raw_data_file_mappings_file_url,
|
|
448
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
449
|
+
) = get_neon_pipeline_inputs()
|
|
450
|
+
|
|
451
|
+
neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
|
|
452
|
+
|
|
453
|
+
neon_raw_data_file_mappings_file = get_df_from_url(
|
|
454
|
+
neon_raw_data_file_mappings_file_url
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
neon_nmdc_instrument_mapping_file = get_df_from_url(
|
|
458
|
+
neon_nmdc_instrument_mapping_file_url
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
database = nmdc_schema_database_from_neon_benthic_data(
|
|
462
|
+
mms_surface_water,
|
|
463
|
+
sites_mapping_dict,
|
|
464
|
+
neon_envo_mappings_file,
|
|
465
|
+
neon_raw_data_file_mappings_file,
|
|
466
|
+
neon_nmdc_instrument_mapping_file,
|
|
467
|
+
)
|
|
468
|
+
run_id = submit_metadata_to_db(database)
|
|
469
|
+
poll_for_run_completion(run_id)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
@graph
|
|
473
|
+
def nmdc_study_to_ncbi_submission_export():
|
|
474
|
+
nmdc_study = get_ncbi_export_pipeline_study()
|
|
475
|
+
ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
|
|
476
|
+
biosamples = get_biosamples_by_study_id(nmdc_study)
|
|
477
|
+
nucleotide_sequencing_records = get_nucleotide_sequencing_from_biosamples(
|
|
478
|
+
biosamples
|
|
479
|
+
)
|
|
480
|
+
data_object_records = get_data_objects_from_biosamples(biosamples)
|
|
481
|
+
library_preparation_records = get_library_preparation_from_biosamples(biosamples)
|
|
482
|
+
pooled_biosamples_data = get_aggregated_pooled_biosamples(biosamples)
|
|
483
|
+
all_instruments = get_all_instruments()
|
|
484
|
+
xml_data = ncbi_submission_xml_from_nmdc_study(
|
|
485
|
+
nmdc_study,
|
|
486
|
+
ncbi_submission_metadata,
|
|
487
|
+
biosamples,
|
|
488
|
+
nucleotide_sequencing_records,
|
|
489
|
+
data_object_records,
|
|
490
|
+
library_preparation_records,
|
|
491
|
+
all_instruments,
|
|
492
|
+
pooled_biosamples_data,
|
|
493
|
+
)
|
|
494
|
+
ncbi_submission_xml_asset(xml_data)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
@graph
|
|
498
|
+
def generate_data_generation_set_for_biosamples_in_nmdc_study():
|
|
499
|
+
(
|
|
500
|
+
study_id,
|
|
501
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
502
|
+
include_field_site_info,
|
|
503
|
+
enable_biosample_filtering,
|
|
504
|
+
) = get_database_updater_inputs()
|
|
505
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
506
|
+
|
|
507
|
+
database = generate_data_generation_set_post_biosample_ingest(
|
|
508
|
+
study_id,
|
|
509
|
+
gold_nmdc_instrument_map_df,
|
|
510
|
+
include_field_site_info,
|
|
511
|
+
enable_biosample_filtering,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
database_dict = nmdc_schema_object_to_dict(database)
|
|
515
|
+
filename = post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
516
|
+
study_id
|
|
517
|
+
)
|
|
518
|
+
outputs = export_json_to_drs(database_dict, filename)
|
|
519
|
+
add_output_run_event(outputs)
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
@graph
|
|
523
|
+
def generate_biosample_set_from_samples_in_gold():
|
|
524
|
+
(
|
|
525
|
+
study_id,
|
|
526
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
527
|
+
include_field_site_info,
|
|
528
|
+
enable_biosample_filtering,
|
|
529
|
+
) = get_database_updater_inputs()
|
|
530
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
531
|
+
|
|
532
|
+
database = generate_biosample_set_for_nmdc_study_from_gold(
|
|
533
|
+
study_id,
|
|
534
|
+
gold_nmdc_instrument_map_df,
|
|
535
|
+
include_field_site_info,
|
|
536
|
+
enable_biosample_filtering,
|
|
537
|
+
)
|
|
538
|
+
database_dict = nmdc_schema_object_to_dict(database)
|
|
539
|
+
filename = post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
540
|
+
study_id
|
|
541
|
+
)
|
|
542
|
+
outputs = export_json_to_drs(database_dict, filename)
|
|
543
|
+
add_output_run_event(outputs)
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
@graph
|
|
547
|
+
def generate_update_script_for_insdc_biosample_identifiers():
|
|
548
|
+
"""Generate a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.
|
|
549
|
+
|
|
550
|
+
This graph fetches the necessary inputs, then calls the run_script_to_update_insdc_biosample_identifiers op
|
|
551
|
+
to generate a script for updating biosample records with INSDC identifiers obtained from GOLD.
|
|
552
|
+
The script is returned as a dictionary that can be executed against MongoDB.
|
|
553
|
+
"""
|
|
554
|
+
(
|
|
555
|
+
study_id,
|
|
556
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
557
|
+
include_field_site_info,
|
|
558
|
+
enable_biosample_filtering,
|
|
559
|
+
) = get_database_updater_inputs()
|
|
560
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
561
|
+
|
|
562
|
+
update_script = run_script_to_update_insdc_biosample_identifiers(
|
|
563
|
+
study_id,
|
|
564
|
+
gold_nmdc_instrument_map_df,
|
|
565
|
+
include_field_site_info,
|
|
566
|
+
enable_biosample_filtering,
|
|
567
|
+
)
|
|
568
|
+
render_text(update_script)
|