nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,7 +1,7 @@
1
1
  from dagster import graph
2
2
 
3
3
  from nmdc_runtime.site.ops import (
4
- build_merged_db,
4
+ generate_biosample_set_for_nmdc_study_from_gold,
5
5
  nmdc_schema_database_export_filename,
6
6
  nmdc_schema_database_from_gold_study,
7
7
  nmdc_schema_object_to_dict,
@@ -11,8 +11,6 @@ from nmdc_runtime.site.ops import (
11
11
  gold_projects_by_study,
12
12
  gold_study,
13
13
  poll_for_run_completion,
14
- run_etl,
15
- local_file_to_api_object,
16
14
  get_operation,
17
15
  produce_curated_db,
18
16
  delete_operations,
@@ -21,8 +19,8 @@ from nmdc_runtime.site.ops import (
21
19
  filter_ops_done_object_puts,
22
20
  hello,
23
21
  mongo_stats,
22
+ run_script_to_update_insdc_biosample_identifiers,
24
23
  submit_metadata_to_db,
25
- update_schema,
26
24
  filter_ops_undone_expired,
27
25
  construct_jobs,
28
26
  maybe_post_jobs,
@@ -38,34 +36,37 @@ from nmdc_runtime.site.ops import (
38
36
  neon_data_by_product,
39
37
  nmdc_schema_database_from_neon_soil_data,
40
38
  nmdc_schema_database_from_neon_benthic_data,
39
+ nmdc_schema_database_from_neon_surface_water_data,
41
40
  nmdc_schema_database_export_filename_neon,
42
41
  get_neon_pipeline_mms_data_product,
43
42
  get_neon_pipeline_sls_data_product,
43
+ get_neon_pipeline_surface_water_data_product,
44
44
  get_submission_portal_pipeline_inputs,
45
45
  get_csv_rows_from_url,
46
46
  get_neon_pipeline_benthic_data_product,
47
47
  get_neon_pipeline_inputs,
48
48
  get_df_from_url,
49
49
  site_code_mapping,
50
+ materialize_alldocs,
51
+ load_ontology,
52
+ get_ncbi_export_pipeline_study,
53
+ get_data_objects_from_biosamples,
54
+ get_nucleotide_sequencing_from_biosamples,
55
+ get_library_preparation_from_biosamples,
56
+ get_aggregated_pooled_biosamples,
57
+ get_all_instruments,
58
+ get_ncbi_export_pipeline_inputs,
59
+ ncbi_submission_xml_from_nmdc_study,
60
+ ncbi_submission_xml_asset,
61
+ render_text,
62
+ get_database_updater_inputs,
63
+ post_submission_portal_biosample_ingest_record_stitching_filename,
64
+ generate_data_generation_set_post_biosample_ingest,
65
+ get_instrument_ids_by_model,
66
+ log_database_ids,
67
+ add_public_image_urls,
50
68
  )
51
-
52
-
53
- @graph
54
- def gold_translation():
55
- """
56
- Translating an export of the JGI GOLD [1] SQL database to the NMDC database JSON schema.
57
-
58
- [1] Genomes OnLine Database (GOLD) <https://gold.jgi.doe.gov/>.
59
- """
60
- local_file_to_api_object(run_etl(build_merged_db()))
61
-
62
-
63
- @graph()
64
- def gold_translation_curation():
65
- # TODO
66
- # - have produce_curated_db do actual curation (see notebook), persisting to db.
67
- # - more steps in pipeline? Or handoff via run_status_sensor on DagsterRunStatus.SUCCESS.
68
- produce_curated_db(get_operation())
69
+ from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
69
70
 
70
71
 
71
72
  @graph()
@@ -86,19 +87,23 @@ def hello_mongo():
86
87
 
87
88
 
88
89
  @graph
89
- def update_terminus():
90
- """
91
- A pipeline definition. This example pipeline has a single solid.
90
+ def housekeeping():
91
+ delete_operations(list_operations(filter_ops_undone_expired()))
92
92
 
93
- For more hints on writing Dagster pipelines, see our documentation overview on Pipelines:
94
- https://docs.dagster.io/overview/solids-pipelines/pipelines
95
- """
96
- update_schema()
93
+
94
+ @graph
95
+ def ensure_alldocs():
96
+ materialize_alldocs()
97
97
 
98
98
 
99
99
  @graph
100
- def housekeeping():
101
- delete_operations(list_operations(filter_ops_undone_expired()))
100
+ def run_ontology_load():
101
+ """
102
+ A graph for loading ontologies.
103
+ The source_ontology parameter is provided by the job configuration
104
+ and passed to the load_ontology op.
105
+ """
106
+ load_ontology()
102
107
 
103
108
 
104
109
  @graph
@@ -109,28 +114,51 @@ def ensure_jobs():
109
114
 
110
115
  @graph
111
116
  def apply_changesheet():
117
+ # Note: We use `_` as a "placeholder" variable.
118
+ # It's a variable to whose value we assign no significance. In this case, we use it to
119
+ # tell Dagster that one op depends upon the output of the other (so Dagster runs them
120
+ # in that order), without implying to maintainers that its value is significant to us.
121
+ # Reference (this strategy): https://docs.dagster.io/api/dagster/types#dagster.Nothing
122
+ # Reference (`_` variables): https://stackoverflow.com/a/47599668
112
123
  sheet_in = get_changesheet_in()
113
124
  outputs = perform_changesheet_updates(sheet_in)
114
- add_output_run_event(outputs)
125
+ _ = add_output_run_event(outputs)
126
+ materialize_alldocs(waits_for=_)
115
127
 
116
128
 
117
129
  @graph
118
130
  def apply_metadata_in():
131
+ # Note: We use `_` as a "placeholder" variable.
119
132
  outputs = perform_mongo_updates(get_json_in())
120
- add_output_run_event(outputs)
133
+ _ = add_output_run_event(outputs)
134
+ materialize_alldocs(waits_for=_)
121
135
 
122
136
 
123
137
  @graph
124
138
  def gold_study_to_database():
125
- study_id = get_gold_study_pipeline_inputs()
139
+ (
140
+ study_id,
141
+ study_type,
142
+ gold_nmdc_instrument_mapping_file_url,
143
+ include_field_site_info,
144
+ enable_biosample_filtering,
145
+ ) = get_gold_study_pipeline_inputs()
126
146
 
127
147
  projects = gold_projects_by_study(study_id)
128
148
  biosamples = gold_biosamples_by_study(study_id)
129
149
  analysis_projects = gold_analysis_projects_by_study(study_id)
130
150
  study = gold_study(study_id)
151
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
131
152
 
132
153
  database = nmdc_schema_database_from_gold_study(
133
- study, projects, biosamples, analysis_projects
154
+ study,
155
+ study_type,
156
+ projects,
157
+ biosamples,
158
+ analysis_projects,
159
+ gold_nmdc_instrument_map_df,
160
+ include_field_site_info,
161
+ enable_biosample_filtering,
134
162
  )
135
163
  database_dict = nmdc_schema_object_to_dict(database)
136
164
  filename = nmdc_schema_database_export_filename(study)
@@ -143,30 +171,38 @@ def gold_study_to_database():
143
171
  def translate_metadata_submission_to_nmdc_schema_database():
144
172
  (
145
173
  submission_id,
146
- omics_processing_mapping_file_url,
174
+ nucleotide_sequencing_mapping_file_url,
147
175
  data_object_mapping_file_url,
148
176
  biosample_extras_file_url,
149
177
  biosample_extras_slot_mapping_file_url,
178
+ study_id,
150
179
  ) = get_submission_portal_pipeline_inputs()
151
180
 
152
181
  metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
153
- omics_processing_mapping = get_csv_rows_from_url(omics_processing_mapping_file_url)
182
+ nucleotide_sequencing_mapping = get_csv_rows_from_url(
183
+ nucleotide_sequencing_mapping_file_url
184
+ )
154
185
  data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
155
186
  biosample_extras = get_csv_rows_from_url(biosample_extras_file_url)
156
187
  biosample_extras_slot_mapping = get_csv_rows_from_url(
157
188
  biosample_extras_slot_mapping_file_url
158
189
  )
190
+ instrument_mapping = get_instrument_ids_by_model()
159
191
 
160
192
  database = translate_portal_submission_to_nmdc_schema_database(
161
193
  metadata_submission,
162
- omics_processing_mapping,
163
- data_object_mapping,
194
+ nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
195
+ data_object_mapping=data_object_mapping,
164
196
  biosample_extras=biosample_extras,
165
197
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
198
+ instrument_mapping=instrument_mapping,
199
+ study_id=study_id,
166
200
  )
167
201
 
168
202
  validate_metadata(database)
169
203
 
204
+ log_database_ids(database)
205
+
170
206
  database_dict = nmdc_schema_object_to_dict(database)
171
207
  filename = nmdc_schema_database_export_filename(metadata_submission)
172
208
  outputs = export_json_to_drs(database_dict, filename)
@@ -177,27 +213,37 @@ def translate_metadata_submission_to_nmdc_schema_database():
177
213
  def ingest_metadata_submission():
178
214
  (
179
215
  submission_id,
180
- omics_processing_mapping_file_url,
216
+ nucleotide_sequencing_mapping_file_url,
181
217
  data_object_mapping_file_url,
182
218
  biosample_extras_file_url,
183
219
  biosample_extras_slot_mapping_file_url,
220
+ study_id,
184
221
  ) = get_submission_portal_pipeline_inputs()
185
222
 
186
223
  metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
187
- omics_processing_mapping = get_csv_rows_from_url(omics_processing_mapping_file_url)
224
+ nucleotide_sequencing_mapping = get_csv_rows_from_url(
225
+ nucleotide_sequencing_mapping_file_url
226
+ )
188
227
  data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
189
228
  biosample_extras = get_csv_rows_from_url(biosample_extras_file_url)
190
229
  biosample_extras_slot_mapping = get_csv_rows_from_url(
191
230
  biosample_extras_slot_mapping_file_url
192
231
  )
232
+ instrument_mapping = get_instrument_ids_by_model()
193
233
 
194
234
  database = translate_portal_submission_to_nmdc_schema_database(
195
235
  metadata_submission,
196
- omics_processing_mapping,
197
- data_object_mapping,
236
+ nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
237
+ data_object_mapping=data_object_mapping,
198
238
  biosample_extras=biosample_extras,
199
239
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
240
+ instrument_mapping=instrument_mapping,
241
+ study_id=study_id,
200
242
  )
243
+ database = add_public_image_urls(database, submission_id)
244
+
245
+ log_database_ids(database)
246
+
201
247
  run_id = submit_metadata_to_db(database)
202
248
  poll_for_run_completion(run_id)
203
249
 
@@ -213,6 +259,7 @@ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
213
259
  (
214
260
  neon_envo_mappings_file_url,
215
261
  neon_raw_data_file_mappings_file_url,
262
+ neon_nmdc_instrument_mapping_file_url,
216
263
  ) = get_neon_pipeline_inputs()
217
264
 
218
265
  neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
@@ -221,8 +268,16 @@ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
221
268
  neon_raw_data_file_mappings_file_url
222
269
  )
223
270
 
271
+ neon_nmdc_instrument_mapping_file = get_df_from_url(
272
+ neon_nmdc_instrument_mapping_file_url
273
+ )
274
+
224
275
  database = nmdc_schema_database_from_neon_soil_data(
225
- mms_data, sls_data, neon_envo_mappings_file, neon_raw_data_file_mappings_file
276
+ mms_data,
277
+ sls_data,
278
+ neon_envo_mappings_file,
279
+ neon_raw_data_file_mappings_file,
280
+ neon_nmdc_instrument_mapping_file,
226
281
  )
227
282
 
228
283
  database_dict = nmdc_schema_object_to_dict(database)
@@ -243,6 +298,7 @@ def ingest_neon_soil_metadata():
243
298
  (
244
299
  neon_envo_mappings_file_url,
245
300
  neon_raw_data_file_mappings_file_url,
301
+ neon_nmdc_instrument_mapping_file_url,
246
302
  ) = get_neon_pipeline_inputs()
247
303
 
248
304
  neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
@@ -251,8 +307,16 @@ def ingest_neon_soil_metadata():
251
307
  neon_raw_data_file_mappings_file_url
252
308
  )
253
309
 
310
+ neon_nmdc_instrument_mapping_file = get_df_from_url(
311
+ neon_nmdc_instrument_mapping_file_url
312
+ )
313
+
254
314
  database = nmdc_schema_database_from_neon_soil_data(
255
- mms_data, sls_data, neon_envo_mappings_file, neon_raw_data_file_mappings_file
315
+ mms_data,
316
+ sls_data,
317
+ neon_envo_mappings_file,
318
+ neon_raw_data_file_mappings_file,
319
+ neon_nmdc_instrument_mapping_file,
256
320
  )
257
321
  run_id = submit_metadata_to_db(database)
258
322
  poll_for_run_completion(run_id)
@@ -263,6 +327,7 @@ def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
263
327
  (
264
328
  neon_envo_mappings_file_url,
265
329
  neon_raw_data_file_mappings_file_url,
330
+ neon_nmdc_instrument_mapping_file_url,
266
331
  ) = get_neon_pipeline_inputs()
267
332
 
268
333
  mms_benthic_data_product = get_neon_pipeline_benthic_data_product()
@@ -276,11 +341,16 @@ def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
276
341
  neon_raw_data_file_mappings_file_url
277
342
  )
278
343
 
344
+ neon_nmdc_instrument_mapping_file = get_df_from_url(
345
+ neon_nmdc_instrument_mapping_file_url
346
+ )
347
+
279
348
  database = nmdc_schema_database_from_neon_benthic_data(
280
349
  mms_benthic,
281
350
  sites_mapping_dict,
282
351
  neon_envo_mappings_file,
283
352
  neon_raw_data_file_mappings_file,
353
+ neon_nmdc_instrument_mapping_file,
284
354
  )
285
355
 
286
356
  database_dict = nmdc_schema_object_to_dict(database)
@@ -301,6 +371,7 @@ def ingest_neon_benthic_metadata():
301
371
  (
302
372
  neon_envo_mappings_file_url,
303
373
  neon_raw_data_file_mappings_file_url,
374
+ neon_nmdc_instrument_mapping_file_url,
304
375
  ) = get_neon_pipeline_inputs()
305
376
 
306
377
  neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
@@ -309,11 +380,189 @@ def ingest_neon_benthic_metadata():
309
380
  neon_raw_data_file_mappings_file_url
310
381
  )
311
382
 
383
+ neon_nmdc_instrument_mapping_file = get_df_from_url(
384
+ neon_nmdc_instrument_mapping_file_url
385
+ )
386
+
312
387
  database = nmdc_schema_database_from_neon_benthic_data(
313
388
  mms_benthic,
314
389
  sites_mapping_dict,
315
390
  neon_envo_mappings_file,
316
391
  neon_raw_data_file_mappings_file,
392
+ neon_nmdc_instrument_mapping_file,
317
393
  )
318
394
  run_id = submit_metadata_to_db(database)
319
395
  poll_for_run_completion(run_id)
396
+
397
+
398
+ @graph
399
+ def translate_neon_api_surface_water_metadata_to_nmdc_schema_database():
400
+ mms_surface_water_data_product = get_neon_pipeline_surface_water_data_product()
401
+
402
+ mms_surface_water = neon_data_by_product(mms_surface_water_data_product)
403
+
404
+ sites_mapping_dict = site_code_mapping()
405
+
406
+ (
407
+ neon_envo_mappings_file_url,
408
+ neon_raw_data_file_mappings_file_url,
409
+ neon_nmdc_instrument_mapping_file_url,
410
+ ) = get_neon_pipeline_inputs()
411
+
412
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
413
+
414
+ neon_raw_data_file_mappings_file = get_df_from_url(
415
+ neon_raw_data_file_mappings_file_url
416
+ )
417
+
418
+ neon_nmdc_instrument_mapping_file = get_df_from_url(
419
+ neon_nmdc_instrument_mapping_file_url
420
+ )
421
+
422
+ database = nmdc_schema_database_from_neon_surface_water_data(
423
+ mms_surface_water,
424
+ sites_mapping_dict,
425
+ neon_envo_mappings_file,
426
+ neon_raw_data_file_mappings_file,
427
+ neon_nmdc_instrument_mapping_file,
428
+ )
429
+
430
+ database_dict = nmdc_schema_object_to_dict(database)
431
+ filename = nmdc_schema_database_export_filename_neon()
432
+
433
+ outputs = export_json_to_drs(database_dict, filename)
434
+ add_output_run_event(outputs)
435
+
436
+
437
+ @graph
438
+ def ingest_neon_surface_water_metadata():
439
+ mms_surface_water_data_product = get_neon_pipeline_surface_water_data_product()
440
+
441
+ mms_surface_water = neon_data_by_product(mms_surface_water_data_product)
442
+
443
+ sites_mapping_dict = site_code_mapping()
444
+
445
+ (
446
+ neon_envo_mappings_file_url,
447
+ neon_raw_data_file_mappings_file_url,
448
+ neon_nmdc_instrument_mapping_file_url,
449
+ ) = get_neon_pipeline_inputs()
450
+
451
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
452
+
453
+ neon_raw_data_file_mappings_file = get_df_from_url(
454
+ neon_raw_data_file_mappings_file_url
455
+ )
456
+
457
+ neon_nmdc_instrument_mapping_file = get_df_from_url(
458
+ neon_nmdc_instrument_mapping_file_url
459
+ )
460
+
461
+ database = nmdc_schema_database_from_neon_benthic_data(
462
+ mms_surface_water,
463
+ sites_mapping_dict,
464
+ neon_envo_mappings_file,
465
+ neon_raw_data_file_mappings_file,
466
+ neon_nmdc_instrument_mapping_file,
467
+ )
468
+ run_id = submit_metadata_to_db(database)
469
+ poll_for_run_completion(run_id)
470
+
471
+
472
+ @graph
473
+ def nmdc_study_to_ncbi_submission_export():
474
+ nmdc_study = get_ncbi_export_pipeline_study()
475
+ ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
476
+ biosamples = get_biosamples_by_study_id(nmdc_study)
477
+ nucleotide_sequencing_records = get_nucleotide_sequencing_from_biosamples(
478
+ biosamples
479
+ )
480
+ data_object_records = get_data_objects_from_biosamples(biosamples)
481
+ library_preparation_records = get_library_preparation_from_biosamples(biosamples)
482
+ pooled_biosamples_data = get_aggregated_pooled_biosamples(biosamples)
483
+ all_instruments = get_all_instruments()
484
+ xml_data = ncbi_submission_xml_from_nmdc_study(
485
+ nmdc_study,
486
+ ncbi_submission_metadata,
487
+ biosamples,
488
+ nucleotide_sequencing_records,
489
+ data_object_records,
490
+ library_preparation_records,
491
+ all_instruments,
492
+ pooled_biosamples_data,
493
+ )
494
+ ncbi_submission_xml_asset(xml_data)
495
+
496
+
497
+ @graph
498
+ def generate_data_generation_set_for_biosamples_in_nmdc_study():
499
+ (
500
+ study_id,
501
+ gold_nmdc_instrument_mapping_file_url,
502
+ include_field_site_info,
503
+ enable_biosample_filtering,
504
+ ) = get_database_updater_inputs()
505
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
506
+
507
+ database = generate_data_generation_set_post_biosample_ingest(
508
+ study_id,
509
+ gold_nmdc_instrument_map_df,
510
+ include_field_site_info,
511
+ enable_biosample_filtering,
512
+ )
513
+
514
+ database_dict = nmdc_schema_object_to_dict(database)
515
+ filename = post_submission_portal_biosample_ingest_record_stitching_filename(
516
+ study_id
517
+ )
518
+ outputs = export_json_to_drs(database_dict, filename)
519
+ add_output_run_event(outputs)
520
+
521
+
522
+ @graph
523
+ def generate_biosample_set_from_samples_in_gold():
524
+ (
525
+ study_id,
526
+ gold_nmdc_instrument_mapping_file_url,
527
+ include_field_site_info,
528
+ enable_biosample_filtering,
529
+ ) = get_database_updater_inputs()
530
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
531
+
532
+ database = generate_biosample_set_for_nmdc_study_from_gold(
533
+ study_id,
534
+ gold_nmdc_instrument_map_df,
535
+ include_field_site_info,
536
+ enable_biosample_filtering,
537
+ )
538
+ database_dict = nmdc_schema_object_to_dict(database)
539
+ filename = post_submission_portal_biosample_ingest_record_stitching_filename(
540
+ study_id
541
+ )
542
+ outputs = export_json_to_drs(database_dict, filename)
543
+ add_output_run_event(outputs)
544
+
545
+
546
+ @graph
547
+ def generate_update_script_for_insdc_biosample_identifiers():
548
+ """Generate a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.
549
+
550
+ This graph fetches the necessary inputs, then calls the run_script_to_update_insdc_biosample_identifiers op
551
+ to generate a script for updating biosample records with INSDC identifiers obtained from GOLD.
552
+ The script is returned as a dictionary that can be executed against MongoDB.
553
+ """
554
+ (
555
+ study_id,
556
+ gold_nmdc_instrument_mapping_file_url,
557
+ include_field_site_info,
558
+ enable_biosample_filtering,
559
+ ) = get_database_updater_inputs()
560
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
561
+
562
+ update_script = run_script_to_update_insdc_biosample_identifiers(
563
+ study_id,
564
+ gold_nmdc_instrument_map_df,
565
+ include_field_site_info,
566
+ enable_biosample_filtering,
567
+ )
568
+ render_text(update_script)