nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,6 +1,5 @@
1
1
  import json
2
2
 
3
- from typing import Any
4
3
 
5
4
  from dagster import (
6
5
  repository,
@@ -25,11 +24,10 @@ from nmdc_runtime.api.models.run import _add_run_fail_event
25
24
  from nmdc_runtime.api.models.trigger import Trigger
26
25
  from nmdc_runtime.site.export.study_metadata import export_study_biosamples_metadata
27
26
  from nmdc_runtime.site.graphs import (
27
+ generate_biosample_set_from_samples_in_gold,
28
28
  translate_metadata_submission_to_nmdc_schema_database,
29
29
  ingest_metadata_submission,
30
30
  gold_study_to_database,
31
- gold_translation,
32
- gold_translation_curation,
33
31
  create_objects_from_site_object_puts,
34
32
  housekeeping,
35
33
  ensure_jobs,
@@ -38,8 +36,15 @@ from nmdc_runtime.site.graphs import (
38
36
  hello_graph,
39
37
  translate_neon_api_soil_metadata_to_nmdc_schema_database,
40
38
  translate_neon_api_benthic_metadata_to_nmdc_schema_database,
39
+ translate_neon_api_surface_water_metadata_to_nmdc_schema_database,
41
40
  ingest_neon_soil_metadata,
42
41
  ingest_neon_benthic_metadata,
42
+ ingest_neon_surface_water_metadata,
43
+ ensure_alldocs,
44
+ run_ontology_load,
45
+ nmdc_study_to_ncbi_submission_export,
46
+ generate_data_generation_set_for_biosamples_in_nmdc_study,
47
+ generate_update_script_for_insdc_biosample_identifiers,
43
48
  )
44
49
  from nmdc_runtime.site.resources import (
45
50
  get_mongo,
@@ -48,15 +53,11 @@ from nmdc_runtime.site.resources import (
48
53
  nmdc_portal_api_client_resource,
49
54
  gold_api_client_resource,
50
55
  neon_api_client_resource,
51
- terminus_resource,
52
56
  mongo_resource,
53
57
  )
54
58
  from nmdc_runtime.site.resources import (
55
59
  get_runtime_api_site_client,
56
60
  )
57
- from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
58
- from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
59
- from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
60
61
  from nmdc_runtime.util import freeze
61
62
  from nmdc_runtime.util import unfreeze
62
63
 
@@ -66,7 +67,6 @@ resource_defs = {
66
67
  "nmdc_portal_api_client": nmdc_portal_api_client_resource,
67
68
  "gold_api_client": gold_api_client_resource,
68
69
  "neon_api_client": neon_api_client_resource,
69
- "terminus": terminus_resource,
70
70
  "mongo": mongo_resource,
71
71
  }
72
72
 
@@ -111,6 +111,62 @@ housekeeping_weekly = ScheduleDefinition(
111
111
  job=housekeeping.to_job(**preset_normal),
112
112
  )
113
113
 
114
+ ensure_alldocs_daily = ScheduleDefinition(
115
+ name="daily_ensure_alldocs",
116
+ cron_schedule="0 3 * * *",
117
+ execution_timezone="America/New_York",
118
+ job=ensure_alldocs.to_job(**preset_normal),
119
+ )
120
+
121
+
122
+ load_envo_ontology_weekly = ScheduleDefinition(
123
+ name="weekly_load_envo_ontology",
124
+ cron_schedule="0 7 * * 1",
125
+ execution_timezone="America/New_York",
126
+ job=run_ontology_load.to_job(
127
+ name="scheduled_envo_ontology_load",
128
+ config=unfreeze(
129
+ merge(
130
+ run_config_frozen__normal_env,
131
+ {"ops": {"load_ontology": {"config": {"source_ontology": "envo"}}}},
132
+ )
133
+ ),
134
+ resource_defs=resource_defs,
135
+ ),
136
+ )
137
+
138
+ load_uberon_ontology_weekly = ScheduleDefinition(
139
+ name="weekly_load_uberon_ontology",
140
+ cron_schedule="0 8 * * 1",
141
+ execution_timezone="America/New_York",
142
+ job=run_ontology_load.to_job(
143
+ name="scheduled_uberon_ontology_load",
144
+ config=unfreeze(
145
+ merge(
146
+ run_config_frozen__normal_env,
147
+ {"ops": {"load_ontology": {"config": {"source_ontology": "uberon"}}}},
148
+ )
149
+ ),
150
+ resource_defs=resource_defs,
151
+ ),
152
+ )
153
+
154
+ load_po_ontology_weekly = ScheduleDefinition(
155
+ name="weekly_load_po_ontology",
156
+ cron_schedule="0 9 * * 1",
157
+ execution_timezone="America/New_York",
158
+ job=run_ontology_load.to_job(
159
+ name="scheduled_po_ontology_load",
160
+ config=unfreeze(
161
+ merge(
162
+ run_config_frozen__normal_env,
163
+ {"ops": {"load_ontology": {"config": {"source_ontology": "po"}}}},
164
+ )
165
+ ),
166
+ resource_defs=resource_defs,
167
+ ),
168
+ )
169
+
114
170
 
115
171
  def asset_materialization_metadata(asset_event, key):
116
172
  """Get metadata from an asset materialization event.
@@ -186,82 +242,6 @@ def process_workflow_job_triggers(_context):
186
242
  yield SkipReason("No new jobs required")
187
243
 
188
244
 
189
- @asset_sensor(
190
- asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
191
- job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
192
- )
193
- def ensure_gold_translation_job(_context, asset_event):
194
- mdb = get_mongo(run_config_frozen__normal_env).db
195
- gold_etl_latest = mdb.objects.find_one(
196
- {"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
197
- )
198
- sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
199
- if gold_etl_latest is None:
200
- yield SkipReason("can't find sensed asset object_id in database")
201
- return
202
- elif gold_etl_latest["id"] != sensed_object_id:
203
- yield SkipReason("later object than sensed materialization")
204
- return
205
-
206
- run_config = merge(
207
- run_config_frozen__normal_env,
208
- {
209
- "solids": {
210
- "construct_jobs": {
211
- "config": {
212
- "base_jobs": [
213
- {
214
- "workflow": {"id": "gold-translation-1.0.0"},
215
- "config": {"object_id": gold_etl_latest["id"]},
216
- }
217
- ]
218
- }
219
- }
220
- }
221
- },
222
- )
223
- yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
224
-
225
-
226
- @asset_sensor(
227
- asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
228
- job=gold_translation_curation.to_job(**preset_normal),
229
- )
230
- def claim_and_run_gold_translation_curation(_context, asset_event):
231
- client = get_runtime_api_site_client(run_config_frozen__normal_env)
232
- mdb = get_mongo(run_config_frozen__normal_env).db
233
- object_id_latest = asset_materialization_metadata(
234
- asset_event, "object_id_latest"
235
- ).text
236
- job = mdb.jobs.find_one(
237
- {
238
- "workflow.id": "gold-translation-1.0.0",
239
- "config.object_id_latest": object_id_latest,
240
- }
241
- )
242
- if job is not None:
243
- rv = client.claim_job(job["id"])
244
- if rv.status_code == status.HTTP_200_OK:
245
- operation = rv.json()
246
- run_config = merge(
247
- run_config_frozen__normal_env,
248
- {
249
- "ops": {
250
- "get_operation": {
251
- "config": {
252
- "operation_id": operation["id"],
253
- }
254
- }
255
- }
256
- },
257
- )
258
- yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
259
- else:
260
- yield SkipReason("Job found, but already claimed by this site")
261
- else:
262
- yield SkipReason("No job found")
263
-
264
-
265
245
  @sensor(
266
246
  job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
267
247
  default_status=DefaultSensorStatus.RUNNING,
@@ -400,11 +380,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
400
380
  yield SkipReason("; ".join(skip_notes))
401
381
 
402
382
 
403
- # TODO ensure data_object_type values from file_type_enum
404
- # see /metadata-translation/notebooks/202106_curation_updates.ipynb
405
- # for details ("Create file_type_enum collection" section).
406
-
407
-
408
383
  @sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
409
384
  def done_object_put_ops(_context):
410
385
  client = get_runtime_api_site_client(run_config_frozen__normal_env)
@@ -444,17 +419,21 @@ def on_run_fail(context: RunStatusSensorContext):
444
419
  @repository
445
420
  def repo():
446
421
  graph_jobs = [
447
- gold_translation.to_job(**preset_normal),
448
422
  hello_graph.to_job(name="hello_job"),
449
423
  ensure_jobs.to_job(**preset_normal),
450
424
  apply_metadata_in.to_job(**preset_normal),
451
425
  export_study_biosamples_metadata.to_job(**preset_normal),
426
+ ensure_alldocs.to_job(**preset_normal),
427
+ ]
428
+ schedules = [
429
+ housekeeping_weekly,
430
+ ensure_alldocs_daily,
431
+ load_envo_ontology_weekly,
432
+ load_uberon_ontology_weekly,
433
+ load_po_ontology_weekly,
452
434
  ]
453
- schedules = [housekeeping_weekly]
454
435
  sensors = [
455
436
  done_object_put_ops,
456
- ensure_gold_translation_job,
457
- claim_and_run_gold_translation_curation,
458
437
  process_workflow_job_triggers,
459
438
  claim_and_run_apply_changesheet_jobs,
460
439
  claim_and_run_metadata_in_jobs,
@@ -464,20 +443,6 @@ def repo():
464
443
  return graph_jobs + schedules + sensors
465
444
 
466
445
 
467
- @repository
468
- def translation():
469
- graph_jobs = [jgi_job, gold_job, emsl_job]
470
-
471
- return graph_jobs
472
-
473
-
474
- @repository
475
- def test_translation():
476
- graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
477
-
478
- return graph_jobs
479
-
480
-
481
446
  @repository
482
447
  def biosample_submission_ingest():
483
448
  normal_resources = run_config_frozen__normal_env["resources"]
@@ -498,7 +463,15 @@ def biosample_submission_ingest():
498
463
  },
499
464
  ),
500
465
  "ops": {
501
- "get_gold_study_pipeline_inputs": {"config": {"study_id": ""}},
466
+ "get_gold_study_pipeline_inputs": {
467
+ "config": {
468
+ "study_id": "",
469
+ "study_type": "research_study",
470
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
471
+ "include_field_site_info": False,
472
+ "enable_biosample_filtering": True,
473
+ },
474
+ },
502
475
  "export_json_to_drs": {"config": {"username": ""}},
503
476
  },
504
477
  },
@@ -513,8 +486,8 @@ def biosample_submission_ingest():
513
486
  "nmdc_portal_api_client": {
514
487
  "config": {
515
488
  "base_url": {"env": "NMDC_PORTAL_API_BASE_URL"},
516
- "session_cookie": {
517
- "env": "NMDC_PORTAL_API_SESSION_COOKIE"
489
+ "refresh_token": {
490
+ "env": "NMDC_PORTAL_API_REFRESH_TOKEN"
518
491
  },
519
492
  }
520
493
  }
@@ -525,18 +498,16 @@ def biosample_submission_ingest():
525
498
  "get_submission_portal_pipeline_inputs": {
526
499
  "inputs": {
527
500
  "submission_id": "",
528
- "omics_processing_mapping_file_url": None,
501
+ "nucleotide_sequencing_mapping_file_url": None,
529
502
  "data_object_mapping_file_url": None,
530
503
  "biosample_extras_file_url": None,
531
504
  "biosample_extras_slot_mapping_file_url": None,
505
+ "study_id": None,
532
506
  }
533
507
  },
534
508
  "translate_portal_submission_to_nmdc_schema_database": {
535
509
  "inputs": {
536
- "study_category": None,
537
- "study_doi_category": None,
538
- "study_doi_provider": None,
539
- "study_funding_sources": None,
510
+ "study_category": "research_study",
540
511
  "study_pi_image_url": None,
541
512
  }
542
513
  },
@@ -553,8 +524,8 @@ def biosample_submission_ingest():
553
524
  "nmdc_portal_api_client": {
554
525
  "config": {
555
526
  "base_url": {"env": "NMDC_PORTAL_API_BASE_URL"},
556
- "session_cookie": {
557
- "env": "NMDC_PORTAL_API_SESSION_COOKIE"
527
+ "refresh_token": {
528
+ "env": "NMDC_PORTAL_API_REFRESH_TOKEN"
558
529
  },
559
530
  }
560
531
  }
@@ -564,18 +535,16 @@ def biosample_submission_ingest():
564
535
  "get_submission_portal_pipeline_inputs": {
565
536
  "inputs": {
566
537
  "submission_id": "",
567
- "omics_processing_mapping_file_url": None,
538
+ "nucleotide_sequencing_mapping_file_url": None,
568
539
  "data_object_mapping_file_url": None,
569
540
  "biosample_extras_file_url": None,
570
541
  "biosample_extras_slot_mapping_file_url": None,
542
+ "study_id": None,
571
543
  }
572
544
  },
573
545
  "translate_portal_submission_to_nmdc_schema_database": {
574
546
  "inputs": {
575
547
  "study_category": None,
576
- "study_doi_category": None,
577
- "study_doi_provider": None,
578
- "study_funding_sources": None,
579
548
  "study_pi_image_url": None,
580
549
  }
581
550
  },
@@ -635,6 +604,7 @@ def biosample_submission_ingest():
635
604
  "inputs": {
636
605
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
637
606
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
607
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
638
608
  }
639
609
  },
640
610
  },
@@ -676,6 +646,7 @@ def biosample_submission_ingest():
676
646
  "inputs": {
677
647
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
678
648
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
649
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
679
650
  }
680
651
  },
681
652
  },
@@ -718,13 +689,14 @@ def biosample_submission_ingest():
718
689
  "inputs": {
719
690
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
720
691
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
692
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
721
693
  }
722
694
  },
723
695
  "get_neon_pipeline_benthic_data_product": {
724
696
  "config": {
725
697
  "benthic_data_product": {
726
698
  "product_id": "DP1.20279.001",
727
- "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
699
+ "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
728
700
  }
729
701
  }
730
702
  },
@@ -751,7 +723,7 @@ def biosample_submission_ingest():
751
723
  "config": {
752
724
  "benthic_data_product": {
753
725
  "product_id": "DP1.20279.001",
754
- "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
726
+ "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
755
727
  }
756
728
  }
757
729
  },
@@ -759,6 +731,92 @@ def biosample_submission_ingest():
759
731
  "inputs": {
760
732
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
761
733
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
734
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
735
+ }
736
+ },
737
+ },
738
+ },
739
+ ),
740
+ translate_neon_api_surface_water_metadata_to_nmdc_schema_database.to_job(
741
+ description="This job fetches the metadata associated with a given NEON data product code and translates it into an equivalent nmdc:Database object. The object is serialized to JSON and stored in DRS. This can be considered a dry-run for the `ingest_neon_metadata` job.",
742
+ resource_defs=resource_defs,
743
+ config={
744
+ "resources": merge(
745
+ unfreeze(normal_resources),
746
+ {
747
+ "neon_api_client": {
748
+ "config": {
749
+ "base_url": {"env": "NEON_API_BASE_URL"},
750
+ "api_token": {"env": "NEON_API_TOKEN"},
751
+ },
752
+ },
753
+ "mongo": {
754
+ "config": {
755
+ "dbname": {"env": "MONGO_DBNAME"},
756
+ "host": {"env": "MONGO_HOST"},
757
+ "password": {"env": "MONGO_PASSWORD"},
758
+ "username": {"env": "MONGO_USERNAME"},
759
+ },
760
+ },
761
+ "runtime_api_site_client": {
762
+ "config": {
763
+ "base_url": {"env": "API_HOST"},
764
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
765
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
766
+ "site_id": {"env": "API_SITE_ID"},
767
+ },
768
+ },
769
+ },
770
+ ),
771
+ "ops": {
772
+ "export_json_to_drs": {"config": {"username": "..."}},
773
+ "get_neon_pipeline_inputs": {
774
+ "inputs": {
775
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
776
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
777
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
778
+ }
779
+ },
780
+ "get_neon_pipeline_surface_water_data_product": {
781
+ "config": {
782
+ "surface_water_data_product": {
783
+ "product_id": "DP1.20281.001",
784
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
785
+ }
786
+ }
787
+ },
788
+ },
789
+ },
790
+ ),
791
+ ingest_neon_surface_water_metadata.to_job(
792
+ description="",
793
+ resource_defs=resource_defs,
794
+ config={
795
+ "resources": merge(
796
+ unfreeze(normal_resources),
797
+ {
798
+ "neon_api_client": {
799
+ "config": {
800
+ "base_url": {"env": "NEON_API_BASE_URL"},
801
+ "api_token": {"env": "NEON_API_TOKEN"},
802
+ },
803
+ }
804
+ },
805
+ ),
806
+ "ops": {
807
+ "get_neon_pipeline_surface_water_data_product": {
808
+ "config": {
809
+ "surface_water_data_product": {
810
+ "product_id": "DP1.20281.001",
811
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
812
+ }
813
+ }
814
+ },
815
+ "get_neon_pipeline_inputs": {
816
+ "inputs": {
817
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
818
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
819
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
762
820
  }
763
821
  },
764
822
  },
@@ -767,13 +825,190 @@ def biosample_submission_ingest():
767
825
  ]
768
826
 
769
827
 
770
- # @repository
771
- # def validation():
772
- # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
773
- # return graph_jobs
774
- #
775
- #
776
- # @repository
777
- # def test_validation():
778
- # graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
779
- # return graph_jobs
828
+ @repository
829
+ def biosample_export():
830
+ normal_resources = run_config_frozen__normal_env["resources"]
831
+ return [
832
+ nmdc_study_to_ncbi_submission_export.to_job(
833
+ resource_defs=resource_defs,
834
+ config={
835
+ "resources": merge(
836
+ unfreeze(normal_resources),
837
+ {
838
+ "mongo": {
839
+ "config": {
840
+ "host": {"env": "MONGO_HOST"},
841
+ "username": {"env": "MONGO_USERNAME"},
842
+ "password": {"env": "MONGO_PASSWORD"},
843
+ "dbname": {"env": "MONGO_DBNAME"},
844
+ },
845
+ },
846
+ "runtime_api_site_client": {
847
+ "config": {
848
+ "base_url": {"env": "API_HOST"},
849
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
850
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
851
+ "site_id": {"env": "API_SITE_ID"},
852
+ },
853
+ },
854
+ },
855
+ ),
856
+ "ops": {
857
+ "get_ncbi_export_pipeline_study": {
858
+ "config": {
859
+ "nmdc_study_id": "",
860
+ }
861
+ },
862
+ "get_ncbi_export_pipeline_inputs": {
863
+ "config": {
864
+ "nmdc_ncbi_attribute_mapping_file_url": "",
865
+ "ncbi_submission_metadata": {
866
+ "organization": "",
867
+ },
868
+ "ncbi_biosample_metadata": {
869
+ "organism_name": "",
870
+ },
871
+ }
872
+ },
873
+ },
874
+ },
875
+ ),
876
+ ]
877
+
878
+
879
+ @repository
880
+ def database_records_stitching():
881
+ normal_resources = run_config_frozen__normal_env["resources"]
882
+ return [
883
+ generate_data_generation_set_for_biosamples_in_nmdc_study.to_job(
884
+ description="This job can be used to create a data_generation_set JSON for biosamples that are already present in the NMDC database.",
885
+ resource_defs=resource_defs,
886
+ config={
887
+ "resources": merge(
888
+ unfreeze(normal_resources),
889
+ {
890
+ "runtime_api_user_client": {
891
+ "config": {
892
+ "base_url": {"env": "API_HOST"},
893
+ "username": {"env": "API_ADMIN_USER"},
894
+ "password": {"env": "API_ADMIN_PASS"},
895
+ },
896
+ },
897
+ "runtime_api_site_client": {
898
+ "config": {
899
+ "base_url": {"env": "API_HOST"},
900
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
901
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
902
+ "site_id": {"env": "API_SITE_ID"},
903
+ },
904
+ },
905
+ "gold_api_client": {
906
+ "config": {
907
+ "base_url": {"env": "GOLD_API_BASE_URL"},
908
+ "username": {"env": "GOLD_API_USERNAME"},
909
+ "password": {"env": "GOLD_API_PASSWORD"},
910
+ },
911
+ },
912
+ },
913
+ ),
914
+ "ops": {
915
+ "get_database_updater_inputs": {
916
+ "config": {
917
+ "nmdc_study_id": "",
918
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
919
+ "include_field_site_info": False,
920
+ "enable_biosample_filtering": True,
921
+ }
922
+ },
923
+ "export_json_to_drs": {"config": {"username": ""}},
924
+ },
925
+ },
926
+ ),
927
+ generate_biosample_set_from_samples_in_gold.to_job(
928
+ description="This job can be used to create a biosample_set JSON from samples in GOLD for a given study in NMDC.",
929
+ resource_defs=resource_defs,
930
+ config={
931
+ "resources": merge(
932
+ unfreeze(normal_resources),
933
+ {
934
+ "runtime_api_user_client": {
935
+ "config": {
936
+ "base_url": {"env": "API_HOST"},
937
+ "username": {"env": "API_ADMIN_USER"},
938
+ "password": {"env": "API_ADMIN_PASS"},
939
+ },
940
+ },
941
+ "runtime_api_site_client": {
942
+ "config": {
943
+ "base_url": {"env": "API_HOST"},
944
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
945
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
946
+ "site_id": {"env": "API_SITE_ID"},
947
+ },
948
+ },
949
+ "gold_api_client": {
950
+ "config": {
951
+ "base_url": {"env": "GOLD_API_BASE_URL"},
952
+ "username": {"env": "GOLD_API_USERNAME"},
953
+ "password": {"env": "GOLD_API_PASSWORD"},
954
+ },
955
+ },
956
+ },
957
+ ),
958
+ "ops": {
959
+ "get_database_updater_inputs": {
960
+ "config": {
961
+ "nmdc_study_id": "",
962
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
963
+ "include_field_site_info": False,
964
+ "enable_biosample_filtering": True,
965
+ }
966
+ },
967
+ "export_json_to_drs": {"config": {"username": ""}},
968
+ },
969
+ },
970
+ ),
971
+ generate_update_script_for_insdc_biosample_identifiers.to_job(
972
+ description="This job generates a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.",
973
+ resource_defs=resource_defs,
974
+ config={
975
+ "resources": merge(
976
+ unfreeze(normal_resources),
977
+ {
978
+ "runtime_api_user_client": {
979
+ "config": {
980
+ "base_url": {"env": "API_HOST"},
981
+ "username": {"env": "API_ADMIN_USER"},
982
+ "password": {"env": "API_ADMIN_PASS"},
983
+ },
984
+ },
985
+ "runtime_api_site_client": {
986
+ "config": {
987
+ "base_url": {"env": "API_HOST"},
988
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
989
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
990
+ "site_id": {"env": "API_SITE_ID"},
991
+ },
992
+ },
993
+ "gold_api_client": {
994
+ "config": {
995
+ "base_url": {"env": "GOLD_API_BASE_URL"},
996
+ "username": {"env": "GOLD_API_USERNAME"},
997
+ "password": {"env": "GOLD_API_PASSWORD"},
998
+ },
999
+ },
1000
+ },
1001
+ ),
1002
+ "ops": {
1003
+ "get_database_updater_inputs": {
1004
+ "config": {
1005
+ "nmdc_study_id": "",
1006
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
1007
+ "include_field_site_info": False,
1008
+ "enable_biosample_filtering": True,
1009
+ }
1010
+ },
1011
+ },
1012
+ },
1013
+ ),
1014
+ ]