nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,5 @@
1
1
  from io import BytesIO, StringIO
2
- from typing import Any, Dict, List, Union
2
+ from typing import Any, Dict, List
3
3
 
4
4
  from nmdc_runtime.api.endpoints.util import strip_oid
5
5
  from nmdc_runtime.minter.config import typecodes
@@ -99,31 +99,36 @@ def fetch_nucleotide_sequencing_from_biosamples(
99
99
  for biosample in biosamples_list:
100
100
  current_ids = [biosample["id"]]
101
101
  collected_ntseq_objects = []
102
+ processed_ids = set() # Track already processed nucleotide sequencing IDs
102
103
 
103
104
  while current_ids:
104
105
  new_current_ids = []
105
106
  for current_id in current_ids:
106
- query = {"has_input": current_id}
107
- document = all_docs_collection.find_one(query)
108
-
109
- if not document:
110
- continue
111
-
112
- has_output = document.get("has_output")
113
- if not has_output:
114
- continue
115
-
116
- for output_id in has_output:
117
- if get_classname_from_typecode(output_id) == "DataObject":
118
- nucleotide_sequencing_doc = data_generation_set.find_one(
119
- {"id": document["id"]}
120
- )
121
- if nucleotide_sequencing_doc:
122
- collected_ntseq_objects.append(
123
- strip_oid(nucleotide_sequencing_doc)
124
- )
125
- else:
126
- new_current_ids.append(output_id)
107
+ # Find all documents with current_id as input instead of just one
108
+ for document in all_docs_collection.find({"has_input": current_id}):
109
+ has_output = document.get("has_output")
110
+ if not has_output:
111
+ continue
112
+
113
+ for output_id in has_output:
114
+ if get_classname_from_typecode(output_id) == "DataObject":
115
+ # Only process if we haven't seen this document ID before
116
+ if document["id"] not in processed_ids:
117
+ nucleotide_sequencing_doc = (
118
+ data_generation_set.find_one(
119
+ {
120
+ "id": document["id"],
121
+ "type": "nmdc:NucleotideSequencing",
122
+ }
123
+ )
124
+ )
125
+ if nucleotide_sequencing_doc:
126
+ collected_ntseq_objects.append(
127
+ strip_oid(nucleotide_sequencing_doc)
128
+ )
129
+ processed_ids.add(document["id"])
130
+ else:
131
+ new_current_ids.append(output_id)
127
132
 
128
133
  current_ids = new_current_ids
129
134
 
@@ -187,10 +192,7 @@ def handle_quantity_value(slot_value):
187
192
  and "has_minimum_numeric_value" in slot_value
188
193
  and "has_unit" in slot_value
189
194
  ):
190
- range_value = (
191
- slot_value["has_maximum_numeric_value"]
192
- - slot_value["has_minimum_numeric_value"]
193
- )
195
+ range_value = f"{slot_value['has_minimum_numeric_value']} - {slot_value['has_maximum_numeric_value']}"
194
196
  return f"{range_value} {slot_value['has_unit']}"
195
197
  elif "has_raw_value" in slot_value:
196
198
  return slot_value["has_raw_value"]
@@ -273,6 +275,120 @@ def load_mappings(url):
273
275
  return attribute_mappings, slot_range_mappings
274
276
 
275
277
 
278
+ def check_pooling_for_biosamples(
279
+ material_processing_set: Collection, biosamples_list: List[Dict[str, Any]]
280
+ ) -> Dict[str, Dict[str, Any]]:
281
+ """Check which biosamples are part of pooling processes and return pooling information.
282
+
283
+ The way in which we check if a biosample is part of a Pooling process is by checking if
284
+ the biosample id has been asserted on the `has_input` slot/key of an `nmdc:Pooling` process
285
+ instance.
286
+
287
+ :param material_processing_set: reference to the material_processing_set collection
288
+ :param biosamples_list: list of all biosamples to check
289
+ :return: dictionary mapping biosample_id to pooling information (empty dict if not pooled)
290
+ """
291
+ result = {}
292
+ # get list of all biosample IDs that are part of a given study
293
+ biosample_lookup = {bs["id"]: bs for bs in biosamples_list}
294
+
295
+ # get list of all pooling processes
296
+ pooling_processes = list(material_processing_set.find({"type": "nmdc:Pooling"}))
297
+
298
+ # initialize all biosamples as not pooled
299
+ for biosample in biosamples_list:
300
+ result[biosample["id"]] = {}
301
+
302
+ # process each pooling process
303
+ for pooling_process in pooling_processes:
304
+ pooled_biosample_ids = pooling_process.get("has_input", [])
305
+
306
+ # get the processed sample output from the pooling process
307
+ has_output = pooling_process.get("has_output", [])
308
+ processed_sample_id = None
309
+
310
+ for output_id in has_output:
311
+ if get_classname_from_typecode(output_id) == "ProcessedSample":
312
+ processed_sample_id = output_id
313
+ break
314
+
315
+ # aggregate the values on `collection_date` and `depth` slots
316
+ # here, we are collecting the `collection_date` and `depth` values
317
+ # asserted on each of the biosamples that are part of a given pooling
318
+ # process in the following way:
319
+ # example of aggregated `collection_date`: 2017-06-05T16:50Z/2017-06-05T17:47Z
320
+ # example of aggregated `depth`: 0-10 m
321
+ collection_dates = []
322
+ depths = []
323
+
324
+ for bs_id in pooled_biosample_ids:
325
+ biosample = biosample_lookup.get(bs_id)
326
+ if not biosample:
327
+ continue
328
+
329
+ if "collection_date" in biosample:
330
+ collection_date = biosample["collection_date"]
331
+ if (
332
+ isinstance(collection_date, dict)
333
+ and "has_raw_value" in collection_date
334
+ ):
335
+ collection_dates.append(collection_date["has_raw_value"])
336
+ elif isinstance(collection_date, str):
337
+ collection_dates.append(collection_date)
338
+
339
+ if "depth" in biosample:
340
+ depth = biosample["depth"]
341
+ if isinstance(depth, dict):
342
+ if "has_numeric_value" in depth:
343
+ depths.append(depth["has_numeric_value"])
344
+ elif (
345
+ "has_minimum_numeric_value" in depth
346
+ and "has_maximum_numeric_value" in depth
347
+ ):
348
+ depths.extend(
349
+ [
350
+ depth["has_minimum_numeric_value"],
351
+ depth["has_maximum_numeric_value"],
352
+ ]
353
+ )
354
+ elif isinstance(depth, (int, float)):
355
+ depths.append(depth)
356
+
357
+ # create aggregated (forward slash separated) value for `collection_date`
358
+ aggregated_collection_date = None
359
+ if collection_dates:
360
+ sorted_dates = sorted(collection_dates)
361
+ if len(sorted_dates) > 1:
362
+ aggregated_collection_date = f"{sorted_dates[0]}/{sorted_dates[-1]}"
363
+ else:
364
+ aggregated_collection_date = sorted_dates[0]
365
+
366
+ # create aggregated (hyphen separated) value for `depth`
367
+ aggregated_depth = None
368
+ if depths:
369
+ min_depth = min(depths)
370
+ max_depth = max(depths)
371
+ if min_depth != max_depth:
372
+ aggregated_depth = f"{min_depth}-{max_depth} m"
373
+ else:
374
+ aggregated_depth = f"{min_depth} m"
375
+
376
+ # update all biosamples that are part of this pooling process
377
+ pooling_info = {
378
+ "processed_sample_id": processed_sample_id,
379
+ "pooling_process_id": pooling_process.get("id"),
380
+ "pooled_biosample_ids": pooled_biosample_ids,
381
+ "aggregated_collection_date": aggregated_collection_date,
382
+ "aggregated_depth": aggregated_depth,
383
+ }
384
+
385
+ for bs_id in pooled_biosample_ids:
386
+ if bs_id in result:
387
+ result[bs_id] = pooling_info
388
+
389
+ return result
390
+
391
+
276
392
  def validate_xml(xml, xsd_url):
277
393
  response = requests.get(xsd_url)
278
394
  response.raise_for_status()
@@ -1,7 +1,6 @@
1
1
  from dagster import graph
2
2
 
3
3
  from nmdc_runtime.site.ops import (
4
- build_merged_db,
5
4
  generate_biosample_set_for_nmdc_study_from_gold,
6
5
  nmdc_schema_database_export_filename,
7
6
  nmdc_schema_database_from_gold_study,
@@ -12,8 +11,6 @@ from nmdc_runtime.site.ops import (
12
11
  gold_projects_by_study,
13
12
  gold_study,
14
13
  poll_for_run_completion,
15
- run_etl,
16
- local_file_to_api_object,
17
14
  get_operation,
18
15
  produce_curated_db,
19
16
  delete_operations,
@@ -22,6 +19,7 @@ from nmdc_runtime.site.ops import (
22
19
  filter_ops_done_object_puts,
23
20
  hello,
24
21
  mongo_stats,
22
+ run_script_to_update_insdc_biosample_identifiers,
25
23
  submit_metadata_to_db,
26
24
  filter_ops_undone_expired,
27
25
  construct_jobs,
@@ -50,41 +48,27 @@ from nmdc_runtime.site.ops import (
50
48
  get_df_from_url,
51
49
  site_code_mapping,
52
50
  materialize_alldocs,
51
+ load_ontology,
53
52
  get_ncbi_export_pipeline_study,
54
53
  get_data_objects_from_biosamples,
55
54
  get_nucleotide_sequencing_from_biosamples,
56
55
  get_library_preparation_from_biosamples,
56
+ get_aggregated_pooled_biosamples,
57
57
  get_all_instruments,
58
58
  get_ncbi_export_pipeline_inputs,
59
59
  ncbi_submission_xml_from_nmdc_study,
60
60
  ncbi_submission_xml_asset,
61
+ render_text,
61
62
  get_database_updater_inputs,
62
63
  post_submission_portal_biosample_ingest_record_stitching_filename,
63
64
  generate_data_generation_set_post_biosample_ingest,
64
65
  get_instrument_ids_by_model,
65
66
  log_database_ids,
67
+ add_public_image_urls,
66
68
  )
67
69
  from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
68
70
 
69
71
 
70
- @graph
71
- def gold_translation():
72
- """
73
- Translating an export of the JGI GOLD [1] SQL database to the NMDC database JSON schema.
74
-
75
- [1] Genomes OnLine Database (GOLD) <https://gold.jgi.doe.gov/>.
76
- """
77
- local_file_to_api_object(run_etl(build_merged_db()))
78
-
79
-
80
- @graph()
81
- def gold_translation_curation():
82
- # TODO
83
- # - have produce_curated_db do actual curation (see notebook), persisting to db.
84
- # - more steps in pipeline? Or handoff via run_status_sensor on DagsterRunStatus.SUCCESS.
85
- produce_curated_db(get_operation())
86
-
87
-
88
72
  @graph()
89
73
  def create_objects_from_site_object_puts():
90
74
  delete_operations(
@@ -112,6 +96,16 @@ def ensure_alldocs():
112
96
  materialize_alldocs()
113
97
 
114
98
 
99
+ @graph
100
+ def run_ontology_load():
101
+ """
102
+ A graph for loading ontologies.
103
+ The source_ontology parameter is provided by the job configuration
104
+ and passed to the load_ontology op.
105
+ """
106
+ load_ontology()
107
+
108
+
115
109
  @graph
116
110
  def ensure_jobs():
117
111
  jobs = construct_jobs()
@@ -120,17 +114,24 @@ def ensure_jobs():
120
114
 
121
115
  @graph
122
116
  def apply_changesheet():
117
+ # Note: We use `_` as a "placeholder" variable.
118
+ # It's a variable to whose value we assign no significance. In this case, we use it to
119
+ # tell Dagster that one op depends upon the output of the other (so Dagster runs them
120
+ # in that order), without implying to maintainers that its value is significant to us.
121
+ # Reference (this strategy): https://docs.dagster.io/api/dagster/types#dagster.Nothing
122
+ # Reference (`_` variables): https://stackoverflow.com/a/47599668
123
123
  sheet_in = get_changesheet_in()
124
124
  outputs = perform_changesheet_updates(sheet_in)
125
- add_output_run_event(outputs)
126
- materialize_alldocs()
125
+ _ = add_output_run_event(outputs)
126
+ materialize_alldocs(waits_for=_)
127
127
 
128
128
 
129
129
  @graph
130
130
  def apply_metadata_in():
131
+ # Note: We use `_` as a "placeholder" variable.
131
132
  outputs = perform_mongo_updates(get_json_in())
132
- add_output_run_event(outputs)
133
- materialize_alldocs()
133
+ _ = add_output_run_event(outputs)
134
+ materialize_alldocs(waits_for=_)
134
135
 
135
136
 
136
137
  @graph
@@ -140,6 +141,7 @@ def gold_study_to_database():
140
141
  study_type,
141
142
  gold_nmdc_instrument_mapping_file_url,
142
143
  include_field_site_info,
144
+ enable_biosample_filtering,
143
145
  ) = get_gold_study_pipeline_inputs()
144
146
 
145
147
  projects = gold_projects_by_study(study_id)
@@ -156,6 +158,7 @@ def gold_study_to_database():
156
158
  analysis_projects,
157
159
  gold_nmdc_instrument_map_df,
158
160
  include_field_site_info,
161
+ enable_biosample_filtering,
159
162
  )
160
163
  database_dict = nmdc_schema_object_to_dict(database)
161
164
  filename = nmdc_schema_database_export_filename(study)
@@ -172,6 +175,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
172
175
  data_object_mapping_file_url,
173
176
  biosample_extras_file_url,
174
177
  biosample_extras_slot_mapping_file_url,
178
+ study_id,
175
179
  ) = get_submission_portal_pipeline_inputs()
176
180
 
177
181
  metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
@@ -192,6 +196,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
192
196
  biosample_extras=biosample_extras,
193
197
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
194
198
  instrument_mapping=instrument_mapping,
199
+ study_id=study_id,
195
200
  )
196
201
 
197
202
  validate_metadata(database)
@@ -212,6 +217,7 @@ def ingest_metadata_submission():
212
217
  data_object_mapping_file_url,
213
218
  biosample_extras_file_url,
214
219
  biosample_extras_slot_mapping_file_url,
220
+ study_id,
215
221
  ) = get_submission_portal_pipeline_inputs()
216
222
 
217
223
  metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
@@ -232,7 +238,9 @@ def ingest_metadata_submission():
232
238
  biosample_extras=biosample_extras,
233
239
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
234
240
  instrument_mapping=instrument_mapping,
241
+ study_id=study_id,
235
242
  )
243
+ database = add_public_image_urls(database, submission_id)
236
244
 
237
245
  log_database_ids(database)
238
246
 
@@ -471,6 +479,7 @@ def nmdc_study_to_ncbi_submission_export():
471
479
  )
472
480
  data_object_records = get_data_objects_from_biosamples(biosamples)
473
481
  library_preparation_records = get_library_preparation_from_biosamples(biosamples)
482
+ pooled_biosamples_data = get_aggregated_pooled_biosamples(biosamples)
474
483
  all_instruments = get_all_instruments()
475
484
  xml_data = ncbi_submission_xml_from_nmdc_study(
476
485
  nmdc_study,
@@ -480,17 +489,26 @@ def nmdc_study_to_ncbi_submission_export():
480
489
  data_object_records,
481
490
  library_preparation_records,
482
491
  all_instruments,
492
+ pooled_biosamples_data,
483
493
  )
484
494
  ncbi_submission_xml_asset(xml_data)
485
495
 
486
496
 
487
497
  @graph
488
498
  def generate_data_generation_set_for_biosamples_in_nmdc_study():
489
- (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
499
+ (
500
+ study_id,
501
+ gold_nmdc_instrument_mapping_file_url,
502
+ include_field_site_info,
503
+ enable_biosample_filtering,
504
+ ) = get_database_updater_inputs()
490
505
  gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
491
506
 
492
507
  database = generate_data_generation_set_post_biosample_ingest(
493
- study_id, gold_nmdc_instrument_map_df
508
+ study_id,
509
+ gold_nmdc_instrument_map_df,
510
+ include_field_site_info,
511
+ enable_biosample_filtering,
494
512
  )
495
513
 
496
514
  database_dict = nmdc_schema_object_to_dict(database)
@@ -503,11 +521,19 @@ def generate_data_generation_set_for_biosamples_in_nmdc_study():
503
521
 
504
522
  @graph
505
523
  def generate_biosample_set_from_samples_in_gold():
506
- (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
524
+ (
525
+ study_id,
526
+ gold_nmdc_instrument_mapping_file_url,
527
+ include_field_site_info,
528
+ enable_biosample_filtering,
529
+ ) = get_database_updater_inputs()
507
530
  gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
508
531
 
509
532
  database = generate_biosample_set_for_nmdc_study_from_gold(
510
- study_id, gold_nmdc_instrument_map_df
533
+ study_id,
534
+ gold_nmdc_instrument_map_df,
535
+ include_field_site_info,
536
+ enable_biosample_filtering,
511
537
  )
512
538
  database_dict = nmdc_schema_object_to_dict(database)
513
539
  filename = post_submission_portal_biosample_ingest_record_stitching_filename(
@@ -515,3 +541,28 @@ def generate_biosample_set_from_samples_in_gold():
515
541
  )
516
542
  outputs = export_json_to_drs(database_dict, filename)
517
543
  add_output_run_event(outputs)
544
+
545
+
546
+ @graph
547
+ def generate_update_script_for_insdc_biosample_identifiers():
548
+ """Generate a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.
549
+
550
+ This graph fetches the necessary inputs, then calls the run_script_to_update_insdc_biosample_identifiers op
551
+ to generate a script for updating biosample records with INSDC identifiers obtained from GOLD.
552
+ The script is returned as a dictionary that can be executed against MongoDB.
553
+ """
554
+ (
555
+ study_id,
556
+ gold_nmdc_instrument_mapping_file_url,
557
+ include_field_site_info,
558
+ enable_biosample_filtering,
559
+ ) = get_database_updater_inputs()
560
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
561
+
562
+ update_script = run_script_to_update_insdc_biosample_identifiers(
563
+ study_id,
564
+ gold_nmdc_instrument_map_df,
565
+ include_field_site_info,
566
+ enable_biosample_filtering,
567
+ )
568
+ render_text(update_script)