nmdc-runtime 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -1,5 +1,5 @@
1
1
  from functools import lru_cache
2
- from typing import Any, Dict, List
2
+ from typing import Any, Dict, List, Union
3
3
  import pandas as pd
4
4
  from nmdc_runtime.site.resources import (
5
5
  RuntimeApiUserClient,
@@ -18,6 +18,8 @@ class DatabaseUpdater:
18
18
  gold_api_client: GoldApiClient,
19
19
  study_id: str,
20
20
  gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
21
+ include_field_site_info: bool = False,
22
+ enable_biosample_filtering: bool = True,
21
23
  ):
22
24
  """This class serves as an API for repairing connections in the database by
23
25
  adding records that are essentially missing "links"/"connections". As we identify
@@ -39,6 +41,8 @@ class DatabaseUpdater:
39
41
  self.gold_api_client = gold_api_client
40
42
  self.study_id = study_id
41
43
  self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
44
+ self.include_field_site_info = include_field_site_info
45
+ self.enable_biosample_filtering = enable_biosample_filtering
42
46
 
43
47
  @lru_cache
44
48
  def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
@@ -95,6 +99,8 @@ class DatabaseUpdater:
95
99
  biosamples=all_gold_biosamples,
96
100
  projects=all_gold_projects,
97
101
  gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
102
+ include_field_site_info=self.include_field_site_info,
103
+ enable_biosample_filtering=self.enable_biosample_filtering,
98
104
  )
99
105
 
100
106
  # The GoldStudyTranslator class has some pre-processing logic which filters out
@@ -214,6 +220,8 @@ class DatabaseUpdater:
214
220
  projects=gold_sequencing_projects_for_study,
215
221
  analysis_projects=gold_analysis_projects_for_study,
216
222
  gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
223
+ include_field_site_info=self.include_field_site_info,
224
+ enable_biosample_filtering=self.enable_biosample_filtering,
217
225
  )
218
226
 
219
227
  translated_biosamples = gold_study_translator.biosamples
@@ -240,3 +248,204 @@ class DatabaseUpdater:
240
248
  ]
241
249
 
242
250
  return database
251
+
252
+ def queries_run_script_to_update_insdc_identifiers(
253
+ self,
254
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
255
+ """This method creates a `/queries:run` API endpoint compatible update script that can be run
256
+ using that API endpoint to update/add information on the `insdc_biosample_identifiers` field
257
+ of biosample_set records and the `insdc_bioproject_identifiers` field on data_generation_set records.
258
+
259
+ The information to be asserted is retrieved from the `ncbiBioSampleAccession` and
260
+ `ncbiBioProjectAccession` fields on the GOLD `/projects` API endpoint.
261
+
262
+ :return: A `/queries:run` update query compatible script serialized as a dictionary/JSON.
263
+ """
264
+ # Fetch all biosamples associated with the study
265
+ biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
266
+ self.study_id
267
+ )
268
+
269
+ # Fetch all data_generation records associated with the study
270
+ data_generation_set = (
271
+ self.runtime_api_user_client.get_data_generation_records_for_study(
272
+ self.study_id
273
+ )
274
+ )
275
+
276
+ biosample_updates = []
277
+ data_generation_updates = []
278
+
279
+ # Dictionary to store gold_project_id -> ncbi_bioproject_accession mapping
280
+ gold_project_to_bioproject = {}
281
+
282
+ # Dictionary to store all project data we gather during biosample processing
283
+ all_processed_projects = {}
284
+
285
+ # Process biosamples for insdc_biosample_identifiers
286
+ for biosample in biosample_set:
287
+ # get the list (usually one) of GOLD biosample identifiers on the gold_biosample_identifiers slot
288
+ gold_biosample_identifiers = biosample.get("gold_biosample_identifiers", [])
289
+ if not gold_biosample_identifiers:
290
+ continue
291
+
292
+ biosample_id = biosample.get("id")
293
+ if not biosample_id:
294
+ continue
295
+
296
+ insdc_biosample_identifiers = []
297
+
298
+ for gold_biosample_id in gold_biosample_identifiers:
299
+ normalized_id = gold_biosample_id.replace("gold:", "")
300
+
301
+ # fetch projects associated with a GOLD biosample from the GOLD `/projects` API endpoint
302
+ gold_projects = self.gold_api_client.fetch_projects_by_biosample(
303
+ normalized_id
304
+ )
305
+
306
+ for project in gold_projects:
307
+ # Store each project for later use
308
+ project_gold_id = project.get("projectGoldId")
309
+ if project_gold_id:
310
+ all_processed_projects[project_gold_id] = project
311
+
312
+ # Collect ncbi_biosample_accession for biosample updates
313
+ ncbi_biosample_accession = project.get("ncbiBioSampleAccession")
314
+ if ncbi_biosample_accession and ncbi_biosample_accession.strip():
315
+ insdc_biosample_identifiers.append(ncbi_biosample_accession)
316
+
317
+ # Collect ncbi_bioproject_accession for data_generation records
318
+ ncbi_bioproject_accession = project.get("ncbiBioProjectAccession")
319
+ if (
320
+ project_gold_id
321
+ and ncbi_bioproject_accession
322
+ and ncbi_bioproject_accession.strip()
323
+ ):
324
+ gold_project_to_bioproject[project_gold_id] = (
325
+ ncbi_bioproject_accession
326
+ )
327
+
328
+ if insdc_biosample_identifiers:
329
+ existing_insdc_biosample_identifiers = biosample.get(
330
+ "insdc_biosample_identifiers", []
331
+ )
332
+ new_insdc_biosample_identifiers = list(
333
+ set(insdc_biosample_identifiers)
334
+ - set(existing_insdc_biosample_identifiers)
335
+ )
336
+
337
+ if new_insdc_biosample_identifiers:
338
+ prefixed_new_biosample_identifiers = [
339
+ f"biosample:{id}" for id in new_insdc_biosample_identifiers
340
+ ]
341
+
342
+ if existing_insdc_biosample_identifiers:
343
+ all_biosample_identifiers = list(
344
+ set(
345
+ existing_insdc_biosample_identifiers
346
+ + prefixed_new_biosample_identifiers
347
+ )
348
+ )
349
+ biosample_updates.append(
350
+ {
351
+ "q": {"id": biosample_id},
352
+ "u": {
353
+ "$set": {
354
+ "insdc_biosample_identifiers": all_biosample_identifiers
355
+ }
356
+ },
357
+ }
358
+ )
359
+ else:
360
+ biosample_updates.append(
361
+ {
362
+ "q": {"id": biosample_id},
363
+ "u": {
364
+ "$set": {
365
+ "insdc_biosample_identifiers": prefixed_new_biosample_identifiers
366
+ }
367
+ },
368
+ }
369
+ )
370
+
371
+ # Process data_generation records for insdc_bioproject_identifiers
372
+ for data_generation in data_generation_set:
373
+ data_generation_id = data_generation.get("id")
374
+ if not data_generation_id:
375
+ continue
376
+
377
+ # Extract existing insdc_bioproject_identifiers
378
+ existing_insdc_bioproject_identifiers = data_generation.get(
379
+ "insdc_bioproject_identifiers", []
380
+ )
381
+
382
+ collected_insdc_bioproject_identifiers = set()
383
+
384
+ # Add any project identifiers already on the record
385
+ if "insdc_bioproject_identifiers" in data_generation:
386
+ for identifier in data_generation["insdc_bioproject_identifiers"]:
387
+ collected_insdc_bioproject_identifiers.add(identifier)
388
+
389
+ # If there are gold_sequencing_project_identifiers, use our pre-collected mapping
390
+ gold_project_identifiers = data_generation.get(
391
+ "gold_sequencing_project_identifiers", []
392
+ )
393
+ for gold_project_id in gold_project_identifiers:
394
+ normalized_id = gold_project_id.replace("gold:", "")
395
+
396
+ # Check if we have a bioproject ID for this GOLD project ID
397
+ if normalized_id in gold_project_to_bioproject:
398
+ ncbi_bioproject_accession = gold_project_to_bioproject[
399
+ normalized_id
400
+ ]
401
+ collected_insdc_bioproject_identifiers.add(
402
+ f"bioproject:{ncbi_bioproject_accession}"
403
+ )
404
+ else:
405
+ # Only if we don't have it in our mapping, try to fetch it
406
+ # Instead of making a direct API request, check if we've already seen this project
407
+ if normalized_id in all_processed_projects:
408
+ project_data = all_processed_projects[normalized_id]
409
+ ncbi_bioproject_accession = project_data.get(
410
+ "ncbiBioProjectAccession"
411
+ )
412
+ if (
413
+ ncbi_bioproject_accession
414
+ and ncbi_bioproject_accession.strip()
415
+ ):
416
+ collected_insdc_bioproject_identifiers.add(
417
+ f"bioproject:{ncbi_bioproject_accession}"
418
+ )
419
+ # Add to our mapping for future reference
420
+ gold_project_to_bioproject[normalized_id] = (
421
+ ncbi_bioproject_accession
422
+ )
423
+
424
+ # Create a list from the set of collected identifiers
425
+ collected_insdc_bioproject_identifiers = list(
426
+ collected_insdc_bioproject_identifiers
427
+ )
428
+
429
+ # Only update if there are identifiers to add
430
+ if collected_insdc_bioproject_identifiers and set(
431
+ collected_insdc_bioproject_identifiers
432
+ ) != set(existing_insdc_bioproject_identifiers):
433
+ data_generation_updates.append(
434
+ {
435
+ "q": {"id": data_generation_id},
436
+ "u": {
437
+ "$set": {
438
+ "insdc_bioproject_identifiers": collected_insdc_bioproject_identifiers
439
+ }
440
+ },
441
+ }
442
+ )
443
+
444
+ # Return updates for both collections
445
+ if data_generation_updates:
446
+ return [
447
+ {"update": "biosample_set", "updates": biosample_updates},
448
+ {"update": "data_generation_set", "updates": data_generation_updates},
449
+ ]
450
+ else:
451
+ return {"update": "biosample_set", "updates": biosample_updates}
@@ -14,6 +14,7 @@ from dagster import (
14
14
  DagsterRunStatus,
15
15
  RunStatusSensorContext,
16
16
  DefaultSensorStatus,
17
+ in_process_executor,
17
18
  )
18
19
  from starlette import status
19
20
  from toolz import merge, get_in
@@ -44,8 +45,10 @@ from nmdc_runtime.site.graphs import (
44
45
  ingest_neon_benthic_metadata,
45
46
  ingest_neon_surface_water_metadata,
46
47
  ensure_alldocs,
48
+ run_ontology_load,
47
49
  nmdc_study_to_ncbi_submission_export,
48
50
  generate_data_generation_set_for_biosamples_in_nmdc_study,
51
+ generate_update_script_for_insdc_biosample_identifiers,
49
52
  )
50
53
  from nmdc_runtime.site.resources import (
51
54
  get_mongo,
@@ -123,6 +126,55 @@ ensure_alldocs_daily = ScheduleDefinition(
123
126
  )
124
127
 
125
128
 
129
+ load_envo_ontology_weekly = ScheduleDefinition(
130
+ name="weekly_load_envo_ontology",
131
+ cron_schedule="0 7 * * 1",
132
+ execution_timezone="America/New_York",
133
+ job=run_ontology_load.to_job(
134
+ name="scheduled_envo_ontology_load",
135
+ config=unfreeze(
136
+ merge(
137
+ run_config_frozen__normal_env,
138
+ {"ops": {"load_ontology": {"config": {"source_ontology": "envo"}}}},
139
+ )
140
+ ),
141
+ resource_defs=resource_defs,
142
+ ),
143
+ )
144
+
145
+ load_uberon_ontology_weekly = ScheduleDefinition(
146
+ name="weekly_load_uberon_ontology",
147
+ cron_schedule="0 8 * * 1",
148
+ execution_timezone="America/New_York",
149
+ job=run_ontology_load.to_job(
150
+ name="scheduled_uberon_ontology_load",
151
+ config=unfreeze(
152
+ merge(
153
+ run_config_frozen__normal_env,
154
+ {"ops": {"load_ontology": {"config": {"source_ontology": "uberon"}}}},
155
+ )
156
+ ),
157
+ resource_defs=resource_defs,
158
+ ),
159
+ )
160
+
161
+ load_po_ontology_weekly = ScheduleDefinition(
162
+ name="weekly_load_po_ontology",
163
+ cron_schedule="0 9 * * 1",
164
+ execution_timezone="America/New_York",
165
+ job=run_ontology_load.to_job(
166
+ name="scheduled_po_ontology_load",
167
+ config=unfreeze(
168
+ merge(
169
+ run_config_frozen__normal_env,
170
+ {"ops": {"load_ontology": {"config": {"source_ontology": "po"}}}},
171
+ )
172
+ ),
173
+ resource_defs=resource_defs,
174
+ ),
175
+ )
176
+
177
+
126
178
  def asset_materialization_metadata(asset_event, key):
127
179
  """Get metadata from an asset materialization event.
128
180
 
@@ -411,11 +463,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
411
463
  yield SkipReason("; ".join(skip_notes))
412
464
 
413
465
 
414
- # TODO ensure data_object_type values from file_type_enum
415
- # see /metadata-translation/notebooks/202106_curation_updates.ipynb
416
- # for details ("Create file_type_enum collection" section).
417
-
418
-
419
466
  @sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
420
467
  def done_object_put_ops(_context):
421
468
  client = get_runtime_api_site_client(run_config_frozen__normal_env)
@@ -462,7 +509,13 @@ def repo():
462
509
  export_study_biosamples_metadata.to_job(**preset_normal),
463
510
  ensure_alldocs.to_job(**preset_normal),
464
511
  ]
465
- schedules = [housekeeping_weekly, ensure_alldocs_daily]
512
+ schedules = [
513
+ housekeeping_weekly,
514
+ ensure_alldocs_daily,
515
+ load_envo_ontology_weekly,
516
+ load_uberon_ontology_weekly,
517
+ load_po_ontology_weekly,
518
+ ]
466
519
  sensors = [
467
520
  done_object_put_ops,
468
521
  ensure_gold_translation_job,
@@ -516,6 +569,7 @@ def biosample_submission_ingest():
516
569
  "study_type": "research_study",
517
570
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
518
571
  "include_field_site_info": False,
572
+ "enable_biosample_filtering": True,
519
573
  },
520
574
  },
521
575
  "export_json_to_drs": {"config": {"username": ""}},
@@ -960,6 +1014,8 @@ def database_records_stitching():
960
1014
  "config": {
961
1015
  "nmdc_study_id": "",
962
1016
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
1017
+ "include_field_site_info": False,
1018
+ "enable_biosample_filtering": True,
963
1019
  }
964
1020
  },
965
1021
  "export_json_to_drs": {"config": {"username": ""}},
@@ -1002,12 +1058,57 @@ def database_records_stitching():
1002
1058
  "config": {
1003
1059
  "nmdc_study_id": "",
1004
1060
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
1061
+ "include_field_site_info": False,
1062
+ "enable_biosample_filtering": True,
1005
1063
  }
1006
1064
  },
1007
1065
  "export_json_to_drs": {"config": {"username": ""}},
1008
1066
  },
1009
1067
  },
1010
1068
  ),
1069
+ generate_update_script_for_insdc_biosample_identifiers.to_job(
1070
+ description="This job generates a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.",
1071
+ resource_defs=resource_defs,
1072
+ config={
1073
+ "resources": merge(
1074
+ unfreeze(normal_resources),
1075
+ {
1076
+ "runtime_api_user_client": {
1077
+ "config": {
1078
+ "base_url": {"env": "API_HOST"},
1079
+ "username": {"env": "API_ADMIN_USER"},
1080
+ "password": {"env": "API_ADMIN_PASS"},
1081
+ },
1082
+ },
1083
+ "runtime_api_site_client": {
1084
+ "config": {
1085
+ "base_url": {"env": "API_HOST"},
1086
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
1087
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
1088
+ "site_id": {"env": "API_SITE_ID"},
1089
+ },
1090
+ },
1091
+ "gold_api_client": {
1092
+ "config": {
1093
+ "base_url": {"env": "GOLD_API_BASE_URL"},
1094
+ "username": {"env": "GOLD_API_USERNAME"},
1095
+ "password": {"env": "GOLD_API_PASSWORD"},
1096
+ },
1097
+ },
1098
+ },
1099
+ ),
1100
+ "ops": {
1101
+ "get_database_updater_inputs": {
1102
+ "config": {
1103
+ "nmdc_study_id": "",
1104
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
1105
+ "include_field_site_info": False,
1106
+ "enable_biosample_filtering": True,
1107
+ }
1108
+ },
1109
+ },
1110
+ },
1111
+ ),
1011
1112
  ]
1012
1113
 
1013
1114
 
@@ -109,7 +109,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
109
109
  },
110
110
  )
111
111
  response.raise_for_status()
112
- return response.json()["cursor"]["firstBatch"]
112
+ return response.json()["cursor"]["batch"]
113
113
 
114
114
  def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
115
115
  gold_project_id = normalize_gold_id(gold_project_id)
@@ -126,7 +126,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
126
126
  },
127
127
  )
128
128
  response.raise_for_status()
129
- return response.json()["cursor"]["firstBatch"]
129
+ return response.json()["cursor"]["batch"]
130
130
 
131
131
  def get_biosamples_for_study(self, study_id: str):
132
132
  # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
@@ -147,6 +147,19 @@ class RuntimeApiUserClient(RuntimeApiClient):
147
147
  response.raise_for_status()
148
148
  return response.json()["resources"]
149
149
 
150
+ def get_data_generation_records_for_study(self, study_id: str):
151
+ # TODO: same as above, we are using a large max_page_size to avoid pagination.
152
+ response = self.request(
153
+ "GET",
154
+ f"/nmdcschema/data_generation_set",
155
+ {
156
+ "filter": json.dumps({"associated_studies": study_id}),
157
+ "max_page_size": 10000,
158
+ },
159
+ )
160
+ response.raise_for_status()
161
+ return response.json()["resources"]
162
+
150
163
  def get_omics_processing_by_name(self, name: str):
151
164
  response = self.request(
152
165
  "POST",
@@ -157,7 +170,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
157
170
  },
158
171
  )
159
172
  response.raise_for_status()
160
- return response.json()["cursor"]["firstBatch"]
173
+ return response.json()["cursor"]["batch"]
161
174
 
162
175
  def get_study(self, study_id: str):
163
176
  response = self.request(
@@ -169,7 +182,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
169
182
  },
170
183
  )
171
184
  response.raise_for_status()
172
- return response.json()["cursor"]["firstBatch"]
185
+ return response.json()["cursor"]["batch"]
173
186
 
174
187
 
175
188
  class RuntimeApiSiteClient(RuntimeApiClient):
@@ -45,6 +45,7 @@ class GoldStudyTranslator(Translator):
45
45
  analysis_projects: List[JSON_OBJECT] = [],
46
46
  gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
47
47
  include_field_site_info: bool = False,
48
+ enable_biosample_filtering: bool = True,
48
49
  *args,
49
50
  **kwargs,
50
51
  ) -> None:
@@ -53,15 +54,20 @@ class GoldStudyTranslator(Translator):
53
54
  self.study = study
54
55
  self.study_type = nmdc.StudyCategoryEnum(study_type)
55
56
  self.include_field_site_info = include_field_site_info
57
+ self.enable_biosample_filtering = enable_biosample_filtering
56
58
  # Filter biosamples to only those with `sequencingStrategy` of
57
- # "Metagenome" or "Metatranscriptome"
58
- self.biosamples = [
59
- biosample
60
- for biosample in biosamples
61
- if any(
62
- _is_valid_project(project) for project in biosample.get("projects", [])
63
- )
64
- ]
59
+ # "Metagenome" or "Metatranscriptome" if filtering is enabled
60
+ if enable_biosample_filtering:
61
+ self.biosamples = [
62
+ biosample
63
+ for biosample in biosamples
64
+ if any(
65
+ _is_valid_project(project)
66
+ for project in biosample.get("projects", [])
67
+ )
68
+ ]
69
+ else:
70
+ self.biosamples = biosamples
65
71
  # Fetch the valid projectGoldIds that are associated with filtered
66
72
  # biosamples on their `projects` field
67
73
  valid_project_ids = {
@@ -116,6 +122,9 @@ class GoldStudyTranslator(Translator):
116
122
  :param gold_entity: GOLD entity object
117
123
  :return: PersonValue corresponding to the first PI in the `contacts` field
118
124
  """
125
+ if "contacts" not in gold_entity:
126
+ return None
127
+
119
128
  pi_dict = next(
120
129
  (
121
130
  contact
@@ -169,7 +178,7 @@ class GoldStudyTranslator(Translator):
169
178
  project["ncbiBioSampleAccession"], default_prefix="biosample"
170
179
  )
171
180
  for project in biosample_projects
172
- if project["ncbiBioSampleAccession"]
181
+ if project.get("ncbiBioSampleAccession")
173
182
  ]
174
183
 
175
184
  def _get_samp_taxon_id(
@@ -349,6 +349,7 @@ class NeonBenthicDataTranslator(Translator):
349
349
  description=f"sequencing results for {basename}",
350
350
  type="nmdc:DataObject",
351
351
  data_object_type=do_type,
352
+ data_category=nmdc.DataCategoryEnum.instrument_data.text,
352
353
  in_manifest=manifest_id,
353
354
  )
354
355
 
@@ -264,6 +264,7 @@ class NeonSoilDataTranslator(Translator):
264
264
  description=f"sequencing results for {basename}",
265
265
  type="nmdc:DataObject",
266
266
  md5_checksum=checksum,
267
+ data_category=nmdc.DataCategoryEnum.instrument_data.text,
267
268
  data_object_type=do_type,
268
269
  )
269
270
 
@@ -397,6 +397,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
397
397
  description=f"sequencing results for {basename}",
398
398
  type="nmdc:DataObject",
399
399
  data_object_type=do_type,
400
+ data_category=nmdc.DataCategoryEnum.instrument_data.text,
400
401
  in_manifest=manifest_id,
401
402
  )
402
403
 
@@ -47,6 +47,12 @@ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str]
47
47
  (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
48
48
  }
49
49
 
50
+ UNIT_OVERRIDES: dict[str, dict[str, str]] = {
51
+ "Biosample": {
52
+ "depth": "m",
53
+ }
54
+ }
55
+
50
56
 
51
57
  class EnvironmentPackage(Enum):
52
58
  r"""
@@ -475,6 +481,50 @@ class SubmissionPortalTranslator(Translator):
475
481
 
476
482
  return value
477
483
 
484
+ def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
485
+ """Collect and format DOIs from submission portal schema in nmdc format DOIs
486
+
487
+ If there were no DOIs, None is returned.
488
+
489
+ :param metadata_submission: submission portal entry
490
+ :return: list of nmdc.DOI objects
491
+ """
492
+ data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
493
+ award_dois = self._get_from(
494
+ metadata_submission, ["multiOmicsForm", "awardDois"]
495
+ )
496
+ if data_dois and len(data_dois) > 0:
497
+ updated_data_dois = [
498
+ nmdc.Doi(
499
+ doi_category="dataset_doi",
500
+ doi_provider=doi["provider"],
501
+ doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
502
+ type="nmdc:Doi",
503
+ )
504
+ for doi in data_dois
505
+ ]
506
+ else:
507
+ updated_data_dois = []
508
+
509
+ if award_dois and len(award_dois) > 0:
510
+ updated_award_dois = [
511
+ nmdc.Doi(
512
+ doi_category="award_doi",
513
+ doi_provider=doi["provider"],
514
+ doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
515
+ type="nmdc:Doi",
516
+ )
517
+ for doi in award_dois
518
+ ]
519
+ else:
520
+ updated_award_dois = []
521
+
522
+ return_val = updated_data_dois + updated_award_dois
523
+ if len(return_val) == 0:
524
+ return_val = None
525
+
526
+ return return_val
527
+
478
528
  def _get_data_objects_from_fields(
479
529
  self,
480
530
  sample_data: JSON_OBJECT,
@@ -591,6 +641,7 @@ class SubmissionPortalTranslator(Translator):
591
641
  websites=self._get_from(
592
642
  metadata_submission, ["studyForm", "linkOutWebpage"]
593
643
  ),
644
+ associated_dois=self._get_study_dois(metadata_submission),
594
645
  )
595
646
 
596
647
  def _transform_value_for_slot(
@@ -660,6 +711,17 @@ class SubmissionPortalTranslator(Translator):
660
711
  logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
661
712
  continue
662
713
 
714
+ # This step handles cases where the submission portal/schema instructs a user to
715
+ # provide a value in a specific unit. The unit cannot be parsed out of the raw value
716
+ # in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
717
+ # go away once units are encoded in the schema itself.
718
+ # See: https://github.com/microbiomedata/nmdc-schema/issues/2517
719
+ if class_name in UNIT_OVERRIDES:
720
+ # If the class has unit overrides, check if the slot is in the overrides
721
+ unit_overrides = UNIT_OVERRIDES[class_name]
722
+ if slot_name in unit_overrides:
723
+ unit = unit_overrides[slot_name]
724
+
663
725
  slot_definition = self.schema_view.induced_slot(slot_name, class_name)
664
726
  if slot_definition.multivalued:
665
727
  value_list = value