nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,18 +1,83 @@
1
1
  import logging
2
2
  import re
3
+ from collections import namedtuple
3
4
  from datetime import datetime
5
+ from decimal import Decimal
6
+ from enum import Enum
4
7
  from functools import lru_cache
5
8
  from importlib import resources
6
- from typing import Any, List, Optional, Union
9
+ from typing import Any, List, Optional, Union, Tuple
10
+ from urllib.parse import urlparse
7
11
 
8
12
  from linkml_runtime import SchemaView
9
13
  from linkml_runtime.linkml_model import SlotDefinition
10
14
  from nmdc_schema import nmdc
11
- from toolz import get_in, groupby, concat, valmap, dissoc
15
+ from toolz import concat, dissoc, get_in, groupby, valmap
12
16
 
13
17
  from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
14
18
 
15
19
 
20
+ DataUrlSet = namedtuple("DataUrlSet", ["url", "md5_checksum"])
21
+
22
+ READ_1 = DataUrlSet("read_1_url", "read_1_md5_checksum")
23
+ READ_2 = DataUrlSet("read_2_url", "read_2_md5_checksum")
24
+ INTERLEAVED = DataUrlSet("interleaved_url", "interleaved_md5_checksum")
25
+
26
+ DATA_URL_SETS: list[DataUrlSet] = [READ_1, READ_2, INTERLEAVED]
27
+
28
+ BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"
29
+
30
+ TAB_NAME_KEY = "__tab_name"
31
+ METAGENOME = nmdc.NucleotideSequencingEnum(nmdc.NucleotideSequencingEnum.metagenome)
32
+ METATRANSCRIPTOME = nmdc.NucleotideSequencingEnum(
33
+ nmdc.NucleotideSequencingEnum.metatranscriptome
34
+ )
35
+ TAB_NAME_TO_ANALYTE_CATEGORY: dict[str, nmdc.NucleotideSequencingEnum] = {
36
+ "metagenome_sequencing_non_interleaved_data": METAGENOME,
37
+ "metagenome_sequencing_interleaved_data": METAGENOME,
38
+ "metatranscriptome_sequencing_non_interleaved_data": METATRANSCRIPTOME,
39
+ "metatranscriptome_sequencing_interleaved_data": METATRANSCRIPTOME,
40
+ }
41
+
42
+ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str] = {
43
+ (READ_1, str(METAGENOME)): "Metagenome Raw Read 1",
44
+ (READ_2, str(METAGENOME)): "Metagenome Raw Read 2",
45
+ (INTERLEAVED, str(METAGENOME)): "Metagenome Raw Reads",
46
+ (READ_1, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 1",
47
+ (READ_2, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 2",
48
+ (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
49
+ }
50
+
51
+ UNIT_OVERRIDES: dict[str, dict[str, str]] = {
52
+ "Biosample": {
53
+ "depth": "m",
54
+ }
55
+ }
56
+
57
+
58
+ class EnvironmentPackage(Enum):
59
+ r"""
60
+ Enumeration of all possible environmental packages.
61
+
62
+ >>> EnvironmentPackage.AIR.value
63
+ 'air'
64
+ >>> EnvironmentPackage.SEDIMENT.value
65
+ 'sediment'
66
+ """
67
+
68
+ AIR = "air"
69
+ BIOFILM = "microbial mat_biofilm"
70
+ BUILT_ENV = "built environment"
71
+ HCR_CORES = "hydrocarbon resources-cores"
72
+ HRC_FLUID_SWABS = "hydrocarbon resources-fluids_swabs"
73
+ HOST_ASSOCIATED = "host-associated"
74
+ MISC_ENVS = "miscellaneous natural or artificial environment"
75
+ PLANT_ASSOCIATED = "plant-associated"
76
+ SEDIMENT = "sediment"
77
+ SOIL = "soil"
78
+ WATER = "water"
79
+
80
+
16
81
  @lru_cache
17
82
  def _get_schema_view():
18
83
  """Return a SchemaView instance representing the NMDC schema"""
@@ -49,6 +114,18 @@ def group_dicts_by_key(key: str, seq: Optional[list[dict]]) -> Optional[dict]:
49
114
  return grouped
50
115
 
51
116
 
117
+ def split_strip(string: str | None, sep: str) -> list[str] | None:
118
+ """Split a string by a separator and strip whitespace from each part.
119
+
120
+ :param string: string to split
121
+ :param sep: separator to split by
122
+ :return: list of stripped strings
123
+ """
124
+ if string is None:
125
+ return None
126
+ return [s.strip() for s in string.split(sep)]
127
+
128
+
52
129
  class SubmissionPortalTranslator(Translator):
53
130
  """A Translator subclass for handling submission portal entries
54
131
 
@@ -60,17 +137,16 @@ class SubmissionPortalTranslator(Translator):
60
137
 
61
138
  def __init__(
62
139
  self,
63
- metadata_submission: JSON_OBJECT = {},
64
- omics_processing_mapping: Optional[list] = None,
65
- data_object_mapping: Optional[list] = None,
140
+ metadata_submission: Optional[JSON_OBJECT] = None,
66
141
  *args,
142
+ nucleotide_sequencing_mapping: Optional[list] = None,
143
+ data_object_mapping: Optional[list] = None,
144
+ illumina_instrument_mapping: Optional[dict[str, str]] = None,
67
145
  # Additional study-level metadata not captured by the submission portal currently
68
146
  # See: https://github.com/microbiomedata/submission-schema/issues/162
69
- study_doi_category: Optional[str] = None,
70
- study_doi_provider: Optional[str] = None,
71
147
  study_category: Optional[str] = None,
72
148
  study_pi_image_url: Optional[str] = None,
73
- study_funding_sources: Optional[list[str]] = None,
149
+ study_id: Optional[str] = None,
74
150
  # Additional biosample-level metadata with optional column mapping information not captured
75
151
  # by the submission portal currently.
76
152
  # See: https://github.com/microbiomedata/submission-schema/issues/162
@@ -80,30 +156,34 @@ class SubmissionPortalTranslator(Translator):
80
156
  ) -> None:
81
157
  super().__init__(*args, **kwargs)
82
158
 
83
- self.metadata_submission = metadata_submission
84
- self.omics_processing_mapping = omics_processing_mapping
159
+ self.metadata_submission: JSON_OBJECT = metadata_submission or {}
160
+ self.nucleotide_sequencing_mapping = nucleotide_sequencing_mapping
85
161
  self.data_object_mapping = data_object_mapping
86
-
87
- self.study_doi_category = (
88
- nmdc.DoiCategoryEnum(study_doi_category)
89
- if study_doi_category
90
- else nmdc.DoiCategoryEnum.dataset_doi
91
- )
92
- self.study_doi_provider = (
93
- nmdc.DoiProviderEnum(study_doi_provider) if study_doi_provider else None
162
+ self.illumina_instrument_mapping: dict[str, str] = (
163
+ illumina_instrument_mapping or {}
94
164
  )
165
+
95
166
  self.study_category = (
96
167
  nmdc.StudyCategoryEnum(study_category) if study_category else None
97
168
  )
98
169
  self.study_pi_image_url = study_pi_image_url
99
- self.study_funding_sources = study_funding_sources
170
+ self.study_id = study_id
100
171
 
101
- self.biosample_extras = group_dicts_by_key("source_mat_id", biosample_extras)
172
+ self.biosample_extras = group_dicts_by_key(
173
+ BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
174
+ )
102
175
  self.biosample_extras_slot_mapping = group_dicts_by_key(
103
176
  "subject_id", biosample_extras_slot_mapping
104
177
  )
105
178
 
106
179
  self.schema_view: SchemaView = _get_schema_view()
180
+ self._material_processing_subclass_names = []
181
+ for class_name in self.schema_view.class_descendants(
182
+ "MaterialProcessing", reflexive=False
183
+ ):
184
+ class_def = self.schema_view.get_class(class_name)
185
+ if not class_def.abstract:
186
+ self._material_processing_subclass_names.append(class_name)
107
187
 
108
188
  def _get_pi(
109
189
  self, metadata_submission: JSON_OBJECT
@@ -122,29 +202,9 @@ class SubmissionPortalTranslator(Translator):
122
202
  email=study_form.get("piEmail"),
123
203
  orcid=study_form.get("piOrcid"),
124
204
  profile_image_url=self.study_pi_image_url,
205
+ type=nmdc.PersonValue.class_class_curie,
125
206
  )
126
207
 
127
- def _get_doi(self, metadata_submission: JSON_OBJECT) -> Union[List[nmdc.Doi], None]:
128
- """Get DOI information from the context form data
129
-
130
- :param metadata_submission: submission portal entry
131
- :return: list of strings or None
132
- """
133
- dataset_doi = get_in(["contextForm", "datasetDoi"], metadata_submission)
134
- if not dataset_doi:
135
- return None
136
-
137
- if not dataset_doi.startswith("doi:"):
138
- dataset_doi = f"doi:{dataset_doi}"
139
-
140
- return [
141
- nmdc.Doi(
142
- doi_value=dataset_doi,
143
- doi_provider=self.study_doi_provider,
144
- doi_category=self.study_doi_category,
145
- )
146
- ]
147
-
148
208
  def _get_has_credit_associations(
149
209
  self, metadata_submission: JSON_OBJECT
150
210
  ) -> Union[List[nmdc.CreditAssociation], None]:
@@ -162,8 +222,10 @@ class SubmissionPortalTranslator(Translator):
162
222
  applies_to_person=nmdc.PersonValue(
163
223
  name=contributor.get("name"),
164
224
  orcid=contributor.get("orcid"),
225
+ type="nmdc:PersonValue",
165
226
  ),
166
227
  applied_roles=contributor.get("roles"),
228
+ type="nmdc:CreditAssociation",
167
229
  )
168
230
  for contributor in contributors
169
231
  ]
@@ -171,72 +233,92 @@ class SubmissionPortalTranslator(Translator):
171
233
  def _get_gold_study_identifiers(
172
234
  self, metadata_submission: JSON_OBJECT
173
235
  ) -> Union[List[str], None]:
174
- """Construct a GOLD CURIE from the multiomics from data
236
+ """Construct a GOLD CURIE from the study form data
175
237
 
176
238
  :param metadata_submission: submission portal entry
177
239
  :return: GOLD CURIE
178
240
  """
179
- gold_study_id = get_in(["multiOmicsForm", "GOLDStudyId"], metadata_submission)
241
+ gold_study_id = get_in(["studyForm", "GOLDStudyId"], metadata_submission)
180
242
  if not gold_study_id:
181
243
  return None
182
244
 
183
- return [self._get_curie("GOLD", gold_study_id)]
245
+ return [self._ensure_curie(gold_study_id, default_prefix="gold")]
184
246
 
185
- def _get_quantity_value(
186
- self, raw_value: Optional[str], unit: Optional[str] = None
187
- ) -> Union[nmdc.QuantityValue, None]:
188
- """Construct a nmdc:QuantityValue from a raw value string
247
+ def _get_ncbi_bioproject_identifiers(
248
+ self, metadata_submission: JSON_OBJECT
249
+ ) -> Union[List[str], None]:
250
+ """Construct a NCBI Bioproject CURIE from the study form data"""
189
251
 
190
- The regex pattern minimally matches on a single numeric value (possibly
191
- floating point). The pattern can also identify a range represented by
192
- two numeric values separated by a hyphen. It can also identify non-numeric
193
- characters at the end of the string which are interpreted as a unit. A unit
194
- may also be explicitly provided as an argument to this function. If parsing
195
- identifies a unit and a unit argument is provided, the unit argument is used.
196
- If the pattern is not matched at all None is returned.
252
+ ncbi_bioproject_id = get_in(
253
+ ["studyForm", "NCBIBioProjectId"], metadata_submission
254
+ )
255
+ if not ncbi_bioproject_id:
256
+ return None
197
257
 
198
- TODO: currently the parsed unit string is used as-is. In the future we may want
199
- to be stricter about what we accept or coerce into a controlled value set
258
+ return [self._ensure_curie(ncbi_bioproject_id, default_prefix="bioproject")]
200
259
 
201
- :param raw_value: string to parse
202
- :param unit: optional unit, defaults to None
203
- :return: nmdc:QuantityValue
260
+ def _get_jgi_study_identifiers(
261
+ self, metadata_submission: JSON_OBJECT
262
+ ) -> Union[List[str], None]:
263
+ """Construct a JGI proposal CURIE from the multiomics form data
264
+
265
+ :param metadata_submission: submission portal entry
266
+ :return: JGI proposal CURIE
204
267
  """
205
- if raw_value is None:
268
+ jgi_study_id = get_in(["multiOmicsForm", "JGIStudyId"], metadata_submission)
269
+ if not jgi_study_id:
206
270
  return None
207
271
 
208
- match = re.fullmatch(
209
- "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
210
- raw_value,
211
- )
212
- if not match:
272
+ return [self._ensure_curie(jgi_study_id, default_prefix="jgi.proposal")]
273
+
274
+ def _get_emsl_project_identifiers(
275
+ self, metadata_submission: JSON_OBJECT
276
+ ) -> Union[List[str], None]:
277
+ """Construct an EMSL project CURIE from the multiomics form data
278
+
279
+ :param metadata_submission: submission portal entry
280
+ :return: EMSL project CURIE
281
+ """
282
+ emsl_project_id = get_in(["multiOmicsForm", "studyNumber"], metadata_submission)
283
+ if not emsl_project_id:
213
284
  return None
214
285
 
215
- qv = nmdc.QuantityValue(has_raw_value=raw_value)
216
- if match.group(2):
217
- # having group 2 means the value is a range like "0 - 1". Either
218
- # group 1 or group 2 might be the minimum especially when handling
219
- # negative ranges like "0 - -1"
220
- num_1 = float(match.group(1))
221
- num_2 = float(match.group(2))
222
- qv.has_minimum_numeric_value = min(num_1, num_2)
223
- qv.has_maximum_numeric_value = max(num_1, num_2)
224
- else:
225
- # otherwise we just have a single numeric value
226
- qv.has_numeric_value = float(match.group(1))
227
-
228
- if unit:
229
- # a unit was manually specified
230
- if match.group(3) and unit != match.group(3):
231
- # a unit was also found in the raw string; issue a warning
232
- # if they don't agree, but keep the manually specified one
233
- logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
234
- qv.has_unit = unit
235
- elif match.group(3):
236
- # a unit was found in the raw string
237
- qv.has_unit = match.group(3)
238
-
239
- return qv
286
+ return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
287
+
288
+ def _get_quantity_value(
289
+ self,
290
+ raw_value: Optional[str | int | float],
291
+ slot_definition: SlotDefinition,
292
+ unit: Optional[str] = None,
293
+ ) -> Union[nmdc.QuantityValue, None]:
294
+ """Construct a nmdc:QuantityValue from a raw value string"""
295
+
296
+ # If the storage_units annotation is present on the slot and it only contains one unit (i.e.
297
+ # not a pipe-separated list of units) then use that unit.
298
+ if "storage_units" in slot_definition.annotations:
299
+ storage_units = slot_definition.annotations["storage_units"].value
300
+ if storage_units and "|" not in storage_units:
301
+ unit = storage_units
302
+
303
+ # If the raw_value is numeric, directly construct a QuantityValue with the inferred unit.
304
+ if isinstance(raw_value, (int, float)):
305
+ if unit is None:
306
+ raise ValueError(
307
+ f"While processing value for slot {slot_definition.name}, a numeric value was provided but no unit could be inferred."
308
+ )
309
+ # Constructing a Decimal directly from a float will maintain the full precision of the
310
+ # float (i.e. numbers like 0.5 cannot be represented exactly). Converting the float to
311
+ # a string first and then constructing the Decimal from that string will give a more
312
+ # expected value.
313
+ value_as_str = str(raw_value)
314
+ return nmdc.QuantityValue(
315
+ has_raw_value=value_as_str,
316
+ has_numeric_value=Decimal(value_as_str),
317
+ has_unit=unit,
318
+ type="nmdc:QuantityValue",
319
+ )
320
+
321
+ return self._parse_quantity_value(raw_value, unit)
240
322
 
241
323
  def _get_ontology_class(
242
324
  self, raw_value: Optional[str]
@@ -259,6 +341,7 @@ class SubmissionPortalTranslator(Translator):
259
341
  return nmdc.OntologyClass(
260
342
  name=match.group(1).strip(),
261
343
  id=match.group(2).strip(),
344
+ type="nmdc:OntologyClass",
262
345
  )
263
346
 
264
347
  def _get_controlled_identified_term_value(
@@ -280,7 +363,9 @@ class SubmissionPortalTranslator(Translator):
280
363
  return None
281
364
 
282
365
  return nmdc.ControlledIdentifiedTermValue(
283
- has_raw_value=raw_value, term=ontology_class
366
+ has_raw_value=raw_value,
367
+ term=ontology_class,
368
+ type="nmdc:ControlledIdentifiedTermValue",
284
369
  )
285
370
 
286
371
  def _get_controlled_term_value(
@@ -297,7 +382,10 @@ class SubmissionPortalTranslator(Translator):
297
382
  if not raw_value:
298
383
  return None
299
384
 
300
- value = nmdc.ControlledTermValue(has_raw_value=raw_value)
385
+ value = nmdc.ControlledTermValue(
386
+ has_raw_value=raw_value,
387
+ type="nmdc:ControlledTermValue",
388
+ )
301
389
  ontology_class = self._get_ontology_class(raw_value)
302
390
  if ontology_class is not None:
303
391
  value.term = ontology_class
@@ -327,7 +415,10 @@ class SubmissionPortalTranslator(Translator):
327
415
  return None
328
416
 
329
417
  return nmdc.GeolocationValue(
330
- has_raw_value=raw_value, latitude=match.group(1), longitude=match.group(2)
418
+ has_raw_value=raw_value,
419
+ latitude=match.group(1),
420
+ longitude=match.group(2),
421
+ type="nmdc:GeolocationValue",
331
422
  )
332
423
 
333
424
  def _get_float(self, raw_value: Optional[str]) -> Union[float, None]:
@@ -376,6 +467,127 @@ class SubmissionPortalTranslator(Translator):
376
467
 
377
468
  return value
378
469
 
470
+ def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
471
+ """Collect and format DOIs from submission portal schema in nmdc format DOIs
472
+
473
+ If there were no DOIs, None is returned.
474
+
475
+ :param metadata_submission: submission portal entry
476
+ :return: list of nmdc.DOI objects
477
+ """
478
+ data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
479
+ award_dois = self._get_from(
480
+ metadata_submission, ["multiOmicsForm", "awardDois"]
481
+ )
482
+ if data_dois and len(data_dois) > 0:
483
+ updated_data_dois = [
484
+ nmdc.Doi(
485
+ doi_category="dataset_doi",
486
+ doi_provider=doi["provider"],
487
+ doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
488
+ type="nmdc:Doi",
489
+ )
490
+ for doi in data_dois
491
+ ]
492
+ else:
493
+ updated_data_dois = []
494
+
495
+ if award_dois and len(award_dois) > 0:
496
+ updated_award_dois = [
497
+ nmdc.Doi(
498
+ doi_category="award_doi",
499
+ doi_provider=doi["provider"],
500
+ doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
501
+ type="nmdc:Doi",
502
+ )
503
+ for doi in award_dois
504
+ ]
505
+ else:
506
+ updated_award_dois = []
507
+
508
+ return_val = updated_data_dois + updated_award_dois
509
+ if len(return_val) == 0:
510
+ return_val = None
511
+
512
+ return return_val
513
+
514
+ def _get_data_objects_from_fields(
515
+ self,
516
+ sample_data: JSON_OBJECT,
517
+ *,
518
+ url_field_name: str,
519
+ md5_checksum_field_name: str,
520
+ nucleotide_sequencing_id: str,
521
+ data_object_type: nmdc.FileTypeEnum,
522
+ ) -> Tuple[List[nmdc.DataObject], nmdc.Manifest | None]:
523
+ """Get a DataObject instances based on the URLs and MD5 checksums in the given fields.
524
+
525
+ If the field provides multiple URLs, multiple DataObject instances will be created and a
526
+ Manifest will be created and provided in the second return value.
527
+
528
+ :param sample_data: sample data
529
+ :param url_field_name: field name for the URL
530
+ :param md5_checksum_field_name: field name for the MD5 checksum
531
+ :param nucleotide_sequencing_id: ID for the nmdc:NucleotideSequencing object that generated the data object(s)
532
+ :param data_object_type: FileTypeEnum representing the type of the data object
533
+ :return: nmdc.DataObject or None
534
+ """
535
+ data_objects: List[nmdc.DataObject] = []
536
+ urls = split_strip(sample_data.get(url_field_name), ";")
537
+ if not urls:
538
+ return data_objects, None
539
+
540
+ md5_checksums = split_strip(sample_data.get(md5_checksum_field_name), ";")
541
+ if md5_checksums and len(urls) != len(md5_checksums):
542
+ raise ValueError(
543
+ f"{url_field_name} and {md5_checksum_field_name} must have the same number of values"
544
+ )
545
+
546
+ data_object_ids = self._id_minter("nmdc:DataObject", len(urls))
547
+ manifest: nmdc.Manifest | None = None
548
+ if len(urls) > 1:
549
+ manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
550
+ manifest = nmdc.Manifest(
551
+ id=manifest_id,
552
+ manifest_category=nmdc.ManifestCategoryEnum(
553
+ nmdc.ManifestCategoryEnum.poolable_replicates
554
+ ),
555
+ type="nmdc:Manifest",
556
+ )
557
+
558
+ for i, url in enumerate(urls):
559
+ data_object_id = data_object_ids[i]
560
+ parsed_url = urlparse(url)
561
+ possible_filename = parsed_url.path.rsplit("/", 1)[-1]
562
+ data_object_slots = {
563
+ "id": data_object_id,
564
+ "name": possible_filename,
565
+ "description": f"{data_object_type} for {nucleotide_sequencing_id}",
566
+ "type": "nmdc:DataObject",
567
+ "url": url,
568
+ "md5_checksum": md5_checksums[i] if md5_checksums else None,
569
+ "in_manifest": [manifest.id] if manifest else None,
570
+ "data_category": nmdc.DataCategoryEnum(
571
+ nmdc.DataCategoryEnum.instrument_data
572
+ ),
573
+ "data_object_type": data_object_type,
574
+ "was_generated_by": nucleotide_sequencing_id,
575
+ }
576
+ data_object_slots.update(
577
+ self._transform_dict_for_class(sample_data, "DataObject")
578
+ )
579
+ data_objects.append(nmdc.DataObject(**data_object_slots))
580
+
581
+ return data_objects, manifest
582
+
583
+ def _parse_sample_link(self, sample_link: str) -> tuple[str, list[str]] | None:
584
+ """Parse a sample link in the form of `ProcessingName:SampleName,..."""
585
+ pattern = r"(" + "|".join(self._material_processing_subclass_names) + r"):(.+)"
586
+ match = re.match(pattern, sample_link)
587
+ if not match:
588
+ return None
589
+ return match.group(1), split_strip(match.group(2), ",")
590
+
379
591
  def _translate_study(
380
592
  self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
381
593
  ) -> nmdc.Study:
@@ -389,20 +601,18 @@ class SubmissionPortalTranslator(Translator):
389
601
  :return: nmdc:Study object
390
602
  """
391
603
  return nmdc.Study(
392
- alternative_identifiers=self._get_from(
393
- metadata_submission, ["multiOmicsForm", "JGIStudyId"]
394
- ),
395
604
  alternative_names=self._get_from(
396
- metadata_submission, ["multiOmicsForm", "alternativeNames"]
605
+ metadata_submission, ["studyForm", "alternativeNames"]
397
606
  ),
398
- associated_dois=self._get_doi(metadata_submission),
399
607
  description=self._get_from(
400
608
  metadata_submission, ["studyForm", "description"]
401
609
  ),
402
- funding_sources=self.study_funding_sources,
403
- # emsl_proposal_identifier=self._get_from(
404
- # metadata_submission, ["multiOmicsForm", "studyNumber"]
405
- # ),
610
+ funding_sources=self._get_from(
611
+ metadata_submission, ["studyForm", "fundingSources"]
612
+ ),
613
+ emsl_project_identifiers=self._get_emsl_project_identifiers(
614
+ metadata_submission
615
+ ),
406
616
  gold_study_identifiers=self._get_gold_study_identifiers(
407
617
  metadata_submission
408
618
  ),
@@ -410,17 +620,22 @@ class SubmissionPortalTranslator(Translator):
410
620
  metadata_submission
411
621
  ),
412
622
  id=nmdc_study_id,
413
- insdc_bioproject_identifiers=self._get_from(
414
- metadata_submission, ["multiOmicsForm", "NCBIBioProjectId"]
623
+ insdc_bioproject_identifiers=self._get_ncbi_bioproject_identifiers(
624
+ metadata_submission
625
+ ),
626
+ jgi_portal_study_identifiers=self._get_jgi_study_identifiers(
627
+ metadata_submission
415
628
  ),
416
629
  name=self._get_from(metadata_submission, ["studyForm", "studyName"]),
417
630
  notes=self._get_from(metadata_submission, ["studyForm", "notes"]),
418
631
  principal_investigator=self._get_pi(metadata_submission),
419
632
  study_category=self.study_category,
420
633
  title=self._get_from(metadata_submission, ["studyForm", "studyName"]),
634
+ type="nmdc:Study",
421
635
  websites=self._get_from(
422
636
  metadata_submission, ["studyForm", "linkOutWebpage"]
423
637
  ),
638
+ associated_dois=self._get_study_dois(metadata_submission),
424
639
  )
425
640
 
426
641
  def _transform_value_for_slot(
@@ -428,15 +643,25 @@ class SubmissionPortalTranslator(Translator):
428
643
  ):
429
644
  transformed_value = None
430
645
  if slot.range == "TextValue":
431
- transformed_value = nmdc.TextValue(has_raw_value=value)
646
+ transformed_value = nmdc.TextValue(
647
+ has_raw_value=value,
648
+ type="nmdc:TextValue",
649
+ )
432
650
  elif slot.range == "QuantityValue":
433
- transformed_value = self._get_quantity_value(value, unit=unit)
651
+ transformed_value = self._get_quantity_value(
652
+ value,
653
+ slot,
654
+ unit=unit,
655
+ )
434
656
  elif slot.range == "ControlledIdentifiedTermValue":
435
657
  transformed_value = self._get_controlled_identified_term_value(value)
436
658
  elif slot.range == "ControlledTermValue":
437
659
  transformed_value = self._get_controlled_term_value(value)
438
660
  elif slot.range == "TimestampValue":
439
- transformed_value = nmdc.TimestampValue(has_raw_value=value)
661
+ transformed_value = nmdc.TimestampValue(
662
+ has_raw_value=value,
663
+ type="nmdc:TimestampValue",
664
+ )
440
665
  elif slot.range == "GeolocationValue":
441
666
  transformed_value = self._get_geolocation_value(value)
442
667
  elif slot.range == "float":
@@ -481,11 +706,22 @@ class SubmissionPortalTranslator(Translator):
481
706
  logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
482
707
  continue
483
708
 
709
+ # This step handles cases where the submission portal/schema instructs a user to
710
+ # provide a value in a specific unit. The unit cannot be parsed out of the raw value
711
+ # in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
712
+ # go away once units are encoded in the schema itself.
713
+ # See: https://github.com/microbiomedata/nmdc-schema/issues/2517
714
+ if class_name in UNIT_OVERRIDES:
715
+ # If the class has unit overrides, check if the slot is in the overrides
716
+ unit_overrides = UNIT_OVERRIDES[class_name]
717
+ if slot_name in unit_overrides:
718
+ unit = unit_overrides[slot_name]
719
+
484
720
  slot_definition = self.schema_view.induced_slot(slot_name, class_name)
485
721
  if slot_definition.multivalued:
486
722
  value_list = value
487
723
  if isinstance(value, str):
488
- value_list = [v.strip() for v in value.split("|")]
724
+ value_list = split_strip(value, "|")
489
725
  transformed_value = [
490
726
  self._transform_value_for_slot(item, slot_definition, unit)
491
727
  for item in value_list
@@ -503,7 +739,6 @@ class SubmissionPortalTranslator(Translator):
503
739
  sample_data: List[JSON_OBJECT],
504
740
  nmdc_biosample_id: str,
505
741
  nmdc_study_id: str,
506
- default_env_package: str,
507
742
  ) -> nmdc.Biosample:
508
743
  """Translate sample data from portal submission into an `nmdc:Biosample` object.
509
744
 
@@ -518,22 +753,30 @@ class SubmissionPortalTranslator(Translator):
518
753
  from each applicable submission portal tab
519
754
  :param nmdc_biosample_id: Minted nmdc:Biosample identifier for the translated object
520
755
  :param nmdc_study_id: Minted nmdc:Study identifier for the related Study
521
- :param default_env_package: Default value for `env_package` slot
522
756
  :return: nmdc:Biosample
523
757
  """
524
- source_mat_id = sample_data[0].get("source_mat_id", "").strip()
758
+ env_idx = next(
759
+ (
760
+ i
761
+ for i, tab in enumerate(sample_data)
762
+ if tab.get("env_package") is not None
763
+ ),
764
+ 0,
765
+ )
766
+ biosample_key = sample_data[env_idx].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
525
767
  slots = {
526
768
  "id": nmdc_biosample_id,
527
- "part_of": nmdc_study_id,
528
- "name": sample_data[0].get("samp_name", "").strip(),
529
- "env_package": nmdc.TextValue(has_raw_value=default_env_package),
769
+ "associated_studies": [nmdc_study_id],
770
+ "type": "nmdc:Biosample",
771
+ "name": sample_data[env_idx].get("samp_name", "").strip(),
772
+ "env_package": sample_data[env_idx].get("env_package"),
530
773
  }
531
774
  for tab in sample_data:
532
775
  transformed_tab = self._transform_dict_for_class(tab, "Biosample")
533
776
  slots.update(transformed_tab)
534
777
 
535
778
  if self.biosample_extras:
536
- raw_extras = self.biosample_extras.get(source_mat_id)
779
+ raw_extras = self.biosample_extras.get(biosample_key)
537
780
  if raw_extras:
538
781
  transformed_extras = self._transform_dict_for_class(
539
782
  raw_extras, "Biosample", self.biosample_extras_slot_mapping
@@ -552,47 +795,217 @@ class SubmissionPortalTranslator(Translator):
552
795
  :return: nmdc:Database object
553
796
  """
554
797
  database = nmdc.Database()
555
-
556
- nmdc_study_id = self._id_minter("nmdc:Study")[0]
557
-
558
798
  metadata_submission_data = self.metadata_submission.get(
559
799
  "metadata_submission", {}
560
800
  )
561
- database.study_set = [
562
- self._translate_study(metadata_submission_data, nmdc_study_id)
563
- ]
564
801
 
802
+ # Generate one Study instance based on the metadata submission, if a study_id wasn't provided
803
+ if self.study_id:
804
+ nmdc_study_id = self.study_id
805
+ else:
806
+ nmdc_study_id = self._id_minter("nmdc:Study")[0]
807
+ database.study_set = [
808
+ self._translate_study(metadata_submission_data, nmdc_study_id)
809
+ ]
810
+
811
+ # Automatically populate the `env_package` field in the sample data based on which
812
+ # environmental data tab the sample data came from.
565
813
  sample_data = metadata_submission_data.get("sampleData", {})
566
- package_name = metadata_submission_data["packageName"]
567
- sample_data_by_id = groupby("source_mat_id", concat(sample_data.values()))
814
+ for key in sample_data.keys():
815
+ env = key.removesuffix("_data").upper()
816
+ try:
817
+ package_name = EnvironmentPackage[env].value
818
+ for sample in sample_data[key]:
819
+ sample["env_package"] = package_name
820
+ except KeyError:
821
+ # This is expected when processing rows from tabs like the JGI/EMSL tabs or external
822
+ # sequencing data tabs.
823
+ pass
824
+
825
+ # Before regrouping the data by sample name, record which tab each object came from
826
+ for tab_name in sample_data.keys():
827
+ for tab in sample_data[tab_name]:
828
+ tab[TAB_NAME_KEY] = tab_name
829
+
830
+ # Reorganize the sample data by sample name and generate a unique NMDC ID for each
831
+ sample_data_by_id = groupby(
832
+ BIOSAMPLE_UNIQUE_KEY_SLOT,
833
+ concat(sample_data.values()),
834
+ )
568
835
  nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_data_by_id))
569
836
  sample_data_to_nmdc_biosample_ids = dict(
570
837
  zip(sample_data_by_id.keys(), nmdc_biosample_ids)
571
838
  )
572
839
 
573
- database.biosample_set = [
574
- self._translate_biosample(
575
- sample_data,
576
- nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
577
- nmdc_study_id=nmdc_study_id,
578
- default_env_package=package_name,
579
- )
580
- for sample_data_id, sample_data in sample_data_by_id.items()
581
- if sample_data
582
- ]
840
+ # Translate the sample data into nmdc:Biosample objects
841
+ database.biosample_set = []
842
+ for sample_data_id, sample_data in sample_data_by_id.items():
843
+ # This shouldn't happen, but just in case skip empty sample data
844
+ if not sample_data:
845
+ continue
583
846
 
584
- if self.omics_processing_mapping:
585
- # If there is data from an OmicsProcessing mapping file, process it now. This part
586
- # assumes that there is a column in that file with the header __biosample_source_mat_id
847
+ # Find the first tab that has a sample_link value and attempt to parse it
848
+ sample_link = ""
849
+ for tab in sample_data:
850
+ if tab.get("sample_link"):
851
+ sample_link = tab.get("sample_link")
852
+ break
853
+ parsed_sample_link = self._parse_sample_link(sample_link)
854
+
855
+ # If the sample_link could be parsed according to the [ProcessName]:[InputSample,...]
856
+ # format, then create a ProcessedSample and MaterialProcessing instance instead of a
857
+ # Biosample instance. The input samples must be present in the submission for this to
858
+ # work. An exception is raised if any of the referenced input samples are missing.
859
+ if parsed_sample_link is not None:
860
+ processing_type, processing_inputs = parsed_sample_link
861
+ if not all(
862
+ input_id in sample_data_to_nmdc_biosample_ids
863
+ for input_id in processing_inputs
864
+ ):
865
+ raise ValueError(
866
+ f"Could not find all input samples in sample_link '{sample_link}'"
867
+ )
868
+ processed_sample_id = self._id_minter("nmdc:ProcessedSample")[0]
869
+ database.processed_sample_set.append(
870
+ nmdc.ProcessedSample(
871
+ id=processed_sample_id,
872
+ type="nmdc:ProcessedSample",
873
+ name=sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip(),
874
+ )
875
+ )
876
+
877
+ processing_class = getattr(nmdc, processing_type)
878
+ material_processing = processing_class(
879
+ id=self._id_minter(f"nmdc:{processing_type}")[0],
880
+ type=f"nmdc:{processing_type}",
881
+ has_input=[
882
+ sample_data_to_nmdc_biosample_ids[input_id]
883
+ for input_id in processing_inputs
884
+ ],
885
+ has_output=[processed_sample_id],
886
+ )
887
+ database.material_processing_set.append(material_processing)
888
+
889
+ # If there was no sample_link or it doesn't follow the expected format, create a
890
+ # Biosample instance as normal.
891
+ else:
892
+ biosample = self._translate_biosample(
893
+ sample_data,
894
+ nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
895
+ nmdc_study_id=nmdc_study_id,
896
+ )
897
+ database.biosample_set.append(biosample)
898
+
899
+ # This section handles the translation of information in the external sequencing tabs into
900
+ # various NMDC objects.
901
+ database.data_generation_set = []
902
+ database.data_object_set = []
903
+ database.instrument_set = []
904
+ database.manifest_set = []
905
+ today = datetime.now().strftime("%Y-%m-%d")
906
+ for sample_data_id, sample_data in sample_data_by_id.items():
907
+ for tab in sample_data:
908
+ tab_name = tab.get(TAB_NAME_KEY)
909
+ analyte_category = TAB_NAME_TO_ANALYTE_CATEGORY.get(tab_name)
910
+ if not analyte_category:
911
+ # If the tab name cannot be mapped to an analyte category, that means we're
912
+ # not in an external sequencing data tabs (e.g. this is an environmental data
913
+ # tab or a JGI/EMSL tab). Skip this tab.
914
+ continue
915
+
916
+ # Start by generating one NucleotideSequencing instance with a has_input
917
+ # relationship to the current Biosample instance.
918
+ nucleotide_sequencing_id = self._id_minter(
919
+ "nmdc:NucleotideSequencing", 1
920
+ )[0]
921
+ nucleotide_sequencing_slots = {
922
+ "id": nucleotide_sequencing_id,
923
+ "has_input": sample_data_to_nmdc_biosample_ids[sample_data_id],
924
+ "has_output": [],
925
+ "associated_studies": [nmdc_study_id],
926
+ "add_date": today,
927
+ "mod_date": today,
928
+ "analyte_category": analyte_category,
929
+ "type": "nmdc:NucleotideSequencing",
930
+ }
931
+ # If the protocol_link column was filled in, expand it into an nmdc:Protocol object
932
+ if "protocol_link" in tab:
933
+ protocol_link = tab.pop("protocol_link")
934
+ nucleotide_sequencing_slots["protocol_link"] = nmdc.Protocol(
935
+ url=protocol_link,
936
+ type="nmdc:Protocol",
937
+ )
938
+ # If model column was filled in, expand it into an nmdc:Instrument object. This is
939
+ # done by first checking the provided instrument mapping to see if the model is
940
+ # already present. If it is not, a new instrument object is created and added to the
941
+ # instrument_set. Currently, we only accept sequencing data in the submission portal
942
+ # that was generated by Illumina instruments, so the vendor is hardcoded here.
943
+ if "model" in tab:
944
+ model = tab.pop("model")
945
+ if model not in self.illumina_instrument_mapping:
946
+ # If the model is not already in the mapping, create a new record for it
947
+ nmdc_instrument_id = self._id_minter("nmdc:Instrument", 1)[0]
948
+ database.instrument_set.append(
949
+ nmdc.Instrument(
950
+ id=nmdc_instrument_id,
951
+ vendor=nmdc.InstrumentVendorEnum(
952
+ nmdc.InstrumentVendorEnum.illumina
953
+ ),
954
+ model=nmdc.InstrumentModelEnum(model),
955
+ type="nmdc:Instrument",
956
+ )
957
+ )
958
+ self.illumina_instrument_mapping[model] = nmdc_instrument_id
959
+ nucleotide_sequencing_slots["instrument_used"] = (
960
+ self.illumina_instrument_mapping[model]
961
+ )
962
+ # Process the remaining columns according to the NucleotideSequencing class
963
+ # definition
964
+ nucleotide_sequencing_slots.update(
965
+ self._transform_dict_for_class(tab, "NucleotideSequencing")
966
+ )
967
+ nucleotide_sequencing = nmdc.NucleotideSequencing(
968
+ **nucleotide_sequencing_slots
969
+ )
970
+ database.data_generation_set.append(nucleotide_sequencing)
971
+
972
+ # Iterate over the columns that contain URLs and MD5 checksums and translate them
973
+ # into DataObject instances. Each of these DataObject instances will be connected
974
+ # to the NucleotideSequencing instance via the has_output/was_generated_by
975
+ # relationships.
976
+ for data_url in DATA_URL_SETS:
977
+ data_object_type = DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE[
978
+ (data_url, str(analyte_category))
979
+ ]
980
+ data_objects, manifest = self._get_data_objects_from_fields(
981
+ tab,
982
+ url_field_name=data_url.url,
983
+ md5_checksum_field_name=data_url.md5_checksum,
984
+ nucleotide_sequencing_id=nucleotide_sequencing_id,
985
+ data_object_type=nmdc.FileTypeEnum(data_object_type),
986
+ )
987
+ if manifest:
988
+ database.manifest_set.append(manifest)
989
+ for data_object in data_objects:
990
+ nucleotide_sequencing.has_output.append(data_object.id)
991
+ database.data_object_set.append(data_object)
992
+
993
+ # This is the older way of handling attaching NucleotideSequencing and DataObject instances
994
+ # to the Biosample instances. This should now mainly be handled by the external sequencing
995
+ # data tabs in the submission portal. This code is being left in place for now in case it is
996
+ # needed in the future.
997
+ if self.nucleotide_sequencing_mapping:
998
+ # If there is data from an NucleotideSequencing mapping file, process it now. This part
999
+ # assumes that there is a column in that file with the header __biosample_samp_name
587
1000
  # that can be used to join with the sample data from the submission portal. The
588
- # biosample identified by that `source_mat_id` will be referenced in the `has_input`
589
- # slot of the OmicsProcessing object. If a DataObject mapping file was also provided,
590
- # those objects will also be generated and referenced in the `has_output` slot of the
591
- # OmicsProcessing object. By keying off of the `source_mat_id` slot of the submission's
592
- # sample data there is an implicit 1:1 relationship between Biosample objects and
593
- # OmicsProcessing objects generated here.
594
- join_key = "__biosample_source_mat_id"
595
- database.omics_processing_set = []
1001
+ # biosample identified by that `samp_name` will be referenced in the `has_input`
1002
+ # slot of the NucleotideSequencing object. If a DataObject mapping file was also
1003
+ # provided, those objects will also be generated and referenced in the `has_output` slot
1004
+ # of the NucleotideSequencing object. By keying off of the `samp_name` slot of the
1005
+ # submission's sample data there is an implicit 1:1 relationship between Biosample
1006
+ # objects and NucleotideSequencing objects generated here.
1007
+ join_key = f"__biosample_{BIOSAMPLE_UNIQUE_KEY_SLOT}"
1008
+ database.data_generation_set = []
596
1009
  database.data_object_set = []
597
1010
  data_objects_by_sample_data_id = {}
598
1011
  today = datetime.now().strftime("%Y-%m-%d")
@@ -608,45 +1021,47 @@ class SubmissionPortalTranslator(Translator):
608
1021
  grouped,
609
1022
  )
610
1023
 
611
- for omics_processing_row in self.omics_processing_mapping:
612
- # For each row in the OmicsProcessing mapping file, first grab the minted Biosample
613
- # id that corresponds to the sample ID from the submission
614
- sample_data_id = omics_processing_row.pop(join_key)
1024
+ for nucleotide_sequencing_row in self.nucleotide_sequencing_mapping:
1025
+ # For each row in the NucleotideSequencing mapping file, first grab the minted
1026
+ # Biosample id that corresponds to the sample ID from the submission
1027
+ sample_data_id = nucleotide_sequencing_row.pop(join_key)
615
1028
  if (
616
1029
  not sample_data_id
617
1030
  or sample_data_id not in sample_data_to_nmdc_biosample_ids
618
1031
  ):
619
1032
  logging.warning(
620
- f"Unrecognized biosample source_mat_id: {sample_data_id}"
1033
+ f"Unrecognized biosample {BIOSAMPLE_UNIQUE_KEY_SLOT}: {sample_data_id}"
621
1034
  )
622
1035
  continue
623
1036
  nmdc_biosample_id = sample_data_to_nmdc_biosample_ids[sample_data_id]
624
1037
 
625
- # Transform the raw row data according to the OmicsProcessing class's slots, and
626
- # generate an instance. A few key slots do not come from the mapping file, but
1038
+ # Transform the raw row data according to the NucleotideSequencing class's slots,
1039
+ # and generate an instance. A few key slots do not come from the mapping file, but
627
1040
  # instead are defined here.
628
- omics_processing_slots = {
629
- "id": self._id_minter("nmdc:OmicsProcessing", 1)[0],
1041
+ nucleotide_sequencing_slots = {
1042
+ "id": self._id_minter("nmdc:NucleotideSequencing", 1)[0],
630
1043
  "has_input": [nmdc_biosample_id],
631
1044
  "has_output": [],
632
- "part_of": nmdc_study_id,
1045
+ "associated_studies": [nmdc_study_id],
633
1046
  "add_date": today,
634
1047
  "mod_date": today,
635
- "type": "nmdc:OmicsProcessing",
1048
+ "type": "nmdc:NucleotideSequencing",
636
1049
  }
637
- omics_processing_slots.update(
1050
+ nucleotide_sequencing_slots.update(
638
1051
  self._transform_dict_for_class(
639
- omics_processing_row, "OmicsProcessing"
1052
+ nucleotide_sequencing_row, "NucleotideSequencing"
640
1053
  )
641
1054
  )
642
- omics_processing = nmdc.OmicsProcessing(**omics_processing_slots)
1055
+ nucleotide_sequencing = nmdc.NucleotideSequencing(
1056
+ **nucleotide_sequencing_slots
1057
+ )
643
1058
 
644
1059
  for data_object_row in data_objects_by_sample_data_id.get(
645
1060
  sample_data_id, []
646
1061
  ):
647
1062
  # For each row in the DataObject mapping file that corresponds to the sample ID,
648
1063
  # transform the raw row data according to the DataObject class's slots, generate
649
- # an instance, and connect that instance's minted ID to the OmicsProcessing
1064
+ # an instance, and connect that instance's minted ID to the NucleotideSequencing
650
1065
  # instance
651
1066
  data_object_id = self._id_minter("nmdc:DataObject", 1)[0]
652
1067
  data_object_slots = {
@@ -658,10 +1073,49 @@ class SubmissionPortalTranslator(Translator):
658
1073
  )
659
1074
  data_object = nmdc.DataObject(**data_object_slots)
660
1075
 
661
- omics_processing.has_output.append(data_object_id)
1076
+ nucleotide_sequencing.has_output.append(data_object_id)
662
1077
 
663
1078
  database.data_object_set.append(data_object)
664
1079
 
665
- database.omics_processing_set.append(omics_processing)
1080
+ database.data_generation_set.append(nucleotide_sequencing)
666
1081
 
667
1082
  return database
1083
+
1084
+ @staticmethod
1085
+ def set_study_images(
1086
+ nmdc_study: nmdc.Study,
1087
+ pi_image_url: Optional[str],
1088
+ primary_study_image_url: Optional[str],
1089
+ study_images_url: Optional[list[str]],
1090
+ ) -> None:
1091
+ """Set images for a study based on provided URLs."""
1092
+
1093
+ if pi_image_url:
1094
+ if not nmdc_study.principal_investigator:
1095
+ nmdc_study.principal_investigator = nmdc.PersonValue(
1096
+ type="nmdc:PersonValue"
1097
+ )
1098
+ nmdc_study.principal_investigator.profile_image_url = pi_image_url
1099
+
1100
+ if primary_study_image_url:
1101
+ if not nmdc_study.study_image:
1102
+ nmdc_study.study_image = []
1103
+ nmdc_study.study_image.append(
1104
+ nmdc.ImageValue(
1105
+ type="nmdc:ImageValue",
1106
+ url=primary_study_image_url,
1107
+ display_order=0,
1108
+ )
1109
+ )
1110
+
1111
+ if study_images_url:
1112
+ if not nmdc_study.study_image:
1113
+ nmdc_study.study_image = []
1114
+ for idx, image_url in enumerate(study_images_url, start=1):
1115
+ nmdc_study.study_image.append(
1116
+ nmdc.ImageValue(
1117
+ type="nmdc:ImageValue",
1118
+ url=image_url,
1119
+ display_order=idx,
1120
+ )
1121
+ )