nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -2,26 +2,94 @@ import collections
2
2
  import re
3
3
  from typing import List, Tuple, Union
4
4
  from nmdc_schema import nmdc
5
+ import pandas as pd
5
6
 
6
7
  from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
7
8
 
9
+ # Dictionary of sequencing strategies from GOLD that we are filtering on
10
+ # based on the kind of samples that are required for NMDC
11
+ SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
12
+
13
+
14
+ def _is_valid_project(project: dict) -> bool:
15
+ """A project is considered valid if:
16
+ 1. `sequencingStrategy` is in {"Metagenome", "Metatranscriptome"}
17
+ 2. if `sequencingCenters` == 'DOE Joint Genome Institute (JGI)' then
18
+ `projectStatus` must be in ("Permanent Draft", "Complete and Published")
19
+ 3. otherwise, no `projectStatus` filter is applied
20
+
21
+ :param project: GOLD project object (structurally similar to response
22
+ from `/projects` endpoint)
23
+ :return: True if the project is valid, False otherwise
24
+ """
25
+ if project.get("sequencingStrategy") not in SEQUENCING_STRATEGIES:
26
+ return False
27
+
28
+ if project.get("sequencingCenters") == "DOE Joint Genome Institute (JGI)":
29
+ return project.get("projectStatus") in (
30
+ "Permanent Draft",
31
+ "Complete and Published",
32
+ )
33
+
34
+ return True
35
+
8
36
 
9
37
  class GoldStudyTranslator(Translator):
10
38
  def __init__(
11
39
  self,
12
40
  study: JSON_OBJECT = {},
41
+ study_type: str = "research_study",
13
42
  biosamples: List[JSON_OBJECT] = [],
14
43
  projects: List[JSON_OBJECT] = [],
15
44
  analysis_projects: List[JSON_OBJECT] = [],
45
+ gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
46
+ include_field_site_info: bool = False,
47
+ enable_biosample_filtering: bool = True,
16
48
  *args,
17
49
  **kwargs,
18
50
  ) -> None:
19
51
  super().__init__(*args, **kwargs)
20
52
 
21
53
  self.study = study
22
- self.biosamples = biosamples
23
- self.projects = projects
24
- self.analysis_projects = analysis_projects
54
+ self.study_type = nmdc.StudyCategoryEnum(study_type)
55
+ self.include_field_site_info = include_field_site_info
56
+ self.enable_biosample_filtering = enable_biosample_filtering
57
+ # Filter biosamples to only those with `sequencingStrategy` of
58
+ # "Metagenome" or "Metatranscriptome" if filtering is enabled
59
+ if enable_biosample_filtering:
60
+ self.biosamples = [
61
+ biosample
62
+ for biosample in biosamples
63
+ if any(
64
+ _is_valid_project(project)
65
+ for project in biosample.get("projects", [])
66
+ )
67
+ ]
68
+ else:
69
+ self.biosamples = biosamples
70
+ # Fetch the valid projectGoldIds that are associated with filtered
71
+ # biosamples on their `projects` field
72
+ valid_project_ids = {
73
+ project.get("projectGoldId")
74
+ for project in projects
75
+ if _is_valid_project(project)
76
+ }
77
+ # Filter projects to only those with `projectGoldId` in valid_project_ids
78
+ self.projects = [
79
+ project
80
+ for project in projects
81
+ if project.get("projectGoldId") in valid_project_ids
82
+ ]
83
+ # Filter analysis_projects to only those with all `projects` in valid_project_ids
84
+ self.analysis_projects = [
85
+ analysis_project
86
+ for analysis_project in analysis_projects
87
+ if all(
88
+ project_id in valid_project_ids
89
+ for project_id in analysis_project.get("projects", [])
90
+ )
91
+ ]
92
+ self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
25
93
 
26
94
  self._projects_by_id = self._index_by_id(self.projects, "projectGoldId")
27
95
  self._analysis_projects_by_id = self._index_by_id(
@@ -53,6 +121,9 @@ class GoldStudyTranslator(Translator):
53
121
  :param gold_entity: GOLD entity object
54
122
  :return: PersonValue corresponding to the first PI in the `contacts` field
55
123
  """
124
+ if "contacts" not in gold_entity:
125
+ return None
126
+
56
127
  pi_dict = next(
57
128
  (
58
129
  contact
@@ -69,6 +140,7 @@ class GoldStudyTranslator(Translator):
69
140
  has_raw_value=pi_dict.get("name"),
70
141
  name=pi_dict.get("name"),
71
142
  email=pi_dict.get("email"),
143
+ type="nmdc:PersonValue",
72
144
  )
73
145
 
74
146
  def _get_mod_date(self, gold_entity: JSON_OBJECT) -> Union[str, None]:
@@ -101,29 +173,67 @@ class GoldStudyTranslator(Translator):
101
173
  for id in self._project_ids_by_biosample_id[gold_biosample_id]
102
174
  )
103
175
  return [
104
- self._get_curie("biosample", project["ncbiBioSampleAccession"])
176
+ self._ensure_curie(
177
+ project["ncbiBioSampleAccession"], default_prefix="biosample"
178
+ )
105
179
  for project in biosample_projects
106
- if project["ncbiBioSampleAccession"]
180
+ if project.get("ncbiBioSampleAccession")
107
181
  ]
108
182
 
109
183
  def _get_samp_taxon_id(
110
184
  self, gold_biosample: JSON_OBJECT
111
- ) -> Union[nmdc.TextValue, None]:
112
- """Get a TextValue representing the NCBI taxon for a GOLD biosample
185
+ ) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
186
+ """Get a ControlledIdentifiedTermValue representing the NCBI taxon
187
+ for a GOLD biosample
113
188
 
114
189
  This method gets the `ncbiTaxName` and `ncbiTaxId` from a GOLD biosample object.
115
- If both are not `None`, it constructs a TextValue of the format
190
+ If both are not `None`, it constructs a ControlledIdentifiedTermValue of the format
116
191
  `{ncbiTaxName} [NCBITaxon:{ncbiTaxId}]`. Otherwise, it returns `None`
117
192
 
118
193
  :param gold_biosample: GOLD biosample object
119
- :return: TextValue object
194
+ :return: ControlledIdentifiedTermValue object
120
195
  """
121
196
  ncbi_tax_name = gold_biosample.get("ncbiTaxName")
122
197
  ncbi_tax_id = gold_biosample.get("ncbiTaxId")
123
198
  if ncbi_tax_name is None or ncbi_tax_id is None:
124
199
  return None
125
200
 
126
- return nmdc.TextValue(f"{ncbi_tax_name} [NCBITaxon:{ncbi_tax_id}]")
201
+ raw_value = f"{ncbi_tax_name} [NCBITaxon:{ncbi_tax_id}]"
202
+
203
+ return nmdc.ControlledIdentifiedTermValue(
204
+ has_raw_value=raw_value,
205
+ term=nmdc.OntologyClass(
206
+ id=f"NCBITaxon:{ncbi_tax_id}",
207
+ name=ncbi_tax_name,
208
+ type="nmdc:OntologyClass",
209
+ ),
210
+ type="nmdc:ControlledIdentifiedTermValue",
211
+ )
212
+
213
+ def _get_host_taxid(
214
+ self, gold_biosample: JSON_OBJECT
215
+ ) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
216
+ """Get a ControlledIdentifiedTermValue representing the NCBI host taxon id
217
+ for a GOLD biosample
218
+
219
+ This method gets the `hostNcbiTaxid` from a GOLD biosample object.
220
+ It constructs a ControlledIdentifiedTermValue of the format
221
+ `[NCBITaxon:{hostNcbiTaxid}]`. Otherwise, it returns `None`
222
+
223
+ :param gold_biosample: GOLD biosample object
224
+ :return: ControlledIdentifiedTermValue object
225
+ """
226
+ host_taxid = gold_biosample.get("hostNcbiTaxid")
227
+ if host_taxid is None:
228
+ return None
229
+ return nmdc.ControlledIdentifiedTermValue(
230
+ has_raw_value=f"NCBITaxon:{host_taxid}",
231
+ term=nmdc.OntologyClass(
232
+ id=f"NCBITaxon:{host_taxid}",
233
+ type="nmdc:OntologyClass",
234
+ ),
235
+ type="nmdc:ControlledIdentifiedTermValue",
236
+ )
127
237
 
128
238
  def _get_samp_name(self, gold_biosample: JSON_OBJECT) -> Union[str, None]:
129
239
  """Get a sample name for a GOLD biosample object
@@ -183,7 +293,9 @@ class GoldStudyTranslator(Translator):
183
293
  date_collected = gold_biosample.get("dateCollected")
184
294
  if date_collected is None:
185
295
  return None
186
- return nmdc.TimestampValue(has_raw_value=date_collected)
296
+ return nmdc.TimestampValue(
297
+ has_raw_value=date_collected, type="nmdc:TimestampValue"
298
+ )
187
299
 
188
300
  def _get_quantity_value(
189
301
  self,
@@ -215,23 +327,21 @@ class GoldStudyTranslator(Translator):
215
327
  has_raw_value=minimum_numeric_value,
216
328
  has_numeric_value=nmdc.Double(minimum_numeric_value),
217
329
  has_unit=unit,
330
+ type="nmdc:QuantityValue",
218
331
  )
219
332
  else:
220
333
  return nmdc.QuantityValue(
221
334
  has_minimum_numeric_value=nmdc.Double(minimum_numeric_value),
222
335
  has_maximum_numeric_value=nmdc.Double(maximum_numeric_value),
223
336
  has_unit=unit,
337
+ type="nmdc:QuantityValue",
224
338
  )
225
339
 
226
340
  field_value = gold_entity.get(gold_field)
227
341
  if field_value is None:
228
342
  return None
229
343
 
230
- return nmdc.QuantityValue(
231
- has_raw_value=field_value,
232
- has_numeric_value=nmdc.Double(field_value),
233
- has_unit=unit,
234
- )
344
+ return self._parse_quantity_value(str(field_value), unit)
235
345
 
236
346
  def _get_text_value(
237
347
  self, gold_entity: JSON_OBJECT, gold_field: str
@@ -249,7 +359,7 @@ class GoldStudyTranslator(Translator):
249
359
  field_value = gold_entity.get(gold_field)
250
360
  if field_value is None:
251
361
  return None
252
- return nmdc.TextValue(has_raw_value=field_value)
362
+ return nmdc.TextValue(has_raw_value=field_value, type="nmdc:TextValue")
253
363
 
254
364
  def _get_controlled_term_value(
255
365
  self, gold_entity: JSON_OBJECT, gold_field: str
@@ -267,7 +377,9 @@ class GoldStudyTranslator(Translator):
267
377
  field_value = gold_entity.get(gold_field)
268
378
  if field_value is None:
269
379
  return None
270
- return nmdc.ControlledTermValue(has_raw_value=field_value)
380
+ return nmdc.ControlledTermValue(
381
+ has_raw_value=field_value, type="nmdc:ControlledTermValue"
382
+ )
271
383
 
272
384
  def _get_env_term_value(
273
385
  self, gold_biosample: JSON_OBJECT, gold_field: str
@@ -277,8 +389,8 @@ class GoldStudyTranslator(Translator):
277
389
  In GOLD entities ENVO terms are represented as a nested object with `id` and `label`
278
390
  fields. This method extracts this type of nested object by the given field name, and
279
391
  returns it as an `nmdc:ControlledIdentifiedTermValue` object. The `id` in the original
280
- GOLD object be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
281
- `ENVO:00005801`). If the value of the given field is `None` or if does not contain
392
+ GOLD object should be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
393
+ `ENVO:00005801`). If the value of the given field is `None` or if it does not contain
282
394
  a nested object with an `id` field, `None` is returned.
283
395
 
284
396
  :param gold_biosample: GOLD biosample object
@@ -292,8 +404,10 @@ class GoldStudyTranslator(Translator):
292
404
  term=nmdc.OntologyClass(
293
405
  id=env_field["id"].replace("_", ":"),
294
406
  name=env_field.get("label"),
407
+ type="nmdc:OntologyClass",
295
408
  ),
296
409
  has_raw_value=env_field["id"],
410
+ type="nmdc:ControlledIdentifiedTermValue",
297
411
  )
298
412
 
299
413
  def _get_lat_lon(
@@ -316,22 +430,40 @@ class GoldStudyTranslator(Translator):
316
430
  has_raw_value=f"{latitude} {longitude}",
317
431
  latitude=nmdc.DecimalDegree(latitude),
318
432
  longitude=nmdc.DecimalDegree(longitude),
433
+ type="nmdc:GeolocationValue",
319
434
  )
320
435
 
321
- def _get_instrument_name(self, gold_project: JSON_OBJECT) -> Union[str, None]:
322
- """Get instrument name used in a GOLD project
436
+ def _get_instrument(self, gold_project: JSON_OBJECT) -> Union[str, None]:
437
+ """Get instrument id referenced in instrument_set collection in Mongo.
438
+ Note: The instrument id is not retrieved by making a call to the database,
439
+ but rather parsed out from a TSV file in the nmdc-schema repo stored at
440
+ self.gold_instrument_set_mapping_file_path.
323
441
 
324
- This method gets the `seqMethod` field from a GOLD project object. If
325
- that value is not `None` it should be a list and the first element of that
326
- list is returned. If the value of the field is `None`, `None` is returned.
442
+ This method gets the seqMethod field from a GOLD project object. If
443
+ that value is not None and is in the self.gold_instrument_set_mapping_file_path
444
+ file's GOLD SeqMethod column, the corresponding instrument id from
445
+ NMDC instrument_set id column is returned. If the value of the field
446
+ is None, None is returned.
327
447
 
328
448
  :param gold_project: GOLD project object
329
- :return: Instrument name
449
+ :return: id corresponding to an Instrument from instrument_set collection
330
450
  """
331
451
  seq_method = gold_project.get("seqMethod")
332
452
  if not seq_method:
333
453
  return None
334
- return seq_method[0]
454
+
455
+ seq_method = seq_method[0].strip()
456
+ df = self.gold_nmdc_instrument_map_df
457
+
458
+ matching_row = df[df["GOLD SeqMethod"] == seq_method]
459
+
460
+ if not matching_row.empty:
461
+ instrument_id = matching_row["NMDC instrument_set id"].values[0]
462
+ return instrument_id
463
+
464
+ raise ValueError(
465
+ f"seqMethod '{seq_method}' could not be found in the GOLD-NMDC instrument mapping TSV file."
466
+ )
335
467
 
336
468
  def _get_processing_institution(
337
469
  self, gold_project: JSON_OBJECT
@@ -401,12 +533,15 @@ class GoldStudyTranslator(Translator):
401
533
  """
402
534
  return nmdc.Study(
403
535
  description=gold_study.get("description"),
404
- gold_study_identifiers=self._get_curie("gold", gold_study["studyGoldId"]),
536
+ gold_study_identifiers=self._ensure_curie(
537
+ gold_study["studyGoldId"], default_prefix="gold"
538
+ ),
405
539
  id=nmdc_study_id,
406
540
  name=gold_study.get("studyName"),
407
541
  principal_investigator=self._get_pi(gold_study),
408
542
  title=gold_study.get("studyName"),
409
543
  type="nmdc:Study",
544
+ study_category=self.study_type,
410
545
  )
411
546
 
412
547
  def _translate_biosample(
@@ -432,13 +567,11 @@ class GoldStudyTranslator(Translator):
432
567
  gold_biosample_id = gold_biosample["biosampleGoldId"]
433
568
  return nmdc.Biosample(
434
569
  add_date=gold_biosample.get("addDate"),
435
- alt=self._get_quantity_value(
436
- gold_biosample, "altitudeInMeters", unit="meters"
437
- ),
570
+ alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
438
571
  collected_from=nmdc_field_site_id,
439
572
  collection_date=self._get_collection_date(gold_biosample),
440
573
  depth=self._get_quantity_value(
441
- gold_biosample, ("depthInMeters", "depthInMeters2"), unit="meters"
574
+ gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
442
575
  ),
443
576
  description=gold_biosample.get("description"),
444
577
  diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
@@ -451,10 +584,12 @@ class GoldStudyTranslator(Translator):
451
584
  env_local_scale=self._get_env_term_value(gold_biosample, "envoLocalScale"),
452
585
  env_medium=self._get_env_term_value(gold_biosample, "envoMedium"),
453
586
  geo_loc_name=self._get_text_value(gold_biosample, "geoLocation"),
454
- gold_biosample_identifiers=self._get_curie("gold", gold_biosample_id),
587
+ gold_biosample_identifiers=self._ensure_curie(
588
+ gold_biosample_id, default_prefix="gold"
589
+ ),
455
590
  habitat=gold_biosample.get("habitat"),
456
591
  host_name=gold_biosample.get("hostName"),
457
- host_taxid=self._get_text_value(gold_biosample, "hostNcbiTaxid"),
592
+ host_taxid=self._get_host_taxid(gold_biosample),
458
593
  id=nmdc_biosample_id,
459
594
  img_identifiers=self._get_img_identifiers(gold_biosample_id),
460
595
  insdc_biosample_identifiers=self._get_insdc_biosample_identifiers(
@@ -466,7 +601,6 @@ class GoldStudyTranslator(Translator):
466
601
  name=gold_biosample.get("biosampleName"),
467
602
  ncbi_taxonomy_name=gold_biosample.get("ncbiTaxName"),
468
603
  nitrite=self._get_quantity_value(gold_biosample, "nitrateConcentration"),
469
- part_of=nmdc_study_id,
470
604
  ph=gold_biosample.get("ph"),
471
605
  pressure=self._get_quantity_value(gold_biosample, "pressure"),
472
606
  samp_name=self._get_samp_name(gold_biosample),
@@ -476,53 +610,67 @@ class GoldStudyTranslator(Translator):
476
610
  ),
477
611
  specific_ecosystem=gold_biosample.get("specificEcosystem"),
478
612
  subsurface_depth=self._get_quantity_value(
479
- gold_biosample, "subsurfaceDepthInMeters", unit="meters"
613
+ gold_biosample, "subsurfaceDepthInMeters", unit="m"
480
614
  ),
481
615
  temp=self._get_quantity_value(
482
616
  gold_biosample, "sampleCollectionTemperature"
483
617
  ),
484
618
  type="nmdc:Biosample",
619
+ associated_studies=[nmdc_study_id],
485
620
  )
486
621
 
487
- def _translate_omics_processing(
622
+ def _translate_nucleotide_sequencing(
488
623
  self,
489
624
  gold_project: JSON_OBJECT,
490
- nmdc_omics_processing_id: str,
625
+ nmdc_nucleotide_sequencing_id: str,
491
626
  nmdc_biosample_id: str,
492
627
  nmdc_study_id: str,
493
- ) -> nmdc.OmicsProcessing:
494
- """Translate a GOLD project object into an `nmdc:OmicsProcessing` object.
628
+ ):
629
+ """Translate a GOLD project object into an `nmdc:NucleotideSequencing` object.
495
630
 
496
- This method translates a GOLD project object into an equivalent `nmdc:OmicsProcessing`
631
+ This method translates a GOLD project object into an equivalent `nmdc:NucleotideSequencing`
497
632
  object. Any minted NMDC IDs must be passed to this method. Internally, each
498
- slot of the `nmdc:OmicsProcessing` is either directly pulled from the GOLD object or
633
+ slot of the `nmdc:NucleotideSequencing` is either directly pulled from the GOLD object or
499
634
  one of the `_get_*` methods is used.
500
635
 
501
636
  :param gold_project: GOLD project object
502
- :param nmdc_omics_processing_id: Minted nmdc:OmicsProcessing identifier for the translated object
637
+ :param nmdc_omics_processing_id: Minted nmdc:NucleotideSequencing identifier for the translated object
503
638
  :param nmdc_biosample_id: Minted nmdc:Biosample identifier for the related Biosample
504
639
  :param nmdc_study_id: Minted nmdc:Study identifier for the related Study
505
- :return: nmdc:OmicsProcessing object
640
+ :return: nmdc:NucleotideSequencing object
506
641
  """
507
642
  gold_project_id = gold_project["projectGoldId"]
508
- return nmdc.OmicsProcessing(
509
- id=nmdc_omics_processing_id,
643
+ ncbi_bioproject_identifier = gold_project.get("ncbiBioProjectAccession")
644
+ insdc_bioproject_identifiers = []
645
+ if ncbi_bioproject_identifier:
646
+ insdc_bioproject_identifiers.append(
647
+ self._ensure_curie(
648
+ ncbi_bioproject_identifier,
649
+ default_prefix="bioproject",
650
+ )
651
+ )
652
+
653
+ return nmdc.NucleotideSequencing(
654
+ id=nmdc_nucleotide_sequencing_id,
510
655
  name=gold_project.get("projectName"),
511
- gold_sequencing_project_identifiers=self._get_curie(
512
- "gold", gold_project_id
656
+ gold_sequencing_project_identifiers=self._ensure_curie(
657
+ gold_project_id, default_prefix="gold"
513
658
  ),
514
659
  ncbi_project_name=gold_project.get("projectName"),
515
- type="nmdc:OmicsProcessing",
660
+ type="nmdc:NucleotideSequencing",
516
661
  has_input=nmdc_biosample_id,
517
- part_of=nmdc_study_id,
518
662
  add_date=gold_project.get("addDate"),
519
663
  mod_date=self._get_mod_date(gold_project),
664
+ insdc_bioproject_identifiers=insdc_bioproject_identifiers,
520
665
  principal_investigator=self._get_pi(gold_project),
521
- omics_type=self._get_controlled_term_value(
522
- gold_project, "sequencingStrategy"
523
- ),
524
- instrument_name=self._get_instrument_name(gold_project),
525
666
  processing_institution=self._get_processing_institution(gold_project),
667
+ instrument_used=self._get_instrument(gold_project),
668
+ analyte_category=(
669
+ gold_project.get("sequencingStrategy").lower()
670
+ if gold_project.get("sequencingStrategy")
671
+ else None
672
+ ),
673
+ associated_studies=[nmdc_study_id],
526
674
  )
527
675
 
528
676
  def get_database(self) -> nmdc.Database:
@@ -546,28 +694,31 @@ class GoldStudyTranslator(Translator):
546
694
  nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(self.biosamples))
547
695
  gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
548
696
 
549
- gold_field_site_names = sorted(
550
- {self._get_field_site_name(biosample) for biosample in self.biosamples}
551
- )
552
- nmdc_field_site_ids = self._id_minter(
553
- "nmdc:FieldResearchSite", len(gold_field_site_names)
554
- )
555
- gold_name_to_nmdc_field_site_ids = dict(
556
- zip(gold_field_site_names, nmdc_field_site_ids)
557
- )
558
- gold_biosample_to_nmdc_field_site_ids = {
559
- biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
560
- self._get_field_site_name(biosample)
561
- ]
562
- for biosample in self.biosamples
563
- }
697
+ if self.include_field_site_info:
698
+ gold_field_site_names = sorted(
699
+ {self._get_field_site_name(biosample) for biosample in self.biosamples}
700
+ )
701
+ nmdc_field_site_ids = self._id_minter(
702
+ "nmdc:FieldResearchSite", len(gold_field_site_names)
703
+ )
704
+ gold_name_to_nmdc_field_site_ids = dict(
705
+ zip(gold_field_site_names, nmdc_field_site_ids)
706
+ )
707
+ gold_biosample_to_nmdc_field_site_ids = {
708
+ biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
709
+ self._get_field_site_name(biosample)
710
+ ]
711
+ for biosample in self.biosamples
712
+ }
713
+ else:
714
+ gold_biosample_to_nmdc_field_site_ids = {}
564
715
 
565
716
  gold_project_ids = [project["projectGoldId"] for project in self.projects]
566
- nmdc_omics_processing_ids = self._id_minter(
567
- "nmdc:OmicsProcessing", len(gold_project_ids)
717
+ nmdc_nucleotide_sequencing_ids = self._id_minter(
718
+ "nmdc:NucleotideSequencing", len(gold_project_ids)
568
719
  )
569
- gold_project_to_nmdc_omics_processing_ids = dict(
570
- zip(gold_project_ids, nmdc_omics_processing_ids)
720
+ gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
721
+ zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
571
722
  )
572
723
 
573
724
  database.study_set = [self._translate_study(self.study, nmdc_study_id)]
@@ -578,20 +729,21 @@ class GoldStudyTranslator(Translator):
578
729
  biosample["biosampleGoldId"]
579
730
  ],
580
731
  nmdc_study_id=nmdc_study_id,
581
- nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids[
582
- biosample["biosampleGoldId"]
583
- ],
732
+ nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids.get(
733
+ biosample["biosampleGoldId"], None
734
+ ),
584
735
  )
585
736
  for biosample in self.biosamples
586
737
  ]
587
- database.field_research_site_set = [
588
- nmdc.FieldResearchSite(id=id, name=name)
589
- for name, id in gold_name_to_nmdc_field_site_ids.items()
590
- ]
591
- database.omics_processing_set = [
592
- self._translate_omics_processing(
738
+ if self.include_field_site_info:
739
+ database.field_research_site_set = [
740
+ nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
741
+ for name, id in gold_name_to_nmdc_field_site_ids.items()
742
+ ]
743
+ database.data_generation_set = [
744
+ self._translate_nucleotide_sequencing(
593
745
  project,
594
- nmdc_omics_processing_id=gold_project_to_nmdc_omics_processing_ids[
746
+ nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
595
747
  project["projectGoldId"]
596
748
  ],
597
749
  nmdc_biosample_id=gold_to_nmdc_biosample_ids[