nmdc-runtime 1.9.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -1,7 +1,9 @@
1
1
  import collections
2
+ import csv
2
3
  import re
3
4
  from typing import List, Tuple, Union
4
5
  from nmdc_schema import nmdc
6
+ import pandas as pd
5
7
 
6
8
  from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
7
9
 
@@ -10,18 +12,22 @@ class GoldStudyTranslator(Translator):
10
12
  def __init__(
11
13
  self,
12
14
  study: JSON_OBJECT = {},
15
+ study_type: str = "research_study",
13
16
  biosamples: List[JSON_OBJECT] = [],
14
17
  projects: List[JSON_OBJECT] = [],
15
18
  analysis_projects: List[JSON_OBJECT] = [],
19
+ gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
16
20
  *args,
17
21
  **kwargs,
18
22
  ) -> None:
19
23
  super().__init__(*args, **kwargs)
20
24
 
21
25
  self.study = study
26
+ self.study_type = nmdc.StudyCategoryEnum(study_type)
22
27
  self.biosamples = biosamples
23
28
  self.projects = projects
24
29
  self.analysis_projects = analysis_projects
30
+ self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
25
31
 
26
32
  self._projects_by_id = self._index_by_id(self.projects, "projectGoldId")
27
33
  self._analysis_projects_by_id = self._index_by_id(
@@ -69,6 +75,7 @@ class GoldStudyTranslator(Translator):
69
75
  has_raw_value=pi_dict.get("name"),
70
76
  name=pi_dict.get("name"),
71
77
  email=pi_dict.get("email"),
78
+ type="nmdc:PersonValue",
72
79
  )
73
80
 
74
81
  def _get_mod_date(self, gold_entity: JSON_OBJECT) -> Union[str, None]:
@@ -108,22 +115,58 @@ class GoldStudyTranslator(Translator):
108
115
 
109
116
  def _get_samp_taxon_id(
110
117
  self, gold_biosample: JSON_OBJECT
111
- ) -> Union[nmdc.TextValue, None]:
112
- """Get a TextValue representing the NCBI taxon for a GOLD biosample
118
+ ) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
119
+ """Get a ControlledIdentifiedTermValue representing the NCBI taxon
120
+ for a GOLD biosample
113
121
 
114
122
  This method gets the `ncbiTaxName` and `ncbiTaxId` from a GOLD biosample object.
115
- If both are not `None`, it constructs a TextValue of the format
123
+ If both are not `None`, it constructs a ControlledIdentifiedTermValue of the format
116
124
  `{ncbiTaxName} [NCBITaxon:{ncbiTaxId}]`. Otherwise, it returns `None`
117
125
 
118
126
  :param gold_biosample: GOLD biosample object
119
- :return: TextValue object
127
+ :return: ControlledIdentifiedTermValue object
120
128
  """
121
129
  ncbi_tax_name = gold_biosample.get("ncbiTaxName")
122
130
  ncbi_tax_id = gold_biosample.get("ncbiTaxId")
123
131
  if ncbi_tax_name is None or ncbi_tax_id is None:
124
132
  return None
125
133
 
126
- return nmdc.TextValue(f"{ncbi_tax_name} [NCBITaxon:{ncbi_tax_id}]")
134
+ raw_value = f"{ncbi_tax_name} [NCBITaxon:{ncbi_tax_id}]"
135
+
136
+ return nmdc.ControlledIdentifiedTermValue(
137
+ has_raw_value=raw_value,
138
+ term=nmdc.OntologyClass(
139
+ id=f"NCBITaxon:{ncbi_tax_id}",
140
+ name=ncbi_tax_name,
141
+ type="nmdc:OntologyClass",
142
+ ),
143
+ type="nmdc:ControlledIdentifiedTermValue",
144
+ )
145
+
146
+ def _get_host_taxid(
147
+ self, gold_biosample: JSON_OBJECT
148
+ ) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
149
+ """Get a ControlledIdentifiedTermValue representing the NCBI host taxon id
150
+ for a GOLD biosample
151
+
152
+ This method gets the `hostNcbiTaxid` from a GOLD biosample object.
153
+ It constructs a ControlledIdentifiedTermValue of the format
154
+ `[NCBITaxon:{hostNcbiTaxid}]`. Otherwise, it returns `None`
155
+
156
+ :param gold_biosample: GOLD biosample object
157
+ :return: ControlledIdentifiedTermValue object
158
+ """
159
+ host_taxid = gold_biosample.get("hostNcbiTaxid")
160
+ if host_taxid is None:
161
+ return None
162
+ return nmdc.ControlledIdentifiedTermValue(
163
+ has_raw_value=f"NCBITaxon:{host_taxid}",
164
+ term=nmdc.OntologyClass(
165
+ id=f"NCBITaxon:{host_taxid}",
166
+ type="nmdc:OntologyClass",
167
+ ),
168
+ type="nmdc:ControlledIdentifiedTermValue",
169
+ )
127
170
 
128
171
  def _get_samp_name(self, gold_biosample: JSON_OBJECT) -> Union[str, None]:
129
172
  """Get a sample name for a GOLD biosample object
@@ -183,7 +226,9 @@ class GoldStudyTranslator(Translator):
183
226
  date_collected = gold_biosample.get("dateCollected")
184
227
  if date_collected is None:
185
228
  return None
186
- return nmdc.TimestampValue(has_raw_value=date_collected)
229
+ return nmdc.TimestampValue(
230
+ has_raw_value=date_collected, type="nmdc:TimestampValue"
231
+ )
187
232
 
188
233
  def _get_quantity_value(
189
234
  self,
@@ -215,12 +260,14 @@ class GoldStudyTranslator(Translator):
215
260
  has_raw_value=minimum_numeric_value,
216
261
  has_numeric_value=nmdc.Double(minimum_numeric_value),
217
262
  has_unit=unit,
263
+ type="nmdc:QuantityValue",
218
264
  )
219
265
  else:
220
266
  return nmdc.QuantityValue(
221
267
  has_minimum_numeric_value=nmdc.Double(minimum_numeric_value),
222
268
  has_maximum_numeric_value=nmdc.Double(maximum_numeric_value),
223
269
  has_unit=unit,
270
+ type="nmdc:QuantityValue",
224
271
  )
225
272
 
226
273
  field_value = gold_entity.get(gold_field)
@@ -231,6 +278,7 @@ class GoldStudyTranslator(Translator):
231
278
  has_raw_value=field_value,
232
279
  has_numeric_value=nmdc.Double(field_value),
233
280
  has_unit=unit,
281
+ type="nmdc:QuantityValue",
234
282
  )
235
283
 
236
284
  def _get_text_value(
@@ -249,7 +297,7 @@ class GoldStudyTranslator(Translator):
249
297
  field_value = gold_entity.get(gold_field)
250
298
  if field_value is None:
251
299
  return None
252
- return nmdc.TextValue(has_raw_value=field_value)
300
+ return nmdc.TextValue(has_raw_value=field_value, type="nmdc:TextValue")
253
301
 
254
302
  def _get_controlled_term_value(
255
303
  self, gold_entity: JSON_OBJECT, gold_field: str
@@ -267,7 +315,9 @@ class GoldStudyTranslator(Translator):
267
315
  field_value = gold_entity.get(gold_field)
268
316
  if field_value is None:
269
317
  return None
270
- return nmdc.ControlledTermValue(has_raw_value=field_value)
318
+ return nmdc.ControlledTermValue(
319
+ has_raw_value=field_value, type="nmdc:ControlledTermValue"
320
+ )
271
321
 
272
322
  def _get_env_term_value(
273
323
  self, gold_biosample: JSON_OBJECT, gold_field: str
@@ -277,8 +327,8 @@ class GoldStudyTranslator(Translator):
277
327
  In GOLD entities ENVO terms are represented as a nested object with `id` and `label`
278
328
  fields. This method extracts this type of nested object by the given field name, and
279
329
  returns it as an `nmdc:ControlledIdentifiedTermValue` object. The `id` in the original
280
- GOLD object be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
281
- `ENVO:00005801`). If the value of the given field is `None` or if does not contain
330
+ GOLD object should be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
331
+ `ENVO:00005801`). If the value of the given field is `None` or if it does not contain
282
332
  a nested object with an `id` field, `None` is returned.
283
333
 
284
334
  :param gold_biosample: GOLD biosample object
@@ -292,8 +342,10 @@ class GoldStudyTranslator(Translator):
292
342
  term=nmdc.OntologyClass(
293
343
  id=env_field["id"].replace("_", ":"),
294
344
  name=env_field.get("label"),
345
+ type="nmdc:OntologyClass",
295
346
  ),
296
347
  has_raw_value=env_field["id"],
348
+ type="nmdc:ControlledIdentifiedTermValue",
297
349
  )
298
350
 
299
351
  def _get_lat_lon(
@@ -316,22 +368,40 @@ class GoldStudyTranslator(Translator):
316
368
  has_raw_value=f"{latitude} {longitude}",
317
369
  latitude=nmdc.DecimalDegree(latitude),
318
370
  longitude=nmdc.DecimalDegree(longitude),
371
+ type="nmdc:GeolocationValue",
319
372
  )
320
373
 
321
- def _get_instrument_name(self, gold_project: JSON_OBJECT) -> Union[str, None]:
322
- """Get instrument name used in a GOLD project
374
+ def _get_instrument(self, gold_project: JSON_OBJECT) -> Union[str, None]:
375
+ """Get instrument id referenced in instrument_set collection in Mongo.
376
+ Note: The instrument id is not retrieved by making a call to the database,
377
+ but rather parsed out from a TSV file in the nmdc-schema repo stored at
378
+ self.gold_instrument_set_mapping_file_path.
323
379
 
324
- This method gets the `seqMethod` field from a GOLD project object. If
325
- that value is not `None` it should be a list and the first element of that
326
- list is returned. If the value of the field is `None`, `None` is returned.
380
+ This method gets the seqMethod field from a GOLD project object. If
381
+ that value is not None and is in the self.gold_instrument_set_mapping_file_path
382
+ file's GOLD SeqMethod column, the corresponding instrument id from
383
+ NMDC instrument_set id column is returned. If the value of the field
384
+ is None, None is returned.
327
385
 
328
386
  :param gold_project: GOLD project object
329
- :return: Instrument name
387
+ :return: id corresponding to an Instrument from instrument_set collection
330
388
  """
331
389
  seq_method = gold_project.get("seqMethod")
332
390
  if not seq_method:
333
391
  return None
334
- return seq_method[0]
392
+
393
+ seq_method = seq_method[0].strip()
394
+ df = self.gold_nmdc_instrument_map_df
395
+
396
+ matching_row = df[df["GOLD SeqMethod"] == seq_method]
397
+
398
+ if not matching_row.empty:
399
+ instrument_id = matching_row["NMDC instrument_set id"].values[0]
400
+ return instrument_id
401
+
402
+ raise ValueError(
403
+ f"seqMethod '{seq_method}' could not be found in the GOLD-NMDC instrument mapping TSV file."
404
+ )
335
405
 
336
406
  def _get_processing_institution(
337
407
  self, gold_project: JSON_OBJECT
@@ -407,6 +477,7 @@ class GoldStudyTranslator(Translator):
407
477
  principal_investigator=self._get_pi(gold_study),
408
478
  title=gold_study.get("studyName"),
409
479
  type="nmdc:Study",
480
+ study_category=self.study_type,
410
481
  )
411
482
 
412
483
  def _translate_biosample(
@@ -454,7 +525,7 @@ class GoldStudyTranslator(Translator):
454
525
  gold_biosample_identifiers=self._get_curie("gold", gold_biosample_id),
455
526
  habitat=gold_biosample.get("habitat"),
456
527
  host_name=gold_biosample.get("hostName"),
457
- host_taxid=self._get_text_value(gold_biosample, "hostNcbiTaxid"),
528
+ host_taxid=self._get_host_taxid(gold_biosample),
458
529
  id=nmdc_biosample_id,
459
530
  img_identifiers=self._get_img_identifiers(gold_biosample_id),
460
531
  insdc_biosample_identifiers=self._get_insdc_biosample_identifiers(
@@ -466,7 +537,6 @@ class GoldStudyTranslator(Translator):
466
537
  name=gold_biosample.get("biosampleName"),
467
538
  ncbi_taxonomy_name=gold_biosample.get("ncbiTaxName"),
468
539
  nitrite=self._get_quantity_value(gold_biosample, "nitrateConcentration"),
469
- part_of=nmdc_study_id,
470
540
  ph=gold_biosample.get("ph"),
471
541
  pressure=self._get_quantity_value(gold_biosample, "pressure"),
472
542
  samp_name=self._get_samp_name(gold_biosample),
@@ -482,47 +552,46 @@ class GoldStudyTranslator(Translator):
482
552
  gold_biosample, "sampleCollectionTemperature"
483
553
  ),
484
554
  type="nmdc:Biosample",
555
+ associated_studies=[nmdc_study_id],
485
556
  )
486
557
 
487
- def _translate_omics_processing(
558
+ def _translate_nucleotide_sequencing(
488
559
  self,
489
560
  gold_project: JSON_OBJECT,
490
- nmdc_omics_processing_id: str,
561
+ nmdc_nucleotide_sequencing_id: str,
491
562
  nmdc_biosample_id: str,
492
563
  nmdc_study_id: str,
493
- ) -> nmdc.OmicsProcessing:
494
- """Translate a GOLD project object into an `nmdc:OmicsProcessing` object.
564
+ ):
565
+ """Translate a GOLD project object into an `nmdc:NucleotideSequencing` object.
495
566
 
496
- This method translates a GOLD project object into an equivalent `nmdc:OmicsProcessing`
567
+ This method translates a GOLD project object into an equivalent `nmdc:NucleotideSequencing`
497
568
  object. Any minted NMDC IDs must be passed to this method. Internally, each
498
- slot of the `nmdc:OmicsProcessing` is either directly pulled from the GOLD object or
569
+ slot of the `nmdc:NucleotideSequencing` is either directly pulled from the GOLD object or
499
570
  one of the `_get_*` methods is used.
500
571
 
501
572
  :param gold_project: GOLD project object
502
- :param nmdc_omics_processing_id: Minted nmdc:OmicsProcessing identifier for the translated object
573
+ :param nmdc_omics_processing_id: Minted nmdc:NucleotideSequencing identifier for the translated object
503
574
  :param nmdc_biosample_id: Minted nmdc:Biosample identifier for the related Biosample
504
575
  :param nmdc_study_id: Minted nmdc:Study identifier for the related Study
505
- :return: nmdc:OmicsProcessing object
576
+ :return: nmdc:NucleotideSequencing object
506
577
  """
507
578
  gold_project_id = gold_project["projectGoldId"]
508
- return nmdc.OmicsProcessing(
509
- id=nmdc_omics_processing_id,
579
+ return nmdc.NucleotideSequencing(
580
+ id=nmdc_nucleotide_sequencing_id,
510
581
  name=gold_project.get("projectName"),
511
582
  gold_sequencing_project_identifiers=self._get_curie(
512
583
  "gold", gold_project_id
513
584
  ),
514
585
  ncbi_project_name=gold_project.get("projectName"),
515
- type="nmdc:OmicsProcessing",
586
+ type="nmdc:NucleotideSequencing",
516
587
  has_input=nmdc_biosample_id,
517
- part_of=nmdc_study_id,
518
588
  add_date=gold_project.get("addDate"),
519
589
  mod_date=self._get_mod_date(gold_project),
520
590
  principal_investigator=self._get_pi(gold_project),
521
- omics_type=self._get_controlled_term_value(
522
- gold_project, "sequencingStrategy"
523
- ),
524
- instrument_name=self._get_instrument_name(gold_project),
525
591
  processing_institution=self._get_processing_institution(gold_project),
592
+ instrument_used=self._get_instrument(gold_project),
593
+ analyte_category="metagenome",
594
+ associated_studies=[nmdc_study_id],
526
595
  )
527
596
 
528
597
  def get_database(self) -> nmdc.Database:
@@ -563,11 +632,11 @@ class GoldStudyTranslator(Translator):
563
632
  }
564
633
 
565
634
  gold_project_ids = [project["projectGoldId"] for project in self.projects]
566
- nmdc_omics_processing_ids = self._id_minter(
567
- "nmdc:OmicsProcessing", len(gold_project_ids)
635
+ nmdc_nucleotide_sequencing_ids = self._id_minter(
636
+ "nmdc:NucleotideSequencing", len(gold_project_ids)
568
637
  )
569
- gold_project_to_nmdc_omics_processing_ids = dict(
570
- zip(gold_project_ids, nmdc_omics_processing_ids)
638
+ gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
639
+ zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
571
640
  )
572
641
 
573
642
  database.study_set = [self._translate_study(self.study, nmdc_study_id)]
@@ -585,13 +654,13 @@ class GoldStudyTranslator(Translator):
585
654
  for biosample in self.biosamples
586
655
  ]
587
656
  database.field_research_site_set = [
588
- nmdc.FieldResearchSite(id=id, name=name)
657
+ nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
589
658
  for name, id in gold_name_to_nmdc_field_site_ids.items()
590
659
  ]
591
- database.omics_processing_set = [
592
- self._translate_omics_processing(
660
+ database.data_generation_set = [
661
+ self._translate_nucleotide_sequencing(
593
662
  project,
594
- nmdc_omics_processing_id=gold_project_to_nmdc_omics_processing_ids[
663
+ nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
595
664
  project["projectGoldId"]
596
665
  ],
597
666
  nmdc_biosample_id=gold_to_nmdc_biosample_ids[
@@ -1,5 +1,6 @@
1
1
  import re
2
2
  import sqlite3
3
+ from typing import Union
3
4
 
4
5
  import pandas as pd
5
6
  import requests_cache
@@ -47,6 +48,7 @@ class NeonBenthicDataTranslator(Translator):
47
48
  site_code_mapping: dict,
48
49
  neon_envo_mappings_file: pd.DataFrame,
49
50
  neon_raw_data_file_mappings_file: pd.DataFrame,
51
+ neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
50
52
  *args,
51
53
  **kwargs,
52
54
  ) -> None:
@@ -92,13 +94,13 @@ class NeonBenthicDataTranslator(Translator):
92
94
  )
93
95
 
94
96
  self.site_code_mapping = site_code_mapping
97
+ self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
95
98
 
96
99
  def _translate_biosample(
97
100
  self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
98
101
  ) -> nmdc.Biosample:
99
102
  return nmdc.Biosample(
100
103
  id=nmdc_id,
101
- part_of="nmdc:sty-11-pzmd0x14",
102
104
  env_broad_scale=_create_controlled_identified_term_value(
103
105
  BENTHIC_BROAD_SCALE_MAPPINGS.get(
104
106
  biosample_row["aquaticSiteType"].values[0]
@@ -146,8 +148,10 @@ class NeonBenthicDataTranslator(Translator):
146
148
  depth=nmdc.QuantityValue(
147
149
  has_minimum_numeric_value=nmdc.Float("0"),
148
150
  has_maximum_numeric_value=nmdc.Float("1"),
149
- has_unit="meters",
151
+ has_unit="m",
152
+ type="nmdc:QuantityValue",
150
153
  ),
154
+ associated_studies=["nmdc:sty-11-pzmd0x14"],
151
155
  )
152
156
 
153
157
  def _translate_extraction_process(
@@ -185,10 +189,9 @@ class NeonBenthicDataTranslator(Translator):
185
189
  input_mass=_create_quantity_value(
186
190
  _get_value_or_none(extraction_row, "sampleMass"), "g"
187
191
  ),
188
- quality_control_report=nmdc.QualityControlReport(
189
- status=_get_value_or_none(extraction_row, "qaqcStatus")
190
- ),
192
+ qc_status=_get_value_or_none(extraction_row, "qaqcStatus"),
191
193
  processing_institution=processing_institution,
194
+ type="nmdc:Extraction",
192
195
  )
193
196
 
194
197
  def _translate_library_preparation(
@@ -201,13 +204,13 @@ class NeonBenthicDataTranslator(Translator):
201
204
  """
202
205
  Create LibraryPreparation process object. The input to LibraryPreparation process
203
206
  is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
204
- process is fed as input to an OmicsProcessing object.
207
+ process is fed as input to an NucleotideSequencing object.
205
208
 
206
209
  :param library_preparation_id: Minted id for LibraryPreparation process.
207
210
  :param library_preparation_input: Input to LibraryPreparation process is output from
208
211
  Extraction process.
209
212
  :param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
210
- is also input to OmicsProcessing.
213
+ is also input to NucleotideSequencing.
211
214
  :param library_preparation_row: Metadata required to populate LibraryPreparation.
212
215
  :return: Object that using LibraryPreparation process model.
213
216
  """
@@ -226,31 +229,47 @@ class NeonBenthicDataTranslator(Translator):
226
229
  start_date=_get_value_or_none(library_preparation_row, "collectDate"),
227
230
  end_date=_get_value_or_none(library_preparation_row, "processedDate"),
228
231
  processing_institution=processing_institution,
232
+ type="nmdc:LibraryPreparation",
229
233
  )
230
234
 
231
- def _translate_omics_processing(
235
+ def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
236
+ if not instrument_model:
237
+ raise ValueError(
238
+ f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
239
+ )
240
+
241
+ df = self.neon_nmdc_instrument_map_df
242
+ matching_row = df[
243
+ df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
244
+ ]
245
+
246
+ if not matching_row.empty:
247
+ nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
248
+ return nmdc_instrument_id
249
+
250
+ def _translate_nucleotide_sequencing(
232
251
  self,
233
- omics_processing_id: str,
252
+ nucleotide_sequencing_id: str,
234
253
  processed_sample_id: str,
235
254
  raw_data_file_data: str,
236
- omics_processing_row: pd.DataFrame,
237
- ) -> nmdc.OmicsProcessing:
238
- """Create nmdc OmicsProcessing object. This class typically models the run of a
239
- Bioinformatics workflow on sequence data from a biosample. The input to an OmicsProcessing
240
- process is the output from a LibraryPreparation process, and the output of OmicsProcessing
255
+ nucleotide_sequencing_row: pd.DataFrame,
256
+ ):
257
+ """Create nmdc NucleotideSequencing object. This class typically models the run of a
258
+ Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
259
+ process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
241
260
  is a DataObject which has the FASTQ sequence file URLs embedded in them.
242
261
 
243
- :param omics_processing_id: Minted id for an OmicsProcessing process.
262
+ :param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
244
263
  :param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
245
264
  :param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
246
265
  files embedded in them.
247
- :param omics_processing_row: DataFrame with metadata for an OmicsProcessing workflow
266
+ :param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
248
267
  process/run.
249
- :return: OmicsProcessing object that models a Bioinformatics workflow process/run.
268
+ :return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
250
269
  """
251
270
  processing_institution = None
252
271
  sequencing_facility = _get_value_or_none(
253
- omics_processing_row, "sequencingFacilityID"
272
+ nucleotide_sequencing_row, "sequencingFacilityID"
254
273
  )
255
274
  if sequencing_facility is not None:
256
275
  if re.search("Battelle", sequencing_facility, re.IGNORECASE):
@@ -258,19 +277,21 @@ class NeonBenthicDataTranslator(Translator):
258
277
  elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
259
278
  processing_institution = "ANL"
260
279
 
261
- return nmdc.OmicsProcessing(
262
- id=omics_processing_id,
280
+ return nmdc.NucleotideSequencing(
281
+ id=nucleotide_sequencing_id,
263
282
  has_input=processed_sample_id,
264
283
  has_output=raw_data_file_data,
265
284
  processing_institution=processing_institution,
266
- ncbi_project_name=_get_value_or_none(omics_processing_row, "ncbiProjectID"),
267
- omics_type=_create_controlled_term_value(
268
- omics_processing_row["investigation_type"].values[0]
285
+ ncbi_project_name=_get_value_or_none(
286
+ nucleotide_sequencing_row, "ncbiProjectID"
287
+ ),
288
+ instrument_used=self._get_instrument_id(
289
+ _get_value_or_none(nucleotide_sequencing_row, "instrument_model")
269
290
  ),
270
- instrument_name=f"{_get_value_or_none(omics_processing_row, 'sequencingMethod')} {_get_value_or_none(omics_processing_row, 'instrument_model')}",
271
- part_of="nmdc:sty-11-34xj1150",
272
- name=f"Terrestrial soil microbial communities - {_get_value_or_none(omics_processing_row, 'dnaSampleID')}",
273
- type="nmdc:OmicsProcessing",
291
+ name=f"Benthic microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
292
+ type="nmdc:NucleotideSequencing",
293
+ associated_studies=["nmdc:sty-11-pzmd0x14"],
294
+ analyte_category="metagenome",
274
295
  )
275
296
 
276
297
  def _translate_processed_sample(
@@ -287,12 +308,14 @@ class NeonBenthicDataTranslator(Translator):
287
308
  :param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
288
309
  :return: ProcessedSample objects to be stored in `processed_sample_set`.
289
310
  """
290
- return nmdc.ProcessedSample(id=processed_sample_id, name=sample_id)
311
+ return nmdc.ProcessedSample(
312
+ id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
313
+ )
291
314
 
292
315
  def _translate_data_object(
293
316
  self, do_id: str, url: str, do_type: str, checksum: str
294
317
  ) -> nmdc.DataObject:
295
- """Create nmdc DataObject which is the output of an OmicsProcessing process. This
318
+ """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
296
319
  object mainly contains information about the sequencing file that was generated as
297
320
  the result of running a Bioinformatics workflow on a certain ProcessedSample, which
298
321
  is the result of a LibraryPreparation process.
@@ -419,7 +442,9 @@ class NeonBenthicDataTranslator(Translator):
419
442
  )
420
443
 
421
444
  neon_omprc_ids = benthic_samples["sampleID"]
422
- nmdc_omprc_ids = self._id_minter("nmdc:OmicsProcessing", len(neon_omprc_ids))
445
+ nmdc_omprc_ids = self._id_minter(
446
+ "nmdc:NucleotideSequencing", len(neon_omprc_ids)
447
+ )
423
448
  neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
424
449
 
425
450
  neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
@@ -445,7 +470,7 @@ class NeonBenthicDataTranslator(Translator):
445
470
  processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
446
471
 
447
472
  if extraction_input is not None and processed_sample_id is not None:
448
- database.extraction_set.append(
473
+ database.material_processing_set.append(
449
474
  self._translate_extraction_process(
450
475
  nmdc_id,
451
476
  extraction_input,
@@ -489,7 +514,7 @@ class NeonBenthicDataTranslator(Translator):
489
514
  processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
490
515
 
491
516
  if lib_prep_input is not None and processed_sample_id is not None:
492
- database.library_preparation_set.append(
517
+ database.material_processing_set.append(
493
518
  self._translate_library_preparation(
494
519
  nmdc_id,
495
520
  lib_prep_input,
@@ -536,8 +561,8 @@ class NeonBenthicDataTranslator(Translator):
536
561
  )
537
562
  )
538
563
 
539
- database.omics_processing_set.append(
540
- self._translate_omics_processing(
564
+ database.data_generation_set.append(
565
+ self._translate_nucleotide_sequencing(
541
566
  neon_to_nmdc_omprc_ids.get(neon_id),
542
567
  processed_sample_id,
543
568
  has_output_do_ids,