nmdc-runtime 1.9.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  import re
2
2
  import sqlite3
3
- from typing import List
3
+ from typing import List, Union
4
4
 
5
5
  import pandas as pd
6
6
 
@@ -26,6 +26,7 @@ class NeonSoilDataTranslator(Translator):
26
26
  sls_data: dict,
27
27
  neon_envo_mappings_file: pd.DataFrame,
28
28
  neon_raw_data_file_mappings_file: pd.DataFrame,
29
+ neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
29
30
  *args,
30
31
  **kwargs,
31
32
  ) -> None:
@@ -99,6 +100,23 @@ class NeonSoilDataTranslator(Translator):
99
100
  "neonRawDataFile", self.conn, if_exists="replace", index=False
100
101
  )
101
102
 
103
+ self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
104
+
105
+ def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
106
+ if not instrument_model:
107
+ raise ValueError(
108
+ f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
109
+ )
110
+
111
+ df = self.neon_nmdc_instrument_map_df
112
+ matching_row = df[
113
+ df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
114
+ ]
115
+
116
+ if not matching_row.empty:
117
+ nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
118
+ return nmdc_instrument_id
119
+
102
120
  def _translate_biosample(
103
121
  self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
104
122
  ) -> nmdc.Biosample:
@@ -116,7 +134,6 @@ class NeonSoilDataTranslator(Translator):
116
134
  """
117
135
  return nmdc.Biosample(
118
136
  id=nmdc_id,
119
- part_of="nmdc:sty-11-34xj1150",
120
137
  env_broad_scale=_create_controlled_identified_term_value(
121
138
  "ENVO:00000446", "terrestrial biome"
122
139
  ),
@@ -145,6 +162,7 @@ class NeonSoilDataTranslator(Translator):
145
162
  biosample_row, "sampleBottomDepth"
146
163
  ),
147
164
  has_unit="m",
165
+ type="nmdc:QuantityValue",
148
166
  ),
149
167
  samp_collec_device=_get_value_or_none(biosample_row, "soilSamplingDevice"),
150
168
  soil_horizon=_get_value_or_none(biosample_row, "horizon"),
@@ -172,6 +190,7 @@ class NeonSoilDataTranslator(Translator):
172
190
  biosample_row["kclNitrateNitriteNConc"].values[0], "mg/L"
173
191
  ),
174
192
  type="nmdc:Biosample",
193
+ associated_studies=["nmdc:sty-11-34xj1150"],
175
194
  )
176
195
 
177
196
  def _translate_pooling_process(
@@ -198,6 +217,7 @@ class NeonSoilDataTranslator(Translator):
198
217
  has_input=bsm_input_values_list,
199
218
  start_date=_get_value_or_none(pooling_row, "startDate"),
200
219
  end_date=_get_value_or_none(pooling_row, "collectDate"),
220
+ type="nmdc:Pooling",
201
221
  )
202
222
 
203
223
  def _translate_processed_sample(
@@ -214,12 +234,14 @@ class NeonSoilDataTranslator(Translator):
214
234
  :param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
215
235
  :return: ProcessedSample objects to be stored in `processed_sample_set`.
216
236
  """
217
- return nmdc.ProcessedSample(id=processed_sample_id, name=sample_id)
237
+ return nmdc.ProcessedSample(
238
+ id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
239
+ )
218
240
 
219
241
  def _translate_data_object(
220
242
  self, do_id: str, url: str, do_type: str, checksum: str
221
243
  ) -> nmdc.DataObject:
222
- """Create nmdc DataObject which is the output of an OmicsProcessing process. This
244
+ """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
223
245
  object mainly contains information about the sequencing file that was generated as
224
246
  the result of running a Bioinformatics workflow on a certain ProcessedSample, which
225
247
  is the result of a LibraryPreparation process.
@@ -280,10 +302,9 @@ class NeonSoilDataTranslator(Translator):
280
302
  input_mass=_create_quantity_value(
281
303
  _get_value_or_none(extraction_row, "sampleMass"), "g"
282
304
  ),
283
- quality_control_report=nmdc.QualityControlReport(
284
- status=_get_value_or_none(extraction_row, "qaqcStatus")
285
- ),
305
+ qc_status=_get_value_or_none(extraction_row, "qaqcStatus"),
286
306
  processing_institution=processing_institution,
307
+ type="nmdc:Extraction",
287
308
  )
288
309
 
289
310
  def _translate_library_preparation(
@@ -296,13 +317,13 @@ class NeonSoilDataTranslator(Translator):
296
317
  """
297
318
  Create LibraryPreparation process object. The input to LibraryPreparation process
298
319
  is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
299
- process is fed as input to an OmicsProcessing object.
320
+ process is fed as input to an NucleotideSequencing object.
300
321
 
301
322
  :param library_preparation_id: Minted id for LibraryPreparation process.
302
323
  :param library_preparation_input: Input to LibraryPreparation process is output from
303
324
  Extraction process.
304
325
  :param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
305
- is also input to OmicsProcessing.
326
+ is also input to NucleotideSequencing.
306
327
  :param library_preparation_row: Metadata required to populate LibraryPreparation.
307
328
  :return: Object that using LibraryPreparation process model.
308
329
  """
@@ -321,31 +342,32 @@ class NeonSoilDataTranslator(Translator):
321
342
  start_date=_get_value_or_none(library_preparation_row, "collectDate"),
322
343
  end_date=_get_value_or_none(library_preparation_row, "processedDate"),
323
344
  processing_institution=processing_institution,
345
+ type="nmdc:LibraryPreparation",
324
346
  )
325
347
 
326
- def _translate_omics_processing(
348
+ def _translate_nucleotide_sequencing(
327
349
  self,
328
- omics_processing_id: str,
350
+ nucleotide_sequencing_id: str,
329
351
  processed_sample_id: str,
330
352
  raw_data_file_data: str,
331
- omics_processing_row: pd.DataFrame,
332
- ) -> nmdc.OmicsProcessing:
333
- """Create nmdc OmicsProcessing object. This class typically models the run of a
334
- Bioinformatics workflow on sequence data from a biosample. The input to an OmicsProcessing
335
- process is the output from a LibraryPreparation process, and the output of OmicsProcessing
353
+ nucleotide_sequencing_row: pd.DataFrame,
354
+ ):
355
+ """Create nmdc NucleotideSequencing object. This class typically models the run of a
356
+ Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
357
+ process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
336
358
  is a DataObject which has the FASTQ sequence file URLs embedded in them.
337
359
 
338
- :param omics_processing_id: Minted id for an OmicsProcessing process.
360
+ :param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
339
361
  :param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
340
362
  :param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
341
363
  files embedded in them.
342
- :param omics_processing_row: DataFrame with metadata for an OmicsProcessing workflow
364
+ :param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
343
365
  process/run.
344
- :return: OmicsProcessing object that models a Bioinformatics workflow process/run.
366
+ :return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
345
367
  """
346
368
  processing_institution = None
347
369
  sequencing_facility = _get_value_or_none(
348
- omics_processing_row, "sequencingFacilityID"
370
+ nucleotide_sequencing_row, "sequencingFacilityID"
349
371
  )
350
372
  if sequencing_facility is not None:
351
373
  if re.search("Battelle", sequencing_facility, re.IGNORECASE):
@@ -353,19 +375,21 @@ class NeonSoilDataTranslator(Translator):
353
375
  elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
354
376
  processing_institution = "ANL"
355
377
 
356
- return nmdc.OmicsProcessing(
357
- id=omics_processing_id,
378
+ return nmdc.NucleotideSequencing(
379
+ id=nucleotide_sequencing_id,
358
380
  has_input=processed_sample_id,
359
381
  has_output=raw_data_file_data,
360
382
  processing_institution=processing_institution,
361
- ncbi_project_name=_get_value_or_none(omics_processing_row, "ncbiProjectID"),
362
- omics_type=_create_controlled_term_value(
363
- omics_processing_row["investigation_type"].values[0]
383
+ ncbi_project_name=_get_value_or_none(
384
+ nucleotide_sequencing_row, "ncbiProjectID"
364
385
  ),
365
- instrument_name=f"{_get_value_or_none(omics_processing_row, 'sequencingMethod')} {_get_value_or_none(omics_processing_row, 'instrument_model')}",
366
- part_of="nmdc:sty-11-34xj1150",
367
- name=f"Terrestrial soil microbial communities - {_get_value_or_none(omics_processing_row, 'dnaSampleID')}",
368
- type="nmdc:OmicsProcessing",
386
+ instrument_used=self._get_instrument_id(
387
+ _get_value_or_none(nucleotide_sequencing_row, "instrument_model")
388
+ ),
389
+ name=f"Terrestrial soil microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
390
+ type="nmdc:NucleotideSequencing",
391
+ associated_studies=["nmdc:sty-11-34xj1150"],
392
+ analyte_category="metagenome",
369
393
  )
370
394
 
371
395
  def get_database(self) -> nmdc.Database:
@@ -373,10 +397,9 @@ class NeonSoilDataTranslator(Translator):
373
397
  nmdc object creation methods as well as the nmdc type (QuantityValue, GeolocationValue, etc.)
374
398
  creation methods, to make an nmdc Database object. It populates multiple sets in the Mongo database -
375
399
  * `biosample_set`: uses `_translate_biosample()`
376
- * `pooling_set`: uses `_translate_pooling_process()`
377
- * `extraction_set`: uses `_translate_extraction_process()`
378
- * `library_preparation_set`: uses `_translate_library_preparation()`
379
- * `omics_processing_set`: uses `_translate_omics_processing()`
400
+ * `material_processing_set`: uses `_translate_pooling_process()`, `_translate_extraction_process()`,
401
+ `_translate_library_preparation()`
402
+ * `data_generation_set`: uses `_translate_nucleotide_sequencing()`
380
403
  * `processed_sample_set`: uses `_translate_processed_sample()`
381
404
  * `data_object_set`: uses `_translate_data_object()`
382
405
  The core Biosample information is in the `sls_soilCoreCollection` table. However, we
@@ -607,14 +630,13 @@ class NeonSoilDataTranslator(Translator):
607
630
  mms_metagenomeDnaExtraction.processedDate,
608
631
  mms_metagenomeSequencing.sequencingFacilityID,
609
632
  mms_metagenomeSequencing.ncbiProjectID,
610
- mms_metagenomeSequencing.investigation_type,
611
633
  mms_metagenomeSequencing.sequencingMethod,
612
634
  mms_metagenomeSequencing.instrument_model
613
635
  FROM mms_metagenomeSequencing
614
636
  LEFT JOIN mms_metagenomeDnaExtraction ON mms_metagenomeDnaExtraction.dnaSampleID = mms_metagenomeSequencing.dnaSampleID
615
637
  """
616
638
  library_preparation_table = pd.read_sql_query(query, self.conn)
617
- omics_processing_table = pd.read_sql_query(query, self.conn)
639
+ nucleotide_sequencing_table = pd.read_sql_query(query, self.conn)
618
640
 
619
641
  nmdc_pooling_ids = self._id_minter("nmdc:Pooling", len(pooling_ids_dict))
620
642
  neon_to_nmdc_pooling_ids = dict(
@@ -653,12 +675,12 @@ class NeonSoilDataTranslator(Translator):
653
675
  zip(library_prepration_ids, nmdc_library_preparation_processed_sample_ids)
654
676
  )
655
677
 
656
- omics_processing_ids = omics_processing_table["dnaSampleID"]
657
- nmdc_omics_processing_ids = self._id_minter(
658
- "nmdc:OmicsProcessing", len(omics_processing_ids)
678
+ nucleotide_sequencing_ids = nucleotide_sequencing_table["dnaSampleID"]
679
+ nmdc_nucleotide_sequencing_ids = self._id_minter(
680
+ "nmdc:NucleotideSequencing", len(nucleotide_sequencing_ids)
659
681
  )
660
- neon_to_nmdc_omics_processing_ids = dict(
661
- zip(omics_processing_ids, nmdc_omics_processing_ids)
682
+ neon_to_nmdc_nucleotide_sequencing_ids = dict(
683
+ zip(nucleotide_sequencing_ids, nmdc_nucleotide_sequencing_ids)
662
684
  )
663
685
 
664
686
  neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
@@ -701,7 +723,7 @@ class NeonSoilDataTranslator(Translator):
701
723
  # if the number of biosamples that are input to a pooling process
702
724
  # is one or less, then ignore it and go straight to extraction
703
725
  if len(bsm_values_list) > 1:
704
- database.pooling_set.append(
726
+ database.material_processing_set.append(
705
727
  self._translate_pooling_process(
706
728
  pooling_process_id,
707
729
  processed_sample_id,
@@ -734,7 +756,7 @@ class NeonSoilDataTranslator(Translator):
734
756
  # handler for creating extraction process records
735
757
  # for both pooled and non-pooled samples
736
758
  if "|" in genomics_pooled_id_list:
737
- database.extraction_set.append(
759
+ database.material_processing_set.append(
738
760
  self._translate_extraction_process(
739
761
  extraction_id,
740
762
  extraction_input,
@@ -755,7 +777,7 @@ class NeonSoilDataTranslator(Translator):
755
777
 
756
778
  extraction_input = neon_to_nmdc_biosample_ids[neon_biosample_id]
757
779
 
758
- database.extraction_set.append(
780
+ database.material_processing_set.append(
759
781
  self._translate_extraction_process(
760
782
  extraction_id,
761
783
  extraction_input,
@@ -772,7 +794,9 @@ class NeonSoilDataTranslator(Translator):
772
794
  dna_sample_id
773
795
  ]
774
796
 
775
- omics_processing_id = neon_to_nmdc_omics_processing_ids[dna_sample_id]
797
+ nucleotide_sequencing_id = neon_to_nmdc_nucleotide_sequencing_ids[
798
+ dna_sample_id
799
+ ]
776
800
 
777
801
  genomics_sample_id = library_preparation_table[
778
802
  library_preparation_table["dnaSampleID"] == dna_sample_id
@@ -787,7 +811,7 @@ class NeonSoilDataTranslator(Translator):
787
811
  library_preparation_table["dnaSampleID"] == dna_sample_id
788
812
  ]
789
813
 
790
- database.library_preparation_set.append(
814
+ database.material_processing_set.append(
791
815
  self._translate_library_preparation(
792
816
  library_preparation_id,
793
817
  library_preparation_input,
@@ -809,9 +833,9 @@ class NeonSoilDataTranslator(Translator):
809
833
  if item in neon_to_nmdc_data_object_ids:
810
834
  has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
811
835
 
812
- database.omics_processing_set.append(
813
- self._translate_omics_processing(
814
- omics_processing_id,
836
+ database.data_generation_set.append(
837
+ self._translate_nucleotide_sequencing(
838
+ nucleotide_sequencing_id,
815
839
  processed_sample_id,
816
840
  has_output_do_ids,
817
841
  library_preparation_row,
@@ -1,6 +1,6 @@
1
1
  import re
2
2
  import sqlite3
3
- from typing import Dict, Optional
3
+ from typing import Dict, Optional, Union
4
4
 
5
5
  import pandas as pd
6
6
  import requests
@@ -36,6 +36,7 @@ SURFACE_WATER_LOCAL_SCALE_MAPPINGS = {
36
36
  "term_id": "ENVO:01000409",
37
37
  "term_name": "freshwater littoral zone",
38
38
  },
39
+ "inflow": {"term_id": "ENVO:00000476", "term_name": "lake inlet"},
39
40
  },
40
41
  "river": {"term_id": "ENVO:01000297", "term_name": "freshwater river"},
41
42
  "stream": {"term_id": "ENVO:03605007", "term_name": "freshwater stream"},
@@ -58,6 +59,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
58
59
  site_code_mapping: dict,
59
60
  neon_envo_mappings_file: pd.DataFrame,
60
61
  neon_raw_data_file_mappings_file: pd.DataFrame,
62
+ neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
61
63
  *args,
62
64
  **kwargs,
63
65
  ) -> None:
@@ -108,6 +110,8 @@ class NeonSurfaceWaterDataTranslator(Translator):
108
110
 
109
111
  self.site_code_mapping = site_code_mapping
110
112
 
113
+ self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
114
+
111
115
  def _translate_biosample(
112
116
  self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
113
117
  ) -> nmdc.Biosample:
@@ -136,16 +140,17 @@ class NeonSurfaceWaterDataTranslator(Translator):
136
140
  has_minimum_numeric_value=nmdc.Float(minimum_depth),
137
141
  has_maximum_numeric_value=nmdc.Float(maximum_depth),
138
142
  has_unit="m",
143
+ type="nmdc:QuantityValue",
139
144
  )
140
145
  else:
141
146
  depth = nmdc.QuantityValue(
142
147
  has_numeric_value=nmdc.Float(minimum_depth),
143
148
  has_unit="m",
149
+ type="nmdc:QuantityValue",
144
150
  )
145
151
 
146
152
  return nmdc.Biosample(
147
153
  id=nmdc_id,
148
- part_of="nmdc:sty-11-hht5sb92",
149
154
  env_broad_scale=_create_controlled_identified_term_value(
150
155
  SURFACE_WATER_BROAD_SCALE_MAPPINGS.get(
151
156
  biosample_row["aquaticSiteType"].values[0]
@@ -201,7 +206,8 @@ class NeonSurfaceWaterDataTranslator(Translator):
201
206
  samp_size=_create_quantity_value(
202
207
  biosample_row["geneticFilteredSampleVolume"].values[0], "mL"
203
208
  ),
204
- env_package=nmdc.TextValue(has_raw_value="water"),
209
+ env_package=nmdc.TextValue(has_raw_value="water", type="nmdc:TextValue"),
210
+ associated_studies=["nmdc:sty-11-hht5sb92"],
205
211
  )
206
212
 
207
213
  def _translate_extraction_process(
@@ -243,6 +249,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
243
249
  _get_value_or_none(extraction_row, "extrQaqcStatus")
244
250
  ),
245
251
  processing_institution=processing_institution,
252
+ type="nmdc:Extraction",
246
253
  )
247
254
 
248
255
  def _translate_library_preparation(
@@ -255,13 +262,13 @@ class NeonSurfaceWaterDataTranslator(Translator):
255
262
  """
256
263
  Create LibraryPreparation process object. The input to LibraryPreparation process
257
264
  is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
258
- process is fed as input to an OmicsProcessing object.
265
+ process is fed as input to an NucleotideSequencing object.
259
266
 
260
267
  :param library_preparation_id: Minted id for LibraryPreparation process.
261
268
  :param library_preparation_input: Input to LibraryPreparation process is output from
262
269
  Extraction process.
263
270
  :param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
264
- is also input to OmicsProcessing.
271
+ is also input to NucleotideSequencing.
265
272
  :param library_preparation_row: Metadata required to populate LibraryPreparation.
266
273
  :return: Object that using LibraryPreparation process model.
267
274
  """
@@ -280,31 +287,47 @@ class NeonSurfaceWaterDataTranslator(Translator):
280
287
  start_date=_get_value_or_none(library_preparation_row, "seqCollectDate"),
281
288
  end_date=_get_value_or_none(library_preparation_row, "seqProcessedDate"),
282
289
  processing_institution=processing_institution,
290
+ type="nmdc:LibraryPreparation",
283
291
  )
284
292
 
285
- def _translate_omics_processing(
293
+ def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
294
+ if not instrument_model:
295
+ raise ValueError(
296
+ f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
297
+ )
298
+
299
+ df = self.neon_nmdc_instrument_map_df
300
+ matching_row = df[
301
+ df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
302
+ ]
303
+
304
+ if not matching_row.empty:
305
+ nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
306
+ return nmdc_instrument_id
307
+
308
+ def _translate_nucleotide_sequencing(
286
309
  self,
287
- omics_processing_id: str,
310
+ nucleotide_sequencing_id: str,
288
311
  processed_sample_id: str,
289
312
  raw_data_file_data: str,
290
- omics_processing_row: pd.DataFrame,
291
- ) -> nmdc.OmicsProcessing:
292
- """Create nmdc OmicsProcessing object. This class typically models the run of a
293
- Bioinformatics workflow on sequence data from a biosample. The input to an OmicsProcessing
294
- process is the output from a LibraryPreparation process, and the output of OmicsProcessing
313
+ nucleotide_sequencing_row: pd.DataFrame,
314
+ ):
315
+ """Create nmdc NucleotideSequencing object. This class typically models the run of a
316
+ Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
317
+ process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
295
318
  is a DataObject which has the FASTQ sequence file URLs embedded in them.
296
319
 
297
- :param omics_processing_id: Minted id for an OmicsProcessing process.
320
+ :param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
298
321
  :param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
299
322
  :param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
300
323
  files embedded in them.
301
- :param omics_processing_row: DataFrame with metadata for an OmicsProcessing workflow
324
+ :param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
302
325
  process/run.
303
- :return: OmicsProcessing object that models a Bioinformatics workflow process/run.
326
+ :return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
304
327
  """
305
328
  processing_institution = None
306
329
  sequencing_facility = _get_value_or_none(
307
- omics_processing_row, "sequencingFacilityID"
330
+ nucleotide_sequencing_row, "sequencingFacilityID"
308
331
  )
309
332
  if sequencing_facility is not None:
310
333
  if re.search("Battelle", sequencing_facility, re.IGNORECASE):
@@ -312,19 +335,21 @@ class NeonSurfaceWaterDataTranslator(Translator):
312
335
  elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
313
336
  processing_institution = "ANL"
314
337
 
315
- return nmdc.OmicsProcessing(
316
- id=omics_processing_id,
338
+ return nmdc.NucleotideSequencing(
339
+ id=nucleotide_sequencing_id,
317
340
  has_input=processed_sample_id,
318
341
  has_output=raw_data_file_data,
319
342
  processing_institution=processing_institution,
320
- ncbi_project_name=_get_value_or_none(omics_processing_row, "ncbiProjectID"),
321
- omics_type=_create_controlled_term_value(
322
- omics_processing_row["investigation_type"].values[0]
343
+ ncbi_project_name=_get_value_or_none(
344
+ nucleotide_sequencing_row, "ncbiProjectID"
345
+ ),
346
+ instrument_used=self._get_instrument_id(
347
+ _get_value_or_none(nucleotide_sequencing_row, "instrument_model")
323
348
  ),
324
- instrument_name=f"{_get_value_or_none(omics_processing_row, 'sequencingMethod')} {_get_value_or_none(omics_processing_row, 'instrument_model')}",
325
- part_of="nmdc:sty-11-hht5sb92",
326
- name=f"Surface water microbial communities - {_get_value_or_none(omics_processing_row, 'dnaSampleID')}",
327
- type="nmdc:OmicsProcessing",
349
+ name=f"Surface water microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
350
+ type="nmdc:NucleotideSequencing",
351
+ associated_studies=["nmdc:sty-11-hht5sb92"],
352
+ analyte_category="metagenome",
328
353
  )
329
354
 
330
355
  def _translate_processed_sample(
@@ -341,12 +366,14 @@ class NeonSurfaceWaterDataTranslator(Translator):
341
366
  :param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
342
367
  :return: ProcessedSample objects to be stored in `processed_sample_set`.
343
368
  """
344
- return nmdc.ProcessedSample(id=processed_sample_id, name=sample_id)
369
+ return nmdc.ProcessedSample(
370
+ id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
371
+ )
345
372
 
346
373
  def _translate_data_object(
347
374
  self, do_id: str, url: str, do_type: str, checksum: str
348
375
  ) -> nmdc.DataObject:
349
- """Create nmdc DataObject which is the output of an OmicsProcessing process. This
376
+ """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
350
377
  object mainly contains information about the sequencing file that was generated as
351
378
  the result of running a Bioinformatics workflow on a certain ProcessedSample, which
352
379
  is the result of a LibraryPreparation process.
@@ -485,7 +512,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
485
512
  )
486
513
 
487
514
  neon_omprc_ids = surface_water_samples["parentSampleID"]
488
- nmdc_omprc_ids = self._id_minter("nmdc:OmicsProcessing", len(neon_omprc_ids))
515
+ nmdc_omprc_ids = self._id_minter(
516
+ "nmdc:NucleotideSequencing", len(neon_omprc_ids)
517
+ )
489
518
  neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
490
519
 
491
520
  neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
@@ -515,7 +544,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
515
544
  processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
516
545
 
517
546
  if extraction_input is not None and processed_sample_id is not None:
518
- database.extraction_set.append(
547
+ database.material_processing_set.append(
519
548
  self._translate_extraction_process(
520
549
  nmdc_id,
521
550
  extraction_input,
@@ -561,7 +590,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
561
590
  processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
562
591
 
563
592
  if lib_prep_input is not None and processed_sample_id is not None:
564
- database.library_preparation_set.append(
593
+ database.material_processing_set.append(
565
594
  self._translate_library_preparation(
566
595
  nmdc_id,
567
596
  lib_prep_input,
@@ -608,8 +637,8 @@ class NeonSurfaceWaterDataTranslator(Translator):
608
637
  )
609
638
  )
610
639
 
611
- database.omics_processing_set.append(
612
- self._translate_omics_processing(
640
+ database.data_generation_set.append(
641
+ self._translate_nucleotide_sequencing(
613
642
  neon_to_nmdc_omprc_ids.get(neon_id),
614
643
  processed_sample_id,
615
644
  has_output_do_ids,
@@ -50,7 +50,14 @@ def _create_controlled_identified_term_value(
50
50
  """
51
51
  if id is None or name is None:
52
52
  return None
53
- return nmdc.ControlledIdentifiedTermValue(term=nmdc.OntologyClass(id=id, name=name))
53
+ return nmdc.ControlledIdentifiedTermValue(
54
+ term=nmdc.OntologyClass(
55
+ id=id,
56
+ name=name,
57
+ type="nmdc:OntologyClass",
58
+ ),
59
+ type="nmdc:ControlledIdentifiedTermValue",
60
+ )
54
61
 
55
62
 
56
63
  def _create_controlled_term_value(name: str = None) -> nmdc.ControlledTermValue:
@@ -64,7 +71,10 @@ def _create_controlled_term_value(name: str = None) -> nmdc.ControlledTermValue:
64
71
  """
65
72
  if name is None:
66
73
  return None
67
- return nmdc.ControlledTermValue(has_raw_value=name)
74
+ return nmdc.ControlledTermValue(
75
+ has_raw_value=name,
76
+ type="nmdc:ControlledTermValue",
77
+ )
68
78
 
69
79
 
70
80
  def _create_timestamp_value(value: str = None) -> nmdc.TimestampValue:
@@ -77,7 +87,7 @@ def _create_timestamp_value(value: str = None) -> nmdc.TimestampValue:
77
87
  """
78
88
  if value is None:
79
89
  return None
80
- return nmdc.TimestampValue(has_raw_value=value)
90
+ return nmdc.TimestampValue(has_raw_value=value, type="nmdc:TimestampValue")
81
91
 
82
92
 
83
93
  def _create_quantity_value(
@@ -94,7 +104,9 @@ def _create_quantity_value(
94
104
  """
95
105
  if numeric_value is None or math.isnan(numeric_value):
96
106
  return None
97
- return nmdc.QuantityValue(has_numeric_value=float(numeric_value), has_unit=unit)
107
+ return nmdc.QuantityValue(
108
+ has_numeric_value=float(numeric_value), has_unit=unit, type="nmdc:QuantityValue"
109
+ )
98
110
 
99
111
 
100
112
  def _create_text_value(value: str = None) -> nmdc.TextValue:
@@ -106,7 +118,7 @@ def _create_text_value(value: str = None) -> nmdc.TextValue:
106
118
  """
107
119
  if value is None:
108
120
  return None
109
- return nmdc.TextValue(has_raw_value=value)
121
+ return nmdc.TextValue(has_raw_value=value, type="nmdc:TextValue")
110
122
 
111
123
 
112
124
  def _create_double_value(value: str = None) -> nmdc.Double:
@@ -119,7 +131,7 @@ def _create_double_value(value: str = None) -> nmdc.Double:
119
131
  """
120
132
  if value is None or math.isnan(value):
121
133
  return None
122
- return nmdc.Double(value)
134
+ return nmdc.Double(value, type="nmdc:Double")
123
135
 
124
136
 
125
137
  def _create_geolocation_value(
@@ -147,4 +159,5 @@ def _create_geolocation_value(
147
159
  return nmdc.GeolocationValue(
148
160
  latitude=nmdc.DecimalDegree(latitude),
149
161
  longitude=nmdc.DecimalDegree(longitude),
162
+ type="nmdc:GeolocationValue",
150
163
  )