nmdc-runtime 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  import re
2
2
  import sqlite3
3
- from typing import Union
3
+ from typing import Optional, Union
4
4
 
5
5
  import pandas as pd
6
6
  import requests_cache
@@ -61,6 +61,7 @@ class NeonBenthicDataTranslator(Translator):
61
61
  "mms_benthicMetagenomeSequencing",
62
62
  "mms_benthicMetagenomeDnaExtraction",
63
63
  "amb_fieldParent",
64
+ "mms_benthicRawDataFiles", # <--- ensure this is present
64
65
  )
65
66
 
66
67
  if all(k in benthic_data for k in neon_amb_data_tables):
@@ -79,6 +80,12 @@ class NeonBenthicDataTranslator(Translator):
79
80
  benthic_data["amb_fieldParent"].to_sql(
80
81
  "amb_fieldParent", self.conn, if_exists="replace", index=False
81
82
  )
83
+ benthic_data["mms_benthicRawDataFiles"].to_sql(
84
+ "mms_benthicRawDataFiles",
85
+ self.conn,
86
+ if_exists="replace",
87
+ index=False,
88
+ )
82
89
  else:
83
90
  raise ValueError(
84
91
  f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
@@ -88,14 +95,19 @@ class NeonBenthicDataTranslator(Translator):
88
95
  "neonEnvoTerms", self.conn, if_exists="replace", index=False
89
96
  )
90
97
 
91
- self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
92
- self.neon_raw_data_file_mappings_df.to_sql(
93
- "neonRawDataFile", self.conn, if_exists="replace", index=False
94
- )
98
+ self.neon_raw_data_file_mappings_df = benthic_data["mms_benthicRawDataFiles"]
95
99
 
96
100
  self.site_code_mapping = site_code_mapping
101
+
97
102
  self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
98
103
 
104
+ def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
105
+ return nmdc.Manifest(
106
+ id=manifest_id,
107
+ manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
108
+ type="nmdc:Manifest",
109
+ )
110
+
99
111
  def _translate_biosample(
100
112
  self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
101
113
  ) -> nmdc.Biosample:
@@ -313,7 +325,7 @@ class NeonBenthicDataTranslator(Translator):
313
325
  )
314
326
 
315
327
  def _translate_data_object(
316
- self, do_id: str, url: str, do_type: str, checksum: str
328
+ self, do_id: str, url: str, do_type: str, manifest_id: str
317
329
  ) -> nmdc.DataObject:
318
330
  """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
319
331
  object mainly contains information about the sequencing file that was generated as
@@ -324,7 +336,6 @@ class NeonBenthicDataTranslator(Translator):
324
336
  :param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
325
337
  by Hugh Cross at NEON.
326
338
  :param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
327
- :param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
328
339
  at NEON.
329
340
  :return: DataObject with all the sequencing file metadata.
330
341
  """
@@ -337,14 +348,14 @@ class NeonBenthicDataTranslator(Translator):
337
348
  url=url,
338
349
  description=f"sequencing results for {basename}",
339
350
  type="nmdc:DataObject",
340
- md5_checksum=checksum,
341
351
  data_object_type=do_type,
352
+ in_manifest=manifest_id,
342
353
  )
343
354
 
344
- def get_database(self):
355
+ def get_database(self) -> nmdc.Database:
345
356
  database = nmdc.Database()
346
357
 
347
- query = """
358
+ join_query = """
348
359
  SELECT
349
360
  merged.laboratoryName,
350
361
  merged.sequencingFacilityID,
@@ -372,202 +383,190 @@ class NeonBenthicDataTranslator(Translator):
372
383
  afp.siteID,
373
384
  afp.sampleID,
374
385
  afp.collectDate
375
- FROM
376
- (
377
- SELECT
378
- bs.collectDate,
379
- bs.laboratoryName,
380
- bs.sequencingFacilityID,
381
- bs.processedDate,
382
- bs.dnaSampleID,
383
- bs.dnaSampleCode,
384
- bs.internalLabID,
385
- bs.instrument_model,
386
- bs.sequencingMethod,
387
- bs.investigation_type,
388
- bs.qaqcStatus,
389
- bs.ncbiProjectID,
390
- bd.genomicsSampleID,
391
- bd.sequenceAnalysisType,
392
- bd.sampleMass,
393
- bd.nucleicAcidConcentration
394
- FROM
395
- mms_benthicMetagenomeSequencing AS bs
396
- JOIN
397
- mms_benthicMetagenomeDnaExtraction AS bd
398
- ON
399
- bs.dnaSampleID = bd.dnaSampleID
400
- ) AS merged
386
+ FROM (
387
+ SELECT
388
+ bs.collectDate,
389
+ bs.laboratoryName,
390
+ bs.sequencingFacilityID,
391
+ bs.processedDate,
392
+ bs.dnaSampleID,
393
+ bs.dnaSampleCode,
394
+ bs.internalLabID,
395
+ bs.instrument_model,
396
+ bs.sequencingMethod,
397
+ bs.investigation_type,
398
+ bs.qaqcStatus,
399
+ bs.ncbiProjectID,
400
+ bd.genomicsSampleID,
401
+ bd.sequenceAnalysisType,
402
+ bd.sampleMass,
403
+ bd.nucleicAcidConcentration
404
+ FROM mms_benthicMetagenomeSequencing AS bs
405
+ JOIN mms_benthicMetagenomeDnaExtraction AS bd
406
+ ON bs.dnaSampleID = bd.dnaSampleID
407
+ ) AS merged
401
408
  LEFT JOIN amb_fieldParent AS afp
402
- ON
403
- merged.genomicsSampleID = afp.geneticSampleID
409
+ ON merged.genomicsSampleID = afp.geneticSampleID
404
410
  """
405
- benthic_samples = pd.read_sql_query(query, self.conn)
411
+ benthic_samples = pd.read_sql_query(join_query, self.conn)
406
412
  benthic_samples.to_sql(
407
413
  "benthicSamples", self.conn, if_exists="replace", index=False
408
414
  )
409
415
 
410
- neon_biosample_ids = benthic_samples["sampleID"]
411
- nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
412
- neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
416
+ sample_ids = benthic_samples["sampleID"]
417
+ nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_ids))
418
+ neon_to_nmdc_biosample_ids = dict(zip(sample_ids, nmdc_biosample_ids))
413
419
 
414
- neon_extraction_ids = benthic_samples["sampleID"]
415
- nmdc_extraction_ids = self._id_minter(
416
- "nmdc:Extraction", len(neon_extraction_ids)
417
- )
418
- neon_to_nmdc_extraction_ids = dict(
419
- zip(neon_extraction_ids, nmdc_extraction_ids)
420
- )
420
+ nmdc_extraction_ids = self._id_minter("nmdc:Extraction", len(sample_ids))
421
+ neon_to_nmdc_extraction_ids = dict(zip(sample_ids, nmdc_extraction_ids))
421
422
 
422
- neon_extraction_processed_ids = benthic_samples["sampleID"]
423
423
  nmdc_extraction_processed_ids = self._id_minter(
424
- "nmdc:ProcessedSample", len(neon_extraction_processed_ids)
424
+ "nmdc:ProcessedSample", len(sample_ids)
425
425
  )
426
426
  neon_to_nmdc_extraction_processed_ids = dict(
427
- zip(neon_extraction_processed_ids, nmdc_extraction_processed_ids)
427
+ zip(sample_ids, nmdc_extraction_processed_ids)
428
428
  )
429
429
 
430
- neon_lib_prep_ids = benthic_samples["sampleID"]
431
- nmdc_lib_prep_ids = self._id_minter(
432
- "nmdc:LibraryPreparation", len(neon_lib_prep_ids)
433
- )
434
- neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
430
+ nmdc_libprep_ids = self._id_minter("nmdc:LibraryPreparation", len(sample_ids))
431
+ neon_to_nmdc_libprep_ids = dict(zip(sample_ids, nmdc_libprep_ids))
435
432
 
436
- neon_lib_prep_processed_ids = benthic_samples["sampleID"]
437
- nmdc_lib_prep_processed_ids = self._id_minter(
438
- "nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
433
+ nmdc_libprep_processed_ids = self._id_minter(
434
+ "nmdc:ProcessedSample", len(sample_ids)
439
435
  )
440
- neon_to_nmdc_lib_prep_processed_ids = dict(
441
- zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
436
+ neon_to_nmdc_libprep_processed_ids = dict(
437
+ zip(sample_ids, nmdc_libprep_processed_ids)
442
438
  )
443
439
 
444
- neon_omprc_ids = benthic_samples["sampleID"]
445
- nmdc_omprc_ids = self._id_minter(
446
- "nmdc:NucleotideSequencing", len(neon_omprc_ids)
447
- )
448
- neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
440
+ nmdc_ntseq_ids = self._id_minter("nmdc:NucleotideSequencing", len(sample_ids))
441
+ neon_to_nmdc_ntseq_ids = dict(zip(sample_ids, nmdc_ntseq_ids))
449
442
 
450
- neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
451
- neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
452
- nmdc_data_object_ids = self._id_minter(
453
- "nmdc:DataObject", len(neon_raw_file_paths)
454
- )
455
- neon_to_nmdc_data_object_ids = dict(
456
- zip(neon_raw_file_paths, nmdc_data_object_ids)
457
- )
443
+ raw_df = self.neon_raw_data_file_mappings_df
444
+ raw_file_paths = raw_df["rawDataFilePath"]
445
+ dataobject_ids = self._id_minter("nmdc:DataObject", len(raw_file_paths))
446
+ neon_to_nmdc_dataobject_ids = dict(zip(raw_file_paths, dataobject_ids))
458
447
 
459
- for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
460
- biosample_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
448
+ for neon_id, biosample_id in neon_to_nmdc_biosample_ids.items():
449
+ row = benthic_samples[benthic_samples["sampleID"] == neon_id]
450
+ if row.empty:
451
+ continue
461
452
 
453
+ # Example of how you might call _translate_biosample:
462
454
  database.biosample_set.append(
463
- self._translate_biosample(neon_id, nmdc_id, biosample_row)
455
+ self._translate_biosample(neon_id, biosample_id, row)
464
456
  )
465
457
 
466
- for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
467
- extraction_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
458
+ for neon_id, extraction_id in neon_to_nmdc_extraction_ids.items():
459
+ row = benthic_samples[benthic_samples["sampleID"] == neon_id]
460
+ if row.empty:
461
+ continue
468
462
 
469
- extraction_input = neon_to_nmdc_biosample_ids.get(neon_id)
470
- processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
463
+ biosample_id = neon_to_nmdc_biosample_ids.get(neon_id)
464
+ extraction_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
471
465
 
472
- if extraction_input is not None and processed_sample_id is not None:
466
+ if biosample_id and extraction_ps_id:
473
467
  database.material_processing_set.append(
474
468
  self._translate_extraction_process(
475
- nmdc_id,
476
- extraction_input,
477
- processed_sample_id,
478
- extraction_row,
469
+ extraction_id, biosample_id, extraction_ps_id, row
479
470
  )
480
471
  )
481
-
482
- genomics_sample_id = _get_value_or_none(
483
- extraction_row, "genomicsSampleID"
484
- )
485
-
472
+ genomics_sample_id = _get_value_or_none(row, "genomicsSampleID")
486
473
  database.processed_sample_set.append(
487
474
  self._translate_processed_sample(
488
- processed_sample_id,
475
+ extraction_ps_id,
489
476
  f"Extracted DNA from {genomics_sample_id}",
490
477
  )
491
478
  )
492
479
 
493
- query = """
480
+ query2 = """
494
481
  SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
495
- FROM neonRawDataFile
482
+ FROM mms_benthicRawDataFiles
496
483
  GROUP BY dnaSampleID
497
484
  """
498
- neon_raw_data_files = pd.read_sql_query(query, self.conn)
499
- neon_raw_data_files_dict = (
500
- neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
485
+ raw_data_files_df = pd.read_sql_query(query2, self.conn)
486
+ dna_files_dict = (
487
+ raw_data_files_df.set_index("dnaSampleID")["rawDataFilePaths"]
501
488
  .str.split("|")
502
489
  .to_dict()
503
490
  )
504
- filtered_neon_raw_data_files_dict = {
505
- key: value
506
- for key, value in neon_raw_data_files_dict.items()
507
- if len(value) <= 2
508
- }
509
491
 
510
- for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
511
- lib_prep_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
492
+ dna_sample_to_manifest_id: dict[str, str] = {}
512
493
 
513
- lib_prep_input = neon_to_nmdc_extraction_processed_ids.get(neon_id)
514
- processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
494
+ for neon_id, libprep_id in neon_to_nmdc_libprep_ids.items():
495
+ row = benthic_samples[benthic_samples["sampleID"] == neon_id]
496
+ if row.empty:
497
+ continue
515
498
 
516
- if lib_prep_input is not None and processed_sample_id is not None:
517
- database.material_processing_set.append(
518
- self._translate_library_preparation(
519
- nmdc_id,
520
- lib_prep_input,
521
- processed_sample_id,
522
- lib_prep_row,
523
- )
499
+ extr_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
500
+ libprep_ps_id = neon_to_nmdc_libprep_processed_ids.get(neon_id)
501
+ if not extr_ps_id or not libprep_ps_id:
502
+ continue
503
+
504
+ database.material_processing_set.append(
505
+ self._translate_library_preparation(
506
+ libprep_id, extr_ps_id, libprep_ps_id, row
524
507
  )
508
+ )
525
509
 
526
- dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
510
+ dna_sample_id = _get_value_or_none(row, "dnaSampleID")
511
+ database.processed_sample_set.append(
512
+ self._translate_processed_sample(
513
+ libprep_ps_id,
514
+ f"Library preparation for {dna_sample_id}",
515
+ )
516
+ )
527
517
 
528
- database.processed_sample_set.append(
529
- self._translate_processed_sample(
530
- processed_sample_id,
531
- f"Library preparation for {dna_sample_id}",
518
+ filepaths_for_dna: list[str] = dna_files_dict.get(dna_sample_id, [])
519
+ if not filepaths_for_dna:
520
+ # no raw files => skip
521
+ ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
522
+ if ntseq_id:
523
+ continue
524
+ continue
525
+
526
+ # If multiple => we create a Manifest
527
+ manifest_id: Optional[str] = None
528
+ if len(filepaths_for_dna) > 2:
529
+ if dna_sample_id not in dna_sample_to_manifest_id:
530
+ new_man_id = self._id_minter("nmdc:Manifest", 1)[0]
531
+ dna_sample_to_manifest_id[dna_sample_id] = new_man_id
532
+ database.manifest_set.append(self._translate_manifest(new_man_id))
533
+ manifest_id = dna_sample_to_manifest_id[dna_sample_id]
534
+
535
+ has_input_value = self.samp_procsm_dict.get(neon_id)
536
+ if not has_input_value:
537
+ continue
538
+
539
+ dataobject_ids_for_run: list[str] = []
540
+ for fp in filepaths_for_dna:
541
+ if fp not in neon_to_nmdc_dataobject_ids:
542
+ continue
543
+ do_id = neon_to_nmdc_dataobject_ids[fp]
544
+
545
+ do_type = None
546
+ if "_R1.fastq.gz" in fp:
547
+ do_type = "Metagenome Raw Read 1"
548
+ elif "_R2.fastq.gz" in fp:
549
+ do_type = "Metagenome Raw Read 2"
550
+
551
+ database.data_object_set.append(
552
+ self._translate_data_object(
553
+ do_id=do_id,
554
+ url=fp,
555
+ do_type=do_type,
556
+ manifest_id=manifest_id,
532
557
  )
533
558
  )
534
-
535
- has_output = None
536
- has_output_do_ids = []
537
-
538
- if dna_sample_id in filtered_neon_raw_data_files_dict:
539
- has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
540
- for item in has_output:
541
- if item in neon_to_nmdc_data_object_ids:
542
- has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
543
-
544
- checksum = None
545
- do_type = None
546
-
547
- checksum = neon_raw_data_file_mappings_df[
548
- neon_raw_data_file_mappings_df["rawDataFilePath"] == item
549
- ]["checkSum"].values[0]
550
- if "_R1.fastq.gz" in item:
551
- do_type = "Metagenome Raw Read 1"
552
- elif "_R2.fastq.gz" in item:
553
- do_type = "Metagenome Raw Read 2"
554
-
555
- database.data_object_set.append(
556
- self._translate_data_object(
557
- neon_to_nmdc_data_object_ids.get(item),
558
- item,
559
- do_type,
560
- checksum,
561
- )
562
- )
563
-
564
- database.data_generation_set.append(
565
- self._translate_nucleotide_sequencing(
566
- neon_to_nmdc_omprc_ids.get(neon_id),
567
- processed_sample_id,
568
- has_output_do_ids,
569
- lib_prep_row,
570
- )
559
+ dataobject_ids_for_run.append(do_id)
560
+
561
+ ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
562
+ if ntseq_id:
563
+ database.data_generation_set.append(
564
+ self._translate_nucleotide_sequencing(
565
+ ntseq_id,
566
+ has_input_value, # <--- from self.samp_procsm_dict
567
+ dataobject_ids_for_run,
568
+ row,
571
569
  )
570
+ )
572
571
 
573
572
  return database