nmdc-runtime 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/site/export/ncbi_xml.py +0 -1
- nmdc_runtime/site/export/ncbi_xml_utils.py +0 -25
- nmdc_runtime/site/graphs.py +11 -0
- nmdc_runtime/site/ops.py +54 -12
- nmdc_runtime/site/repair/database_updater.py +12 -0
- nmdc_runtime/site/repository.py +2 -6
- nmdc_runtime/site/translation/gold_translator.py +11 -0
- nmdc_runtime/site/translation/neon_benthic_translator.py +156 -157
- nmdc_runtime/site/translation/submission_portal_translator.py +269 -51
- nmdc_runtime/site/util.py +8 -1
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/METADATA +19 -6
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/RECORD +16 -16
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info/licenses}/LICENSE +0 -0
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
-
from typing import Union
|
|
3
|
+
from typing import Optional, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import requests_cache
|
|
@@ -61,6 +61,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
61
61
|
"mms_benthicMetagenomeSequencing",
|
|
62
62
|
"mms_benthicMetagenomeDnaExtraction",
|
|
63
63
|
"amb_fieldParent",
|
|
64
|
+
"mms_benthicRawDataFiles", # <--- ensure this is present
|
|
64
65
|
)
|
|
65
66
|
|
|
66
67
|
if all(k in benthic_data for k in neon_amb_data_tables):
|
|
@@ -79,6 +80,12 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
79
80
|
benthic_data["amb_fieldParent"].to_sql(
|
|
80
81
|
"amb_fieldParent", self.conn, if_exists="replace", index=False
|
|
81
82
|
)
|
|
83
|
+
benthic_data["mms_benthicRawDataFiles"].to_sql(
|
|
84
|
+
"mms_benthicRawDataFiles",
|
|
85
|
+
self.conn,
|
|
86
|
+
if_exists="replace",
|
|
87
|
+
index=False,
|
|
88
|
+
)
|
|
82
89
|
else:
|
|
83
90
|
raise ValueError(
|
|
84
91
|
f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
|
|
@@ -88,14 +95,19 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
88
95
|
"neonEnvoTerms", self.conn, if_exists="replace", index=False
|
|
89
96
|
)
|
|
90
97
|
|
|
91
|
-
self.neon_raw_data_file_mappings_df =
|
|
92
|
-
self.neon_raw_data_file_mappings_df.to_sql(
|
|
93
|
-
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
94
|
-
)
|
|
98
|
+
self.neon_raw_data_file_mappings_df = benthic_data["mms_benthicRawDataFiles"]
|
|
95
99
|
|
|
96
100
|
self.site_code_mapping = site_code_mapping
|
|
101
|
+
|
|
97
102
|
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
98
103
|
|
|
104
|
+
def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
|
|
105
|
+
return nmdc.Manifest(
|
|
106
|
+
id=manifest_id,
|
|
107
|
+
manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
|
|
108
|
+
type="nmdc:Manifest",
|
|
109
|
+
)
|
|
110
|
+
|
|
99
111
|
def _translate_biosample(
|
|
100
112
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
101
113
|
) -> nmdc.Biosample:
|
|
@@ -313,7 +325,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
313
325
|
)
|
|
314
326
|
|
|
315
327
|
def _translate_data_object(
|
|
316
|
-
self, do_id: str, url: str, do_type: str,
|
|
328
|
+
self, do_id: str, url: str, do_type: str, manifest_id: str
|
|
317
329
|
) -> nmdc.DataObject:
|
|
318
330
|
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
319
331
|
object mainly contains information about the sequencing file that was generated as
|
|
@@ -324,7 +336,6 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
324
336
|
:param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
|
|
325
337
|
by Hugh Cross at NEON.
|
|
326
338
|
:param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
|
|
327
|
-
:param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
|
|
328
339
|
at NEON.
|
|
329
340
|
:return: DataObject with all the sequencing file metadata.
|
|
330
341
|
"""
|
|
@@ -337,14 +348,14 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
337
348
|
url=url,
|
|
338
349
|
description=f"sequencing results for {basename}",
|
|
339
350
|
type="nmdc:DataObject",
|
|
340
|
-
md5_checksum=checksum,
|
|
341
351
|
data_object_type=do_type,
|
|
352
|
+
in_manifest=manifest_id,
|
|
342
353
|
)
|
|
343
354
|
|
|
344
|
-
def get_database(self):
|
|
355
|
+
def get_database(self) -> nmdc.Database:
|
|
345
356
|
database = nmdc.Database()
|
|
346
357
|
|
|
347
|
-
|
|
358
|
+
join_query = """
|
|
348
359
|
SELECT
|
|
349
360
|
merged.laboratoryName,
|
|
350
361
|
merged.sequencingFacilityID,
|
|
@@ -372,202 +383,190 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
372
383
|
afp.siteID,
|
|
373
384
|
afp.sampleID,
|
|
374
385
|
afp.collectDate
|
|
375
|
-
FROM
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
mms_benthicMetagenomeDnaExtraction AS bd
|
|
398
|
-
ON
|
|
399
|
-
bs.dnaSampleID = bd.dnaSampleID
|
|
400
|
-
) AS merged
|
|
386
|
+
FROM (
|
|
387
|
+
SELECT
|
|
388
|
+
bs.collectDate,
|
|
389
|
+
bs.laboratoryName,
|
|
390
|
+
bs.sequencingFacilityID,
|
|
391
|
+
bs.processedDate,
|
|
392
|
+
bs.dnaSampleID,
|
|
393
|
+
bs.dnaSampleCode,
|
|
394
|
+
bs.internalLabID,
|
|
395
|
+
bs.instrument_model,
|
|
396
|
+
bs.sequencingMethod,
|
|
397
|
+
bs.investigation_type,
|
|
398
|
+
bs.qaqcStatus,
|
|
399
|
+
bs.ncbiProjectID,
|
|
400
|
+
bd.genomicsSampleID,
|
|
401
|
+
bd.sequenceAnalysisType,
|
|
402
|
+
bd.sampleMass,
|
|
403
|
+
bd.nucleicAcidConcentration
|
|
404
|
+
FROM mms_benthicMetagenomeSequencing AS bs
|
|
405
|
+
JOIN mms_benthicMetagenomeDnaExtraction AS bd
|
|
406
|
+
ON bs.dnaSampleID = bd.dnaSampleID
|
|
407
|
+
) AS merged
|
|
401
408
|
LEFT JOIN amb_fieldParent AS afp
|
|
402
|
-
ON
|
|
403
|
-
merged.genomicsSampleID = afp.geneticSampleID
|
|
409
|
+
ON merged.genomicsSampleID = afp.geneticSampleID
|
|
404
410
|
"""
|
|
405
|
-
benthic_samples = pd.read_sql_query(
|
|
411
|
+
benthic_samples = pd.read_sql_query(join_query, self.conn)
|
|
406
412
|
benthic_samples.to_sql(
|
|
407
413
|
"benthicSamples", self.conn, if_exists="replace", index=False
|
|
408
414
|
)
|
|
409
415
|
|
|
410
|
-
|
|
411
|
-
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(
|
|
412
|
-
neon_to_nmdc_biosample_ids = dict(zip(
|
|
416
|
+
sample_ids = benthic_samples["sampleID"]
|
|
417
|
+
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_ids))
|
|
418
|
+
neon_to_nmdc_biosample_ids = dict(zip(sample_ids, nmdc_biosample_ids))
|
|
413
419
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
"nmdc:Extraction", len(neon_extraction_ids)
|
|
417
|
-
)
|
|
418
|
-
neon_to_nmdc_extraction_ids = dict(
|
|
419
|
-
zip(neon_extraction_ids, nmdc_extraction_ids)
|
|
420
|
-
)
|
|
420
|
+
nmdc_extraction_ids = self._id_minter("nmdc:Extraction", len(sample_ids))
|
|
421
|
+
neon_to_nmdc_extraction_ids = dict(zip(sample_ids, nmdc_extraction_ids))
|
|
421
422
|
|
|
422
|
-
neon_extraction_processed_ids = benthic_samples["sampleID"]
|
|
423
423
|
nmdc_extraction_processed_ids = self._id_minter(
|
|
424
|
-
"nmdc:ProcessedSample", len(
|
|
424
|
+
"nmdc:ProcessedSample", len(sample_ids)
|
|
425
425
|
)
|
|
426
426
|
neon_to_nmdc_extraction_processed_ids = dict(
|
|
427
|
-
zip(
|
|
427
|
+
zip(sample_ids, nmdc_extraction_processed_ids)
|
|
428
428
|
)
|
|
429
429
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
"nmdc:LibraryPreparation", len(neon_lib_prep_ids)
|
|
433
|
-
)
|
|
434
|
-
neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
|
|
430
|
+
nmdc_libprep_ids = self._id_minter("nmdc:LibraryPreparation", len(sample_ids))
|
|
431
|
+
neon_to_nmdc_libprep_ids = dict(zip(sample_ids, nmdc_libprep_ids))
|
|
435
432
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
"nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
|
|
433
|
+
nmdc_libprep_processed_ids = self._id_minter(
|
|
434
|
+
"nmdc:ProcessedSample", len(sample_ids)
|
|
439
435
|
)
|
|
440
|
-
|
|
441
|
-
zip(
|
|
436
|
+
neon_to_nmdc_libprep_processed_ids = dict(
|
|
437
|
+
zip(sample_ids, nmdc_libprep_processed_ids)
|
|
442
438
|
)
|
|
443
439
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
"nmdc:NucleotideSequencing", len(neon_omprc_ids)
|
|
447
|
-
)
|
|
448
|
-
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
440
|
+
nmdc_ntseq_ids = self._id_minter("nmdc:NucleotideSequencing", len(sample_ids))
|
|
441
|
+
neon_to_nmdc_ntseq_ids = dict(zip(sample_ids, nmdc_ntseq_ids))
|
|
449
442
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
)
|
|
455
|
-
neon_to_nmdc_data_object_ids = dict(
|
|
456
|
-
zip(neon_raw_file_paths, nmdc_data_object_ids)
|
|
457
|
-
)
|
|
443
|
+
raw_df = self.neon_raw_data_file_mappings_df
|
|
444
|
+
raw_file_paths = raw_df["rawDataFilePath"]
|
|
445
|
+
dataobject_ids = self._id_minter("nmdc:DataObject", len(raw_file_paths))
|
|
446
|
+
neon_to_nmdc_dataobject_ids = dict(zip(raw_file_paths, dataobject_ids))
|
|
458
447
|
|
|
459
|
-
for neon_id,
|
|
460
|
-
|
|
448
|
+
for neon_id, biosample_id in neon_to_nmdc_biosample_ids.items():
|
|
449
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
450
|
+
if row.empty:
|
|
451
|
+
continue
|
|
461
452
|
|
|
453
|
+
# Example of how you might call _translate_biosample:
|
|
462
454
|
database.biosample_set.append(
|
|
463
|
-
self._translate_biosample(neon_id,
|
|
455
|
+
self._translate_biosample(neon_id, biosample_id, row)
|
|
464
456
|
)
|
|
465
457
|
|
|
466
|
-
for neon_id,
|
|
467
|
-
|
|
458
|
+
for neon_id, extraction_id in neon_to_nmdc_extraction_ids.items():
|
|
459
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
460
|
+
if row.empty:
|
|
461
|
+
continue
|
|
468
462
|
|
|
469
|
-
|
|
470
|
-
|
|
463
|
+
biosample_id = neon_to_nmdc_biosample_ids.get(neon_id)
|
|
464
|
+
extraction_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
471
465
|
|
|
472
|
-
if
|
|
466
|
+
if biosample_id and extraction_ps_id:
|
|
473
467
|
database.material_processing_set.append(
|
|
474
468
|
self._translate_extraction_process(
|
|
475
|
-
|
|
476
|
-
extraction_input,
|
|
477
|
-
processed_sample_id,
|
|
478
|
-
extraction_row,
|
|
469
|
+
extraction_id, biosample_id, extraction_ps_id, row
|
|
479
470
|
)
|
|
480
471
|
)
|
|
481
|
-
|
|
482
|
-
genomics_sample_id = _get_value_or_none(
|
|
483
|
-
extraction_row, "genomicsSampleID"
|
|
484
|
-
)
|
|
485
|
-
|
|
472
|
+
genomics_sample_id = _get_value_or_none(row, "genomicsSampleID")
|
|
486
473
|
database.processed_sample_set.append(
|
|
487
474
|
self._translate_processed_sample(
|
|
488
|
-
|
|
475
|
+
extraction_ps_id,
|
|
489
476
|
f"Extracted DNA from {genomics_sample_id}",
|
|
490
477
|
)
|
|
491
478
|
)
|
|
492
479
|
|
|
493
|
-
|
|
480
|
+
query2 = """
|
|
494
481
|
SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
|
|
495
|
-
FROM
|
|
482
|
+
FROM mms_benthicRawDataFiles
|
|
496
483
|
GROUP BY dnaSampleID
|
|
497
484
|
"""
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
485
|
+
raw_data_files_df = pd.read_sql_query(query2, self.conn)
|
|
486
|
+
dna_files_dict = (
|
|
487
|
+
raw_data_files_df.set_index("dnaSampleID")["rawDataFilePaths"]
|
|
501
488
|
.str.split("|")
|
|
502
489
|
.to_dict()
|
|
503
490
|
)
|
|
504
|
-
filtered_neon_raw_data_files_dict = {
|
|
505
|
-
key: value
|
|
506
|
-
for key, value in neon_raw_data_files_dict.items()
|
|
507
|
-
if len(value) <= 2
|
|
508
|
-
}
|
|
509
491
|
|
|
510
|
-
|
|
511
|
-
lib_prep_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
492
|
+
dna_sample_to_manifest_id: dict[str, str] = {}
|
|
512
493
|
|
|
513
|
-
|
|
514
|
-
|
|
494
|
+
for neon_id, libprep_id in neon_to_nmdc_libprep_ids.items():
|
|
495
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
496
|
+
if row.empty:
|
|
497
|
+
continue
|
|
515
498
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
499
|
+
extr_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
500
|
+
libprep_ps_id = neon_to_nmdc_libprep_processed_ids.get(neon_id)
|
|
501
|
+
if not extr_ps_id or not libprep_ps_id:
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
database.material_processing_set.append(
|
|
505
|
+
self._translate_library_preparation(
|
|
506
|
+
libprep_id, extr_ps_id, libprep_ps_id, row
|
|
524
507
|
)
|
|
508
|
+
)
|
|
525
509
|
|
|
526
|
-
|
|
510
|
+
dna_sample_id = _get_value_or_none(row, "dnaSampleID")
|
|
511
|
+
database.processed_sample_set.append(
|
|
512
|
+
self._translate_processed_sample(
|
|
513
|
+
libprep_ps_id,
|
|
514
|
+
f"Library preparation for {dna_sample_id}",
|
|
515
|
+
)
|
|
516
|
+
)
|
|
527
517
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
518
|
+
filepaths_for_dna: list[str] = dna_files_dict.get(dna_sample_id, [])
|
|
519
|
+
if not filepaths_for_dna:
|
|
520
|
+
# no raw files => skip
|
|
521
|
+
ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
|
|
522
|
+
if ntseq_id:
|
|
523
|
+
continue
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
# If multiple => we create a Manifest
|
|
527
|
+
manifest_id: Optional[str] = None
|
|
528
|
+
if len(filepaths_for_dna) > 2:
|
|
529
|
+
if dna_sample_id not in dna_sample_to_manifest_id:
|
|
530
|
+
new_man_id = self._id_minter("nmdc:Manifest", 1)[0]
|
|
531
|
+
dna_sample_to_manifest_id[dna_sample_id] = new_man_id
|
|
532
|
+
database.manifest_set.append(self._translate_manifest(new_man_id))
|
|
533
|
+
manifest_id = dna_sample_to_manifest_id[dna_sample_id]
|
|
534
|
+
|
|
535
|
+
has_input_value = self.samp_procsm_dict.get(neon_id)
|
|
536
|
+
if not has_input_value:
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
dataobject_ids_for_run: list[str] = []
|
|
540
|
+
for fp in filepaths_for_dna:
|
|
541
|
+
if fp not in neon_to_nmdc_dataobject_ids:
|
|
542
|
+
continue
|
|
543
|
+
do_id = neon_to_nmdc_dataobject_ids[fp]
|
|
544
|
+
|
|
545
|
+
do_type = None
|
|
546
|
+
if "_R1.fastq.gz" in fp:
|
|
547
|
+
do_type = "Metagenome Raw Read 1"
|
|
548
|
+
elif "_R2.fastq.gz" in fp:
|
|
549
|
+
do_type = "Metagenome Raw Read 2"
|
|
550
|
+
|
|
551
|
+
database.data_object_set.append(
|
|
552
|
+
self._translate_data_object(
|
|
553
|
+
do_id=do_id,
|
|
554
|
+
url=fp,
|
|
555
|
+
do_type=do_type,
|
|
556
|
+
manifest_id=manifest_id,
|
|
532
557
|
)
|
|
533
558
|
)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
checksum = None
|
|
545
|
-
do_type = None
|
|
546
|
-
|
|
547
|
-
checksum = neon_raw_data_file_mappings_df[
|
|
548
|
-
neon_raw_data_file_mappings_df["rawDataFilePath"] == item
|
|
549
|
-
]["checkSum"].values[0]
|
|
550
|
-
if "_R1.fastq.gz" in item:
|
|
551
|
-
do_type = "Metagenome Raw Read 1"
|
|
552
|
-
elif "_R2.fastq.gz" in item:
|
|
553
|
-
do_type = "Metagenome Raw Read 2"
|
|
554
|
-
|
|
555
|
-
database.data_object_set.append(
|
|
556
|
-
self._translate_data_object(
|
|
557
|
-
neon_to_nmdc_data_object_ids.get(item),
|
|
558
|
-
item,
|
|
559
|
-
do_type,
|
|
560
|
-
checksum,
|
|
561
|
-
)
|
|
562
|
-
)
|
|
563
|
-
|
|
564
|
-
database.data_generation_set.append(
|
|
565
|
-
self._translate_nucleotide_sequencing(
|
|
566
|
-
neon_to_nmdc_omprc_ids.get(neon_id),
|
|
567
|
-
processed_sample_id,
|
|
568
|
-
has_output_do_ids,
|
|
569
|
-
lib_prep_row,
|
|
570
|
-
)
|
|
559
|
+
dataobject_ids_for_run.append(do_id)
|
|
560
|
+
|
|
561
|
+
ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
|
|
562
|
+
if ntseq_id:
|
|
563
|
+
database.data_generation_set.append(
|
|
564
|
+
self._translate_nucleotide_sequencing(
|
|
565
|
+
ntseq_id,
|
|
566
|
+
has_input_value, # <--- from self.samp_procsm_dict
|
|
567
|
+
dataobject_ids_for_run,
|
|
568
|
+
row,
|
|
571
569
|
)
|
|
570
|
+
)
|
|
572
571
|
|
|
573
572
|
return database
|