nmdc-runtime 2.4.0__py3-none-any.whl → 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/site/ops.py +6 -1
- nmdc_runtime/site/repair/database_updater.py +12 -0
- nmdc_runtime/site/repository.py +2 -2
- nmdc_runtime/site/translation/neon_benthic_translator.py +156 -157
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.5.0.dist-info}/METADATA +17 -4
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.5.0.dist-info}/RECORD +10 -10
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.5.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.5.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.5.0.dist-info/licenses}/LICENSE +0 -0
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.5.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/site/ops.py
CHANGED
|
@@ -1100,7 +1100,12 @@ def materialize_alldocs(context) -> int:
|
|
|
1100
1100
|
write_operations = []
|
|
1101
1101
|
documents_processed_counter = 0
|
|
1102
1102
|
for doc in mdb[coll_name].find():
|
|
1103
|
-
|
|
1103
|
+
try:
|
|
1104
|
+
doc_type = doc["type"][5:] # lop off "nmdc:" prefix
|
|
1105
|
+
except KeyError:
|
|
1106
|
+
raise Exception(
|
|
1107
|
+
f"doc {doc['id']} in collection {coll_name} has no 'type'!"
|
|
1108
|
+
)
|
|
1104
1109
|
slots_to_include = ["id", "type"] + document_reference_ranged_slots[
|
|
1105
1110
|
doc_type
|
|
1106
1111
|
]
|
|
@@ -199,8 +199,20 @@ class DatabaseUpdater:
|
|
|
199
199
|
if gbs.get("biosampleGoldId") not in nmdc_gold_ids
|
|
200
200
|
]
|
|
201
201
|
|
|
202
|
+
# use the GOLD study id to fetch all sequencing project records associated with the study
|
|
203
|
+
gold_sequencing_projects_for_study = (
|
|
204
|
+
self.gold_api_client.fetch_projects_by_study(gold_study_id)
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# use the GOLD study id to fetch all analysis project records associated with the study
|
|
208
|
+
gold_analysis_projects_for_study = (
|
|
209
|
+
self.gold_api_client.fetch_analysis_projects_by_study(gold_study_id)
|
|
210
|
+
)
|
|
211
|
+
|
|
202
212
|
gold_study_translator = GoldStudyTranslator(
|
|
203
213
|
biosamples=missing_gold_biosamples,
|
|
214
|
+
projects=gold_sequencing_projects_for_study,
|
|
215
|
+
analysis_projects=gold_analysis_projects_for_study,
|
|
204
216
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
205
217
|
)
|
|
206
218
|
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -744,7 +744,7 @@ def biosample_submission_ingest():
|
|
|
744
744
|
"config": {
|
|
745
745
|
"benthic_data_product": {
|
|
746
746
|
"product_id": "DP1.20279.001",
|
|
747
|
-
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
|
|
747
|
+
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
|
|
748
748
|
}
|
|
749
749
|
}
|
|
750
750
|
},
|
|
@@ -771,7 +771,7 @@ def biosample_submission_ingest():
|
|
|
771
771
|
"config": {
|
|
772
772
|
"benthic_data_product": {
|
|
773
773
|
"product_id": "DP1.20279.001",
|
|
774
|
-
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
|
|
774
|
+
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
|
|
775
775
|
}
|
|
776
776
|
}
|
|
777
777
|
},
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
-
from typing import Union
|
|
3
|
+
from typing import Optional, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import requests_cache
|
|
@@ -61,6 +61,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
61
61
|
"mms_benthicMetagenomeSequencing",
|
|
62
62
|
"mms_benthicMetagenomeDnaExtraction",
|
|
63
63
|
"amb_fieldParent",
|
|
64
|
+
"mms_benthicRawDataFiles", # <--- ensure this is present
|
|
64
65
|
)
|
|
65
66
|
|
|
66
67
|
if all(k in benthic_data for k in neon_amb_data_tables):
|
|
@@ -79,6 +80,12 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
79
80
|
benthic_data["amb_fieldParent"].to_sql(
|
|
80
81
|
"amb_fieldParent", self.conn, if_exists="replace", index=False
|
|
81
82
|
)
|
|
83
|
+
benthic_data["mms_benthicRawDataFiles"].to_sql(
|
|
84
|
+
"mms_benthicRawDataFiles",
|
|
85
|
+
self.conn,
|
|
86
|
+
if_exists="replace",
|
|
87
|
+
index=False,
|
|
88
|
+
)
|
|
82
89
|
else:
|
|
83
90
|
raise ValueError(
|
|
84
91
|
f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
|
|
@@ -88,14 +95,19 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
88
95
|
"neonEnvoTerms", self.conn, if_exists="replace", index=False
|
|
89
96
|
)
|
|
90
97
|
|
|
91
|
-
self.neon_raw_data_file_mappings_df =
|
|
92
|
-
self.neon_raw_data_file_mappings_df.to_sql(
|
|
93
|
-
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
94
|
-
)
|
|
98
|
+
self.neon_raw_data_file_mappings_df = benthic_data["mms_benthicRawDataFiles"]
|
|
95
99
|
|
|
96
100
|
self.site_code_mapping = site_code_mapping
|
|
101
|
+
|
|
97
102
|
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
98
103
|
|
|
104
|
+
def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
|
|
105
|
+
return nmdc.Manifest(
|
|
106
|
+
id=manifest_id,
|
|
107
|
+
manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
|
|
108
|
+
type="nmdc:Manifest",
|
|
109
|
+
)
|
|
110
|
+
|
|
99
111
|
def _translate_biosample(
|
|
100
112
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
101
113
|
) -> nmdc.Biosample:
|
|
@@ -313,7 +325,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
313
325
|
)
|
|
314
326
|
|
|
315
327
|
def _translate_data_object(
|
|
316
|
-
self, do_id: str, url: str, do_type: str,
|
|
328
|
+
self, do_id: str, url: str, do_type: str, manifest_id: str
|
|
317
329
|
) -> nmdc.DataObject:
|
|
318
330
|
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
319
331
|
object mainly contains information about the sequencing file that was generated as
|
|
@@ -324,7 +336,6 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
324
336
|
:param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
|
|
325
337
|
by Hugh Cross at NEON.
|
|
326
338
|
:param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
|
|
327
|
-
:param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
|
|
328
339
|
at NEON.
|
|
329
340
|
:return: DataObject with all the sequencing file metadata.
|
|
330
341
|
"""
|
|
@@ -337,14 +348,14 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
337
348
|
url=url,
|
|
338
349
|
description=f"sequencing results for {basename}",
|
|
339
350
|
type="nmdc:DataObject",
|
|
340
|
-
md5_checksum=checksum,
|
|
341
351
|
data_object_type=do_type,
|
|
352
|
+
in_manifest=manifest_id,
|
|
342
353
|
)
|
|
343
354
|
|
|
344
|
-
def get_database(self):
|
|
355
|
+
def get_database(self) -> nmdc.Database:
|
|
345
356
|
database = nmdc.Database()
|
|
346
357
|
|
|
347
|
-
|
|
358
|
+
join_query = """
|
|
348
359
|
SELECT
|
|
349
360
|
merged.laboratoryName,
|
|
350
361
|
merged.sequencingFacilityID,
|
|
@@ -372,202 +383,190 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
372
383
|
afp.siteID,
|
|
373
384
|
afp.sampleID,
|
|
374
385
|
afp.collectDate
|
|
375
|
-
FROM
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
mms_benthicMetagenomeDnaExtraction AS bd
|
|
398
|
-
ON
|
|
399
|
-
bs.dnaSampleID = bd.dnaSampleID
|
|
400
|
-
) AS merged
|
|
386
|
+
FROM (
|
|
387
|
+
SELECT
|
|
388
|
+
bs.collectDate,
|
|
389
|
+
bs.laboratoryName,
|
|
390
|
+
bs.sequencingFacilityID,
|
|
391
|
+
bs.processedDate,
|
|
392
|
+
bs.dnaSampleID,
|
|
393
|
+
bs.dnaSampleCode,
|
|
394
|
+
bs.internalLabID,
|
|
395
|
+
bs.instrument_model,
|
|
396
|
+
bs.sequencingMethod,
|
|
397
|
+
bs.investigation_type,
|
|
398
|
+
bs.qaqcStatus,
|
|
399
|
+
bs.ncbiProjectID,
|
|
400
|
+
bd.genomicsSampleID,
|
|
401
|
+
bd.sequenceAnalysisType,
|
|
402
|
+
bd.sampleMass,
|
|
403
|
+
bd.nucleicAcidConcentration
|
|
404
|
+
FROM mms_benthicMetagenomeSequencing AS bs
|
|
405
|
+
JOIN mms_benthicMetagenomeDnaExtraction AS bd
|
|
406
|
+
ON bs.dnaSampleID = bd.dnaSampleID
|
|
407
|
+
) AS merged
|
|
401
408
|
LEFT JOIN amb_fieldParent AS afp
|
|
402
|
-
ON
|
|
403
|
-
merged.genomicsSampleID = afp.geneticSampleID
|
|
409
|
+
ON merged.genomicsSampleID = afp.geneticSampleID
|
|
404
410
|
"""
|
|
405
|
-
benthic_samples = pd.read_sql_query(
|
|
411
|
+
benthic_samples = pd.read_sql_query(join_query, self.conn)
|
|
406
412
|
benthic_samples.to_sql(
|
|
407
413
|
"benthicSamples", self.conn, if_exists="replace", index=False
|
|
408
414
|
)
|
|
409
415
|
|
|
410
|
-
|
|
411
|
-
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(
|
|
412
|
-
neon_to_nmdc_biosample_ids = dict(zip(
|
|
416
|
+
sample_ids = benthic_samples["sampleID"]
|
|
417
|
+
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_ids))
|
|
418
|
+
neon_to_nmdc_biosample_ids = dict(zip(sample_ids, nmdc_biosample_ids))
|
|
413
419
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
"nmdc:Extraction", len(neon_extraction_ids)
|
|
417
|
-
)
|
|
418
|
-
neon_to_nmdc_extraction_ids = dict(
|
|
419
|
-
zip(neon_extraction_ids, nmdc_extraction_ids)
|
|
420
|
-
)
|
|
420
|
+
nmdc_extraction_ids = self._id_minter("nmdc:Extraction", len(sample_ids))
|
|
421
|
+
neon_to_nmdc_extraction_ids = dict(zip(sample_ids, nmdc_extraction_ids))
|
|
421
422
|
|
|
422
|
-
neon_extraction_processed_ids = benthic_samples["sampleID"]
|
|
423
423
|
nmdc_extraction_processed_ids = self._id_minter(
|
|
424
|
-
"nmdc:ProcessedSample", len(
|
|
424
|
+
"nmdc:ProcessedSample", len(sample_ids)
|
|
425
425
|
)
|
|
426
426
|
neon_to_nmdc_extraction_processed_ids = dict(
|
|
427
|
-
zip(
|
|
427
|
+
zip(sample_ids, nmdc_extraction_processed_ids)
|
|
428
428
|
)
|
|
429
429
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
"nmdc:LibraryPreparation", len(neon_lib_prep_ids)
|
|
433
|
-
)
|
|
434
|
-
neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
|
|
430
|
+
nmdc_libprep_ids = self._id_minter("nmdc:LibraryPreparation", len(sample_ids))
|
|
431
|
+
neon_to_nmdc_libprep_ids = dict(zip(sample_ids, nmdc_libprep_ids))
|
|
435
432
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
"nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
|
|
433
|
+
nmdc_libprep_processed_ids = self._id_minter(
|
|
434
|
+
"nmdc:ProcessedSample", len(sample_ids)
|
|
439
435
|
)
|
|
440
|
-
|
|
441
|
-
zip(
|
|
436
|
+
neon_to_nmdc_libprep_processed_ids = dict(
|
|
437
|
+
zip(sample_ids, nmdc_libprep_processed_ids)
|
|
442
438
|
)
|
|
443
439
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
"nmdc:NucleotideSequencing", len(neon_omprc_ids)
|
|
447
|
-
)
|
|
448
|
-
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
440
|
+
nmdc_ntseq_ids = self._id_minter("nmdc:NucleotideSequencing", len(sample_ids))
|
|
441
|
+
neon_to_nmdc_ntseq_ids = dict(zip(sample_ids, nmdc_ntseq_ids))
|
|
449
442
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
)
|
|
455
|
-
neon_to_nmdc_data_object_ids = dict(
|
|
456
|
-
zip(neon_raw_file_paths, nmdc_data_object_ids)
|
|
457
|
-
)
|
|
443
|
+
raw_df = self.neon_raw_data_file_mappings_df
|
|
444
|
+
raw_file_paths = raw_df["rawDataFilePath"]
|
|
445
|
+
dataobject_ids = self._id_minter("nmdc:DataObject", len(raw_file_paths))
|
|
446
|
+
neon_to_nmdc_dataobject_ids = dict(zip(raw_file_paths, dataobject_ids))
|
|
458
447
|
|
|
459
|
-
for neon_id,
|
|
460
|
-
|
|
448
|
+
for neon_id, biosample_id in neon_to_nmdc_biosample_ids.items():
|
|
449
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
450
|
+
if row.empty:
|
|
451
|
+
continue
|
|
461
452
|
|
|
453
|
+
# Example of how you might call _translate_biosample:
|
|
462
454
|
database.biosample_set.append(
|
|
463
|
-
self._translate_biosample(neon_id,
|
|
455
|
+
self._translate_biosample(neon_id, biosample_id, row)
|
|
464
456
|
)
|
|
465
457
|
|
|
466
|
-
for neon_id,
|
|
467
|
-
|
|
458
|
+
for neon_id, extraction_id in neon_to_nmdc_extraction_ids.items():
|
|
459
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
460
|
+
if row.empty:
|
|
461
|
+
continue
|
|
468
462
|
|
|
469
|
-
|
|
470
|
-
|
|
463
|
+
biosample_id = neon_to_nmdc_biosample_ids.get(neon_id)
|
|
464
|
+
extraction_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
471
465
|
|
|
472
|
-
if
|
|
466
|
+
if biosample_id and extraction_ps_id:
|
|
473
467
|
database.material_processing_set.append(
|
|
474
468
|
self._translate_extraction_process(
|
|
475
|
-
|
|
476
|
-
extraction_input,
|
|
477
|
-
processed_sample_id,
|
|
478
|
-
extraction_row,
|
|
469
|
+
extraction_id, biosample_id, extraction_ps_id, row
|
|
479
470
|
)
|
|
480
471
|
)
|
|
481
|
-
|
|
482
|
-
genomics_sample_id = _get_value_or_none(
|
|
483
|
-
extraction_row, "genomicsSampleID"
|
|
484
|
-
)
|
|
485
|
-
|
|
472
|
+
genomics_sample_id = _get_value_or_none(row, "genomicsSampleID")
|
|
486
473
|
database.processed_sample_set.append(
|
|
487
474
|
self._translate_processed_sample(
|
|
488
|
-
|
|
475
|
+
extraction_ps_id,
|
|
489
476
|
f"Extracted DNA from {genomics_sample_id}",
|
|
490
477
|
)
|
|
491
478
|
)
|
|
492
479
|
|
|
493
|
-
|
|
480
|
+
query2 = """
|
|
494
481
|
SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
|
|
495
|
-
FROM
|
|
482
|
+
FROM mms_benthicRawDataFiles
|
|
496
483
|
GROUP BY dnaSampleID
|
|
497
484
|
"""
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
485
|
+
raw_data_files_df = pd.read_sql_query(query2, self.conn)
|
|
486
|
+
dna_files_dict = (
|
|
487
|
+
raw_data_files_df.set_index("dnaSampleID")["rawDataFilePaths"]
|
|
501
488
|
.str.split("|")
|
|
502
489
|
.to_dict()
|
|
503
490
|
)
|
|
504
|
-
filtered_neon_raw_data_files_dict = {
|
|
505
|
-
key: value
|
|
506
|
-
for key, value in neon_raw_data_files_dict.items()
|
|
507
|
-
if len(value) <= 2
|
|
508
|
-
}
|
|
509
491
|
|
|
510
|
-
|
|
511
|
-
lib_prep_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
492
|
+
dna_sample_to_manifest_id: dict[str, str] = {}
|
|
512
493
|
|
|
513
|
-
|
|
514
|
-
|
|
494
|
+
for neon_id, libprep_id in neon_to_nmdc_libprep_ids.items():
|
|
495
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
496
|
+
if row.empty:
|
|
497
|
+
continue
|
|
515
498
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
499
|
+
extr_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
500
|
+
libprep_ps_id = neon_to_nmdc_libprep_processed_ids.get(neon_id)
|
|
501
|
+
if not extr_ps_id or not libprep_ps_id:
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
database.material_processing_set.append(
|
|
505
|
+
self._translate_library_preparation(
|
|
506
|
+
libprep_id, extr_ps_id, libprep_ps_id, row
|
|
524
507
|
)
|
|
508
|
+
)
|
|
525
509
|
|
|
526
|
-
|
|
510
|
+
dna_sample_id = _get_value_or_none(row, "dnaSampleID")
|
|
511
|
+
database.processed_sample_set.append(
|
|
512
|
+
self._translate_processed_sample(
|
|
513
|
+
libprep_ps_id,
|
|
514
|
+
f"Library preparation for {dna_sample_id}",
|
|
515
|
+
)
|
|
516
|
+
)
|
|
527
517
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
518
|
+
filepaths_for_dna: list[str] = dna_files_dict.get(dna_sample_id, [])
|
|
519
|
+
if not filepaths_for_dna:
|
|
520
|
+
# no raw files => skip
|
|
521
|
+
ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
|
|
522
|
+
if ntseq_id:
|
|
523
|
+
continue
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
# If multiple => we create a Manifest
|
|
527
|
+
manifest_id: Optional[str] = None
|
|
528
|
+
if len(filepaths_for_dna) > 2:
|
|
529
|
+
if dna_sample_id not in dna_sample_to_manifest_id:
|
|
530
|
+
new_man_id = self._id_minter("nmdc:Manifest", 1)[0]
|
|
531
|
+
dna_sample_to_manifest_id[dna_sample_id] = new_man_id
|
|
532
|
+
database.manifest_set.append(self._translate_manifest(new_man_id))
|
|
533
|
+
manifest_id = dna_sample_to_manifest_id[dna_sample_id]
|
|
534
|
+
|
|
535
|
+
has_input_value = self.samp_procsm_dict.get(neon_id)
|
|
536
|
+
if not has_input_value:
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
dataobject_ids_for_run: list[str] = []
|
|
540
|
+
for fp in filepaths_for_dna:
|
|
541
|
+
if fp not in neon_to_nmdc_dataobject_ids:
|
|
542
|
+
continue
|
|
543
|
+
do_id = neon_to_nmdc_dataobject_ids[fp]
|
|
544
|
+
|
|
545
|
+
do_type = None
|
|
546
|
+
if "_R1.fastq.gz" in fp:
|
|
547
|
+
do_type = "Metagenome Raw Read 1"
|
|
548
|
+
elif "_R2.fastq.gz" in fp:
|
|
549
|
+
do_type = "Metagenome Raw Read 2"
|
|
550
|
+
|
|
551
|
+
database.data_object_set.append(
|
|
552
|
+
self._translate_data_object(
|
|
553
|
+
do_id=do_id,
|
|
554
|
+
url=fp,
|
|
555
|
+
do_type=do_type,
|
|
556
|
+
manifest_id=manifest_id,
|
|
532
557
|
)
|
|
533
558
|
)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
checksum = None
|
|
545
|
-
do_type = None
|
|
546
|
-
|
|
547
|
-
checksum = neon_raw_data_file_mappings_df[
|
|
548
|
-
neon_raw_data_file_mappings_df["rawDataFilePath"] == item
|
|
549
|
-
]["checkSum"].values[0]
|
|
550
|
-
if "_R1.fastq.gz" in item:
|
|
551
|
-
do_type = "Metagenome Raw Read 1"
|
|
552
|
-
elif "_R2.fastq.gz" in item:
|
|
553
|
-
do_type = "Metagenome Raw Read 2"
|
|
554
|
-
|
|
555
|
-
database.data_object_set.append(
|
|
556
|
-
self._translate_data_object(
|
|
557
|
-
neon_to_nmdc_data_object_ids.get(item),
|
|
558
|
-
item,
|
|
559
|
-
do_type,
|
|
560
|
-
checksum,
|
|
561
|
-
)
|
|
562
|
-
)
|
|
563
|
-
|
|
564
|
-
database.data_generation_set.append(
|
|
565
|
-
self._translate_nucleotide_sequencing(
|
|
566
|
-
neon_to_nmdc_omprc_ids.get(neon_id),
|
|
567
|
-
processed_sample_id,
|
|
568
|
-
has_output_do_ids,
|
|
569
|
-
lib_prep_row,
|
|
570
|
-
)
|
|
559
|
+
dataobject_ids_for_run.append(do_id)
|
|
560
|
+
|
|
561
|
+
ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
|
|
562
|
+
if ntseq_id:
|
|
563
|
+
database.data_generation_set.append(
|
|
564
|
+
self._translate_nucleotide_sequencing(
|
|
565
|
+
ntseq_id,
|
|
566
|
+
has_input_value, # <--- from self.samp_procsm_dict
|
|
567
|
+
dataobject_ids_for_run,
|
|
568
|
+
row,
|
|
571
569
|
)
|
|
570
|
+
)
|
|
572
571
|
|
|
573
572
|
return database
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: nmdc_runtime
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5.0
|
|
4
4
|
Summary: A runtime system for NMDC data management and orchestration
|
|
5
5
|
Home-page: https://github.com/microbiomedata/nmdc-runtime
|
|
6
6
|
Author: Donny Winston
|
|
@@ -17,6 +17,7 @@ Dynamic: classifier
|
|
|
17
17
|
Dynamic: description
|
|
18
18
|
Dynamic: description-content-type
|
|
19
19
|
Dynamic: home-page
|
|
20
|
+
Dynamic: license-file
|
|
20
21
|
Dynamic: requires-python
|
|
21
22
|
Dynamic: summary
|
|
22
23
|
|
|
@@ -37,8 +38,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
|
|
|
37
38
|
* [nmdc-server](https://github.com/microbiomedata/nmdc-server)
|
|
38
39
|
houses code specific to the data portal -- its database, back-end API, and front-end application.
|
|
39
40
|
|
|
40
|
-
* [
|
|
41
|
-
references workflow code spread across several repositories, that take source data and produce computed data.
|
|
41
|
+
* Workflows — documented in the [workflows](https://docs.microbiomedata.org/workflows/) section of the NMDC documentation website — take source data and produce computed data.
|
|
42
42
|
|
|
43
43
|
* This repo (nmdc-runtime)
|
|
44
44
|
* houses code that takes source data and computed data, and transforms it
|
|
@@ -156,6 +156,9 @@ Tests can be found in `tests` and are run with the following commands:
|
|
|
156
156
|
```bash
|
|
157
157
|
make up-test
|
|
158
158
|
make test
|
|
159
|
+
|
|
160
|
+
# Run a Specific test file eg. tests/test_api/test_endpoints.py
|
|
161
|
+
make test ARGS="tests/test_api/test_endpoints.py"
|
|
159
162
|
```
|
|
160
163
|
|
|
161
164
|
As you create Dagster solids and pipelines, add tests in `tests/` to check that your code behaves as
|
|
@@ -164,6 +167,16 @@ desired and does not break over time.
|
|
|
164
167
|
[For hints on how to write tests for solids and pipelines in Dagster, see their documentation
|
|
165
168
|
tutorial on Testing](https://docs.dagster.io/tutorial/testable).
|
|
166
169
|
|
|
170
|
+
### RAM usage
|
|
171
|
+
|
|
172
|
+
The `dagster-daemon` and `dagster-dagit` containers can consume a lot of RAM. If tests are failing and the console of
|
|
173
|
+
the `test` container shows "Error 137," here is something you can try as a workaround: In Docker Desktop, go to
|
|
174
|
+
"Settings > Resources > Advanced," and increase the memory limit. One of our team members has
|
|
175
|
+
found **12 GB** to be sufficient for running the tests.
|
|
176
|
+
|
|
177
|
+
> Dedicating 12 GB of RAM to Docker may be prohibitive for some prospective developers.
|
|
178
|
+
> There is an open [issue](https://github.com/microbiomedata/nmdc-runtime/issues/928) about the memory requirement.
|
|
179
|
+
|
|
167
180
|
## Publish to PyPI
|
|
168
181
|
|
|
169
182
|
This repository contains a GitHub Actions workflow that publishes a Python package to [PyPI](https://pypi.org/project/nmdc-runtime/).
|
|
@@ -37,8 +37,8 @@ nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
37
37
|
nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
|
|
38
38
|
nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
39
|
nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
|
|
40
|
-
nmdc_runtime/site/ops.py,sha256=
|
|
41
|
-
nmdc_runtime/site/repository.py,sha256=
|
|
40
|
+
nmdc_runtime/site/ops.py,sha256=tg-zRlVSUSJ7B0cJbBsUwmMRmpIUmK5tsL8ABnY0wnY,46626
|
|
41
|
+
nmdc_runtime/site/repository.py,sha256=kVCoIMF2rgAMUAf9a6jk0WbejFpmWgxh6nN4U37Mgc8,43919
|
|
42
42
|
nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
|
|
43
43
|
nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
|
|
44
44
|
nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -57,13 +57,13 @@ nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ
|
|
|
57
57
|
nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
|
|
59
59
|
nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
|
-
nmdc_runtime/site/repair/database_updater.py,sha256=
|
|
60
|
+
nmdc_runtime/site/repair/database_updater.py,sha256=eTNAPtgAc_xQodADBfgomwow9-14j5rBqQWF8R7BheY,11525
|
|
61
61
|
nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
62
|
nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
|
|
63
63
|
nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
|
|
64
64
|
nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
|
|
65
65
|
nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
|
|
66
|
-
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=
|
|
66
|
+
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=VxN7yCziQE-ZP9mtrzqI-yaS9taEgTy0EnIEattYeKo,23727
|
|
67
67
|
nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
|
|
68
68
|
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
|
|
69
69
|
nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
|
|
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
|
|
|
75
75
|
nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
|
|
76
76
|
nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
|
|
77
77
|
nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
|
|
78
|
-
nmdc_runtime-2.
|
|
79
|
-
nmdc_runtime-2.
|
|
80
|
-
nmdc_runtime-2.
|
|
81
|
-
nmdc_runtime-2.
|
|
82
|
-
nmdc_runtime-2.
|
|
83
|
-
nmdc_runtime-2.
|
|
78
|
+
nmdc_runtime-2.5.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
|
|
79
|
+
nmdc_runtime-2.5.0.dist-info/METADATA,sha256=tli66QKJC-48TzLXbI9iHMzTLyugbRBKj9CJEeKHXLY,8139
|
|
80
|
+
nmdc_runtime-2.5.0.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
|
|
81
|
+
nmdc_runtime-2.5.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
|
|
82
|
+
nmdc_runtime-2.5.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
|
|
83
|
+
nmdc_runtime-2.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|