nmdc-runtime 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/site/export/ncbi_xml.py +0 -1
- nmdc_runtime/site/export/ncbi_xml_utils.py +0 -25
- nmdc_runtime/site/graphs.py +11 -0
- nmdc_runtime/site/ops.py +54 -12
- nmdc_runtime/site/repair/database_updater.py +12 -0
- nmdc_runtime/site/repository.py +2 -6
- nmdc_runtime/site/translation/gold_translator.py +11 -0
- nmdc_runtime/site/translation/neon_benthic_translator.py +156 -157
- nmdc_runtime/site/translation/submission_portal_translator.py +269 -51
- nmdc_runtime/site/util.py +8 -1
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/METADATA +19 -6
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/RECORD +16 -16
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info/licenses}/LICENSE +0 -0
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
+
from collections import namedtuple
|
|
3
4
|
from datetime import datetime
|
|
4
5
|
from enum import Enum
|
|
5
6
|
from functools import lru_cache
|
|
6
7
|
from importlib import resources
|
|
7
|
-
from typing import Any, List, Optional, Union
|
|
8
|
+
from typing import Any, List, Optional, Union, Tuple
|
|
9
|
+
from urllib.parse import urlparse
|
|
8
10
|
|
|
9
11
|
from linkml_runtime import SchemaView
|
|
10
12
|
from linkml_runtime.linkml_model import SlotDefinition
|
|
@@ -13,8 +15,38 @@ from toolz import concat, dissoc, get_in, groupby, valmap
|
|
|
13
15
|
|
|
14
16
|
from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
|
|
15
17
|
|
|
18
|
+
|
|
19
|
+
DataUrlSet = namedtuple("DataUrlSet", ["url", "md5_checksum"])
|
|
20
|
+
|
|
21
|
+
READ_1 = DataUrlSet("read_1_url", "read_1_md5_checksum")
|
|
22
|
+
READ_2 = DataUrlSet("read_2_url", "read_2_md5_checksum")
|
|
23
|
+
INTERLEAVED = DataUrlSet("interleaved_url", "interleaved_md5_checksum")
|
|
24
|
+
|
|
25
|
+
DATA_URL_SETS: list[DataUrlSet] = [READ_1, READ_2, INTERLEAVED]
|
|
26
|
+
|
|
16
27
|
BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"
|
|
17
28
|
|
|
29
|
+
TAB_NAME_KEY = "__tab_name"
|
|
30
|
+
METAGENOME = nmdc.NucleotideSequencingEnum(nmdc.NucleotideSequencingEnum.metagenome)
|
|
31
|
+
METATRANSCRIPTOME = nmdc.NucleotideSequencingEnum(
|
|
32
|
+
nmdc.NucleotideSequencingEnum.metatranscriptome
|
|
33
|
+
)
|
|
34
|
+
TAB_NAME_TO_ANALYTE_CATEGORY: dict[str, nmdc.NucleotideSequencingEnum] = {
|
|
35
|
+
"metagenome_sequencing_non_interleaved_data": METAGENOME,
|
|
36
|
+
"metagenome_sequencing_interleaved_data": METAGENOME,
|
|
37
|
+
"metatranscriptome_sequencing_non_interleaved_data": METATRANSCRIPTOME,
|
|
38
|
+
"metatranscriptome_sequencing_interleaved_data": METATRANSCRIPTOME,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str] = {
|
|
42
|
+
(READ_1, str(METAGENOME)): "Metagenome Raw Read 1",
|
|
43
|
+
(READ_2, str(METAGENOME)): "Metagenome Raw Read 2",
|
|
44
|
+
(INTERLEAVED, str(METAGENOME)): "Metagenome Raw Reads",
|
|
45
|
+
(READ_1, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 1",
|
|
46
|
+
(READ_2, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 2",
|
|
47
|
+
(INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
|
|
48
|
+
}
|
|
49
|
+
|
|
18
50
|
|
|
19
51
|
class EnvironmentPackage(Enum):
|
|
20
52
|
r"""
|
|
@@ -75,6 +107,18 @@ def group_dicts_by_key(key: str, seq: Optional[list[dict]]) -> Optional[dict]:
|
|
|
75
107
|
return grouped
|
|
76
108
|
|
|
77
109
|
|
|
110
|
+
def split_strip(string: str | None, sep: str) -> list[str] | None:
|
|
111
|
+
"""Split a string by a separator and strip whitespace from each part.
|
|
112
|
+
|
|
113
|
+
:param string: string to split
|
|
114
|
+
:param sep: separator to split by
|
|
115
|
+
:return: list of stripped strings
|
|
116
|
+
"""
|
|
117
|
+
if string is None:
|
|
118
|
+
return None
|
|
119
|
+
return [s.strip() for s in string.split(sep)]
|
|
120
|
+
|
|
121
|
+
|
|
78
122
|
class SubmissionPortalTranslator(Translator):
|
|
79
123
|
"""A Translator subclass for handling submission portal entries
|
|
80
124
|
|
|
@@ -86,17 +130,15 @@ class SubmissionPortalTranslator(Translator):
|
|
|
86
130
|
|
|
87
131
|
def __init__(
|
|
88
132
|
self,
|
|
89
|
-
metadata_submission: JSON_OBJECT =
|
|
133
|
+
metadata_submission: Optional[JSON_OBJECT] = None,
|
|
90
134
|
*args,
|
|
91
135
|
nucleotide_sequencing_mapping: Optional[list] = None,
|
|
92
136
|
data_object_mapping: Optional[list] = None,
|
|
137
|
+
illumina_instrument_mapping: Optional[dict[str, str]] = None,
|
|
93
138
|
# Additional study-level metadata not captured by the submission portal currently
|
|
94
139
|
# See: https://github.com/microbiomedata/submission-schema/issues/162
|
|
95
|
-
study_doi_category: Optional[str] = None,
|
|
96
|
-
study_doi_provider: Optional[str] = None,
|
|
97
140
|
study_category: Optional[str] = None,
|
|
98
141
|
study_pi_image_url: Optional[str] = None,
|
|
99
|
-
study_funding_sources: Optional[list[str]] = None,
|
|
100
142
|
# Additional biosample-level metadata with optional column mapping information not captured
|
|
101
143
|
# by the submission portal currently.
|
|
102
144
|
# See: https://github.com/microbiomedata/submission-schema/issues/162
|
|
@@ -106,23 +148,17 @@ class SubmissionPortalTranslator(Translator):
|
|
|
106
148
|
) -> None:
|
|
107
149
|
super().__init__(*args, **kwargs)
|
|
108
150
|
|
|
109
|
-
self.metadata_submission = metadata_submission
|
|
151
|
+
self.metadata_submission: JSON_OBJECT = metadata_submission or {}
|
|
110
152
|
self.nucleotide_sequencing_mapping = nucleotide_sequencing_mapping
|
|
111
153
|
self.data_object_mapping = data_object_mapping
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
nmdc.DoiCategoryEnum(study_doi_category)
|
|
115
|
-
if study_doi_category
|
|
116
|
-
else nmdc.DoiCategoryEnum.dataset_doi
|
|
117
|
-
)
|
|
118
|
-
self.study_doi_provider = (
|
|
119
|
-
nmdc.DoiProviderEnum(study_doi_provider) if study_doi_provider else None
|
|
154
|
+
self.illumina_instrument_mapping: dict[str, str] = (
|
|
155
|
+
illumina_instrument_mapping or {}
|
|
120
156
|
)
|
|
157
|
+
|
|
121
158
|
self.study_category = (
|
|
122
159
|
nmdc.StudyCategoryEnum(study_category) if study_category else None
|
|
123
160
|
)
|
|
124
161
|
self.study_pi_image_url = study_pi_image_url
|
|
125
|
-
self.study_funding_sources = study_funding_sources
|
|
126
162
|
|
|
127
163
|
self.biosample_extras = group_dicts_by_key(
|
|
128
164
|
BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
|
|
@@ -153,28 +189,6 @@ class SubmissionPortalTranslator(Translator):
|
|
|
153
189
|
type=nmdc.PersonValue.class_class_curie,
|
|
154
190
|
)
|
|
155
191
|
|
|
156
|
-
def _get_doi(self, metadata_submission: JSON_OBJECT) -> Union[List[nmdc.Doi], None]:
|
|
157
|
-
"""Get DOI information from the context form data
|
|
158
|
-
|
|
159
|
-
:param metadata_submission: submission portal entry
|
|
160
|
-
:return: list of strings or None
|
|
161
|
-
"""
|
|
162
|
-
dataset_doi = get_in(["contextForm", "datasetDoi"], metadata_submission)
|
|
163
|
-
if not dataset_doi:
|
|
164
|
-
return None
|
|
165
|
-
|
|
166
|
-
if not dataset_doi.startswith("doi:"):
|
|
167
|
-
dataset_doi = f"doi:{dataset_doi}"
|
|
168
|
-
|
|
169
|
-
return [
|
|
170
|
-
nmdc.Doi(
|
|
171
|
-
doi_value=dataset_doi,
|
|
172
|
-
doi_provider=self.study_doi_provider,
|
|
173
|
-
doi_category=self.study_doi_category,
|
|
174
|
-
type="nmdc:Doi",
|
|
175
|
-
)
|
|
176
|
-
]
|
|
177
|
-
|
|
178
192
|
def _get_has_credit_associations(
|
|
179
193
|
self, metadata_submission: JSON_OBJECT
|
|
180
194
|
) -> Union[List[nmdc.CreditAssociation], None]:
|
|
@@ -203,21 +217,34 @@ class SubmissionPortalTranslator(Translator):
|
|
|
203
217
|
def _get_gold_study_identifiers(
|
|
204
218
|
self, metadata_submission: JSON_OBJECT
|
|
205
219
|
) -> Union[List[str], None]:
|
|
206
|
-
"""Construct a GOLD CURIE from the
|
|
220
|
+
"""Construct a GOLD CURIE from the study form data
|
|
207
221
|
|
|
208
222
|
:param metadata_submission: submission portal entry
|
|
209
223
|
:return: GOLD CURIE
|
|
210
224
|
"""
|
|
211
|
-
gold_study_id = get_in(["
|
|
225
|
+
gold_study_id = get_in(["studyForm", "GOLDStudyId"], metadata_submission)
|
|
212
226
|
if not gold_study_id:
|
|
213
227
|
return None
|
|
214
228
|
|
|
215
229
|
return [self._ensure_curie(gold_study_id, default_prefix="gold")]
|
|
216
230
|
|
|
231
|
+
def _get_ncbi_bioproject_identifiers(
|
|
232
|
+
self, metadata_submission: JSON_OBJECT
|
|
233
|
+
) -> Union[List[str], None]:
|
|
234
|
+
"""Construct a NCBI Bioproject CURIE from the study form data"""
|
|
235
|
+
|
|
236
|
+
ncbi_bioproject_id = get_in(
|
|
237
|
+
["studyForm", "NCBIBioProjectId"], metadata_submission
|
|
238
|
+
)
|
|
239
|
+
if not ncbi_bioproject_id:
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
return [self._ensure_curie(ncbi_bioproject_id, default_prefix="bioproject")]
|
|
243
|
+
|
|
217
244
|
def _get_jgi_study_identifiers(
|
|
218
245
|
self, metadata_submission: JSON_OBJECT
|
|
219
246
|
) -> Union[List[str], None]:
|
|
220
|
-
"""Construct a JGI proposal CURIE from the multiomics
|
|
247
|
+
"""Construct a JGI proposal CURIE from the multiomics form data
|
|
221
248
|
|
|
222
249
|
:param metadata_submission: submission portal entry
|
|
223
250
|
:return: JGI proposal CURIE
|
|
@@ -228,6 +255,20 @@ class SubmissionPortalTranslator(Translator):
|
|
|
228
255
|
|
|
229
256
|
return [self._ensure_curie(jgi_study_id, default_prefix="jgi.proposal")]
|
|
230
257
|
|
|
258
|
+
def _get_emsl_project_identifiers(
|
|
259
|
+
self, metadata_submission: JSON_OBJECT
|
|
260
|
+
) -> Union[List[str], None]:
|
|
261
|
+
"""Construct an EMSL project CURIE from the multiomics form data
|
|
262
|
+
|
|
263
|
+
:param metadata_submission: submission portal entry
|
|
264
|
+
:return: EMSL project CURIE
|
|
265
|
+
"""
|
|
266
|
+
emsl_project_id = get_in(["multiOmicsForm", "studyNumber"], metadata_submission)
|
|
267
|
+
if not emsl_project_id:
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
|
|
271
|
+
|
|
231
272
|
def _get_quantity_value(
|
|
232
273
|
self, raw_value: Optional[str], unit: Optional[str] = None
|
|
233
274
|
) -> Union[nmdc.QuantityValue, None]:
|
|
@@ -434,6 +475,75 @@ class SubmissionPortalTranslator(Translator):
|
|
|
434
475
|
|
|
435
476
|
return value
|
|
436
477
|
|
|
478
|
+
def _get_data_objects_from_fields(
|
|
479
|
+
self,
|
|
480
|
+
sample_data: JSON_OBJECT,
|
|
481
|
+
*,
|
|
482
|
+
url_field_name: str,
|
|
483
|
+
md5_checksum_field_name: str,
|
|
484
|
+
nucleotide_sequencing_id: str,
|
|
485
|
+
data_object_type: nmdc.FileTypeEnum,
|
|
486
|
+
) -> Tuple[List[nmdc.DataObject], nmdc.Manifest | None]:
|
|
487
|
+
"""Get a DataObject instances based on the URLs and MD5 checksums in the given fields.
|
|
488
|
+
|
|
489
|
+
If the field provides multiple URLs, multiple DataObject instances will be created and a
|
|
490
|
+
Manifest will be created and provided in the second return value.
|
|
491
|
+
|
|
492
|
+
:param sample_data: sample data
|
|
493
|
+
:param url_field_name: field name for the URL
|
|
494
|
+
:param md5_checksum_field_name: field name for the MD5 checksum
|
|
495
|
+
:param nucleotide_sequencing_id: ID for the nmdc:NucleotideSequencing object that generated the data object(s)
|
|
496
|
+
:param data_object_type: FileTypeEnum representing the type of the data object
|
|
497
|
+
:return: nmdc.DataObject or None
|
|
498
|
+
"""
|
|
499
|
+
data_objects: List[nmdc.DataObject] = []
|
|
500
|
+
urls = split_strip(sample_data.get(url_field_name), ";")
|
|
501
|
+
if not urls:
|
|
502
|
+
return data_objects, None
|
|
503
|
+
|
|
504
|
+
md5_checksums = split_strip(sample_data.get(md5_checksum_field_name), ";")
|
|
505
|
+
if md5_checksums and len(urls) != len(md5_checksums):
|
|
506
|
+
raise ValueError(
|
|
507
|
+
f"{url_field_name} and {md5_checksum_field_name} must have the same number of values"
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
data_object_ids = self._id_minter("nmdc:DataObject", len(urls))
|
|
511
|
+
manifest: nmdc.Manifest | None = None
|
|
512
|
+
if len(urls) > 1:
|
|
513
|
+
manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
|
|
514
|
+
manifest = nmdc.Manifest(
|
|
515
|
+
id=manifest_id,
|
|
516
|
+
manifest_category=nmdc.ManifestCategoryEnum(
|
|
517
|
+
nmdc.ManifestCategoryEnum.poolable_replicates
|
|
518
|
+
),
|
|
519
|
+
type="nmdc:Manifest",
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
for i, url in enumerate(urls):
|
|
523
|
+
data_object_id = data_object_ids[i]
|
|
524
|
+
parsed_url = urlparse(url)
|
|
525
|
+
possible_filename = parsed_url.path.rsplit("/", 1)[-1]
|
|
526
|
+
data_object_slots = {
|
|
527
|
+
"id": data_object_id,
|
|
528
|
+
"name": possible_filename,
|
|
529
|
+
"description": f"{data_object_type} for {nucleotide_sequencing_id}",
|
|
530
|
+
"type": "nmdc:DataObject",
|
|
531
|
+
"url": url,
|
|
532
|
+
"md5_checksum": md5_checksums[i] if md5_checksums else None,
|
|
533
|
+
"in_manifest": [manifest.id] if manifest else None,
|
|
534
|
+
"data_category": nmdc.DataCategoryEnum(
|
|
535
|
+
nmdc.DataCategoryEnum.instrument_data
|
|
536
|
+
),
|
|
537
|
+
"data_object_type": data_object_type,
|
|
538
|
+
"was_generated_by": nucleotide_sequencing_id,
|
|
539
|
+
}
|
|
540
|
+
data_object_slots.update(
|
|
541
|
+
self._transform_dict_for_class(sample_data, "DataObject")
|
|
542
|
+
)
|
|
543
|
+
data_objects.append(nmdc.DataObject(**data_object_slots))
|
|
544
|
+
|
|
545
|
+
return data_objects, manifest
|
|
546
|
+
|
|
437
547
|
def _translate_study(
|
|
438
548
|
self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
|
|
439
549
|
) -> nmdc.Study:
|
|
@@ -448,18 +558,17 @@ class SubmissionPortalTranslator(Translator):
|
|
|
448
558
|
"""
|
|
449
559
|
return nmdc.Study(
|
|
450
560
|
alternative_names=self._get_from(
|
|
451
|
-
metadata_submission, ["
|
|
561
|
+
metadata_submission, ["studyForm", "alternativeNames"]
|
|
452
562
|
),
|
|
453
|
-
associated_dois=self._get_doi(metadata_submission),
|
|
454
563
|
description=self._get_from(
|
|
455
564
|
metadata_submission, ["studyForm", "description"]
|
|
456
565
|
),
|
|
457
566
|
funding_sources=self._get_from(
|
|
458
567
|
metadata_submission, ["studyForm", "fundingSources"]
|
|
459
568
|
),
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
569
|
+
emsl_project_identifiers=self._get_emsl_project_identifiers(
|
|
570
|
+
metadata_submission
|
|
571
|
+
),
|
|
463
572
|
gold_study_identifiers=self._get_gold_study_identifiers(
|
|
464
573
|
metadata_submission
|
|
465
574
|
),
|
|
@@ -467,8 +576,8 @@ class SubmissionPortalTranslator(Translator):
|
|
|
467
576
|
metadata_submission
|
|
468
577
|
),
|
|
469
578
|
id=nmdc_study_id,
|
|
470
|
-
insdc_bioproject_identifiers=self.
|
|
471
|
-
metadata_submission
|
|
579
|
+
insdc_bioproject_identifiers=self._get_ncbi_bioproject_identifiers(
|
|
580
|
+
metadata_submission
|
|
472
581
|
),
|
|
473
582
|
jgi_portal_study_identifiers=self._get_jgi_study_identifiers(
|
|
474
583
|
metadata_submission
|
|
@@ -555,7 +664,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
555
664
|
if slot_definition.multivalued:
|
|
556
665
|
value_list = value
|
|
557
666
|
if isinstance(value, str):
|
|
558
|
-
value_list =
|
|
667
|
+
value_list = split_strip(value, "|")
|
|
559
668
|
transformed_value = [
|
|
560
669
|
self._transform_value_for_slot(item, slot_definition, unit)
|
|
561
670
|
for item in value_list
|
|
@@ -629,16 +738,18 @@ class SubmissionPortalTranslator(Translator):
|
|
|
629
738
|
:return: nmdc:Database object
|
|
630
739
|
"""
|
|
631
740
|
database = nmdc.Database()
|
|
632
|
-
|
|
633
|
-
nmdc_study_id = self._id_minter("nmdc:Study")[0]
|
|
634
|
-
|
|
635
741
|
metadata_submission_data = self.metadata_submission.get(
|
|
636
742
|
"metadata_submission", {}
|
|
637
743
|
)
|
|
744
|
+
|
|
745
|
+
# Generate one Study instance based on the metadata submission
|
|
746
|
+
nmdc_study_id = self._id_minter("nmdc:Study")[0]
|
|
638
747
|
database.study_set = [
|
|
639
748
|
self._translate_study(metadata_submission_data, nmdc_study_id)
|
|
640
749
|
]
|
|
641
750
|
|
|
751
|
+
# Automatically populate the `env_package` field in the sample data based on which
|
|
752
|
+
# environmental data tab the sample data came from.
|
|
642
753
|
sample_data = metadata_submission_data.get("sampleData", {})
|
|
643
754
|
for key in sample_data.keys():
|
|
644
755
|
env = key.removesuffix("_data").upper()
|
|
@@ -647,8 +758,16 @@ class SubmissionPortalTranslator(Translator):
|
|
|
647
758
|
for sample in sample_data[key]:
|
|
648
759
|
sample["env_package"] = package_name
|
|
649
760
|
except KeyError:
|
|
761
|
+
# This is expected when processing rows from tabs like the JGI/EMSL tabs or external
|
|
762
|
+
# sequencing data tabs.
|
|
650
763
|
pass
|
|
651
764
|
|
|
765
|
+
# Before regrouping the data by sample name, record which tab each object came from
|
|
766
|
+
for tab_name in sample_data.keys():
|
|
767
|
+
for tab in sample_data[tab_name]:
|
|
768
|
+
tab[TAB_NAME_KEY] = tab_name
|
|
769
|
+
|
|
770
|
+
# Reorganize the sample data by sample name and generate a unique NMDC ID for each
|
|
652
771
|
sample_data_by_id = groupby(
|
|
653
772
|
BIOSAMPLE_UNIQUE_KEY_SLOT,
|
|
654
773
|
concat(sample_data.values()),
|
|
@@ -658,6 +777,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
658
777
|
zip(sample_data_by_id.keys(), nmdc_biosample_ids)
|
|
659
778
|
)
|
|
660
779
|
|
|
780
|
+
# Translate the sample data into nmdc:Biosample objects
|
|
661
781
|
database.biosample_set = [
|
|
662
782
|
self._translate_biosample(
|
|
663
783
|
sample_data,
|
|
@@ -668,6 +788,104 @@ class SubmissionPortalTranslator(Translator):
|
|
|
668
788
|
if sample_data
|
|
669
789
|
]
|
|
670
790
|
|
|
791
|
+
# This section handles the translation of information in the external sequencing tabs into
|
|
792
|
+
# various NMDC objects.
|
|
793
|
+
database.data_generation_set = []
|
|
794
|
+
database.data_object_set = []
|
|
795
|
+
database.instrument_set = []
|
|
796
|
+
database.manifest_set = []
|
|
797
|
+
today = datetime.now().strftime("%Y-%m-%d")
|
|
798
|
+
for sample_data_id, sample_data in sample_data_by_id.items():
|
|
799
|
+
for tab in sample_data:
|
|
800
|
+
tab_name = tab.get(TAB_NAME_KEY)
|
|
801
|
+
analyte_category = TAB_NAME_TO_ANALYTE_CATEGORY.get(tab_name)
|
|
802
|
+
if not analyte_category:
|
|
803
|
+
# If the tab name cannot be mapped to an analyte category, that means we're
|
|
804
|
+
# not in an external sequencing data tabs (e.g. this is an environmental data
|
|
805
|
+
# tab or a JGI/EMSL tab). Skip this tab.
|
|
806
|
+
continue
|
|
807
|
+
|
|
808
|
+
# Start by generating one NucleotideSequencing instance with a has_input
|
|
809
|
+
# relationship to the current Biosample instance.
|
|
810
|
+
nucleotide_sequencing_id = self._id_minter(
|
|
811
|
+
"nmdc:NucleotideSequencing", 1
|
|
812
|
+
)[0]
|
|
813
|
+
nucleotide_sequencing_slots = {
|
|
814
|
+
"id": nucleotide_sequencing_id,
|
|
815
|
+
"has_input": sample_data_to_nmdc_biosample_ids[sample_data_id],
|
|
816
|
+
"has_output": [],
|
|
817
|
+
"associated_studies": [nmdc_study_id],
|
|
818
|
+
"add_date": today,
|
|
819
|
+
"mod_date": today,
|
|
820
|
+
"analyte_category": analyte_category,
|
|
821
|
+
"type": "nmdc:NucleotideSequencing",
|
|
822
|
+
}
|
|
823
|
+
# If the protocol_link column was filled in, expand it into an nmdc:Protocol object
|
|
824
|
+
if "protocol_link" in tab:
|
|
825
|
+
protocol_link = tab.pop("protocol_link")
|
|
826
|
+
nucleotide_sequencing_slots["protocol_link"] = nmdc.Protocol(
|
|
827
|
+
url=protocol_link,
|
|
828
|
+
type="nmdc:Protocol",
|
|
829
|
+
)
|
|
830
|
+
# If model column was filled in, expand it into an nmdc:Instrument object. This is
|
|
831
|
+
# done by first checking the provided instrument mapping to see if the model is
|
|
832
|
+
# already present. If it is not, a new instrument object is created and added to the
|
|
833
|
+
# instrument_set. Currently, we only accept sequencing data in the submission portal
|
|
834
|
+
# that was generated by Illumina instruments, so the vendor is hardcoded here.
|
|
835
|
+
if "model" in tab:
|
|
836
|
+
model = tab.pop("model")
|
|
837
|
+
if model not in self.illumina_instrument_mapping:
|
|
838
|
+
# If the model is not already in the mapping, create a new record for it
|
|
839
|
+
nmdc_instrument_id = self._id_minter("nmdc:Instrument", 1)[0]
|
|
840
|
+
database.instrument_set.append(
|
|
841
|
+
nmdc.Instrument(
|
|
842
|
+
id=nmdc_instrument_id,
|
|
843
|
+
vendor=nmdc.InstrumentVendorEnum(
|
|
844
|
+
nmdc.InstrumentVendorEnum.illumina
|
|
845
|
+
),
|
|
846
|
+
model=nmdc.InstrumentModelEnum(model),
|
|
847
|
+
type="nmdc:Instrument",
|
|
848
|
+
)
|
|
849
|
+
)
|
|
850
|
+
self.illumina_instrument_mapping[model] = nmdc_instrument_id
|
|
851
|
+
nucleotide_sequencing_slots["instrument_used"] = (
|
|
852
|
+
self.illumina_instrument_mapping[model]
|
|
853
|
+
)
|
|
854
|
+
# Process the remaining columns according to the NucleotideSequencing class
|
|
855
|
+
# definition
|
|
856
|
+
nucleotide_sequencing_slots.update(
|
|
857
|
+
self._transform_dict_for_class(tab, "NucleotideSequencing")
|
|
858
|
+
)
|
|
859
|
+
nucleotide_sequencing = nmdc.NucleotideSequencing(
|
|
860
|
+
**nucleotide_sequencing_slots
|
|
861
|
+
)
|
|
862
|
+
database.data_generation_set.append(nucleotide_sequencing)
|
|
863
|
+
|
|
864
|
+
# Iterate over the columns that contain URLs and MD5 checksums and translate them
|
|
865
|
+
# into DataObject instances. Each of these DataObject instances will be connected
|
|
866
|
+
# to the NucleotideSequencing instance via the has_output/was_generated_by
|
|
867
|
+
# relationships.
|
|
868
|
+
for data_url in DATA_URL_SETS:
|
|
869
|
+
data_object_type = DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE[
|
|
870
|
+
(data_url, str(analyte_category))
|
|
871
|
+
]
|
|
872
|
+
data_objects, manifest = self._get_data_objects_from_fields(
|
|
873
|
+
tab,
|
|
874
|
+
url_field_name=data_url.url,
|
|
875
|
+
md5_checksum_field_name=data_url.md5_checksum,
|
|
876
|
+
nucleotide_sequencing_id=nucleotide_sequencing_id,
|
|
877
|
+
data_object_type=nmdc.FileTypeEnum(data_object_type),
|
|
878
|
+
)
|
|
879
|
+
if manifest:
|
|
880
|
+
database.manifest_set.append(manifest)
|
|
881
|
+
for data_object in data_objects:
|
|
882
|
+
nucleotide_sequencing.has_output.append(data_object.id)
|
|
883
|
+
database.data_object_set.append(data_object)
|
|
884
|
+
|
|
885
|
+
# This is the older way of handling attaching NucleotideSequencing and DataObject instances
|
|
886
|
+
# to the Biosample instances. This should now mainly be handled by the external sequencing
|
|
887
|
+
# data tabs in the submission portal. This code is being left in place for now in case it is
|
|
888
|
+
# needed in the future.
|
|
671
889
|
if self.nucleotide_sequencing_mapping:
|
|
672
890
|
# If there is data from an NucleotideSequencing mapping file, process it now. This part
|
|
673
891
|
# assumes that there is a column in that file with the header __biosample_samp_name
|
nmdc_runtime/site/util.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
-
from dagster import op
|
|
4
3
|
from functools import lru_cache
|
|
5
4
|
from pymongo.database import Database as MongoDatabase
|
|
6
5
|
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
6
|
+
from toolz import groupby
|
|
7
7
|
|
|
8
8
|
from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
|
|
9
9
|
from nmdc_runtime.site.resources import mongo_resource
|
|
@@ -52,3 +52,10 @@ def get_basename(filename: str) -> str:
|
|
|
52
52
|
|
|
53
53
|
def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
|
|
54
54
|
return nmdc_study_id.replace(":", "_").replace("-", "_")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_instruments_by_id(mdb: MongoDatabase) -> dict[str, dict]:
|
|
58
|
+
"""Get all documents from the instrument_set collection in a dict keyed by id."""
|
|
59
|
+
return {
|
|
60
|
+
instrument["id"]: instrument for instrument in mdb["instrument_set"].find({})
|
|
61
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: nmdc_runtime
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.6.0
|
|
4
4
|
Summary: A runtime system for NMDC data management and orchestration
|
|
5
5
|
Home-page: https://github.com/microbiomedata/nmdc-runtime
|
|
6
6
|
Author: Donny Winston
|
|
@@ -17,6 +17,7 @@ Dynamic: classifier
|
|
|
17
17
|
Dynamic: description
|
|
18
18
|
Dynamic: description-content-type
|
|
19
19
|
Dynamic: home-page
|
|
20
|
+
Dynamic: license-file
|
|
20
21
|
Dynamic: requires-python
|
|
21
22
|
Dynamic: summary
|
|
22
23
|
|
|
@@ -37,8 +38,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
|
|
|
37
38
|
* [nmdc-server](https://github.com/microbiomedata/nmdc-server)
|
|
38
39
|
houses code specific to the data portal -- its database, back-end API, and front-end application.
|
|
39
40
|
|
|
40
|
-
* [
|
|
41
|
-
references workflow code spread across several repositories, that take source data and produce computed data.
|
|
41
|
+
* Workflows — documented in the [workflows](https://docs.microbiomedata.org/workflows/) section of the NMDC documentation website — take source data and produce computed data.
|
|
42
42
|
|
|
43
43
|
* This repo (nmdc-runtime)
|
|
44
44
|
* houses code that takes source data and computed data, and transforms it
|
|
@@ -106,10 +106,10 @@ docker compose version
|
|
|
106
106
|
docker info
|
|
107
107
|
```
|
|
108
108
|
|
|
109
|
-
Ensure the permissions of
|
|
109
|
+
Ensure the permissions of `./.docker/mongoKeyFile` are such that only the file's owner can read or write the file.
|
|
110
110
|
|
|
111
111
|
```shell
|
|
112
|
-
chmod 600
|
|
112
|
+
chmod 600 ./.docker/mongoKeyFile
|
|
113
113
|
```
|
|
114
114
|
|
|
115
115
|
Ensure you have a `.env` file for the Docker services to source from. You may copy `.env.example` to
|
|
@@ -156,6 +156,9 @@ Tests can be found in `tests` and are run with the following commands:
|
|
|
156
156
|
```bash
|
|
157
157
|
make up-test
|
|
158
158
|
make test
|
|
159
|
+
|
|
160
|
+
# Run a Specific test file eg. tests/test_api/test_endpoints.py
|
|
161
|
+
make test ARGS="tests/test_api/test_endpoints.py"
|
|
159
162
|
```
|
|
160
163
|
|
|
161
164
|
As you create Dagster solids and pipelines, add tests in `tests/` to check that your code behaves as
|
|
@@ -164,6 +167,16 @@ desired and does not break over time.
|
|
|
164
167
|
[For hints on how to write tests for solids and pipelines in Dagster, see their documentation
|
|
165
168
|
tutorial on Testing](https://docs.dagster.io/tutorial/testable).
|
|
166
169
|
|
|
170
|
+
### RAM usage
|
|
171
|
+
|
|
172
|
+
The `dagster-daemon` and `dagster-dagit` containers can consume a lot of RAM. If tests are failing and the console of
|
|
173
|
+
the `test` container shows "Error 137," here is something you can try as a workaround: In Docker Desktop, go to
|
|
174
|
+
"Settings > Resources > Advanced," and increase the memory limit. One of our team members has
|
|
175
|
+
found **12 GB** to be sufficient for running the tests.
|
|
176
|
+
|
|
177
|
+
> Dedicating 12 GB of RAM to Docker may be prohibitive for some prospective developers.
|
|
178
|
+
> There is an open [issue](https://github.com/microbiomedata/nmdc-runtime/issues/928) about the memory requirement.
|
|
179
|
+
|
|
167
180
|
## Publish to PyPI
|
|
168
181
|
|
|
169
182
|
This repository contains a GitHub Actions workflow that publishes a Python package to [PyPI](https://pypi.org/project/nmdc-runtime/).
|
|
@@ -36,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
|
|
|
36
36
|
nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
37
|
nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
|
|
38
38
|
nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
-
nmdc_runtime/site/graphs.py,sha256=
|
|
40
|
-
nmdc_runtime/site/ops.py,sha256=
|
|
41
|
-
nmdc_runtime/site/repository.py,sha256=
|
|
39
|
+
nmdc_runtime/site/graphs.py,sha256=DoKK6B6xkSwRcY5PVVo6jV_IA4HI5qL8xW9_n94jVfQ,15990
|
|
40
|
+
nmdc_runtime/site/ops.py,sha256=atZNkU5mzRRqTnaW39fvq7gVO2sKSH8ztVOp8_dOLbU,48048
|
|
41
|
+
nmdc_runtime/site/repository.py,sha256=nHu1skayyTjJWwGEf5eToX02cgBNTG_kdSluzJZ6rJc,43695
|
|
42
42
|
nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
|
|
43
|
-
nmdc_runtime/site/util.py,sha256
|
|
43
|
+
nmdc_runtime/site/util.py,sha256=h70UJCT9g-I63EJn0drZjv1iaQ8LHJTbG29R9kqJ04c,1821
|
|
44
44
|
nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
|
|
46
46
|
nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
|
|
@@ -51,23 +51,23 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
51
51
|
nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
|
|
52
52
|
nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
|
|
53
53
|
nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
nmdc_runtime/site/export/ncbi_xml.py,sha256=
|
|
55
|
-
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=
|
|
54
|
+
nmdc_runtime/site/export/ncbi_xml.py,sha256=4RqaT6qs1LDSiDDfF-JNZL5gOel8m65oCOelfr0blXs,26209
|
|
55
|
+
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=X35zbkxBxEyCnA9peY9YBAa_0oeoWy3DQEXoAXmc6vg,10100
|
|
56
56
|
nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
|
|
57
57
|
nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
|
|
59
59
|
nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
|
-
nmdc_runtime/site/repair/database_updater.py,sha256=
|
|
60
|
+
nmdc_runtime/site/repair/database_updater.py,sha256=eTNAPtgAc_xQodADBfgomwow9-14j5rBqQWF8R7BheY,11525
|
|
61
61
|
nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
62
|
nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
|
|
63
63
|
nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
|
|
64
|
-
nmdc_runtime/site/translation/gold_translator.py,sha256=
|
|
64
|
+
nmdc_runtime/site/translation/gold_translator.py,sha256=HGbWeuxppqlVfU8F5oKTYIDoC6qaftugJeWFIALB9XE,32720
|
|
65
65
|
nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
|
|
66
|
-
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=
|
|
66
|
+
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=VxN7yCziQE-ZP9mtrzqI-yaS9taEgTy0EnIEattYeKo,23727
|
|
67
67
|
nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
|
|
68
68
|
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
|
|
69
69
|
nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
|
|
70
|
-
nmdc_runtime/site/translation/submission_portal_translator.py,sha256=
|
|
70
|
+
nmdc_runtime/site/translation/submission_portal_translator.py,sha256=UEeqlkz_YGqcnx8vomFysetOlXxDu23q0Ryr93SZy78,41684
|
|
71
71
|
nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
|
|
72
72
|
nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
|
|
73
73
|
nmdc_runtime/site/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
|
|
|
75
75
|
nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
|
|
76
76
|
nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
|
|
77
77
|
nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
|
|
78
|
-
nmdc_runtime-2.
|
|
79
|
-
nmdc_runtime-2.
|
|
80
|
-
nmdc_runtime-2.
|
|
81
|
-
nmdc_runtime-2.
|
|
82
|
-
nmdc_runtime-2.
|
|
83
|
-
nmdc_runtime-2.
|
|
78
|
+
nmdc_runtime-2.6.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
|
|
79
|
+
nmdc_runtime-2.6.0.dist-info/METADATA,sha256=RK075FB7BHmZL0fm3elHYjBV7YB4hsG-pQRHfgCUm0g,8155
|
|
80
|
+
nmdc_runtime-2.6.0.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
|
|
81
|
+
nmdc_runtime-2.6.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
|
|
82
|
+
nmdc_runtime-2.6.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
|
|
83
|
+
nmdc_runtime-2.6.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|