nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,18 +1,83 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
+
from collections import namedtuple
|
|
3
4
|
from datetime import datetime
|
|
5
|
+
from decimal import Decimal
|
|
6
|
+
from enum import Enum
|
|
4
7
|
from functools import lru_cache
|
|
5
8
|
from importlib import resources
|
|
6
|
-
from typing import Any, List, Optional, Union
|
|
9
|
+
from typing import Any, List, Optional, Union, Tuple
|
|
10
|
+
from urllib.parse import urlparse
|
|
7
11
|
|
|
8
12
|
from linkml_runtime import SchemaView
|
|
9
13
|
from linkml_runtime.linkml_model import SlotDefinition
|
|
10
14
|
from nmdc_schema import nmdc
|
|
11
|
-
from toolz import
|
|
15
|
+
from toolz import concat, dissoc, get_in, groupby, valmap
|
|
12
16
|
|
|
13
17
|
from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
|
|
14
18
|
|
|
15
19
|
|
|
20
|
+
DataUrlSet = namedtuple("DataUrlSet", ["url", "md5_checksum"])
|
|
21
|
+
|
|
22
|
+
READ_1 = DataUrlSet("read_1_url", "read_1_md5_checksum")
|
|
23
|
+
READ_2 = DataUrlSet("read_2_url", "read_2_md5_checksum")
|
|
24
|
+
INTERLEAVED = DataUrlSet("interleaved_url", "interleaved_md5_checksum")
|
|
25
|
+
|
|
26
|
+
DATA_URL_SETS: list[DataUrlSet] = [READ_1, READ_2, INTERLEAVED]
|
|
27
|
+
|
|
28
|
+
BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"
|
|
29
|
+
|
|
30
|
+
TAB_NAME_KEY = "__tab_name"
|
|
31
|
+
METAGENOME = nmdc.NucleotideSequencingEnum(nmdc.NucleotideSequencingEnum.metagenome)
|
|
32
|
+
METATRANSCRIPTOME = nmdc.NucleotideSequencingEnum(
|
|
33
|
+
nmdc.NucleotideSequencingEnum.metatranscriptome
|
|
34
|
+
)
|
|
35
|
+
TAB_NAME_TO_ANALYTE_CATEGORY: dict[str, nmdc.NucleotideSequencingEnum] = {
|
|
36
|
+
"metagenome_sequencing_non_interleaved_data": METAGENOME,
|
|
37
|
+
"metagenome_sequencing_interleaved_data": METAGENOME,
|
|
38
|
+
"metatranscriptome_sequencing_non_interleaved_data": METATRANSCRIPTOME,
|
|
39
|
+
"metatranscriptome_sequencing_interleaved_data": METATRANSCRIPTOME,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str] = {
|
|
43
|
+
(READ_1, str(METAGENOME)): "Metagenome Raw Read 1",
|
|
44
|
+
(READ_2, str(METAGENOME)): "Metagenome Raw Read 2",
|
|
45
|
+
(INTERLEAVED, str(METAGENOME)): "Metagenome Raw Reads",
|
|
46
|
+
(READ_1, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 1",
|
|
47
|
+
(READ_2, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 2",
|
|
48
|
+
(INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
UNIT_OVERRIDES: dict[str, dict[str, str]] = {
|
|
52
|
+
"Biosample": {
|
|
53
|
+
"depth": "m",
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class EnvironmentPackage(Enum):
|
|
59
|
+
r"""
|
|
60
|
+
Enumeration of all possible environmental packages.
|
|
61
|
+
|
|
62
|
+
>>> EnvironmentPackage.AIR.value
|
|
63
|
+
'air'
|
|
64
|
+
>>> EnvironmentPackage.SEDIMENT.value
|
|
65
|
+
'sediment'
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
AIR = "air"
|
|
69
|
+
BIOFILM = "microbial mat_biofilm"
|
|
70
|
+
BUILT_ENV = "built environment"
|
|
71
|
+
HCR_CORES = "hydrocarbon resources-cores"
|
|
72
|
+
HRC_FLUID_SWABS = "hydrocarbon resources-fluids_swabs"
|
|
73
|
+
HOST_ASSOCIATED = "host-associated"
|
|
74
|
+
MISC_ENVS = "miscellaneous natural or artificial environment"
|
|
75
|
+
PLANT_ASSOCIATED = "plant-associated"
|
|
76
|
+
SEDIMENT = "sediment"
|
|
77
|
+
SOIL = "soil"
|
|
78
|
+
WATER = "water"
|
|
79
|
+
|
|
80
|
+
|
|
16
81
|
@lru_cache
|
|
17
82
|
def _get_schema_view():
|
|
18
83
|
"""Return a SchemaView instance representing the NMDC schema"""
|
|
@@ -49,6 +114,18 @@ def group_dicts_by_key(key: str, seq: Optional[list[dict]]) -> Optional[dict]:
|
|
|
49
114
|
return grouped
|
|
50
115
|
|
|
51
116
|
|
|
117
|
+
def split_strip(string: str | None, sep: str) -> list[str] | None:
|
|
118
|
+
"""Split a string by a separator and strip whitespace from each part.
|
|
119
|
+
|
|
120
|
+
:param string: string to split
|
|
121
|
+
:param sep: separator to split by
|
|
122
|
+
:return: list of stripped strings
|
|
123
|
+
"""
|
|
124
|
+
if string is None:
|
|
125
|
+
return None
|
|
126
|
+
return [s.strip() for s in string.split(sep)]
|
|
127
|
+
|
|
128
|
+
|
|
52
129
|
class SubmissionPortalTranslator(Translator):
|
|
53
130
|
"""A Translator subclass for handling submission portal entries
|
|
54
131
|
|
|
@@ -60,17 +137,16 @@ class SubmissionPortalTranslator(Translator):
|
|
|
60
137
|
|
|
61
138
|
def __init__(
|
|
62
139
|
self,
|
|
63
|
-
metadata_submission: JSON_OBJECT =
|
|
64
|
-
omics_processing_mapping: Optional[list] = None,
|
|
65
|
-
data_object_mapping: Optional[list] = None,
|
|
140
|
+
metadata_submission: Optional[JSON_OBJECT] = None,
|
|
66
141
|
*args,
|
|
142
|
+
nucleotide_sequencing_mapping: Optional[list] = None,
|
|
143
|
+
data_object_mapping: Optional[list] = None,
|
|
144
|
+
illumina_instrument_mapping: Optional[dict[str, str]] = None,
|
|
67
145
|
# Additional study-level metadata not captured by the submission portal currently
|
|
68
146
|
# See: https://github.com/microbiomedata/submission-schema/issues/162
|
|
69
|
-
study_doi_category: Optional[str] = None,
|
|
70
|
-
study_doi_provider: Optional[str] = None,
|
|
71
147
|
study_category: Optional[str] = None,
|
|
72
148
|
study_pi_image_url: Optional[str] = None,
|
|
73
|
-
|
|
149
|
+
study_id: Optional[str] = None,
|
|
74
150
|
# Additional biosample-level metadata with optional column mapping information not captured
|
|
75
151
|
# by the submission portal currently.
|
|
76
152
|
# See: https://github.com/microbiomedata/submission-schema/issues/162
|
|
@@ -80,30 +156,34 @@ class SubmissionPortalTranslator(Translator):
|
|
|
80
156
|
) -> None:
|
|
81
157
|
super().__init__(*args, **kwargs)
|
|
82
158
|
|
|
83
|
-
self.metadata_submission = metadata_submission
|
|
84
|
-
self.
|
|
159
|
+
self.metadata_submission: JSON_OBJECT = metadata_submission or {}
|
|
160
|
+
self.nucleotide_sequencing_mapping = nucleotide_sequencing_mapping
|
|
85
161
|
self.data_object_mapping = data_object_mapping
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
nmdc.DoiCategoryEnum(study_doi_category)
|
|
89
|
-
if study_doi_category
|
|
90
|
-
else nmdc.DoiCategoryEnum.dataset_doi
|
|
91
|
-
)
|
|
92
|
-
self.study_doi_provider = (
|
|
93
|
-
nmdc.DoiProviderEnum(study_doi_provider) if study_doi_provider else None
|
|
162
|
+
self.illumina_instrument_mapping: dict[str, str] = (
|
|
163
|
+
illumina_instrument_mapping or {}
|
|
94
164
|
)
|
|
165
|
+
|
|
95
166
|
self.study_category = (
|
|
96
167
|
nmdc.StudyCategoryEnum(study_category) if study_category else None
|
|
97
168
|
)
|
|
98
169
|
self.study_pi_image_url = study_pi_image_url
|
|
99
|
-
self.
|
|
170
|
+
self.study_id = study_id
|
|
100
171
|
|
|
101
|
-
self.biosample_extras = group_dicts_by_key(
|
|
172
|
+
self.biosample_extras = group_dicts_by_key(
|
|
173
|
+
BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
|
|
174
|
+
)
|
|
102
175
|
self.biosample_extras_slot_mapping = group_dicts_by_key(
|
|
103
176
|
"subject_id", biosample_extras_slot_mapping
|
|
104
177
|
)
|
|
105
178
|
|
|
106
179
|
self.schema_view: SchemaView = _get_schema_view()
|
|
180
|
+
self._material_processing_subclass_names = []
|
|
181
|
+
for class_name in self.schema_view.class_descendants(
|
|
182
|
+
"MaterialProcessing", reflexive=False
|
|
183
|
+
):
|
|
184
|
+
class_def = self.schema_view.get_class(class_name)
|
|
185
|
+
if not class_def.abstract:
|
|
186
|
+
self._material_processing_subclass_names.append(class_name)
|
|
107
187
|
|
|
108
188
|
def _get_pi(
|
|
109
189
|
self, metadata_submission: JSON_OBJECT
|
|
@@ -122,29 +202,9 @@ class SubmissionPortalTranslator(Translator):
|
|
|
122
202
|
email=study_form.get("piEmail"),
|
|
123
203
|
orcid=study_form.get("piOrcid"),
|
|
124
204
|
profile_image_url=self.study_pi_image_url,
|
|
205
|
+
type=nmdc.PersonValue.class_class_curie,
|
|
125
206
|
)
|
|
126
207
|
|
|
127
|
-
def _get_doi(self, metadata_submission: JSON_OBJECT) -> Union[List[nmdc.Doi], None]:
|
|
128
|
-
"""Get DOI information from the context form data
|
|
129
|
-
|
|
130
|
-
:param metadata_submission: submission portal entry
|
|
131
|
-
:return: list of strings or None
|
|
132
|
-
"""
|
|
133
|
-
dataset_doi = get_in(["contextForm", "datasetDoi"], metadata_submission)
|
|
134
|
-
if not dataset_doi:
|
|
135
|
-
return None
|
|
136
|
-
|
|
137
|
-
if not dataset_doi.startswith("doi:"):
|
|
138
|
-
dataset_doi = f"doi:{dataset_doi}"
|
|
139
|
-
|
|
140
|
-
return [
|
|
141
|
-
nmdc.Doi(
|
|
142
|
-
doi_value=dataset_doi,
|
|
143
|
-
doi_provider=self.study_doi_provider,
|
|
144
|
-
doi_category=self.study_doi_category,
|
|
145
|
-
)
|
|
146
|
-
]
|
|
147
|
-
|
|
148
208
|
def _get_has_credit_associations(
|
|
149
209
|
self, metadata_submission: JSON_OBJECT
|
|
150
210
|
) -> Union[List[nmdc.CreditAssociation], None]:
|
|
@@ -162,8 +222,10 @@ class SubmissionPortalTranslator(Translator):
|
|
|
162
222
|
applies_to_person=nmdc.PersonValue(
|
|
163
223
|
name=contributor.get("name"),
|
|
164
224
|
orcid=contributor.get("orcid"),
|
|
225
|
+
type="nmdc:PersonValue",
|
|
165
226
|
),
|
|
166
227
|
applied_roles=contributor.get("roles"),
|
|
228
|
+
type="nmdc:CreditAssociation",
|
|
167
229
|
)
|
|
168
230
|
for contributor in contributors
|
|
169
231
|
]
|
|
@@ -171,72 +233,92 @@ class SubmissionPortalTranslator(Translator):
|
|
|
171
233
|
def _get_gold_study_identifiers(
|
|
172
234
|
self, metadata_submission: JSON_OBJECT
|
|
173
235
|
) -> Union[List[str], None]:
|
|
174
|
-
"""Construct a GOLD CURIE from the
|
|
236
|
+
"""Construct a GOLD CURIE from the study form data
|
|
175
237
|
|
|
176
238
|
:param metadata_submission: submission portal entry
|
|
177
239
|
:return: GOLD CURIE
|
|
178
240
|
"""
|
|
179
|
-
gold_study_id = get_in(["
|
|
241
|
+
gold_study_id = get_in(["studyForm", "GOLDStudyId"], metadata_submission)
|
|
180
242
|
if not gold_study_id:
|
|
181
243
|
return None
|
|
182
244
|
|
|
183
|
-
return [self.
|
|
245
|
+
return [self._ensure_curie(gold_study_id, default_prefix="gold")]
|
|
184
246
|
|
|
185
|
-
def
|
|
186
|
-
self,
|
|
187
|
-
) -> Union[
|
|
188
|
-
"""Construct a
|
|
247
|
+
def _get_ncbi_bioproject_identifiers(
|
|
248
|
+
self, metadata_submission: JSON_OBJECT
|
|
249
|
+
) -> Union[List[str], None]:
|
|
250
|
+
"""Construct a NCBI Bioproject CURIE from the study form data"""
|
|
189
251
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
identifies a unit and a unit argument is provided, the unit argument is used.
|
|
196
|
-
If the pattern is not matched at all None is returned.
|
|
252
|
+
ncbi_bioproject_id = get_in(
|
|
253
|
+
["studyForm", "NCBIBioProjectId"], metadata_submission
|
|
254
|
+
)
|
|
255
|
+
if not ncbi_bioproject_id:
|
|
256
|
+
return None
|
|
197
257
|
|
|
198
|
-
|
|
199
|
-
to be stricter about what we accept or coerce into a controlled value set
|
|
258
|
+
return [self._ensure_curie(ncbi_bioproject_id, default_prefix="bioproject")]
|
|
200
259
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
260
|
+
def _get_jgi_study_identifiers(
|
|
261
|
+
self, metadata_submission: JSON_OBJECT
|
|
262
|
+
) -> Union[List[str], None]:
|
|
263
|
+
"""Construct a JGI proposal CURIE from the multiomics form data
|
|
264
|
+
|
|
265
|
+
:param metadata_submission: submission portal entry
|
|
266
|
+
:return: JGI proposal CURIE
|
|
204
267
|
"""
|
|
205
|
-
|
|
268
|
+
jgi_study_id = get_in(["multiOmicsForm", "JGIStudyId"], metadata_submission)
|
|
269
|
+
if not jgi_study_id:
|
|
206
270
|
return None
|
|
207
271
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
272
|
+
return [self._ensure_curie(jgi_study_id, default_prefix="jgi.proposal")]
|
|
273
|
+
|
|
274
|
+
def _get_emsl_project_identifiers(
|
|
275
|
+
self, metadata_submission: JSON_OBJECT
|
|
276
|
+
) -> Union[List[str], None]:
|
|
277
|
+
"""Construct an EMSL project CURIE from the multiomics form data
|
|
278
|
+
|
|
279
|
+
:param metadata_submission: submission portal entry
|
|
280
|
+
:return: EMSL project CURIE
|
|
281
|
+
"""
|
|
282
|
+
emsl_project_id = get_in(["multiOmicsForm", "studyNumber"], metadata_submission)
|
|
283
|
+
if not emsl_project_id:
|
|
213
284
|
return None
|
|
214
285
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
286
|
+
return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
|
|
287
|
+
|
|
288
|
+
def _get_quantity_value(
|
|
289
|
+
self,
|
|
290
|
+
raw_value: Optional[str | int | float],
|
|
291
|
+
slot_definition: SlotDefinition,
|
|
292
|
+
unit: Optional[str] = None,
|
|
293
|
+
) -> Union[nmdc.QuantityValue, None]:
|
|
294
|
+
"""Construct a nmdc:QuantityValue from a raw value string"""
|
|
295
|
+
|
|
296
|
+
# If the storage_units annotation is present on the slot and it only contains one unit (i.e.
|
|
297
|
+
# not a pipe-separated list of units) then use that unit.
|
|
298
|
+
if "storage_units" in slot_definition.annotations:
|
|
299
|
+
storage_units = slot_definition.annotations["storage_units"].value
|
|
300
|
+
if storage_units and "|" not in storage_units:
|
|
301
|
+
unit = storage_units
|
|
302
|
+
|
|
303
|
+
# If the raw_value is numeric, directly construct a QuantityValue with the inferred unit.
|
|
304
|
+
if isinstance(raw_value, (int, float)):
|
|
305
|
+
if unit is None:
|
|
306
|
+
raise ValueError(
|
|
307
|
+
f"While processing value for slot {slot_definition.name}, a numeric value was provided but no unit could be inferred."
|
|
308
|
+
)
|
|
309
|
+
# Constructing a Decimal directly from a float will maintain the full precision of the
|
|
310
|
+
# float (i.e. numbers like 0.5 cannot be represented exactly). Converting the float to
|
|
311
|
+
# a string first and then constructing the Decimal from that string will give a more
|
|
312
|
+
# expected value.
|
|
313
|
+
value_as_str = str(raw_value)
|
|
314
|
+
return nmdc.QuantityValue(
|
|
315
|
+
has_raw_value=value_as_str,
|
|
316
|
+
has_numeric_value=Decimal(value_as_str),
|
|
317
|
+
has_unit=unit,
|
|
318
|
+
type="nmdc:QuantityValue",
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
return self._parse_quantity_value(raw_value, unit)
|
|
240
322
|
|
|
241
323
|
def _get_ontology_class(
|
|
242
324
|
self, raw_value: Optional[str]
|
|
@@ -259,6 +341,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
259
341
|
return nmdc.OntologyClass(
|
|
260
342
|
name=match.group(1).strip(),
|
|
261
343
|
id=match.group(2).strip(),
|
|
344
|
+
type="nmdc:OntologyClass",
|
|
262
345
|
)
|
|
263
346
|
|
|
264
347
|
def _get_controlled_identified_term_value(
|
|
@@ -280,7 +363,9 @@ class SubmissionPortalTranslator(Translator):
|
|
|
280
363
|
return None
|
|
281
364
|
|
|
282
365
|
return nmdc.ControlledIdentifiedTermValue(
|
|
283
|
-
has_raw_value=raw_value,
|
|
366
|
+
has_raw_value=raw_value,
|
|
367
|
+
term=ontology_class,
|
|
368
|
+
type="nmdc:ControlledIdentifiedTermValue",
|
|
284
369
|
)
|
|
285
370
|
|
|
286
371
|
def _get_controlled_term_value(
|
|
@@ -297,7 +382,10 @@ class SubmissionPortalTranslator(Translator):
|
|
|
297
382
|
if not raw_value:
|
|
298
383
|
return None
|
|
299
384
|
|
|
300
|
-
value = nmdc.ControlledTermValue(
|
|
385
|
+
value = nmdc.ControlledTermValue(
|
|
386
|
+
has_raw_value=raw_value,
|
|
387
|
+
type="nmdc:ControlledTermValue",
|
|
388
|
+
)
|
|
301
389
|
ontology_class = self._get_ontology_class(raw_value)
|
|
302
390
|
if ontology_class is not None:
|
|
303
391
|
value.term = ontology_class
|
|
@@ -327,7 +415,10 @@ class SubmissionPortalTranslator(Translator):
|
|
|
327
415
|
return None
|
|
328
416
|
|
|
329
417
|
return nmdc.GeolocationValue(
|
|
330
|
-
has_raw_value=raw_value,
|
|
418
|
+
has_raw_value=raw_value,
|
|
419
|
+
latitude=match.group(1),
|
|
420
|
+
longitude=match.group(2),
|
|
421
|
+
type="nmdc:GeolocationValue",
|
|
331
422
|
)
|
|
332
423
|
|
|
333
424
|
def _get_float(self, raw_value: Optional[str]) -> Union[float, None]:
|
|
@@ -376,6 +467,127 @@ class SubmissionPortalTranslator(Translator):
|
|
|
376
467
|
|
|
377
468
|
return value
|
|
378
469
|
|
|
470
|
+
def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
|
|
471
|
+
"""Collect and format DOIs from submission portal schema in nmdc format DOIs
|
|
472
|
+
|
|
473
|
+
If there were no DOIs, None is returned.
|
|
474
|
+
|
|
475
|
+
:param metadata_submission: submission portal entry
|
|
476
|
+
:return: list of nmdc.DOI objects
|
|
477
|
+
"""
|
|
478
|
+
data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
|
|
479
|
+
award_dois = self._get_from(
|
|
480
|
+
metadata_submission, ["multiOmicsForm", "awardDois"]
|
|
481
|
+
)
|
|
482
|
+
if data_dois and len(data_dois) > 0:
|
|
483
|
+
updated_data_dois = [
|
|
484
|
+
nmdc.Doi(
|
|
485
|
+
doi_category="dataset_doi",
|
|
486
|
+
doi_provider=doi["provider"],
|
|
487
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
488
|
+
type="nmdc:Doi",
|
|
489
|
+
)
|
|
490
|
+
for doi in data_dois
|
|
491
|
+
]
|
|
492
|
+
else:
|
|
493
|
+
updated_data_dois = []
|
|
494
|
+
|
|
495
|
+
if award_dois and len(award_dois) > 0:
|
|
496
|
+
updated_award_dois = [
|
|
497
|
+
nmdc.Doi(
|
|
498
|
+
doi_category="award_doi",
|
|
499
|
+
doi_provider=doi["provider"],
|
|
500
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
501
|
+
type="nmdc:Doi",
|
|
502
|
+
)
|
|
503
|
+
for doi in award_dois
|
|
504
|
+
]
|
|
505
|
+
else:
|
|
506
|
+
updated_award_dois = []
|
|
507
|
+
|
|
508
|
+
return_val = updated_data_dois + updated_award_dois
|
|
509
|
+
if len(return_val) == 0:
|
|
510
|
+
return_val = None
|
|
511
|
+
|
|
512
|
+
return return_val
|
|
513
|
+
|
|
514
|
+
def _get_data_objects_from_fields(
|
|
515
|
+
self,
|
|
516
|
+
sample_data: JSON_OBJECT,
|
|
517
|
+
*,
|
|
518
|
+
url_field_name: str,
|
|
519
|
+
md5_checksum_field_name: str,
|
|
520
|
+
nucleotide_sequencing_id: str,
|
|
521
|
+
data_object_type: nmdc.FileTypeEnum,
|
|
522
|
+
) -> Tuple[List[nmdc.DataObject], nmdc.Manifest | None]:
|
|
523
|
+
"""Get a DataObject instances based on the URLs and MD5 checksums in the given fields.
|
|
524
|
+
|
|
525
|
+
If the field provides multiple URLs, multiple DataObject instances will be created and a
|
|
526
|
+
Manifest will be created and provided in the second return value.
|
|
527
|
+
|
|
528
|
+
:param sample_data: sample data
|
|
529
|
+
:param url_field_name: field name for the URL
|
|
530
|
+
:param md5_checksum_field_name: field name for the MD5 checksum
|
|
531
|
+
:param nucleotide_sequencing_id: ID for the nmdc:NucleotideSequencing object that generated the data object(s)
|
|
532
|
+
:param data_object_type: FileTypeEnum representing the type of the data object
|
|
533
|
+
:return: nmdc.DataObject or None
|
|
534
|
+
"""
|
|
535
|
+
data_objects: List[nmdc.DataObject] = []
|
|
536
|
+
urls = split_strip(sample_data.get(url_field_name), ";")
|
|
537
|
+
if not urls:
|
|
538
|
+
return data_objects, None
|
|
539
|
+
|
|
540
|
+
md5_checksums = split_strip(sample_data.get(md5_checksum_field_name), ";")
|
|
541
|
+
if md5_checksums and len(urls) != len(md5_checksums):
|
|
542
|
+
raise ValueError(
|
|
543
|
+
f"{url_field_name} and {md5_checksum_field_name} must have the same number of values"
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
data_object_ids = self._id_minter("nmdc:DataObject", len(urls))
|
|
547
|
+
manifest: nmdc.Manifest | None = None
|
|
548
|
+
if len(urls) > 1:
|
|
549
|
+
manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
|
|
550
|
+
manifest = nmdc.Manifest(
|
|
551
|
+
id=manifest_id,
|
|
552
|
+
manifest_category=nmdc.ManifestCategoryEnum(
|
|
553
|
+
nmdc.ManifestCategoryEnum.poolable_replicates
|
|
554
|
+
),
|
|
555
|
+
type="nmdc:Manifest",
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
for i, url in enumerate(urls):
|
|
559
|
+
data_object_id = data_object_ids[i]
|
|
560
|
+
parsed_url = urlparse(url)
|
|
561
|
+
possible_filename = parsed_url.path.rsplit("/", 1)[-1]
|
|
562
|
+
data_object_slots = {
|
|
563
|
+
"id": data_object_id,
|
|
564
|
+
"name": possible_filename,
|
|
565
|
+
"description": f"{data_object_type} for {nucleotide_sequencing_id}",
|
|
566
|
+
"type": "nmdc:DataObject",
|
|
567
|
+
"url": url,
|
|
568
|
+
"md5_checksum": md5_checksums[i] if md5_checksums else None,
|
|
569
|
+
"in_manifest": [manifest.id] if manifest else None,
|
|
570
|
+
"data_category": nmdc.DataCategoryEnum(
|
|
571
|
+
nmdc.DataCategoryEnum.instrument_data
|
|
572
|
+
),
|
|
573
|
+
"data_object_type": data_object_type,
|
|
574
|
+
"was_generated_by": nucleotide_sequencing_id,
|
|
575
|
+
}
|
|
576
|
+
data_object_slots.update(
|
|
577
|
+
self._transform_dict_for_class(sample_data, "DataObject")
|
|
578
|
+
)
|
|
579
|
+
data_objects.append(nmdc.DataObject(**data_object_slots))
|
|
580
|
+
|
|
581
|
+
return data_objects, manifest
|
|
582
|
+
|
|
583
|
+
def _parse_sample_link(self, sample_link: str) -> tuple[str, list[str]] | None:
|
|
584
|
+
"""Parse a sample link in the form of `ProcessingName:SampleName,..."""
|
|
585
|
+
pattern = r"(" + "|".join(self._material_processing_subclass_names) + r"):(.+)"
|
|
586
|
+
match = re.match(pattern, sample_link)
|
|
587
|
+
if not match:
|
|
588
|
+
return None
|
|
589
|
+
return match.group(1), split_strip(match.group(2), ",")
|
|
590
|
+
|
|
379
591
|
def _translate_study(
|
|
380
592
|
self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
|
|
381
593
|
) -> nmdc.Study:
|
|
@@ -389,20 +601,18 @@ class SubmissionPortalTranslator(Translator):
|
|
|
389
601
|
:return: nmdc:Study object
|
|
390
602
|
"""
|
|
391
603
|
return nmdc.Study(
|
|
392
|
-
alternative_identifiers=self._get_from(
|
|
393
|
-
metadata_submission, ["multiOmicsForm", "JGIStudyId"]
|
|
394
|
-
),
|
|
395
604
|
alternative_names=self._get_from(
|
|
396
|
-
metadata_submission, ["
|
|
605
|
+
metadata_submission, ["studyForm", "alternativeNames"]
|
|
397
606
|
),
|
|
398
|
-
associated_dois=self._get_doi(metadata_submission),
|
|
399
607
|
description=self._get_from(
|
|
400
608
|
metadata_submission, ["studyForm", "description"]
|
|
401
609
|
),
|
|
402
|
-
funding_sources=self.
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
610
|
+
funding_sources=self._get_from(
|
|
611
|
+
metadata_submission, ["studyForm", "fundingSources"]
|
|
612
|
+
),
|
|
613
|
+
emsl_project_identifiers=self._get_emsl_project_identifiers(
|
|
614
|
+
metadata_submission
|
|
615
|
+
),
|
|
406
616
|
gold_study_identifiers=self._get_gold_study_identifiers(
|
|
407
617
|
metadata_submission
|
|
408
618
|
),
|
|
@@ -410,17 +620,22 @@ class SubmissionPortalTranslator(Translator):
|
|
|
410
620
|
metadata_submission
|
|
411
621
|
),
|
|
412
622
|
id=nmdc_study_id,
|
|
413
|
-
insdc_bioproject_identifiers=self.
|
|
414
|
-
metadata_submission
|
|
623
|
+
insdc_bioproject_identifiers=self._get_ncbi_bioproject_identifiers(
|
|
624
|
+
metadata_submission
|
|
625
|
+
),
|
|
626
|
+
jgi_portal_study_identifiers=self._get_jgi_study_identifiers(
|
|
627
|
+
metadata_submission
|
|
415
628
|
),
|
|
416
629
|
name=self._get_from(metadata_submission, ["studyForm", "studyName"]),
|
|
417
630
|
notes=self._get_from(metadata_submission, ["studyForm", "notes"]),
|
|
418
631
|
principal_investigator=self._get_pi(metadata_submission),
|
|
419
632
|
study_category=self.study_category,
|
|
420
633
|
title=self._get_from(metadata_submission, ["studyForm", "studyName"]),
|
|
634
|
+
type="nmdc:Study",
|
|
421
635
|
websites=self._get_from(
|
|
422
636
|
metadata_submission, ["studyForm", "linkOutWebpage"]
|
|
423
637
|
),
|
|
638
|
+
associated_dois=self._get_study_dois(metadata_submission),
|
|
424
639
|
)
|
|
425
640
|
|
|
426
641
|
def _transform_value_for_slot(
|
|
@@ -428,15 +643,25 @@ class SubmissionPortalTranslator(Translator):
|
|
|
428
643
|
):
|
|
429
644
|
transformed_value = None
|
|
430
645
|
if slot.range == "TextValue":
|
|
431
|
-
transformed_value = nmdc.TextValue(
|
|
646
|
+
transformed_value = nmdc.TextValue(
|
|
647
|
+
has_raw_value=value,
|
|
648
|
+
type="nmdc:TextValue",
|
|
649
|
+
)
|
|
432
650
|
elif slot.range == "QuantityValue":
|
|
433
|
-
transformed_value = self._get_quantity_value(
|
|
651
|
+
transformed_value = self._get_quantity_value(
|
|
652
|
+
value,
|
|
653
|
+
slot,
|
|
654
|
+
unit=unit,
|
|
655
|
+
)
|
|
434
656
|
elif slot.range == "ControlledIdentifiedTermValue":
|
|
435
657
|
transformed_value = self._get_controlled_identified_term_value(value)
|
|
436
658
|
elif slot.range == "ControlledTermValue":
|
|
437
659
|
transformed_value = self._get_controlled_term_value(value)
|
|
438
660
|
elif slot.range == "TimestampValue":
|
|
439
|
-
transformed_value = nmdc.TimestampValue(
|
|
661
|
+
transformed_value = nmdc.TimestampValue(
|
|
662
|
+
has_raw_value=value,
|
|
663
|
+
type="nmdc:TimestampValue",
|
|
664
|
+
)
|
|
440
665
|
elif slot.range == "GeolocationValue":
|
|
441
666
|
transformed_value = self._get_geolocation_value(value)
|
|
442
667
|
elif slot.range == "float":
|
|
@@ -481,11 +706,22 @@ class SubmissionPortalTranslator(Translator):
|
|
|
481
706
|
logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
|
|
482
707
|
continue
|
|
483
708
|
|
|
709
|
+
# This step handles cases where the submission portal/schema instructs a user to
|
|
710
|
+
# provide a value in a specific unit. The unit cannot be parsed out of the raw value
|
|
711
|
+
# in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
|
|
712
|
+
# go away once units are encoded in the schema itself.
|
|
713
|
+
# See: https://github.com/microbiomedata/nmdc-schema/issues/2517
|
|
714
|
+
if class_name in UNIT_OVERRIDES:
|
|
715
|
+
# If the class has unit overrides, check if the slot is in the overrides
|
|
716
|
+
unit_overrides = UNIT_OVERRIDES[class_name]
|
|
717
|
+
if slot_name in unit_overrides:
|
|
718
|
+
unit = unit_overrides[slot_name]
|
|
719
|
+
|
|
484
720
|
slot_definition = self.schema_view.induced_slot(slot_name, class_name)
|
|
485
721
|
if slot_definition.multivalued:
|
|
486
722
|
value_list = value
|
|
487
723
|
if isinstance(value, str):
|
|
488
|
-
value_list =
|
|
724
|
+
value_list = split_strip(value, "|")
|
|
489
725
|
transformed_value = [
|
|
490
726
|
self._transform_value_for_slot(item, slot_definition, unit)
|
|
491
727
|
for item in value_list
|
|
@@ -503,7 +739,6 @@ class SubmissionPortalTranslator(Translator):
|
|
|
503
739
|
sample_data: List[JSON_OBJECT],
|
|
504
740
|
nmdc_biosample_id: str,
|
|
505
741
|
nmdc_study_id: str,
|
|
506
|
-
default_env_package: str,
|
|
507
742
|
) -> nmdc.Biosample:
|
|
508
743
|
"""Translate sample data from portal submission into an `nmdc:Biosample` object.
|
|
509
744
|
|
|
@@ -518,22 +753,30 @@ class SubmissionPortalTranslator(Translator):
|
|
|
518
753
|
from each applicable submission portal tab
|
|
519
754
|
:param nmdc_biosample_id: Minted nmdc:Biosample identifier for the translated object
|
|
520
755
|
:param nmdc_study_id: Minted nmdc:Study identifier for the related Study
|
|
521
|
-
:param default_env_package: Default value for `env_package` slot
|
|
522
756
|
:return: nmdc:Biosample
|
|
523
757
|
"""
|
|
524
|
-
|
|
758
|
+
env_idx = next(
|
|
759
|
+
(
|
|
760
|
+
i
|
|
761
|
+
for i, tab in enumerate(sample_data)
|
|
762
|
+
if tab.get("env_package") is not None
|
|
763
|
+
),
|
|
764
|
+
0,
|
|
765
|
+
)
|
|
766
|
+
biosample_key = sample_data[env_idx].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
|
|
525
767
|
slots = {
|
|
526
768
|
"id": nmdc_biosample_id,
|
|
527
|
-
"
|
|
528
|
-
"
|
|
529
|
-
"
|
|
769
|
+
"associated_studies": [nmdc_study_id],
|
|
770
|
+
"type": "nmdc:Biosample",
|
|
771
|
+
"name": sample_data[env_idx].get("samp_name", "").strip(),
|
|
772
|
+
"env_package": sample_data[env_idx].get("env_package"),
|
|
530
773
|
}
|
|
531
774
|
for tab in sample_data:
|
|
532
775
|
transformed_tab = self._transform_dict_for_class(tab, "Biosample")
|
|
533
776
|
slots.update(transformed_tab)
|
|
534
777
|
|
|
535
778
|
if self.biosample_extras:
|
|
536
|
-
raw_extras = self.biosample_extras.get(
|
|
779
|
+
raw_extras = self.biosample_extras.get(biosample_key)
|
|
537
780
|
if raw_extras:
|
|
538
781
|
transformed_extras = self._transform_dict_for_class(
|
|
539
782
|
raw_extras, "Biosample", self.biosample_extras_slot_mapping
|
|
@@ -552,47 +795,217 @@ class SubmissionPortalTranslator(Translator):
|
|
|
552
795
|
:return: nmdc:Database object
|
|
553
796
|
"""
|
|
554
797
|
database = nmdc.Database()
|
|
555
|
-
|
|
556
|
-
nmdc_study_id = self._id_minter("nmdc:Study")[0]
|
|
557
|
-
|
|
558
798
|
metadata_submission_data = self.metadata_submission.get(
|
|
559
799
|
"metadata_submission", {}
|
|
560
800
|
)
|
|
561
|
-
database.study_set = [
|
|
562
|
-
self._translate_study(metadata_submission_data, nmdc_study_id)
|
|
563
|
-
]
|
|
564
801
|
|
|
802
|
+
# Generate one Study instance based on the metadata submission, if a study_id wasn't provided
|
|
803
|
+
if self.study_id:
|
|
804
|
+
nmdc_study_id = self.study_id
|
|
805
|
+
else:
|
|
806
|
+
nmdc_study_id = self._id_minter("nmdc:Study")[0]
|
|
807
|
+
database.study_set = [
|
|
808
|
+
self._translate_study(metadata_submission_data, nmdc_study_id)
|
|
809
|
+
]
|
|
810
|
+
|
|
811
|
+
# Automatically populate the `env_package` field in the sample data based on which
|
|
812
|
+
# environmental data tab the sample data came from.
|
|
565
813
|
sample_data = metadata_submission_data.get("sampleData", {})
|
|
566
|
-
|
|
567
|
-
|
|
814
|
+
for key in sample_data.keys():
|
|
815
|
+
env = key.removesuffix("_data").upper()
|
|
816
|
+
try:
|
|
817
|
+
package_name = EnvironmentPackage[env].value
|
|
818
|
+
for sample in sample_data[key]:
|
|
819
|
+
sample["env_package"] = package_name
|
|
820
|
+
except KeyError:
|
|
821
|
+
# This is expected when processing rows from tabs like the JGI/EMSL tabs or external
|
|
822
|
+
# sequencing data tabs.
|
|
823
|
+
pass
|
|
824
|
+
|
|
825
|
+
# Before regrouping the data by sample name, record which tab each object came from
|
|
826
|
+
for tab_name in sample_data.keys():
|
|
827
|
+
for tab in sample_data[tab_name]:
|
|
828
|
+
tab[TAB_NAME_KEY] = tab_name
|
|
829
|
+
|
|
830
|
+
# Reorganize the sample data by sample name and generate a unique NMDC ID for each
|
|
831
|
+
sample_data_by_id = groupby(
|
|
832
|
+
BIOSAMPLE_UNIQUE_KEY_SLOT,
|
|
833
|
+
concat(sample_data.values()),
|
|
834
|
+
)
|
|
568
835
|
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_data_by_id))
|
|
569
836
|
sample_data_to_nmdc_biosample_ids = dict(
|
|
570
837
|
zip(sample_data_by_id.keys(), nmdc_biosample_ids)
|
|
571
838
|
)
|
|
572
839
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
)
|
|
580
|
-
for sample_data_id, sample_data in sample_data_by_id.items()
|
|
581
|
-
if sample_data
|
|
582
|
-
]
|
|
840
|
+
# Translate the sample data into nmdc:Biosample objects
|
|
841
|
+
database.biosample_set = []
|
|
842
|
+
for sample_data_id, sample_data in sample_data_by_id.items():
|
|
843
|
+
# This shouldn't happen, but just in case skip empty sample data
|
|
844
|
+
if not sample_data:
|
|
845
|
+
continue
|
|
583
846
|
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
847
|
+
# Find the first tab that has a sample_link value and attempt to parse it
|
|
848
|
+
sample_link = ""
|
|
849
|
+
for tab in sample_data:
|
|
850
|
+
if tab.get("sample_link"):
|
|
851
|
+
sample_link = tab.get("sample_link")
|
|
852
|
+
break
|
|
853
|
+
parsed_sample_link = self._parse_sample_link(sample_link)
|
|
854
|
+
|
|
855
|
+
# If the sample_link could be parsed according to the [ProcessName]:[InputSample,...]
|
|
856
|
+
# format, then create a ProcessedSample and MaterialProcessing instance instead of a
|
|
857
|
+
# Biosample instance. The input samples must be present in the submission for this to
|
|
858
|
+
# work. An exception is raised if any of the referenced input samples are missing.
|
|
859
|
+
if parsed_sample_link is not None:
|
|
860
|
+
processing_type, processing_inputs = parsed_sample_link
|
|
861
|
+
if not all(
|
|
862
|
+
input_id in sample_data_to_nmdc_biosample_ids
|
|
863
|
+
for input_id in processing_inputs
|
|
864
|
+
):
|
|
865
|
+
raise ValueError(
|
|
866
|
+
f"Could not find all input samples in sample_link '{sample_link}'"
|
|
867
|
+
)
|
|
868
|
+
processed_sample_id = self._id_minter("nmdc:ProcessedSample")[0]
|
|
869
|
+
database.processed_sample_set.append(
|
|
870
|
+
nmdc.ProcessedSample(
|
|
871
|
+
id=processed_sample_id,
|
|
872
|
+
type="nmdc:ProcessedSample",
|
|
873
|
+
name=sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip(),
|
|
874
|
+
)
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
processing_class = getattr(nmdc, processing_type)
|
|
878
|
+
material_processing = processing_class(
|
|
879
|
+
id=self._id_minter(f"nmdc:{processing_type}")[0],
|
|
880
|
+
type=f"nmdc:{processing_type}",
|
|
881
|
+
has_input=[
|
|
882
|
+
sample_data_to_nmdc_biosample_ids[input_id]
|
|
883
|
+
for input_id in processing_inputs
|
|
884
|
+
],
|
|
885
|
+
has_output=[processed_sample_id],
|
|
886
|
+
)
|
|
887
|
+
database.material_processing_set.append(material_processing)
|
|
888
|
+
|
|
889
|
+
# If there was no sample_link or it doesn't follow the expected format, create a
|
|
890
|
+
# Biosample instance as normal.
|
|
891
|
+
else:
|
|
892
|
+
biosample = self._translate_biosample(
|
|
893
|
+
sample_data,
|
|
894
|
+
nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
|
|
895
|
+
nmdc_study_id=nmdc_study_id,
|
|
896
|
+
)
|
|
897
|
+
database.biosample_set.append(biosample)
|
|
898
|
+
|
|
899
|
+
# This section handles the translation of information in the external sequencing tabs into
|
|
900
|
+
# various NMDC objects.
|
|
901
|
+
database.data_generation_set = []
|
|
902
|
+
database.data_object_set = []
|
|
903
|
+
database.instrument_set = []
|
|
904
|
+
database.manifest_set = []
|
|
905
|
+
today = datetime.now().strftime("%Y-%m-%d")
|
|
906
|
+
for sample_data_id, sample_data in sample_data_by_id.items():
|
|
907
|
+
for tab in sample_data:
|
|
908
|
+
tab_name = tab.get(TAB_NAME_KEY)
|
|
909
|
+
analyte_category = TAB_NAME_TO_ANALYTE_CATEGORY.get(tab_name)
|
|
910
|
+
if not analyte_category:
|
|
911
|
+
# If the tab name cannot be mapped to an analyte category, that means we're
|
|
912
|
+
# not in an external sequencing data tabs (e.g. this is an environmental data
|
|
913
|
+
# tab or a JGI/EMSL tab). Skip this tab.
|
|
914
|
+
continue
|
|
915
|
+
|
|
916
|
+
# Start by generating one NucleotideSequencing instance with a has_input
|
|
917
|
+
# relationship to the current Biosample instance.
|
|
918
|
+
nucleotide_sequencing_id = self._id_minter(
|
|
919
|
+
"nmdc:NucleotideSequencing", 1
|
|
920
|
+
)[0]
|
|
921
|
+
nucleotide_sequencing_slots = {
|
|
922
|
+
"id": nucleotide_sequencing_id,
|
|
923
|
+
"has_input": sample_data_to_nmdc_biosample_ids[sample_data_id],
|
|
924
|
+
"has_output": [],
|
|
925
|
+
"associated_studies": [nmdc_study_id],
|
|
926
|
+
"add_date": today,
|
|
927
|
+
"mod_date": today,
|
|
928
|
+
"analyte_category": analyte_category,
|
|
929
|
+
"type": "nmdc:NucleotideSequencing",
|
|
930
|
+
}
|
|
931
|
+
# If the protocol_link column was filled in, expand it into an nmdc:Protocol object
|
|
932
|
+
if "protocol_link" in tab:
|
|
933
|
+
protocol_link = tab.pop("protocol_link")
|
|
934
|
+
nucleotide_sequencing_slots["protocol_link"] = nmdc.Protocol(
|
|
935
|
+
url=protocol_link,
|
|
936
|
+
type="nmdc:Protocol",
|
|
937
|
+
)
|
|
938
|
+
# If model column was filled in, expand it into an nmdc:Instrument object. This is
|
|
939
|
+
# done by first checking the provided instrument mapping to see if the model is
|
|
940
|
+
# already present. If it is not, a new instrument object is created and added to the
|
|
941
|
+
# instrument_set. Currently, we only accept sequencing data in the submission portal
|
|
942
|
+
# that was generated by Illumina instruments, so the vendor is hardcoded here.
|
|
943
|
+
if "model" in tab:
|
|
944
|
+
model = tab.pop("model")
|
|
945
|
+
if model not in self.illumina_instrument_mapping:
|
|
946
|
+
# If the model is not already in the mapping, create a new record for it
|
|
947
|
+
nmdc_instrument_id = self._id_minter("nmdc:Instrument", 1)[0]
|
|
948
|
+
database.instrument_set.append(
|
|
949
|
+
nmdc.Instrument(
|
|
950
|
+
id=nmdc_instrument_id,
|
|
951
|
+
vendor=nmdc.InstrumentVendorEnum(
|
|
952
|
+
nmdc.InstrumentVendorEnum.illumina
|
|
953
|
+
),
|
|
954
|
+
model=nmdc.InstrumentModelEnum(model),
|
|
955
|
+
type="nmdc:Instrument",
|
|
956
|
+
)
|
|
957
|
+
)
|
|
958
|
+
self.illumina_instrument_mapping[model] = nmdc_instrument_id
|
|
959
|
+
nucleotide_sequencing_slots["instrument_used"] = (
|
|
960
|
+
self.illumina_instrument_mapping[model]
|
|
961
|
+
)
|
|
962
|
+
# Process the remaining columns according to the NucleotideSequencing class
|
|
963
|
+
# definition
|
|
964
|
+
nucleotide_sequencing_slots.update(
|
|
965
|
+
self._transform_dict_for_class(tab, "NucleotideSequencing")
|
|
966
|
+
)
|
|
967
|
+
nucleotide_sequencing = nmdc.NucleotideSequencing(
|
|
968
|
+
**nucleotide_sequencing_slots
|
|
969
|
+
)
|
|
970
|
+
database.data_generation_set.append(nucleotide_sequencing)
|
|
971
|
+
|
|
972
|
+
# Iterate over the columns that contain URLs and MD5 checksums and translate them
|
|
973
|
+
# into DataObject instances. Each of these DataObject instances will be connected
|
|
974
|
+
# to the NucleotideSequencing instance via the has_output/was_generated_by
|
|
975
|
+
# relationships.
|
|
976
|
+
for data_url in DATA_URL_SETS:
|
|
977
|
+
data_object_type = DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE[
|
|
978
|
+
(data_url, str(analyte_category))
|
|
979
|
+
]
|
|
980
|
+
data_objects, manifest = self._get_data_objects_from_fields(
|
|
981
|
+
tab,
|
|
982
|
+
url_field_name=data_url.url,
|
|
983
|
+
md5_checksum_field_name=data_url.md5_checksum,
|
|
984
|
+
nucleotide_sequencing_id=nucleotide_sequencing_id,
|
|
985
|
+
data_object_type=nmdc.FileTypeEnum(data_object_type),
|
|
986
|
+
)
|
|
987
|
+
if manifest:
|
|
988
|
+
database.manifest_set.append(manifest)
|
|
989
|
+
for data_object in data_objects:
|
|
990
|
+
nucleotide_sequencing.has_output.append(data_object.id)
|
|
991
|
+
database.data_object_set.append(data_object)
|
|
992
|
+
|
|
993
|
+
# This is the older way of handling attaching NucleotideSequencing and DataObject instances
|
|
994
|
+
# to the Biosample instances. This should now mainly be handled by the external sequencing
|
|
995
|
+
# data tabs in the submission portal. This code is being left in place for now in case it is
|
|
996
|
+
# needed in the future.
|
|
997
|
+
if self.nucleotide_sequencing_mapping:
|
|
998
|
+
# If there is data from an NucleotideSequencing mapping file, process it now. This part
|
|
999
|
+
# assumes that there is a column in that file with the header __biosample_samp_name
|
|
587
1000
|
# that can be used to join with the sample data from the submission portal. The
|
|
588
|
-
# biosample identified by that `
|
|
589
|
-
# slot of the
|
|
590
|
-
# those objects will also be generated and referenced in the `has_output` slot
|
|
591
|
-
#
|
|
592
|
-
# sample data there is an implicit 1:1 relationship between Biosample
|
|
593
|
-
#
|
|
594
|
-
join_key = "
|
|
595
|
-
database.
|
|
1001
|
+
# biosample identified by that `samp_name` will be referenced in the `has_input`
|
|
1002
|
+
# slot of the NucleotideSequencing object. If a DataObject mapping file was also
|
|
1003
|
+
# provided, those objects will also be generated and referenced in the `has_output` slot
|
|
1004
|
+
# of the NucleotideSequencing object. By keying off of the `samp_name` slot of the
|
|
1005
|
+
# submission's sample data there is an implicit 1:1 relationship between Biosample
|
|
1006
|
+
# objects and NucleotideSequencing objects generated here.
|
|
1007
|
+
join_key = f"__biosample_{BIOSAMPLE_UNIQUE_KEY_SLOT}"
|
|
1008
|
+
database.data_generation_set = []
|
|
596
1009
|
database.data_object_set = []
|
|
597
1010
|
data_objects_by_sample_data_id = {}
|
|
598
1011
|
today = datetime.now().strftime("%Y-%m-%d")
|
|
@@ -608,45 +1021,47 @@ class SubmissionPortalTranslator(Translator):
|
|
|
608
1021
|
grouped,
|
|
609
1022
|
)
|
|
610
1023
|
|
|
611
|
-
for
|
|
612
|
-
# For each row in the
|
|
613
|
-
# id that corresponds to the sample ID from the submission
|
|
614
|
-
sample_data_id =
|
|
1024
|
+
for nucleotide_sequencing_row in self.nucleotide_sequencing_mapping:
|
|
1025
|
+
# For each row in the NucleotideSequencing mapping file, first grab the minted
|
|
1026
|
+
# Biosample id that corresponds to the sample ID from the submission
|
|
1027
|
+
sample_data_id = nucleotide_sequencing_row.pop(join_key)
|
|
615
1028
|
if (
|
|
616
1029
|
not sample_data_id
|
|
617
1030
|
or sample_data_id not in sample_data_to_nmdc_biosample_ids
|
|
618
1031
|
):
|
|
619
1032
|
logging.warning(
|
|
620
|
-
f"Unrecognized biosample
|
|
1033
|
+
f"Unrecognized biosample {BIOSAMPLE_UNIQUE_KEY_SLOT}: {sample_data_id}"
|
|
621
1034
|
)
|
|
622
1035
|
continue
|
|
623
1036
|
nmdc_biosample_id = sample_data_to_nmdc_biosample_ids[sample_data_id]
|
|
624
1037
|
|
|
625
|
-
# Transform the raw row data according to the
|
|
626
|
-
# generate an instance. A few key slots do not come from the mapping file, but
|
|
1038
|
+
# Transform the raw row data according to the NucleotideSequencing class's slots,
|
|
1039
|
+
# and generate an instance. A few key slots do not come from the mapping file, but
|
|
627
1040
|
# instead are defined here.
|
|
628
|
-
|
|
629
|
-
"id": self._id_minter("nmdc:
|
|
1041
|
+
nucleotide_sequencing_slots = {
|
|
1042
|
+
"id": self._id_minter("nmdc:NucleotideSequencing", 1)[0],
|
|
630
1043
|
"has_input": [nmdc_biosample_id],
|
|
631
1044
|
"has_output": [],
|
|
632
|
-
"
|
|
1045
|
+
"associated_studies": [nmdc_study_id],
|
|
633
1046
|
"add_date": today,
|
|
634
1047
|
"mod_date": today,
|
|
635
|
-
"type": "nmdc:
|
|
1048
|
+
"type": "nmdc:NucleotideSequencing",
|
|
636
1049
|
}
|
|
637
|
-
|
|
1050
|
+
nucleotide_sequencing_slots.update(
|
|
638
1051
|
self._transform_dict_for_class(
|
|
639
|
-
|
|
1052
|
+
nucleotide_sequencing_row, "NucleotideSequencing"
|
|
640
1053
|
)
|
|
641
1054
|
)
|
|
642
|
-
|
|
1055
|
+
nucleotide_sequencing = nmdc.NucleotideSequencing(
|
|
1056
|
+
**nucleotide_sequencing_slots
|
|
1057
|
+
)
|
|
643
1058
|
|
|
644
1059
|
for data_object_row in data_objects_by_sample_data_id.get(
|
|
645
1060
|
sample_data_id, []
|
|
646
1061
|
):
|
|
647
1062
|
# For each row in the DataObject mapping file that corresponds to the sample ID,
|
|
648
1063
|
# transform the raw row data according to the DataObject class's slots, generate
|
|
649
|
-
# an instance, and connect that instance's minted ID to the
|
|
1064
|
+
# an instance, and connect that instance's minted ID to the NucleotideSequencing
|
|
650
1065
|
# instance
|
|
651
1066
|
data_object_id = self._id_minter("nmdc:DataObject", 1)[0]
|
|
652
1067
|
data_object_slots = {
|
|
@@ -658,10 +1073,49 @@ class SubmissionPortalTranslator(Translator):
|
|
|
658
1073
|
)
|
|
659
1074
|
data_object = nmdc.DataObject(**data_object_slots)
|
|
660
1075
|
|
|
661
|
-
|
|
1076
|
+
nucleotide_sequencing.has_output.append(data_object_id)
|
|
662
1077
|
|
|
663
1078
|
database.data_object_set.append(data_object)
|
|
664
1079
|
|
|
665
|
-
database.
|
|
1080
|
+
database.data_generation_set.append(nucleotide_sequencing)
|
|
666
1081
|
|
|
667
1082
|
return database
|
|
1083
|
+
|
|
1084
|
+
@staticmethod
|
|
1085
|
+
def set_study_images(
|
|
1086
|
+
nmdc_study: nmdc.Study,
|
|
1087
|
+
pi_image_url: Optional[str],
|
|
1088
|
+
primary_study_image_url: Optional[str],
|
|
1089
|
+
study_images_url: Optional[list[str]],
|
|
1090
|
+
) -> None:
|
|
1091
|
+
"""Set images for a study based on provided URLs."""
|
|
1092
|
+
|
|
1093
|
+
if pi_image_url:
|
|
1094
|
+
if not nmdc_study.principal_investigator:
|
|
1095
|
+
nmdc_study.principal_investigator = nmdc.PersonValue(
|
|
1096
|
+
type="nmdc:PersonValue"
|
|
1097
|
+
)
|
|
1098
|
+
nmdc_study.principal_investigator.profile_image_url = pi_image_url
|
|
1099
|
+
|
|
1100
|
+
if primary_study_image_url:
|
|
1101
|
+
if not nmdc_study.study_image:
|
|
1102
|
+
nmdc_study.study_image = []
|
|
1103
|
+
nmdc_study.study_image.append(
|
|
1104
|
+
nmdc.ImageValue(
|
|
1105
|
+
type="nmdc:ImageValue",
|
|
1106
|
+
url=primary_study_image_url,
|
|
1107
|
+
display_order=0,
|
|
1108
|
+
)
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
if study_images_url:
|
|
1112
|
+
if not nmdc_study.study_image:
|
|
1113
|
+
nmdc_study.study_image = []
|
|
1114
|
+
for idx, image_url in enumerate(study_images_url, start=1):
|
|
1115
|
+
nmdc_study.study_image.append(
|
|
1116
|
+
nmdc.ImageValue(
|
|
1117
|
+
type="nmdc:ImageValue",
|
|
1118
|
+
url=image_url,
|
|
1119
|
+
display_order=idx,
|
|
1120
|
+
)
|
|
1121
|
+
)
|