nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -2,26 +2,94 @@ import collections
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import List, Tuple, Union
|
|
4
4
|
from nmdc_schema import nmdc
|
|
5
|
+
import pandas as pd
|
|
5
6
|
|
|
6
7
|
from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
|
|
7
8
|
|
|
9
|
+
# Dictionary of sequencing strategies from GOLD that we are filtering on
|
|
10
|
+
# based on the kind of samples that are required for NMDC
|
|
11
|
+
SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _is_valid_project(project: dict) -> bool:
|
|
15
|
+
"""A project is considered valid if:
|
|
16
|
+
1. `sequencingStrategy` is in {"Metagenome", "Metatranscriptome"}
|
|
17
|
+
2. if `sequencingCenters` == 'DOE Joint Genome Institute (JGI)' then
|
|
18
|
+
`projectStatus` must be in ("Permanent Draft", "Complete and Published")
|
|
19
|
+
3. otherwise, no `projectStatus` filter is applied
|
|
20
|
+
|
|
21
|
+
:param project: GOLD project object (structurally similar to response
|
|
22
|
+
from `/projects` endpoint)
|
|
23
|
+
:return: True if the project is valid, False otherwise
|
|
24
|
+
"""
|
|
25
|
+
if project.get("sequencingStrategy") not in SEQUENCING_STRATEGIES:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
if project.get("sequencingCenters") == "DOE Joint Genome Institute (JGI)":
|
|
29
|
+
return project.get("projectStatus") in (
|
|
30
|
+
"Permanent Draft",
|
|
31
|
+
"Complete and Published",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return True
|
|
35
|
+
|
|
8
36
|
|
|
9
37
|
class GoldStudyTranslator(Translator):
|
|
10
38
|
def __init__(
|
|
11
39
|
self,
|
|
12
40
|
study: JSON_OBJECT = {},
|
|
41
|
+
study_type: str = "research_study",
|
|
13
42
|
biosamples: List[JSON_OBJECT] = [],
|
|
14
43
|
projects: List[JSON_OBJECT] = [],
|
|
15
44
|
analysis_projects: List[JSON_OBJECT] = [],
|
|
45
|
+
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
46
|
+
include_field_site_info: bool = False,
|
|
47
|
+
enable_biosample_filtering: bool = True,
|
|
16
48
|
*args,
|
|
17
49
|
**kwargs,
|
|
18
50
|
) -> None:
|
|
19
51
|
super().__init__(*args, **kwargs)
|
|
20
52
|
|
|
21
53
|
self.study = study
|
|
22
|
-
self.
|
|
23
|
-
self.
|
|
24
|
-
self.
|
|
54
|
+
self.study_type = nmdc.StudyCategoryEnum(study_type)
|
|
55
|
+
self.include_field_site_info = include_field_site_info
|
|
56
|
+
self.enable_biosample_filtering = enable_biosample_filtering
|
|
57
|
+
# Filter biosamples to only those with `sequencingStrategy` of
|
|
58
|
+
# "Metagenome" or "Metatranscriptome" if filtering is enabled
|
|
59
|
+
if enable_biosample_filtering:
|
|
60
|
+
self.biosamples = [
|
|
61
|
+
biosample
|
|
62
|
+
for biosample in biosamples
|
|
63
|
+
if any(
|
|
64
|
+
_is_valid_project(project)
|
|
65
|
+
for project in biosample.get("projects", [])
|
|
66
|
+
)
|
|
67
|
+
]
|
|
68
|
+
else:
|
|
69
|
+
self.biosamples = biosamples
|
|
70
|
+
# Fetch the valid projectGoldIds that are associated with filtered
|
|
71
|
+
# biosamples on their `projects` field
|
|
72
|
+
valid_project_ids = {
|
|
73
|
+
project.get("projectGoldId")
|
|
74
|
+
for project in projects
|
|
75
|
+
if _is_valid_project(project)
|
|
76
|
+
}
|
|
77
|
+
# Filter projects to only those with `projectGoldId` in valid_project_ids
|
|
78
|
+
self.projects = [
|
|
79
|
+
project
|
|
80
|
+
for project in projects
|
|
81
|
+
if project.get("projectGoldId") in valid_project_ids
|
|
82
|
+
]
|
|
83
|
+
# Filter analysis_projects to only those with all `projects` in valid_project_ids
|
|
84
|
+
self.analysis_projects = [
|
|
85
|
+
analysis_project
|
|
86
|
+
for analysis_project in analysis_projects
|
|
87
|
+
if all(
|
|
88
|
+
project_id in valid_project_ids
|
|
89
|
+
for project_id in analysis_project.get("projects", [])
|
|
90
|
+
)
|
|
91
|
+
]
|
|
92
|
+
self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
|
|
25
93
|
|
|
26
94
|
self._projects_by_id = self._index_by_id(self.projects, "projectGoldId")
|
|
27
95
|
self._analysis_projects_by_id = self._index_by_id(
|
|
@@ -53,6 +121,9 @@ class GoldStudyTranslator(Translator):
|
|
|
53
121
|
:param gold_entity: GOLD entity object
|
|
54
122
|
:return: PersonValue corresponding to the first PI in the `contacts` field
|
|
55
123
|
"""
|
|
124
|
+
if "contacts" not in gold_entity:
|
|
125
|
+
return None
|
|
126
|
+
|
|
56
127
|
pi_dict = next(
|
|
57
128
|
(
|
|
58
129
|
contact
|
|
@@ -69,6 +140,7 @@ class GoldStudyTranslator(Translator):
|
|
|
69
140
|
has_raw_value=pi_dict.get("name"),
|
|
70
141
|
name=pi_dict.get("name"),
|
|
71
142
|
email=pi_dict.get("email"),
|
|
143
|
+
type="nmdc:PersonValue",
|
|
72
144
|
)
|
|
73
145
|
|
|
74
146
|
def _get_mod_date(self, gold_entity: JSON_OBJECT) -> Union[str, None]:
|
|
@@ -101,29 +173,67 @@ class GoldStudyTranslator(Translator):
|
|
|
101
173
|
for id in self._project_ids_by_biosample_id[gold_biosample_id]
|
|
102
174
|
)
|
|
103
175
|
return [
|
|
104
|
-
self.
|
|
176
|
+
self._ensure_curie(
|
|
177
|
+
project["ncbiBioSampleAccession"], default_prefix="biosample"
|
|
178
|
+
)
|
|
105
179
|
for project in biosample_projects
|
|
106
|
-
if project
|
|
180
|
+
if project.get("ncbiBioSampleAccession")
|
|
107
181
|
]
|
|
108
182
|
|
|
109
183
|
def _get_samp_taxon_id(
|
|
110
184
|
self, gold_biosample: JSON_OBJECT
|
|
111
|
-
) -> Union[nmdc.
|
|
112
|
-
"""Get a
|
|
185
|
+
) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
|
|
186
|
+
"""Get a ControlledIdentifiedTermValue representing the NCBI taxon
|
|
187
|
+
for a GOLD biosample
|
|
113
188
|
|
|
114
189
|
This method gets the `ncbiTaxName` and `ncbiTaxId` from a GOLD biosample object.
|
|
115
|
-
If both are not `None`, it constructs a
|
|
190
|
+
If both are not `None`, it constructs a ControlledIdentifiedTermValue of the format
|
|
116
191
|
`{ncbiTaxName} [NCBITaxon:{ncbiTaxId}]`. Otherwise, it returns `None`
|
|
117
192
|
|
|
118
193
|
:param gold_biosample: GOLD biosample object
|
|
119
|
-
:return:
|
|
194
|
+
:return: ControlledIdentifiedTermValue object
|
|
120
195
|
"""
|
|
121
196
|
ncbi_tax_name = gold_biosample.get("ncbiTaxName")
|
|
122
197
|
ncbi_tax_id = gold_biosample.get("ncbiTaxId")
|
|
123
198
|
if ncbi_tax_name is None or ncbi_tax_id is None:
|
|
124
199
|
return None
|
|
125
200
|
|
|
126
|
-
|
|
201
|
+
raw_value = f"{ncbi_tax_name} [NCBITaxon:{ncbi_tax_id}]"
|
|
202
|
+
|
|
203
|
+
return nmdc.ControlledIdentifiedTermValue(
|
|
204
|
+
has_raw_value=raw_value,
|
|
205
|
+
term=nmdc.OntologyClass(
|
|
206
|
+
id=f"NCBITaxon:{ncbi_tax_id}",
|
|
207
|
+
name=ncbi_tax_name,
|
|
208
|
+
type="nmdc:OntologyClass",
|
|
209
|
+
),
|
|
210
|
+
type="nmdc:ControlledIdentifiedTermValue",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def _get_host_taxid(
|
|
214
|
+
self, gold_biosample: JSON_OBJECT
|
|
215
|
+
) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
|
|
216
|
+
"""Get a ControlledIdentifiedTermValue representing the NCBI host taxon id
|
|
217
|
+
for a GOLD biosample
|
|
218
|
+
|
|
219
|
+
This method gets the `hostNcbiTaxid` from a GOLD biosample object.
|
|
220
|
+
It constructs a ControlledIdentifiedTermValue of the format
|
|
221
|
+
`[NCBITaxon:{hostNcbiTaxid}]`. Otherwise, it returns `None`
|
|
222
|
+
|
|
223
|
+
:param gold_biosample: GOLD biosample object
|
|
224
|
+
:return: ControlledIdentifiedTermValue object
|
|
225
|
+
"""
|
|
226
|
+
host_taxid = gold_biosample.get("hostNcbiTaxid")
|
|
227
|
+
if host_taxid is None:
|
|
228
|
+
return None
|
|
229
|
+
return nmdc.ControlledIdentifiedTermValue(
|
|
230
|
+
has_raw_value=f"NCBITaxon:{host_taxid}",
|
|
231
|
+
term=nmdc.OntologyClass(
|
|
232
|
+
id=f"NCBITaxon:{host_taxid}",
|
|
233
|
+
type="nmdc:OntologyClass",
|
|
234
|
+
),
|
|
235
|
+
type="nmdc:ControlledIdentifiedTermValue",
|
|
236
|
+
)
|
|
127
237
|
|
|
128
238
|
def _get_samp_name(self, gold_biosample: JSON_OBJECT) -> Union[str, None]:
|
|
129
239
|
"""Get a sample name for a GOLD biosample object
|
|
@@ -183,7 +293,9 @@ class GoldStudyTranslator(Translator):
|
|
|
183
293
|
date_collected = gold_biosample.get("dateCollected")
|
|
184
294
|
if date_collected is None:
|
|
185
295
|
return None
|
|
186
|
-
return nmdc.TimestampValue(
|
|
296
|
+
return nmdc.TimestampValue(
|
|
297
|
+
has_raw_value=date_collected, type="nmdc:TimestampValue"
|
|
298
|
+
)
|
|
187
299
|
|
|
188
300
|
def _get_quantity_value(
|
|
189
301
|
self,
|
|
@@ -215,23 +327,21 @@ class GoldStudyTranslator(Translator):
|
|
|
215
327
|
has_raw_value=minimum_numeric_value,
|
|
216
328
|
has_numeric_value=nmdc.Double(minimum_numeric_value),
|
|
217
329
|
has_unit=unit,
|
|
330
|
+
type="nmdc:QuantityValue",
|
|
218
331
|
)
|
|
219
332
|
else:
|
|
220
333
|
return nmdc.QuantityValue(
|
|
221
334
|
has_minimum_numeric_value=nmdc.Double(minimum_numeric_value),
|
|
222
335
|
has_maximum_numeric_value=nmdc.Double(maximum_numeric_value),
|
|
223
336
|
has_unit=unit,
|
|
337
|
+
type="nmdc:QuantityValue",
|
|
224
338
|
)
|
|
225
339
|
|
|
226
340
|
field_value = gold_entity.get(gold_field)
|
|
227
341
|
if field_value is None:
|
|
228
342
|
return None
|
|
229
343
|
|
|
230
|
-
return
|
|
231
|
-
has_raw_value=field_value,
|
|
232
|
-
has_numeric_value=nmdc.Double(field_value),
|
|
233
|
-
has_unit=unit,
|
|
234
|
-
)
|
|
344
|
+
return self._parse_quantity_value(str(field_value), unit)
|
|
235
345
|
|
|
236
346
|
def _get_text_value(
|
|
237
347
|
self, gold_entity: JSON_OBJECT, gold_field: str
|
|
@@ -249,7 +359,7 @@ class GoldStudyTranslator(Translator):
|
|
|
249
359
|
field_value = gold_entity.get(gold_field)
|
|
250
360
|
if field_value is None:
|
|
251
361
|
return None
|
|
252
|
-
return nmdc.TextValue(has_raw_value=field_value)
|
|
362
|
+
return nmdc.TextValue(has_raw_value=field_value, type="nmdc:TextValue")
|
|
253
363
|
|
|
254
364
|
def _get_controlled_term_value(
|
|
255
365
|
self, gold_entity: JSON_OBJECT, gold_field: str
|
|
@@ -267,7 +377,9 @@ class GoldStudyTranslator(Translator):
|
|
|
267
377
|
field_value = gold_entity.get(gold_field)
|
|
268
378
|
if field_value is None:
|
|
269
379
|
return None
|
|
270
|
-
return nmdc.ControlledTermValue(
|
|
380
|
+
return nmdc.ControlledTermValue(
|
|
381
|
+
has_raw_value=field_value, type="nmdc:ControlledTermValue"
|
|
382
|
+
)
|
|
271
383
|
|
|
272
384
|
def _get_env_term_value(
|
|
273
385
|
self, gold_biosample: JSON_OBJECT, gold_field: str
|
|
@@ -277,8 +389,8 @@ class GoldStudyTranslator(Translator):
|
|
|
277
389
|
In GOLD entities ENVO terms are represented as a nested object with `id` and `label`
|
|
278
390
|
fields. This method extracts this type of nested object by the given field name, and
|
|
279
391
|
returns it as an `nmdc:ControlledIdentifiedTermValue` object. The `id` in the original
|
|
280
|
-
GOLD object be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
|
|
281
|
-
`ENVO:00005801`). If the value of the given field is `None` or if does not contain
|
|
392
|
+
GOLD object should be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
|
|
393
|
+
`ENVO:00005801`). If the value of the given field is `None` or if it does not contain
|
|
282
394
|
a nested object with an `id` field, `None` is returned.
|
|
283
395
|
|
|
284
396
|
:param gold_biosample: GOLD biosample object
|
|
@@ -292,8 +404,10 @@ class GoldStudyTranslator(Translator):
|
|
|
292
404
|
term=nmdc.OntologyClass(
|
|
293
405
|
id=env_field["id"].replace("_", ":"),
|
|
294
406
|
name=env_field.get("label"),
|
|
407
|
+
type="nmdc:OntologyClass",
|
|
295
408
|
),
|
|
296
409
|
has_raw_value=env_field["id"],
|
|
410
|
+
type="nmdc:ControlledIdentifiedTermValue",
|
|
297
411
|
)
|
|
298
412
|
|
|
299
413
|
def _get_lat_lon(
|
|
@@ -316,22 +430,40 @@ class GoldStudyTranslator(Translator):
|
|
|
316
430
|
has_raw_value=f"{latitude} {longitude}",
|
|
317
431
|
latitude=nmdc.DecimalDegree(latitude),
|
|
318
432
|
longitude=nmdc.DecimalDegree(longitude),
|
|
433
|
+
type="nmdc:GeolocationValue",
|
|
319
434
|
)
|
|
320
435
|
|
|
321
|
-
def
|
|
322
|
-
"""Get instrument
|
|
436
|
+
def _get_instrument(self, gold_project: JSON_OBJECT) -> Union[str, None]:
|
|
437
|
+
"""Get instrument id referenced in instrument_set collection in Mongo.
|
|
438
|
+
Note: The instrument id is not retrieved by making a call to the database,
|
|
439
|
+
but rather parsed out from a TSV file in the nmdc-schema repo stored at
|
|
440
|
+
self.gold_instrument_set_mapping_file_path.
|
|
323
441
|
|
|
324
|
-
This method gets the
|
|
325
|
-
that value is not
|
|
326
|
-
|
|
442
|
+
This method gets the seqMethod field from a GOLD project object. If
|
|
443
|
+
that value is not None and is in the self.gold_instrument_set_mapping_file_path
|
|
444
|
+
file's GOLD SeqMethod column, the corresponding instrument id from
|
|
445
|
+
NMDC instrument_set id column is returned. If the value of the field
|
|
446
|
+
is None, None is returned.
|
|
327
447
|
|
|
328
448
|
:param gold_project: GOLD project object
|
|
329
|
-
:return: Instrument
|
|
449
|
+
:return: id corresponding to an Instrument from instrument_set collection
|
|
330
450
|
"""
|
|
331
451
|
seq_method = gold_project.get("seqMethod")
|
|
332
452
|
if not seq_method:
|
|
333
453
|
return None
|
|
334
|
-
|
|
454
|
+
|
|
455
|
+
seq_method = seq_method[0].strip()
|
|
456
|
+
df = self.gold_nmdc_instrument_map_df
|
|
457
|
+
|
|
458
|
+
matching_row = df[df["GOLD SeqMethod"] == seq_method]
|
|
459
|
+
|
|
460
|
+
if not matching_row.empty:
|
|
461
|
+
instrument_id = matching_row["NMDC instrument_set id"].values[0]
|
|
462
|
+
return instrument_id
|
|
463
|
+
|
|
464
|
+
raise ValueError(
|
|
465
|
+
f"seqMethod '{seq_method}' could not be found in the GOLD-NMDC instrument mapping TSV file."
|
|
466
|
+
)
|
|
335
467
|
|
|
336
468
|
def _get_processing_institution(
|
|
337
469
|
self, gold_project: JSON_OBJECT
|
|
@@ -401,12 +533,15 @@ class GoldStudyTranslator(Translator):
|
|
|
401
533
|
"""
|
|
402
534
|
return nmdc.Study(
|
|
403
535
|
description=gold_study.get("description"),
|
|
404
|
-
gold_study_identifiers=self.
|
|
536
|
+
gold_study_identifiers=self._ensure_curie(
|
|
537
|
+
gold_study["studyGoldId"], default_prefix="gold"
|
|
538
|
+
),
|
|
405
539
|
id=nmdc_study_id,
|
|
406
540
|
name=gold_study.get("studyName"),
|
|
407
541
|
principal_investigator=self._get_pi(gold_study),
|
|
408
542
|
title=gold_study.get("studyName"),
|
|
409
543
|
type="nmdc:Study",
|
|
544
|
+
study_category=self.study_type,
|
|
410
545
|
)
|
|
411
546
|
|
|
412
547
|
def _translate_biosample(
|
|
@@ -432,13 +567,11 @@ class GoldStudyTranslator(Translator):
|
|
|
432
567
|
gold_biosample_id = gold_biosample["biosampleGoldId"]
|
|
433
568
|
return nmdc.Biosample(
|
|
434
569
|
add_date=gold_biosample.get("addDate"),
|
|
435
|
-
alt=self._get_quantity_value(
|
|
436
|
-
gold_biosample, "altitudeInMeters", unit="meters"
|
|
437
|
-
),
|
|
570
|
+
alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
|
|
438
571
|
collected_from=nmdc_field_site_id,
|
|
439
572
|
collection_date=self._get_collection_date(gold_biosample),
|
|
440
573
|
depth=self._get_quantity_value(
|
|
441
|
-
gold_biosample, ("depthInMeters", "depthInMeters2"), unit="
|
|
574
|
+
gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
|
|
442
575
|
),
|
|
443
576
|
description=gold_biosample.get("description"),
|
|
444
577
|
diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
|
|
@@ -451,10 +584,12 @@ class GoldStudyTranslator(Translator):
|
|
|
451
584
|
env_local_scale=self._get_env_term_value(gold_biosample, "envoLocalScale"),
|
|
452
585
|
env_medium=self._get_env_term_value(gold_biosample, "envoMedium"),
|
|
453
586
|
geo_loc_name=self._get_text_value(gold_biosample, "geoLocation"),
|
|
454
|
-
gold_biosample_identifiers=self.
|
|
587
|
+
gold_biosample_identifiers=self._ensure_curie(
|
|
588
|
+
gold_biosample_id, default_prefix="gold"
|
|
589
|
+
),
|
|
455
590
|
habitat=gold_biosample.get("habitat"),
|
|
456
591
|
host_name=gold_biosample.get("hostName"),
|
|
457
|
-
host_taxid=self.
|
|
592
|
+
host_taxid=self._get_host_taxid(gold_biosample),
|
|
458
593
|
id=nmdc_biosample_id,
|
|
459
594
|
img_identifiers=self._get_img_identifiers(gold_biosample_id),
|
|
460
595
|
insdc_biosample_identifiers=self._get_insdc_biosample_identifiers(
|
|
@@ -466,7 +601,6 @@ class GoldStudyTranslator(Translator):
|
|
|
466
601
|
name=gold_biosample.get("biosampleName"),
|
|
467
602
|
ncbi_taxonomy_name=gold_biosample.get("ncbiTaxName"),
|
|
468
603
|
nitrite=self._get_quantity_value(gold_biosample, "nitrateConcentration"),
|
|
469
|
-
part_of=nmdc_study_id,
|
|
470
604
|
ph=gold_biosample.get("ph"),
|
|
471
605
|
pressure=self._get_quantity_value(gold_biosample, "pressure"),
|
|
472
606
|
samp_name=self._get_samp_name(gold_biosample),
|
|
@@ -476,53 +610,67 @@ class GoldStudyTranslator(Translator):
|
|
|
476
610
|
),
|
|
477
611
|
specific_ecosystem=gold_biosample.get("specificEcosystem"),
|
|
478
612
|
subsurface_depth=self._get_quantity_value(
|
|
479
|
-
gold_biosample, "subsurfaceDepthInMeters", unit="
|
|
613
|
+
gold_biosample, "subsurfaceDepthInMeters", unit="m"
|
|
480
614
|
),
|
|
481
615
|
temp=self._get_quantity_value(
|
|
482
616
|
gold_biosample, "sampleCollectionTemperature"
|
|
483
617
|
),
|
|
484
618
|
type="nmdc:Biosample",
|
|
619
|
+
associated_studies=[nmdc_study_id],
|
|
485
620
|
)
|
|
486
621
|
|
|
487
|
-
def
|
|
622
|
+
def _translate_nucleotide_sequencing(
|
|
488
623
|
self,
|
|
489
624
|
gold_project: JSON_OBJECT,
|
|
490
|
-
|
|
625
|
+
nmdc_nucleotide_sequencing_id: str,
|
|
491
626
|
nmdc_biosample_id: str,
|
|
492
627
|
nmdc_study_id: str,
|
|
493
|
-
)
|
|
494
|
-
"""Translate a GOLD project object into an `nmdc:
|
|
628
|
+
):
|
|
629
|
+
"""Translate a GOLD project object into an `nmdc:NucleotideSequencing` object.
|
|
495
630
|
|
|
496
|
-
This method translates a GOLD project object into an equivalent `nmdc:
|
|
631
|
+
This method translates a GOLD project object into an equivalent `nmdc:NucleotideSequencing`
|
|
497
632
|
object. Any minted NMDC IDs must be passed to this method. Internally, each
|
|
498
|
-
slot of the `nmdc:
|
|
633
|
+
slot of the `nmdc:NucleotideSequencing` is either directly pulled from the GOLD object or
|
|
499
634
|
one of the `_get_*` methods is used.
|
|
500
635
|
|
|
501
636
|
:param gold_project: GOLD project object
|
|
502
|
-
:param nmdc_omics_processing_id: Minted nmdc:
|
|
637
|
+
:param nmdc_omics_processing_id: Minted nmdc:NucleotideSequencing identifier for the translated object
|
|
503
638
|
:param nmdc_biosample_id: Minted nmdc:Biosample identifier for the related Biosample
|
|
504
639
|
:param nmdc_study_id: Minted nmdc:Study identifier for the related Study
|
|
505
|
-
:return: nmdc:
|
|
640
|
+
:return: nmdc:NucleotideSequencing object
|
|
506
641
|
"""
|
|
507
642
|
gold_project_id = gold_project["projectGoldId"]
|
|
508
|
-
|
|
509
|
-
|
|
643
|
+
ncbi_bioproject_identifier = gold_project.get("ncbiBioProjectAccession")
|
|
644
|
+
insdc_bioproject_identifiers = []
|
|
645
|
+
if ncbi_bioproject_identifier:
|
|
646
|
+
insdc_bioproject_identifiers.append(
|
|
647
|
+
self._ensure_curie(
|
|
648
|
+
ncbi_bioproject_identifier,
|
|
649
|
+
default_prefix="bioproject",
|
|
650
|
+
)
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
return nmdc.NucleotideSequencing(
|
|
654
|
+
id=nmdc_nucleotide_sequencing_id,
|
|
510
655
|
name=gold_project.get("projectName"),
|
|
511
|
-
gold_sequencing_project_identifiers=self.
|
|
512
|
-
"gold"
|
|
656
|
+
gold_sequencing_project_identifiers=self._ensure_curie(
|
|
657
|
+
gold_project_id, default_prefix="gold"
|
|
513
658
|
),
|
|
514
659
|
ncbi_project_name=gold_project.get("projectName"),
|
|
515
|
-
type="nmdc:
|
|
660
|
+
type="nmdc:NucleotideSequencing",
|
|
516
661
|
has_input=nmdc_biosample_id,
|
|
517
|
-
part_of=nmdc_study_id,
|
|
518
662
|
add_date=gold_project.get("addDate"),
|
|
519
663
|
mod_date=self._get_mod_date(gold_project),
|
|
664
|
+
insdc_bioproject_identifiers=insdc_bioproject_identifiers,
|
|
520
665
|
principal_investigator=self._get_pi(gold_project),
|
|
521
|
-
omics_type=self._get_controlled_term_value(
|
|
522
|
-
gold_project, "sequencingStrategy"
|
|
523
|
-
),
|
|
524
|
-
instrument_name=self._get_instrument_name(gold_project),
|
|
525
666
|
processing_institution=self._get_processing_institution(gold_project),
|
|
667
|
+
instrument_used=self._get_instrument(gold_project),
|
|
668
|
+
analyte_category=(
|
|
669
|
+
gold_project.get("sequencingStrategy").lower()
|
|
670
|
+
if gold_project.get("sequencingStrategy")
|
|
671
|
+
else None
|
|
672
|
+
),
|
|
673
|
+
associated_studies=[nmdc_study_id],
|
|
526
674
|
)
|
|
527
675
|
|
|
528
676
|
def get_database(self) -> nmdc.Database:
|
|
@@ -546,28 +694,31 @@ class GoldStudyTranslator(Translator):
|
|
|
546
694
|
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(self.biosamples))
|
|
547
695
|
gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
|
|
548
696
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
697
|
+
if self.include_field_site_info:
|
|
698
|
+
gold_field_site_names = sorted(
|
|
699
|
+
{self._get_field_site_name(biosample) for biosample in self.biosamples}
|
|
700
|
+
)
|
|
701
|
+
nmdc_field_site_ids = self._id_minter(
|
|
702
|
+
"nmdc:FieldResearchSite", len(gold_field_site_names)
|
|
703
|
+
)
|
|
704
|
+
gold_name_to_nmdc_field_site_ids = dict(
|
|
705
|
+
zip(gold_field_site_names, nmdc_field_site_ids)
|
|
706
|
+
)
|
|
707
|
+
gold_biosample_to_nmdc_field_site_ids = {
|
|
708
|
+
biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
|
|
709
|
+
self._get_field_site_name(biosample)
|
|
710
|
+
]
|
|
711
|
+
for biosample in self.biosamples
|
|
712
|
+
}
|
|
713
|
+
else:
|
|
714
|
+
gold_biosample_to_nmdc_field_site_ids = {}
|
|
564
715
|
|
|
565
716
|
gold_project_ids = [project["projectGoldId"] for project in self.projects]
|
|
566
|
-
|
|
567
|
-
"nmdc:
|
|
717
|
+
nmdc_nucleotide_sequencing_ids = self._id_minter(
|
|
718
|
+
"nmdc:NucleotideSequencing", len(gold_project_ids)
|
|
568
719
|
)
|
|
569
|
-
|
|
570
|
-
zip(gold_project_ids,
|
|
720
|
+
gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
|
|
721
|
+
zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
|
|
571
722
|
)
|
|
572
723
|
|
|
573
724
|
database.study_set = [self._translate_study(self.study, nmdc_study_id)]
|
|
@@ -578,20 +729,21 @@ class GoldStudyTranslator(Translator):
|
|
|
578
729
|
biosample["biosampleGoldId"]
|
|
579
730
|
],
|
|
580
731
|
nmdc_study_id=nmdc_study_id,
|
|
581
|
-
nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids
|
|
582
|
-
biosample["biosampleGoldId"]
|
|
583
|
-
|
|
732
|
+
nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids.get(
|
|
733
|
+
biosample["biosampleGoldId"], None
|
|
734
|
+
),
|
|
584
735
|
)
|
|
585
736
|
for biosample in self.biosamples
|
|
586
737
|
]
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
738
|
+
if self.include_field_site_info:
|
|
739
|
+
database.field_research_site_set = [
|
|
740
|
+
nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
|
|
741
|
+
for name, id in gold_name_to_nmdc_field_site_ids.items()
|
|
742
|
+
]
|
|
743
|
+
database.data_generation_set = [
|
|
744
|
+
self._translate_nucleotide_sequencing(
|
|
593
745
|
project,
|
|
594
|
-
|
|
746
|
+
nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
|
|
595
747
|
project["projectGoldId"]
|
|
596
748
|
],
|
|
597
749
|
nmdc_biosample_id=gold_to_nmdc_biosample_ids[
|