nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
-
from typing import List
|
|
3
|
+
from typing import List, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
@@ -10,7 +10,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
10
10
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
11
11
|
_get_value_or_none,
|
|
12
12
|
_create_controlled_identified_term_value,
|
|
13
|
-
_create_controlled_term_value,
|
|
14
13
|
_create_geolocation_value,
|
|
15
14
|
_create_quantity_value,
|
|
16
15
|
_create_timestamp_value,
|
|
@@ -26,6 +25,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
26
25
|
sls_data: dict,
|
|
27
26
|
neon_envo_mappings_file: pd.DataFrame,
|
|
28
27
|
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
28
|
+
neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
29
29
|
*args,
|
|
30
30
|
**kwargs,
|
|
31
31
|
) -> None:
|
|
@@ -99,6 +99,23 @@ class NeonSoilDataTranslator(Translator):
|
|
|
99
99
|
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
100
100
|
)
|
|
101
101
|
|
|
102
|
+
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
103
|
+
|
|
104
|
+
def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
|
|
105
|
+
if not instrument_model:
|
|
106
|
+
raise ValueError(
|
|
107
|
+
f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
df = self.neon_nmdc_instrument_map_df
|
|
111
|
+
matching_row = df[
|
|
112
|
+
df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
if not matching_row.empty:
|
|
116
|
+
nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
|
|
117
|
+
return nmdc_instrument_id
|
|
118
|
+
|
|
102
119
|
def _translate_biosample(
|
|
103
120
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
104
121
|
) -> nmdc.Biosample:
|
|
@@ -116,7 +133,6 @@ class NeonSoilDataTranslator(Translator):
|
|
|
116
133
|
"""
|
|
117
134
|
return nmdc.Biosample(
|
|
118
135
|
id=nmdc_id,
|
|
119
|
-
part_of="nmdc:sty-11-34xj1150",
|
|
120
136
|
env_broad_scale=_create_controlled_identified_term_value(
|
|
121
137
|
"ENVO:00000446", "terrestrial biome"
|
|
122
138
|
),
|
|
@@ -136,7 +152,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
136
152
|
collection_date=_create_timestamp_value(
|
|
137
153
|
biosample_row["collectDate"].values[0]
|
|
138
154
|
),
|
|
139
|
-
temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "
|
|
155
|
+
temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Cel"),
|
|
140
156
|
depth=nmdc.QuantityValue(
|
|
141
157
|
has_minimum_numeric_value=_get_value_or_none(
|
|
142
158
|
biosample_row, "sampleTopDepth"
|
|
@@ -145,26 +161,27 @@ class NeonSoilDataTranslator(Translator):
|
|
|
145
161
|
biosample_row, "sampleBottomDepth"
|
|
146
162
|
),
|
|
147
163
|
has_unit="m",
|
|
164
|
+
type="nmdc:QuantityValue",
|
|
148
165
|
),
|
|
149
166
|
samp_collec_device=_get_value_or_none(biosample_row, "soilSamplingDevice"),
|
|
150
167
|
soil_horizon=_get_value_or_none(biosample_row, "horizon"),
|
|
151
168
|
analysis_type=_get_value_or_none(biosample_row, "sequenceAnalysisType"),
|
|
152
169
|
env_package=_create_text_value(biosample_row["sampleType"].values[0]),
|
|
153
170
|
nitro=_create_quantity_value(
|
|
154
|
-
biosample_row["nitrogenPercent"].values[0], "
|
|
171
|
+
biosample_row["nitrogenPercent"].values[0], "%"
|
|
155
172
|
),
|
|
156
173
|
org_carb=_create_quantity_value(
|
|
157
|
-
biosample_row["organicCPercent"].values[0], "
|
|
174
|
+
biosample_row["organicCPercent"].values[0], "%"
|
|
158
175
|
),
|
|
159
176
|
carb_nitro_ratio=_create_quantity_value(
|
|
160
|
-
biosample_row["CNratio"].values[0],
|
|
177
|
+
biosample_row["CNratio"].values[0], "ratio"
|
|
161
178
|
),
|
|
162
179
|
ph=_create_double_value(biosample_row["soilInWaterpH"].values[0]),
|
|
163
|
-
water_content=
|
|
164
|
-
f"{biosample_row['soilMoisture'].values[0]} g of water/g of dry soil"
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
180
|
+
water_content=(
|
|
181
|
+
[f"{biosample_row['soilMoisture'].values[0]} g of water/g of dry soil"]
|
|
182
|
+
if not biosample_row["soilMoisture"].isna().any()
|
|
183
|
+
else None
|
|
184
|
+
),
|
|
168
185
|
ammonium_nitrogen=_create_quantity_value(
|
|
169
186
|
biosample_row["kclAmmoniumNConc"].values[0], "mg/L"
|
|
170
187
|
),
|
|
@@ -172,6 +189,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
172
189
|
biosample_row["kclNitrateNitriteNConc"].values[0], "mg/L"
|
|
173
190
|
),
|
|
174
191
|
type="nmdc:Biosample",
|
|
192
|
+
associated_studies=["nmdc:sty-11-34xj1150"],
|
|
175
193
|
)
|
|
176
194
|
|
|
177
195
|
def _translate_pooling_process(
|
|
@@ -198,6 +216,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
198
216
|
has_input=bsm_input_values_list,
|
|
199
217
|
start_date=_get_value_or_none(pooling_row, "startDate"),
|
|
200
218
|
end_date=_get_value_or_none(pooling_row, "collectDate"),
|
|
219
|
+
type="nmdc:Pooling",
|
|
201
220
|
)
|
|
202
221
|
|
|
203
222
|
def _translate_processed_sample(
|
|
@@ -214,12 +233,14 @@ class NeonSoilDataTranslator(Translator):
|
|
|
214
233
|
:param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
|
|
215
234
|
:return: ProcessedSample objects to be stored in `processed_sample_set`.
|
|
216
235
|
"""
|
|
217
|
-
return nmdc.ProcessedSample(
|
|
236
|
+
return nmdc.ProcessedSample(
|
|
237
|
+
id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
|
|
238
|
+
)
|
|
218
239
|
|
|
219
240
|
def _translate_data_object(
|
|
220
241
|
self, do_id: str, url: str, do_type: str, checksum: str
|
|
221
242
|
) -> nmdc.DataObject:
|
|
222
|
-
"""Create nmdc DataObject which is the output of
|
|
243
|
+
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
223
244
|
object mainly contains information about the sequencing file that was generated as
|
|
224
245
|
the result of running a Bioinformatics workflow on a certain ProcessedSample, which
|
|
225
246
|
is the result of a LibraryPreparation process.
|
|
@@ -242,6 +263,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
242
263
|
description=f"sequencing results for {basename}",
|
|
243
264
|
type="nmdc:DataObject",
|
|
244
265
|
md5_checksum=checksum,
|
|
266
|
+
data_category=nmdc.DataCategoryEnum.instrument_data.text,
|
|
245
267
|
data_object_type=do_type,
|
|
246
268
|
)
|
|
247
269
|
|
|
@@ -280,10 +302,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
280
302
|
input_mass=_create_quantity_value(
|
|
281
303
|
_get_value_or_none(extraction_row, "sampleMass"), "g"
|
|
282
304
|
),
|
|
283
|
-
|
|
284
|
-
status=_get_value_or_none(extraction_row, "qaqcStatus")
|
|
285
|
-
),
|
|
305
|
+
qc_status=_get_value_or_none(extraction_row, "qaqcStatus"),
|
|
286
306
|
processing_institution=processing_institution,
|
|
307
|
+
type="nmdc:Extraction",
|
|
287
308
|
)
|
|
288
309
|
|
|
289
310
|
def _translate_library_preparation(
|
|
@@ -296,13 +317,13 @@ class NeonSoilDataTranslator(Translator):
|
|
|
296
317
|
"""
|
|
297
318
|
Create LibraryPreparation process object. The input to LibraryPreparation process
|
|
298
319
|
is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
|
|
299
|
-
process is fed as input to an
|
|
320
|
+
process is fed as input to an NucleotideSequencing object.
|
|
300
321
|
|
|
301
322
|
:param library_preparation_id: Minted id for LibraryPreparation process.
|
|
302
323
|
:param library_preparation_input: Input to LibraryPreparation process is output from
|
|
303
324
|
Extraction process.
|
|
304
325
|
:param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
|
|
305
|
-
is also input to
|
|
326
|
+
is also input to NucleotideSequencing.
|
|
306
327
|
:param library_preparation_row: Metadata required to populate LibraryPreparation.
|
|
307
328
|
:return: Object that using LibraryPreparation process model.
|
|
308
329
|
"""
|
|
@@ -321,31 +342,32 @@ class NeonSoilDataTranslator(Translator):
|
|
|
321
342
|
start_date=_get_value_or_none(library_preparation_row, "collectDate"),
|
|
322
343
|
end_date=_get_value_or_none(library_preparation_row, "processedDate"),
|
|
323
344
|
processing_institution=processing_institution,
|
|
345
|
+
type="nmdc:LibraryPreparation",
|
|
324
346
|
)
|
|
325
347
|
|
|
326
|
-
def
|
|
348
|
+
def _translate_nucleotide_sequencing(
|
|
327
349
|
self,
|
|
328
|
-
|
|
350
|
+
nucleotide_sequencing_id: str,
|
|
329
351
|
processed_sample_id: str,
|
|
330
352
|
raw_data_file_data: str,
|
|
331
|
-
|
|
332
|
-
)
|
|
333
|
-
"""Create nmdc
|
|
334
|
-
Bioinformatics workflow on sequence data from a biosample. The input to an
|
|
335
|
-
process is the output from a LibraryPreparation process, and the output of
|
|
353
|
+
nucleotide_sequencing_row: pd.DataFrame,
|
|
354
|
+
):
|
|
355
|
+
"""Create nmdc NucleotideSequencing object. This class typically models the run of a
|
|
356
|
+
Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
|
|
357
|
+
process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
|
|
336
358
|
is a DataObject which has the FASTQ sequence file URLs embedded in them.
|
|
337
359
|
|
|
338
|
-
:param
|
|
360
|
+
:param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
|
|
339
361
|
:param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
|
|
340
362
|
:param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
|
|
341
363
|
files embedded in them.
|
|
342
|
-
:param
|
|
364
|
+
:param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
|
|
343
365
|
process/run.
|
|
344
|
-
:return:
|
|
366
|
+
:return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
|
|
345
367
|
"""
|
|
346
368
|
processing_institution = None
|
|
347
369
|
sequencing_facility = _get_value_or_none(
|
|
348
|
-
|
|
370
|
+
nucleotide_sequencing_row, "sequencingFacilityID"
|
|
349
371
|
)
|
|
350
372
|
if sequencing_facility is not None:
|
|
351
373
|
if re.search("Battelle", sequencing_facility, re.IGNORECASE):
|
|
@@ -353,19 +375,21 @@ class NeonSoilDataTranslator(Translator):
|
|
|
353
375
|
elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
|
|
354
376
|
processing_institution = "ANL"
|
|
355
377
|
|
|
356
|
-
return nmdc.
|
|
357
|
-
id=
|
|
378
|
+
return nmdc.NucleotideSequencing(
|
|
379
|
+
id=nucleotide_sequencing_id,
|
|
358
380
|
has_input=processed_sample_id,
|
|
359
381
|
has_output=raw_data_file_data,
|
|
360
382
|
processing_institution=processing_institution,
|
|
361
|
-
ncbi_project_name=_get_value_or_none(
|
|
362
|
-
|
|
363
|
-
omics_processing_row["investigation_type"].values[0]
|
|
383
|
+
ncbi_project_name=_get_value_or_none(
|
|
384
|
+
nucleotide_sequencing_row, "ncbiProjectID"
|
|
364
385
|
),
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
386
|
+
instrument_used=self._get_instrument_id(
|
|
387
|
+
_get_value_or_none(nucleotide_sequencing_row, "instrument_model")
|
|
388
|
+
),
|
|
389
|
+
name=f"Terrestrial soil microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
|
|
390
|
+
type="nmdc:NucleotideSequencing",
|
|
391
|
+
associated_studies=["nmdc:sty-11-34xj1150"],
|
|
392
|
+
analyte_category="metagenome",
|
|
369
393
|
)
|
|
370
394
|
|
|
371
395
|
def get_database(self) -> nmdc.Database:
|
|
@@ -373,10 +397,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
373
397
|
nmdc object creation methods as well as the nmdc type (QuantityValue, GeolocationValue, etc.)
|
|
374
398
|
creation methods, to make an nmdc Database object. It populates multiple sets in the Mongo database -
|
|
375
399
|
* `biosample_set`: uses `_translate_biosample()`
|
|
376
|
-
* `
|
|
377
|
-
|
|
378
|
-
* `
|
|
379
|
-
* `omics_processing_set`: uses `_translate_omics_processing()`
|
|
400
|
+
* `material_processing_set`: uses `_translate_pooling_process()`, `_translate_extraction_process()`,
|
|
401
|
+
`_translate_library_preparation()`
|
|
402
|
+
* `data_generation_set`: uses `_translate_nucleotide_sequencing()`
|
|
380
403
|
* `processed_sample_set`: uses `_translate_processed_sample()`
|
|
381
404
|
* `data_object_set`: uses `_translate_data_object()`
|
|
382
405
|
The core Biosample information is in the `sls_soilCoreCollection` table. However, we
|
|
@@ -607,14 +630,13 @@ class NeonSoilDataTranslator(Translator):
|
|
|
607
630
|
mms_metagenomeDnaExtraction.processedDate,
|
|
608
631
|
mms_metagenomeSequencing.sequencingFacilityID,
|
|
609
632
|
mms_metagenomeSequencing.ncbiProjectID,
|
|
610
|
-
mms_metagenomeSequencing.investigation_type,
|
|
611
633
|
mms_metagenomeSequencing.sequencingMethod,
|
|
612
634
|
mms_metagenomeSequencing.instrument_model
|
|
613
635
|
FROM mms_metagenomeSequencing
|
|
614
636
|
LEFT JOIN mms_metagenomeDnaExtraction ON mms_metagenomeDnaExtraction.dnaSampleID = mms_metagenomeSequencing.dnaSampleID
|
|
615
637
|
"""
|
|
616
638
|
library_preparation_table = pd.read_sql_query(query, self.conn)
|
|
617
|
-
|
|
639
|
+
nucleotide_sequencing_table = pd.read_sql_query(query, self.conn)
|
|
618
640
|
|
|
619
641
|
nmdc_pooling_ids = self._id_minter("nmdc:Pooling", len(pooling_ids_dict))
|
|
620
642
|
neon_to_nmdc_pooling_ids = dict(
|
|
@@ -653,12 +675,12 @@ class NeonSoilDataTranslator(Translator):
|
|
|
653
675
|
zip(library_prepration_ids, nmdc_library_preparation_processed_sample_ids)
|
|
654
676
|
)
|
|
655
677
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
"nmdc:
|
|
678
|
+
nucleotide_sequencing_ids = nucleotide_sequencing_table["dnaSampleID"]
|
|
679
|
+
nmdc_nucleotide_sequencing_ids = self._id_minter(
|
|
680
|
+
"nmdc:NucleotideSequencing", len(nucleotide_sequencing_ids)
|
|
659
681
|
)
|
|
660
|
-
|
|
661
|
-
zip(
|
|
682
|
+
neon_to_nmdc_nucleotide_sequencing_ids = dict(
|
|
683
|
+
zip(nucleotide_sequencing_ids, nmdc_nucleotide_sequencing_ids)
|
|
662
684
|
)
|
|
663
685
|
|
|
664
686
|
neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
|
|
@@ -701,7 +723,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
701
723
|
# if the number of biosamples that are input to a pooling process
|
|
702
724
|
# is one or less, then ignore it and go straight to extraction
|
|
703
725
|
if len(bsm_values_list) > 1:
|
|
704
|
-
database.
|
|
726
|
+
database.material_processing_set.append(
|
|
705
727
|
self._translate_pooling_process(
|
|
706
728
|
pooling_process_id,
|
|
707
729
|
processed_sample_id,
|
|
@@ -734,7 +756,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
734
756
|
# handler for creating extraction process records
|
|
735
757
|
# for both pooled and non-pooled samples
|
|
736
758
|
if "|" in genomics_pooled_id_list:
|
|
737
|
-
database.
|
|
759
|
+
database.material_processing_set.append(
|
|
738
760
|
self._translate_extraction_process(
|
|
739
761
|
extraction_id,
|
|
740
762
|
extraction_input,
|
|
@@ -755,7 +777,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
755
777
|
|
|
756
778
|
extraction_input = neon_to_nmdc_biosample_ids[neon_biosample_id]
|
|
757
779
|
|
|
758
|
-
database.
|
|
780
|
+
database.material_processing_set.append(
|
|
759
781
|
self._translate_extraction_process(
|
|
760
782
|
extraction_id,
|
|
761
783
|
extraction_input,
|
|
@@ -772,7 +794,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
772
794
|
dna_sample_id
|
|
773
795
|
]
|
|
774
796
|
|
|
775
|
-
|
|
797
|
+
nucleotide_sequencing_id = neon_to_nmdc_nucleotide_sequencing_ids[
|
|
798
|
+
dna_sample_id
|
|
799
|
+
]
|
|
776
800
|
|
|
777
801
|
genomics_sample_id = library_preparation_table[
|
|
778
802
|
library_preparation_table["dnaSampleID"] == dna_sample_id
|
|
@@ -787,7 +811,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
787
811
|
library_preparation_table["dnaSampleID"] == dna_sample_id
|
|
788
812
|
]
|
|
789
813
|
|
|
790
|
-
database.
|
|
814
|
+
database.material_processing_set.append(
|
|
791
815
|
self._translate_library_preparation(
|
|
792
816
|
library_preparation_id,
|
|
793
817
|
library_preparation_input,
|
|
@@ -809,9 +833,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
809
833
|
if item in neon_to_nmdc_data_object_ids:
|
|
810
834
|
has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
|
|
811
835
|
|
|
812
|
-
database.
|
|
813
|
-
self.
|
|
814
|
-
|
|
836
|
+
database.data_generation_set.append(
|
|
837
|
+
self._translate_nucleotide_sequencing(
|
|
838
|
+
nucleotide_sequencing_id,
|
|
815
839
|
processed_sample_id,
|
|
816
840
|
has_output_do_ids,
|
|
817
841
|
library_preparation_row,
|