nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
+
from typing import Optional, Union
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
import requests_cache
|
|
@@ -10,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
10
11
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
11
12
|
_get_value_or_none,
|
|
12
13
|
_create_controlled_identified_term_value,
|
|
13
|
-
_create_controlled_term_value,
|
|
14
14
|
_create_geolocation_value,
|
|
15
15
|
_create_quantity_value,
|
|
16
16
|
_create_timestamp_value,
|
|
@@ -47,6 +47,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
47
47
|
site_code_mapping: dict,
|
|
48
48
|
neon_envo_mappings_file: pd.DataFrame,
|
|
49
49
|
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
50
|
+
neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
50
51
|
*args,
|
|
51
52
|
**kwargs,
|
|
52
53
|
) -> None:
|
|
@@ -59,6 +60,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
59
60
|
"mms_benthicMetagenomeSequencing",
|
|
60
61
|
"mms_benthicMetagenomeDnaExtraction",
|
|
61
62
|
"amb_fieldParent",
|
|
63
|
+
"mms_benthicRawDataFiles", # <--- ensure this is present
|
|
62
64
|
)
|
|
63
65
|
|
|
64
66
|
if all(k in benthic_data for k in neon_amb_data_tables):
|
|
@@ -77,6 +79,12 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
77
79
|
benthic_data["amb_fieldParent"].to_sql(
|
|
78
80
|
"amb_fieldParent", self.conn, if_exists="replace", index=False
|
|
79
81
|
)
|
|
82
|
+
benthic_data["mms_benthicRawDataFiles"].to_sql(
|
|
83
|
+
"mms_benthicRawDataFiles",
|
|
84
|
+
self.conn,
|
|
85
|
+
if_exists="replace",
|
|
86
|
+
index=False,
|
|
87
|
+
)
|
|
80
88
|
else:
|
|
81
89
|
raise ValueError(
|
|
82
90
|
f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
|
|
@@ -86,19 +94,24 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
86
94
|
"neonEnvoTerms", self.conn, if_exists="replace", index=False
|
|
87
95
|
)
|
|
88
96
|
|
|
89
|
-
self.neon_raw_data_file_mappings_df =
|
|
90
|
-
self.neon_raw_data_file_mappings_df.to_sql(
|
|
91
|
-
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
92
|
-
)
|
|
97
|
+
self.neon_raw_data_file_mappings_df = benthic_data["mms_benthicRawDataFiles"]
|
|
93
98
|
|
|
94
99
|
self.site_code_mapping = site_code_mapping
|
|
95
100
|
|
|
101
|
+
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
102
|
+
|
|
103
|
+
def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
|
|
104
|
+
return nmdc.Manifest(
|
|
105
|
+
id=manifest_id,
|
|
106
|
+
manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
|
|
107
|
+
type="nmdc:Manifest",
|
|
108
|
+
)
|
|
109
|
+
|
|
96
110
|
def _translate_biosample(
|
|
97
111
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
98
112
|
) -> nmdc.Biosample:
|
|
99
113
|
return nmdc.Biosample(
|
|
100
114
|
id=nmdc_id,
|
|
101
|
-
part_of="nmdc:sty-11-pzmd0x14",
|
|
102
115
|
env_broad_scale=_create_controlled_identified_term_value(
|
|
103
116
|
BENTHIC_BROAD_SCALE_MAPPINGS.get(
|
|
104
117
|
biosample_row["aquaticSiteType"].values[0]
|
|
@@ -146,8 +159,10 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
146
159
|
depth=nmdc.QuantityValue(
|
|
147
160
|
has_minimum_numeric_value=nmdc.Float("0"),
|
|
148
161
|
has_maximum_numeric_value=nmdc.Float("1"),
|
|
149
|
-
has_unit="
|
|
162
|
+
has_unit="m",
|
|
163
|
+
type="nmdc:QuantityValue",
|
|
150
164
|
),
|
|
165
|
+
associated_studies=["nmdc:sty-11-pzmd0x14"],
|
|
151
166
|
)
|
|
152
167
|
|
|
153
168
|
def _translate_extraction_process(
|
|
@@ -185,10 +200,9 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
185
200
|
input_mass=_create_quantity_value(
|
|
186
201
|
_get_value_or_none(extraction_row, "sampleMass"), "g"
|
|
187
202
|
),
|
|
188
|
-
|
|
189
|
-
status=_get_value_or_none(extraction_row, "qaqcStatus")
|
|
190
|
-
),
|
|
203
|
+
qc_status=_get_value_or_none(extraction_row, "qaqcStatus"),
|
|
191
204
|
processing_institution=processing_institution,
|
|
205
|
+
type="nmdc:Extraction",
|
|
192
206
|
)
|
|
193
207
|
|
|
194
208
|
def _translate_library_preparation(
|
|
@@ -201,13 +215,13 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
201
215
|
"""
|
|
202
216
|
Create LibraryPreparation process object. The input to LibraryPreparation process
|
|
203
217
|
is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
|
|
204
|
-
process is fed as input to an
|
|
218
|
+
process is fed as input to an NucleotideSequencing object.
|
|
205
219
|
|
|
206
220
|
:param library_preparation_id: Minted id for LibraryPreparation process.
|
|
207
221
|
:param library_preparation_input: Input to LibraryPreparation process is output from
|
|
208
222
|
Extraction process.
|
|
209
223
|
:param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
|
|
210
|
-
is also input to
|
|
224
|
+
is also input to NucleotideSequencing.
|
|
211
225
|
:param library_preparation_row: Metadata required to populate LibraryPreparation.
|
|
212
226
|
:return: Object that using LibraryPreparation process model.
|
|
213
227
|
"""
|
|
@@ -226,31 +240,47 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
226
240
|
start_date=_get_value_or_none(library_preparation_row, "collectDate"),
|
|
227
241
|
end_date=_get_value_or_none(library_preparation_row, "processedDate"),
|
|
228
242
|
processing_institution=processing_institution,
|
|
243
|
+
type="nmdc:LibraryPreparation",
|
|
229
244
|
)
|
|
230
245
|
|
|
231
|
-
def
|
|
246
|
+
def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
|
|
247
|
+
if not instrument_model:
|
|
248
|
+
raise ValueError(
|
|
249
|
+
f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
df = self.neon_nmdc_instrument_map_df
|
|
253
|
+
matching_row = df[
|
|
254
|
+
df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
if not matching_row.empty:
|
|
258
|
+
nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
|
|
259
|
+
return nmdc_instrument_id
|
|
260
|
+
|
|
261
|
+
def _translate_nucleotide_sequencing(
|
|
232
262
|
self,
|
|
233
|
-
|
|
263
|
+
nucleotide_sequencing_id: str,
|
|
234
264
|
processed_sample_id: str,
|
|
235
265
|
raw_data_file_data: str,
|
|
236
|
-
|
|
237
|
-
)
|
|
238
|
-
"""Create nmdc
|
|
239
|
-
Bioinformatics workflow on sequence data from a biosample. The input to an
|
|
240
|
-
process is the output from a LibraryPreparation process, and the output of
|
|
266
|
+
nucleotide_sequencing_row: pd.DataFrame,
|
|
267
|
+
):
|
|
268
|
+
"""Create nmdc NucleotideSequencing object. This class typically models the run of a
|
|
269
|
+
Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
|
|
270
|
+
process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
|
|
241
271
|
is a DataObject which has the FASTQ sequence file URLs embedded in them.
|
|
242
272
|
|
|
243
|
-
:param
|
|
273
|
+
:param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
|
|
244
274
|
:param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
|
|
245
275
|
:param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
|
|
246
276
|
files embedded in them.
|
|
247
|
-
:param
|
|
277
|
+
:param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
|
|
248
278
|
process/run.
|
|
249
|
-
:return:
|
|
279
|
+
:return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
|
|
250
280
|
"""
|
|
251
281
|
processing_institution = None
|
|
252
282
|
sequencing_facility = _get_value_or_none(
|
|
253
|
-
|
|
283
|
+
nucleotide_sequencing_row, "sequencingFacilityID"
|
|
254
284
|
)
|
|
255
285
|
if sequencing_facility is not None:
|
|
256
286
|
if re.search("Battelle", sequencing_facility, re.IGNORECASE):
|
|
@@ -258,19 +288,21 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
258
288
|
elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
|
|
259
289
|
processing_institution = "ANL"
|
|
260
290
|
|
|
261
|
-
return nmdc.
|
|
262
|
-
id=
|
|
291
|
+
return nmdc.NucleotideSequencing(
|
|
292
|
+
id=nucleotide_sequencing_id,
|
|
263
293
|
has_input=processed_sample_id,
|
|
264
294
|
has_output=raw_data_file_data,
|
|
265
295
|
processing_institution=processing_institution,
|
|
266
|
-
ncbi_project_name=_get_value_or_none(
|
|
267
|
-
|
|
268
|
-
omics_processing_row["investigation_type"].values[0]
|
|
296
|
+
ncbi_project_name=_get_value_or_none(
|
|
297
|
+
nucleotide_sequencing_row, "ncbiProjectID"
|
|
269
298
|
),
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
299
|
+
instrument_used=self._get_instrument_id(
|
|
300
|
+
_get_value_or_none(nucleotide_sequencing_row, "instrument_model")
|
|
301
|
+
),
|
|
302
|
+
name=f"Benthic microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
|
|
303
|
+
type="nmdc:NucleotideSequencing",
|
|
304
|
+
associated_studies=["nmdc:sty-11-pzmd0x14"],
|
|
305
|
+
analyte_category="metagenome",
|
|
274
306
|
)
|
|
275
307
|
|
|
276
308
|
def _translate_processed_sample(
|
|
@@ -287,12 +319,14 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
287
319
|
:param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
|
|
288
320
|
:return: ProcessedSample objects to be stored in `processed_sample_set`.
|
|
289
321
|
"""
|
|
290
|
-
return nmdc.ProcessedSample(
|
|
322
|
+
return nmdc.ProcessedSample(
|
|
323
|
+
id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
|
|
324
|
+
)
|
|
291
325
|
|
|
292
326
|
def _translate_data_object(
|
|
293
|
-
self, do_id: str, url: str, do_type: str,
|
|
327
|
+
self, do_id: str, url: str, do_type: str, manifest_id: str
|
|
294
328
|
) -> nmdc.DataObject:
|
|
295
|
-
"""Create nmdc DataObject which is the output of
|
|
329
|
+
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
296
330
|
object mainly contains information about the sequencing file that was generated as
|
|
297
331
|
the result of running a Bioinformatics workflow on a certain ProcessedSample, which
|
|
298
332
|
is the result of a LibraryPreparation process.
|
|
@@ -301,7 +335,6 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
301
335
|
:param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
|
|
302
336
|
by Hugh Cross at NEON.
|
|
303
337
|
:param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
|
|
304
|
-
:param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
|
|
305
338
|
at NEON.
|
|
306
339
|
:return: DataObject with all the sequencing file metadata.
|
|
307
340
|
"""
|
|
@@ -314,14 +347,15 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
314
347
|
url=url,
|
|
315
348
|
description=f"sequencing results for {basename}",
|
|
316
349
|
type="nmdc:DataObject",
|
|
317
|
-
md5_checksum=checksum,
|
|
318
350
|
data_object_type=do_type,
|
|
351
|
+
data_category=nmdc.DataCategoryEnum.instrument_data.text,
|
|
352
|
+
in_manifest=manifest_id,
|
|
319
353
|
)
|
|
320
354
|
|
|
321
|
-
def get_database(self):
|
|
355
|
+
def get_database(self) -> nmdc.Database:
|
|
322
356
|
database = nmdc.Database()
|
|
323
357
|
|
|
324
|
-
|
|
358
|
+
join_query = """
|
|
325
359
|
SELECT
|
|
326
360
|
merged.laboratoryName,
|
|
327
361
|
merged.sequencingFacilityID,
|
|
@@ -349,200 +383,190 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
349
383
|
afp.siteID,
|
|
350
384
|
afp.sampleID,
|
|
351
385
|
afp.collectDate
|
|
352
|
-
FROM
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
mms_benthicMetagenomeDnaExtraction AS bd
|
|
375
|
-
ON
|
|
376
|
-
bs.dnaSampleID = bd.dnaSampleID
|
|
377
|
-
) AS merged
|
|
386
|
+
FROM (
|
|
387
|
+
SELECT
|
|
388
|
+
bs.collectDate,
|
|
389
|
+
bs.laboratoryName,
|
|
390
|
+
bs.sequencingFacilityID,
|
|
391
|
+
bs.processedDate,
|
|
392
|
+
bs.dnaSampleID,
|
|
393
|
+
bs.dnaSampleCode,
|
|
394
|
+
bs.internalLabID,
|
|
395
|
+
bs.instrument_model,
|
|
396
|
+
bs.sequencingMethod,
|
|
397
|
+
bs.investigation_type,
|
|
398
|
+
bs.qaqcStatus,
|
|
399
|
+
bs.ncbiProjectID,
|
|
400
|
+
bd.genomicsSampleID,
|
|
401
|
+
bd.sequenceAnalysisType,
|
|
402
|
+
bd.sampleMass,
|
|
403
|
+
bd.nucleicAcidConcentration
|
|
404
|
+
FROM mms_benthicMetagenomeSequencing AS bs
|
|
405
|
+
JOIN mms_benthicMetagenomeDnaExtraction AS bd
|
|
406
|
+
ON bs.dnaSampleID = bd.dnaSampleID
|
|
407
|
+
) AS merged
|
|
378
408
|
LEFT JOIN amb_fieldParent AS afp
|
|
379
|
-
ON
|
|
380
|
-
merged.genomicsSampleID = afp.geneticSampleID
|
|
409
|
+
ON merged.genomicsSampleID = afp.geneticSampleID
|
|
381
410
|
"""
|
|
382
|
-
benthic_samples = pd.read_sql_query(
|
|
411
|
+
benthic_samples = pd.read_sql_query(join_query, self.conn)
|
|
383
412
|
benthic_samples.to_sql(
|
|
384
413
|
"benthicSamples", self.conn, if_exists="replace", index=False
|
|
385
414
|
)
|
|
386
415
|
|
|
387
|
-
|
|
388
|
-
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(
|
|
389
|
-
neon_to_nmdc_biosample_ids = dict(zip(
|
|
416
|
+
sample_ids = benthic_samples["sampleID"]
|
|
417
|
+
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_ids))
|
|
418
|
+
neon_to_nmdc_biosample_ids = dict(zip(sample_ids, nmdc_biosample_ids))
|
|
390
419
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
"nmdc:Extraction", len(neon_extraction_ids)
|
|
394
|
-
)
|
|
395
|
-
neon_to_nmdc_extraction_ids = dict(
|
|
396
|
-
zip(neon_extraction_ids, nmdc_extraction_ids)
|
|
397
|
-
)
|
|
420
|
+
nmdc_extraction_ids = self._id_minter("nmdc:Extraction", len(sample_ids))
|
|
421
|
+
neon_to_nmdc_extraction_ids = dict(zip(sample_ids, nmdc_extraction_ids))
|
|
398
422
|
|
|
399
|
-
neon_extraction_processed_ids = benthic_samples["sampleID"]
|
|
400
423
|
nmdc_extraction_processed_ids = self._id_minter(
|
|
401
|
-
"nmdc:ProcessedSample", len(
|
|
424
|
+
"nmdc:ProcessedSample", len(sample_ids)
|
|
402
425
|
)
|
|
403
426
|
neon_to_nmdc_extraction_processed_ids = dict(
|
|
404
|
-
zip(
|
|
427
|
+
zip(sample_ids, nmdc_extraction_processed_ids)
|
|
405
428
|
)
|
|
406
429
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
"nmdc:LibraryPreparation", len(neon_lib_prep_ids)
|
|
410
|
-
)
|
|
411
|
-
neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
|
|
430
|
+
nmdc_libprep_ids = self._id_minter("nmdc:LibraryPreparation", len(sample_ids))
|
|
431
|
+
neon_to_nmdc_libprep_ids = dict(zip(sample_ids, nmdc_libprep_ids))
|
|
412
432
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
"nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
|
|
433
|
+
nmdc_libprep_processed_ids = self._id_minter(
|
|
434
|
+
"nmdc:ProcessedSample", len(sample_ids)
|
|
416
435
|
)
|
|
417
|
-
|
|
418
|
-
zip(
|
|
436
|
+
neon_to_nmdc_libprep_processed_ids = dict(
|
|
437
|
+
zip(sample_ids, nmdc_libprep_processed_ids)
|
|
419
438
|
)
|
|
420
439
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
440
|
+
nmdc_ntseq_ids = self._id_minter("nmdc:NucleotideSequencing", len(sample_ids))
|
|
441
|
+
neon_to_nmdc_ntseq_ids = dict(zip(sample_ids, nmdc_ntseq_ids))
|
|
424
442
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
)
|
|
430
|
-
neon_to_nmdc_data_object_ids = dict(
|
|
431
|
-
zip(neon_raw_file_paths, nmdc_data_object_ids)
|
|
432
|
-
)
|
|
443
|
+
raw_df = self.neon_raw_data_file_mappings_df
|
|
444
|
+
raw_file_paths = raw_df["rawDataFilePath"]
|
|
445
|
+
dataobject_ids = self._id_minter("nmdc:DataObject", len(raw_file_paths))
|
|
446
|
+
neon_to_nmdc_dataobject_ids = dict(zip(raw_file_paths, dataobject_ids))
|
|
433
447
|
|
|
434
|
-
for neon_id,
|
|
435
|
-
|
|
448
|
+
for neon_id, biosample_id in neon_to_nmdc_biosample_ids.items():
|
|
449
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
450
|
+
if row.empty:
|
|
451
|
+
continue
|
|
436
452
|
|
|
453
|
+
# Example of how you might call _translate_biosample:
|
|
437
454
|
database.biosample_set.append(
|
|
438
|
-
self._translate_biosample(neon_id,
|
|
455
|
+
self._translate_biosample(neon_id, biosample_id, row)
|
|
439
456
|
)
|
|
440
457
|
|
|
441
|
-
for neon_id,
|
|
442
|
-
|
|
458
|
+
for neon_id, extraction_id in neon_to_nmdc_extraction_ids.items():
|
|
459
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
460
|
+
if row.empty:
|
|
461
|
+
continue
|
|
443
462
|
|
|
444
|
-
|
|
445
|
-
|
|
463
|
+
biosample_id = neon_to_nmdc_biosample_ids.get(neon_id)
|
|
464
|
+
extraction_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
446
465
|
|
|
447
|
-
if
|
|
448
|
-
database.
|
|
466
|
+
if biosample_id and extraction_ps_id:
|
|
467
|
+
database.material_processing_set.append(
|
|
449
468
|
self._translate_extraction_process(
|
|
450
|
-
|
|
451
|
-
extraction_input,
|
|
452
|
-
processed_sample_id,
|
|
453
|
-
extraction_row,
|
|
469
|
+
extraction_id, biosample_id, extraction_ps_id, row
|
|
454
470
|
)
|
|
455
471
|
)
|
|
456
|
-
|
|
457
|
-
genomics_sample_id = _get_value_or_none(
|
|
458
|
-
extraction_row, "genomicsSampleID"
|
|
459
|
-
)
|
|
460
|
-
|
|
472
|
+
genomics_sample_id = _get_value_or_none(row, "genomicsSampleID")
|
|
461
473
|
database.processed_sample_set.append(
|
|
462
474
|
self._translate_processed_sample(
|
|
463
|
-
|
|
475
|
+
extraction_ps_id,
|
|
464
476
|
f"Extracted DNA from {genomics_sample_id}",
|
|
465
477
|
)
|
|
466
478
|
)
|
|
467
479
|
|
|
468
|
-
|
|
480
|
+
query2 = """
|
|
469
481
|
SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
|
|
470
|
-
FROM
|
|
482
|
+
FROM mms_benthicRawDataFiles
|
|
471
483
|
GROUP BY dnaSampleID
|
|
472
484
|
"""
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
485
|
+
raw_data_files_df = pd.read_sql_query(query2, self.conn)
|
|
486
|
+
dna_files_dict = (
|
|
487
|
+
raw_data_files_df.set_index("dnaSampleID")["rawDataFilePaths"]
|
|
476
488
|
.str.split("|")
|
|
477
489
|
.to_dict()
|
|
478
490
|
)
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
lib_prep_input,
|
|
496
|
-
processed_sample_id,
|
|
497
|
-
lib_prep_row,
|
|
498
|
-
)
|
|
491
|
+
|
|
492
|
+
dna_sample_to_manifest_id: dict[str, str] = {}
|
|
493
|
+
|
|
494
|
+
for neon_id, libprep_id in neon_to_nmdc_libprep_ids.items():
|
|
495
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
496
|
+
if row.empty:
|
|
497
|
+
continue
|
|
498
|
+
|
|
499
|
+
extr_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
500
|
+
libprep_ps_id = neon_to_nmdc_libprep_processed_ids.get(neon_id)
|
|
501
|
+
if not extr_ps_id or not libprep_ps_id:
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
database.material_processing_set.append(
|
|
505
|
+
self._translate_library_preparation(
|
|
506
|
+
libprep_id, extr_ps_id, libprep_ps_id, row
|
|
499
507
|
)
|
|
508
|
+
)
|
|
500
509
|
|
|
501
|
-
|
|
510
|
+
dna_sample_id = _get_value_or_none(row, "dnaSampleID")
|
|
511
|
+
database.processed_sample_set.append(
|
|
512
|
+
self._translate_processed_sample(
|
|
513
|
+
libprep_ps_id,
|
|
514
|
+
f"Library preparation for {dna_sample_id}",
|
|
515
|
+
)
|
|
516
|
+
)
|
|
502
517
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
518
|
+
filepaths_for_dna: list[str] = dna_files_dict.get(dna_sample_id, [])
|
|
519
|
+
if not filepaths_for_dna:
|
|
520
|
+
# no raw files => skip
|
|
521
|
+
ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
|
|
522
|
+
if ntseq_id:
|
|
523
|
+
continue
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
# If multiple => we create a Manifest
|
|
527
|
+
manifest_id: Optional[str] = None
|
|
528
|
+
if len(filepaths_for_dna) > 2:
|
|
529
|
+
if dna_sample_id not in dna_sample_to_manifest_id:
|
|
530
|
+
new_man_id = self._id_minter("nmdc:Manifest", 1)[0]
|
|
531
|
+
dna_sample_to_manifest_id[dna_sample_id] = new_man_id
|
|
532
|
+
database.manifest_set.append(self._translate_manifest(new_man_id))
|
|
533
|
+
manifest_id = dna_sample_to_manifest_id[dna_sample_id]
|
|
534
|
+
|
|
535
|
+
has_input_value = self.samp_procsm_dict.get(neon_id)
|
|
536
|
+
if not has_input_value:
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
dataobject_ids_for_run: list[str] = []
|
|
540
|
+
for fp in filepaths_for_dna:
|
|
541
|
+
if fp not in neon_to_nmdc_dataobject_ids:
|
|
542
|
+
continue
|
|
543
|
+
do_id = neon_to_nmdc_dataobject_ids[fp]
|
|
544
|
+
|
|
545
|
+
do_type = None
|
|
546
|
+
if "_R1.fastq.gz" in fp:
|
|
547
|
+
do_type = "Metagenome Raw Read 1"
|
|
548
|
+
elif "_R2.fastq.gz" in fp:
|
|
549
|
+
do_type = "Metagenome Raw Read 2"
|
|
550
|
+
|
|
551
|
+
database.data_object_set.append(
|
|
552
|
+
self._translate_data_object(
|
|
553
|
+
do_id=do_id,
|
|
554
|
+
url=fp,
|
|
555
|
+
do_type=do_type,
|
|
556
|
+
manifest_id=manifest_id,
|
|
507
557
|
)
|
|
508
558
|
)
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
checksum = None
|
|
520
|
-
do_type = None
|
|
521
|
-
|
|
522
|
-
checksum = neon_raw_data_file_mappings_df[
|
|
523
|
-
neon_raw_data_file_mappings_df["rawDataFilePath"] == item
|
|
524
|
-
]["checkSum"].values[0]
|
|
525
|
-
if "_R1.fastq.gz" in item:
|
|
526
|
-
do_type = "Metagenome Raw Read 1"
|
|
527
|
-
elif "_R2.fastq.gz" in item:
|
|
528
|
-
do_type = "Metagenome Raw Read 2"
|
|
529
|
-
|
|
530
|
-
database.data_object_set.append(
|
|
531
|
-
self._translate_data_object(
|
|
532
|
-
neon_to_nmdc_data_object_ids.get(item),
|
|
533
|
-
item,
|
|
534
|
-
do_type,
|
|
535
|
-
checksum,
|
|
536
|
-
)
|
|
537
|
-
)
|
|
538
|
-
|
|
539
|
-
database.omics_processing_set.append(
|
|
540
|
-
self._translate_omics_processing(
|
|
541
|
-
neon_to_nmdc_omprc_ids.get(neon_id),
|
|
542
|
-
processed_sample_id,
|
|
543
|
-
has_output_do_ids,
|
|
544
|
-
lib_prep_row,
|
|
545
|
-
)
|
|
559
|
+
dataobject_ids_for_run.append(do_id)
|
|
560
|
+
|
|
561
|
+
ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
|
|
562
|
+
if ntseq_id:
|
|
563
|
+
database.data_generation_set.append(
|
|
564
|
+
self._translate_nucleotide_sequencing(
|
|
565
|
+
ntseq_id,
|
|
566
|
+
has_input_value, # <--- from self.samp_procsm_dict
|
|
567
|
+
dataobject_ids_for_run,
|
|
568
|
+
row,
|
|
546
569
|
)
|
|
570
|
+
)
|
|
547
571
|
|
|
548
572
|
return database
|