nmdc-runtime 1.4.2__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -38,9 +38,11 @@ from nmdc_runtime.site.ops import (
38
38
  neon_data_by_product,
39
39
  nmdc_schema_database_from_neon_soil_data,
40
40
  nmdc_schema_database_from_neon_benthic_data,
41
+ nmdc_schema_database_from_neon_surface_water_data,
41
42
  nmdc_schema_database_export_filename_neon,
42
43
  get_neon_pipeline_mms_data_product,
43
44
  get_neon_pipeline_sls_data_product,
45
+ get_neon_pipeline_surface_water_data_product,
44
46
  get_submission_portal_pipeline_inputs,
45
47
  get_csv_rows_from_url,
46
48
  get_neon_pipeline_benthic_data_product,
@@ -317,3 +319,65 @@ def ingest_neon_benthic_metadata():
317
319
  )
318
320
  run_id = submit_metadata_to_db(database)
319
321
  poll_for_run_completion(run_id)
322
+
323
+
324
+ @graph
325
+ def translate_neon_api_surface_water_metadata_to_nmdc_schema_database():
326
+ mms_surface_water_data_product = get_neon_pipeline_surface_water_data_product()
327
+
328
+ mms_surface_water = neon_data_by_product(mms_surface_water_data_product)
329
+
330
+ sites_mapping_dict = site_code_mapping()
331
+
332
+ (
333
+ neon_envo_mappings_file_url,
334
+ neon_raw_data_file_mappings_file_url,
335
+ ) = get_neon_pipeline_inputs()
336
+
337
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
338
+
339
+ neon_raw_data_file_mappings_file = get_df_from_url(
340
+ neon_raw_data_file_mappings_file_url
341
+ )
342
+
343
+ database = nmdc_schema_database_from_neon_surface_water_data(
344
+ mms_surface_water,
345
+ sites_mapping_dict,
346
+ neon_envo_mappings_file,
347
+ neon_raw_data_file_mappings_file,
348
+ )
349
+
350
+ database_dict = nmdc_schema_object_to_dict(database)
351
+ filename = nmdc_schema_database_export_filename_neon()
352
+
353
+ outputs = export_json_to_drs(database_dict, filename)
354
+ add_output_run_event(outputs)
355
+
356
+
357
+ @graph
358
+ def ingest_neon_surface_water_metadata():
359
+ mms_surface_water_data_product = get_neon_pipeline_surface_water_data_product()
360
+
361
+ mms_surface_water = neon_data_by_product(mms_surface_water_data_product)
362
+
363
+ sites_mapping_dict = site_code_mapping()
364
+
365
+ (
366
+ neon_envo_mappings_file_url,
367
+ neon_raw_data_file_mappings_file_url,
368
+ ) = get_neon_pipeline_inputs()
369
+
370
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
371
+
372
+ neon_raw_data_file_mappings_file = get_df_from_url(
373
+ neon_raw_data_file_mappings_file_url
374
+ )
375
+
376
+ database = nmdc_schema_database_from_neon_benthic_data(
377
+ mms_surface_water,
378
+ sites_mapping_dict,
379
+ neon_envo_mappings_file,
380
+ neon_raw_data_file_mappings_file,
381
+ )
382
+ run_id = submit_metadata_to_db(database)
383
+ poll_for_run_completion(run_id)
nmdc_runtime/site/ops.py CHANGED
@@ -68,6 +68,9 @@ from nmdc_runtime.site.translation.neon_soil_translator import NeonSoilDataTrans
68
68
  from nmdc_runtime.site.translation.neon_benthic_translator import (
69
69
  NeonBenthicDataTranslator,
70
70
  )
71
+ from nmdc_runtime.site.translation.neon_surface_water_translator import (
72
+ NeonSurfaceWaterDataTranslator,
73
+ )
71
74
  from nmdc_runtime.site.translation.submission_portal_translator import (
72
75
  SubmissionPortalTranslator,
73
76
  )
@@ -784,6 +787,11 @@ def get_neon_pipeline_benthic_data_product(context: OpExecutionContext) -> dict:
784
787
  return context.op_config["benthic_data_product"]
785
788
 
786
789
 
790
+ @op(config_schema={"surface_water_data_product": dict})
791
+ def get_neon_pipeline_surface_water_data_product(context: OpExecutionContext) -> dict:
792
+ return context.op_config["surface_water_data_product"]
793
+
794
+
787
795
  @op(required_resource_keys={"neon_api_client"})
788
796
  def neon_data_by_product(
789
797
  context: OpExecutionContext, data_product: dict
@@ -862,6 +870,32 @@ def nmdc_schema_database_from_neon_benthic_data(
862
870
  return database
863
871
 
864
872
 
873
+ @op(required_resource_keys={"runtime_api_site_client"})
874
+ def nmdc_schema_database_from_neon_surface_water_data(
875
+ context: OpExecutionContext,
876
+ surface_water_data: Dict[str, pd.DataFrame],
877
+ site_code_mapping: Dict[str, str],
878
+ neon_envo_mappings_file: pd.DataFrame,
879
+ neon_raw_data_file_mappings_file: pd.DataFrame,
880
+ ) -> nmdc.Database:
881
+ client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
882
+
883
+ def id_minter(*args, **kwargs):
884
+ response = client.mint_id(*args, **kwargs)
885
+ return response.json()
886
+
887
+ translator = NeonSurfaceWaterDataTranslator(
888
+ surface_water_data,
889
+ site_code_mapping,
890
+ neon_envo_mappings_file,
891
+ neon_raw_data_file_mappings_file,
892
+ id_minter=id_minter,
893
+ )
894
+
895
+ database = translator.get_database()
896
+ return database
897
+
898
+
865
899
  @op(
866
900
  out={
867
901
  "neon_envo_mappings_file_url": Out(),
@@ -38,8 +38,10 @@ from nmdc_runtime.site.graphs import (
38
38
  hello_graph,
39
39
  translate_neon_api_soil_metadata_to_nmdc_schema_database,
40
40
  translate_neon_api_benthic_metadata_to_nmdc_schema_database,
41
+ translate_neon_api_surface_water_metadata_to_nmdc_schema_database,
41
42
  ingest_neon_soil_metadata,
42
43
  ingest_neon_benthic_metadata,
44
+ ingest_neon_surface_water_metadata,
43
45
  )
44
46
  from nmdc_runtime.site.resources import (
45
47
  get_mongo,
@@ -764,6 +766,89 @@ def biosample_submission_ingest():
764
766
  },
765
767
  },
766
768
  ),
769
+ translate_neon_api_surface_water_metadata_to_nmdc_schema_database.to_job(
770
+ description="This job fetches the metadata associated with a given NEON data product code and translates it into an equivalent nmdc:Database object. The object is serialized to JSON and stored in DRS. This can be considered a dry-run for the `ingest_neon_metadata` job.",
771
+ resource_defs=resource_defs,
772
+ config={
773
+ "resources": merge(
774
+ unfreeze(normal_resources),
775
+ {
776
+ "neon_api_client": {
777
+ "config": {
778
+ "base_url": {"env": "NEON_API_BASE_URL"},
779
+ "api_token": {"env": "NEON_API_TOKEN"},
780
+ },
781
+ },
782
+ "mongo": {
783
+ "config": {
784
+ "dbname": {"env": "MONGO_DBNAME"},
785
+ "host": {"env": "MONGO_HOST"},
786
+ "password": {"env": "MONGO_PASSWORD"},
787
+ "username": {"env": "MONGO_USERNAME"},
788
+ },
789
+ },
790
+ "runtime_api_site_client": {
791
+ "config": {
792
+ "base_url": {"env": "API_HOST"},
793
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
794
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
795
+ "site_id": {"env": "API_SITE_ID"},
796
+ },
797
+ },
798
+ },
799
+ ),
800
+ "ops": {
801
+ "export_json_to_drs": {"config": {"username": "..."}},
802
+ "get_neon_pipeline_inputs": {
803
+ "inputs": {
804
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
805
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
806
+ }
807
+ },
808
+ "get_neon_pipeline_surface_water_data_product": {
809
+ "config": {
810
+ "surface_water_data_product": {
811
+ "product_id": "DP1.20281.001",
812
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
813
+ }
814
+ }
815
+ },
816
+ },
817
+ },
818
+ ),
819
+ ingest_neon_surface_water_metadata.to_job(
820
+ description="",
821
+ resource_defs=resource_defs,
822
+ config={
823
+ "resources": merge(
824
+ unfreeze(normal_resources),
825
+ {
826
+ "neon_api_client": {
827
+ "config": {
828
+ "base_url": {"env": "NEON_API_BASE_URL"},
829
+ "api_token": {"env": "NEON_API_TOKEN"},
830
+ },
831
+ }
832
+ },
833
+ ),
834
+ "ops": {
835
+ "get_neon_pipeline_surface_water_data_product": {
836
+ "config": {
837
+ "surface_water_data_product": {
838
+ "product_id": "DP1.20281.001",
839
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
840
+ }
841
+ }
842
+ },
843
+ "get_neon_pipeline_inputs": {
844
+ "inputs": {
845
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
846
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
847
+ }
848
+ },
849
+ },
850
+ },
851
+ ),
767
852
  ]
768
853
 
769
854
 
@@ -0,0 +1,620 @@
1
+ import re
2
+ import sqlite3
3
+ from typing import Dict, Optional
4
+
5
+ import pandas as pd
6
+ import requests
7
+ import requests_cache
8
+
9
+ from nmdc_schema import nmdc
10
+ from nmdc_runtime.site.translation.translator import Translator
11
+ from nmdc_runtime.site.util import get_basename
12
+ from nmdc_runtime.site.translation.neon_utils import (
13
+ _get_value_or_none,
14
+ _create_controlled_identified_term_value,
15
+ _create_controlled_term_value,
16
+ _create_geolocation_value,
17
+ _create_quantity_value,
18
+ _create_timestamp_value,
19
+ _create_text_value,
20
+ )
21
+
22
+
23
+ SURFACE_WATER_BROAD_SCALE_MAPPINGS = {
24
+ "lake": {"term_id": "ENVO:01000252", "term_name": "freshwater lake biome"},
25
+ "river": {"term_id": "ENVO:01000253", "term_name": "freshwater river biome"},
26
+ "stream": {"term_id": "ENVO:03605008", "term_name": "freshwater stream biome"},
27
+ }
28
+
29
+ SURFACE_WATER_LOCAL_SCALE_MAPPINGS = {
30
+ "lake": {
31
+ "buoy.c0": {"term_id": "ENVO:01001191", "term_name": "water surface"},
32
+ "buoy.c1": {"term_id": "ENVO:00002131", "term_name": "epilimnion"},
33
+ "buoy.c2": {"term_id": "ENVO:00002269", "term_name": "thermocline"},
34
+ "buoy.c3": {"term_id": "ENVO:00002130", "term_name": "hypolimnion"},
35
+ "littoral": {
36
+ "term_id": "ENVO:01000409",
37
+ "term_name": "freshwater littoral zone",
38
+ },
39
+ },
40
+ "river": {"term_id": "ENVO:01000297", "term_name": "freshwater river"},
41
+ "stream": {"term_id": "ENVO:03605007", "term_name": "freshwater stream"},
42
+ }
43
+
44
+ SURFACE_WATER_MEDIUM_MAPPINGS = {
45
+ "lake": {
46
+ "term_id": "ENVO:04000007",
47
+ "term_name": "lake water",
48
+ },
49
+ "river": {"term_id": "ENVO:01000599", "term_name": "river water"},
50
+ "stream": {"term_id": "ENVO:03605006", "term_name": "stream water"},
51
+ }
52
+
53
+
54
+ class NeonSurfaceWaterDataTranslator(Translator):
55
+ def __init__(
56
+ self,
57
+ surface_water_data: dict,
58
+ site_code_mapping: dict,
59
+ neon_envo_mappings_file: pd.DataFrame,
60
+ neon_raw_data_file_mappings_file: pd.DataFrame,
61
+ *args,
62
+ **kwargs,
63
+ ) -> None:
64
+ super().__init__(*args, **kwargs)
65
+
66
+ self.conn = sqlite3.connect("neon.db")
67
+ requests_cache.install_cache("neon_api_cache")
68
+
69
+ neon_amb_data_tables = (
70
+ "mms_swMetagenomeSequencing",
71
+ "mms_swMetagenomeDnaExtraction",
72
+ "amc_fieldGenetic",
73
+ "amc_fieldSuperParent",
74
+ )
75
+
76
+ if all(k in surface_water_data for k in neon_amb_data_tables):
77
+ surface_water_data["mms_swMetagenomeSequencing"].to_sql(
78
+ "mms_swMetagenomeSequencing",
79
+ self.conn,
80
+ if_exists="replace",
81
+ index=False,
82
+ )
83
+ surface_water_data["mms_swMetagenomeDnaExtraction"].to_sql(
84
+ "mms_swMetagenomeDnaExtraction",
85
+ self.conn,
86
+ if_exists="replace",
87
+ index=False,
88
+ )
89
+ surface_water_data["amc_fieldGenetic"].to_sql(
90
+ "amc_fieldGenetic", self.conn, if_exists="replace", index=False
91
+ )
92
+ surface_water_data["amc_fieldSuperParent"].to_sql(
93
+ "amc_fieldSuperParent", self.conn, if_exists="replace", index=False
94
+ )
95
+ else:
96
+ raise ValueError(
97
+ f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
98
+ )
99
+
100
+ neon_envo_mappings_file.to_sql(
101
+ "neonEnvoTerms", self.conn, if_exists="replace", index=False
102
+ )
103
+
104
+ self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
105
+ self.neon_raw_data_file_mappings_df.to_sql(
106
+ "neonRawDataFile", self.conn, if_exists="replace", index=False
107
+ )
108
+
109
+ self.site_code_mapping = site_code_mapping
110
+
111
+ def _translate_biosample(
112
+ self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
113
+ ) -> nmdc.Biosample:
114
+ def map_local_scale(
115
+ aquatic_site_type: str, named_location: Optional[str] = None
116
+ ) -> Dict[str, str]:
117
+ if aquatic_site_type == "lake":
118
+ for key in SURFACE_WATER_LOCAL_SCALE_MAPPINGS.get(
119
+ aquatic_site_type, {}
120
+ ):
121
+ if key in named_location:
122
+ return SURFACE_WATER_LOCAL_SCALE_MAPPINGS[aquatic_site_type][
123
+ key
124
+ ]
125
+ elif aquatic_site_type == "river" or aquatic_site_type == "stream":
126
+ return SURFACE_WATER_LOCAL_SCALE_MAPPINGS.get(aquatic_site_type, {})
127
+ return {}
128
+
129
+ depth = None
130
+ minimum_depth = biosample_row["lakeSampleDepth1"].values[0]
131
+ maximum_depth = biosample_row["lakeSampleDepth2"].values[0]
132
+
133
+ if not pd.isna(minimum_depth):
134
+ if not pd.isna(maximum_depth):
135
+ depth = nmdc.QuantityValue(
136
+ has_minimum_numeric_value=nmdc.Float(minimum_depth),
137
+ has_maximum_numeric_value=nmdc.Float(maximum_depth),
138
+ has_unit="m",
139
+ )
140
+ else:
141
+ depth = nmdc.QuantityValue(
142
+ has_numeric_value=nmdc.Float(minimum_depth),
143
+ has_unit="m",
144
+ )
145
+
146
+ return nmdc.Biosample(
147
+ id=nmdc_id,
148
+ part_of="nmdc:sty-11-hht5sb92",
149
+ env_broad_scale=_create_controlled_identified_term_value(
150
+ SURFACE_WATER_BROAD_SCALE_MAPPINGS.get(
151
+ biosample_row["aquaticSiteType"].values[0]
152
+ ).get("term_id"),
153
+ SURFACE_WATER_BROAD_SCALE_MAPPINGS.get(
154
+ biosample_row["aquaticSiteType"].values[0]
155
+ ).get("term_name"),
156
+ ),
157
+ env_local_scale=_create_controlled_identified_term_value(
158
+ map_local_scale(
159
+ biosample_row["aquaticSiteType"].values[0],
160
+ biosample_row["namedLocation"].values[0],
161
+ ).get("term_id"),
162
+ map_local_scale(
163
+ biosample_row["aquaticSiteType"].values[0],
164
+ biosample_row["namedLocation"].values[0],
165
+ ).get("term_name"),
166
+ ),
167
+ env_medium=_create_controlled_identified_term_value(
168
+ SURFACE_WATER_MEDIUM_MAPPINGS.get(
169
+ biosample_row["aquaticSiteType"].values[0]
170
+ ).get("term_id"),
171
+ SURFACE_WATER_MEDIUM_MAPPINGS.get(
172
+ biosample_row["aquaticSiteType"].values[0]
173
+ ).get("term_name"),
174
+ ),
175
+ name=neon_id,
176
+ lat_lon=_create_geolocation_value(
177
+ biosample_row["decimalLatitude"].values[0],
178
+ biosample_row["decimalLongitude"].values[0],
179
+ ),
180
+ elev=nmdc.Float(biosample_row["elevation"].values[0]),
181
+ collection_date=_create_timestamp_value(
182
+ biosample_row["seqCollectDate"].values[0]
183
+ ),
184
+ geo_loc_name=_create_text_value(
185
+ self.site_code_mapping[biosample_row["siteID"].values[0]]
186
+ if biosample_row["siteID"].values[0]
187
+ else None
188
+ ),
189
+ samp_collec_device=biosample_row["samplerType"].values[0],
190
+ diss_oxygen=_create_quantity_value(
191
+ biosample_row["dissolvedOxygen"].values[0], "mg/L"
192
+ ),
193
+ conduc=_create_quantity_value(
194
+ biosample_row["specificConductance"].values[0], "uS/cm"
195
+ ),
196
+ temp=_create_quantity_value(biosample_row["waterTemp"].values[0], "Cel"),
197
+ type="nmdc:Biosample",
198
+ analysis_type="metagenomics",
199
+ biosample_categories="NEON",
200
+ depth=depth,
201
+ samp_size=_create_quantity_value(
202
+ biosample_row["geneticFilteredSampleVolume"].values[0], "mL"
203
+ ),
204
+ env_package=nmdc.TextValue(has_raw_value="water"),
205
+ )
206
+
207
+ def _translate_extraction_process(
208
+ self,
209
+ extraction_id: str,
210
+ extraction_input: str,
211
+ processed_sample_id: str,
212
+ extraction_row: pd.DataFrame,
213
+ ) -> nmdc.Extraction:
214
+ """
215
+ Create an nmdc Extraction process, which is a process to model the DNA extraction in
216
+ a metagenome sequencing experiment. The input to an Extraction process is the
217
+ output from a Pooling process.
218
+
219
+ :param extraction_id: Minted id for Extraction process.
220
+ :param extraction_input: Input to an Extraction process is the output from a Pooling process.
221
+ :param processed_sample_id: Output of Extraction process is a ProcessedSample.
222
+ :param extraction_row: DataFrame with Extraction process metadata.
223
+ :return: Extraction process object.
224
+ """
225
+ processing_institution = None
226
+ laboratory_name = _get_value_or_none(extraction_row, "laboratoryName")
227
+ if laboratory_name is not None:
228
+ if re.search("Battelle", laboratory_name, re.IGNORECASE):
229
+ processing_institution = "Battelle"
230
+ elif re.search("Argonne", laboratory_name, re.IGNORECASE):
231
+ processing_institution = "ANL"
232
+
233
+ return nmdc.Extraction(
234
+ id=extraction_id,
235
+ has_input=extraction_input,
236
+ has_output=processed_sample_id,
237
+ start_date=_get_value_or_none(extraction_row, "extrCollectDate"),
238
+ end_date=_get_value_or_none(extraction_row, "extrProcessedDate"),
239
+ input_mass=_create_quantity_value(
240
+ _get_value_or_none(extraction_row, "sampleMass"), "g"
241
+ ),
242
+ qc_status=nmdc.StatusEnum(
243
+ _get_value_or_none(extraction_row, "extrQaqcStatus")
244
+ ),
245
+ processing_institution=processing_institution,
246
+ )
247
+
248
+ def _translate_library_preparation(
249
+ self,
250
+ library_preparation_id: str,
251
+ library_preparation_input: str,
252
+ processed_sample_id: str,
253
+ library_preparation_row: pd.DataFrame,
254
+ ):
255
+ """
256
+ Create LibraryPreparation process object. The input to LibraryPreparation process
257
+ is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
258
+ process is fed as input to an OmicsProcessing object.
259
+
260
+ :param library_preparation_id: Minted id for LibraryPreparation process.
261
+ :param library_preparation_input: Input to LibraryPreparation process is output from
262
+ Extraction process.
263
+ :param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
264
+ is also input to OmicsProcessing.
265
+ :param library_preparation_row: Metadata required to populate LibraryPreparation.
266
+ :return: Object that using LibraryPreparation process model.
267
+ """
268
+ processing_institution = None
269
+ laboratory_name = _get_value_or_none(library_preparation_row, "laboratoryName")
270
+ if laboratory_name is not None:
271
+ if re.search("Battelle", laboratory_name, re.IGNORECASE):
272
+ processing_institution = "Battelle"
273
+ elif re.search("Argonne", laboratory_name, re.IGNORECASE):
274
+ processing_institution = "ANL"
275
+
276
+ return nmdc.LibraryPreparation(
277
+ id=library_preparation_id,
278
+ has_input=library_preparation_input,
279
+ has_output=processed_sample_id,
280
+ start_date=_get_value_or_none(library_preparation_row, "seqCollectDate"),
281
+ end_date=_get_value_or_none(library_preparation_row, "seqProcessedDate"),
282
+ processing_institution=processing_institution,
283
+ )
284
+
285
+ def _translate_omics_processing(
286
+ self,
287
+ omics_processing_id: str,
288
+ processed_sample_id: str,
289
+ raw_data_file_data: str,
290
+ omics_processing_row: pd.DataFrame,
291
+ ) -> nmdc.OmicsProcessing:
292
+ """Create nmdc OmicsProcessing object. This class typically models the run of a
293
+ Bioinformatics workflow on sequence data from a biosample. The input to an OmicsProcessing
294
+ process is the output from a LibraryPreparation process, and the output of OmicsProcessing
295
+ is a DataObject which has the FASTQ sequence file URLs embedded in them.
296
+
297
+ :param omics_processing_id: Minted id for an OmicsProcessing process.
298
+ :param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
299
+ :param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
300
+ files embedded in them.
301
+ :param omics_processing_row: DataFrame with metadata for an OmicsProcessing workflow
302
+ process/run.
303
+ :return: OmicsProcessing object that models a Bioinformatics workflow process/run.
304
+ """
305
+ processing_institution = None
306
+ sequencing_facility = _get_value_or_none(
307
+ omics_processing_row, "sequencingFacilityID"
308
+ )
309
+ if sequencing_facility is not None:
310
+ if re.search("Battelle", sequencing_facility, re.IGNORECASE):
311
+ processing_institution = "Battelle"
312
+ elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
313
+ processing_institution = "ANL"
314
+
315
+ return nmdc.OmicsProcessing(
316
+ id=omics_processing_id,
317
+ has_input=processed_sample_id,
318
+ has_output=raw_data_file_data,
319
+ processing_institution=processing_institution,
320
+ ncbi_project_name=_get_value_or_none(omics_processing_row, "ncbiProjectID"),
321
+ omics_type=_create_controlled_term_value(
322
+ omics_processing_row["investigation_type"].values[0]
323
+ ),
324
+ instrument_name=f"{_get_value_or_none(omics_processing_row, 'sequencingMethod')} {_get_value_or_none(omics_processing_row, 'instrument_model')}",
325
+ part_of="nmdc:sty-11-hht5sb92",
326
+ name=f"Surface water microbial communities - {_get_value_or_none(omics_processing_row, 'dnaSampleID')}",
327
+ type="nmdc:OmicsProcessing",
328
+ )
329
+
330
+ def _translate_processed_sample(
331
+ self, processed_sample_id: str, sample_id: str
332
+ ) -> nmdc.ProcessedSample:
333
+ """
334
+ Create an nmdc ProcessedSample. ProcessedSample is typically the output of a PlannedProcess
335
+ like Pooling, Extraction, LibraryPreparation, etc. We are using this to create a
336
+ reference for the nmdc minted ProcessedSample ids in `processed_sample_set`. We are
337
+ associating the minted ids with the name of the sample it is coming from which can be
338
+ a value from either the `genomicsSampleID` column or from the `dnaSampleID` column.
339
+
340
+ :param processed_sample_id: NMDC minted ProcessedSampleID.
341
+ :param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
342
+ :return: ProcessedSample objects to be stored in `processed_sample_set`.
343
+ """
344
+ return nmdc.ProcessedSample(id=processed_sample_id, name=sample_id)
345
+
346
+ def _translate_data_object(
347
+ self, do_id: str, url: str, do_type: str, checksum: str
348
+ ) -> nmdc.DataObject:
349
+ """Create nmdc DataObject which is the output of an OmicsProcessing process. This
350
+ object mainly contains information about the sequencing file that was generated as
351
+ the result of running a Bioinformatics workflow on a certain ProcessedSample, which
352
+ is the result of a LibraryPreparation process.
353
+
354
+ :param do_id: NMDC minted DataObject id.
355
+ :param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
356
+ by Hugh Cross at NEON.
357
+ :param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
358
+ :param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
359
+ at NEON.
360
+ :return: DataObject with all the sequencing file metadata.
361
+ """
362
+ file_name = get_basename(url)
363
+ basename = file_name.split(".", 1)[0]
364
+
365
+ return nmdc.DataObject(
366
+ id=do_id,
367
+ name=file_name,
368
+ url=url,
369
+ description=f"sequencing results for {basename}",
370
+ type="nmdc:DataObject",
371
+ md5_checksum=checksum,
372
+ data_object_type=do_type,
373
+ )
374
+
375
+ def get_database(self):
376
+ database = nmdc.Database()
377
+
378
+ query = """
379
+ SELECT
380
+ merged.laboratoryName,
381
+ merged.sequencingFacilityID,
382
+ merged.extrProcessedDate,
383
+ merged.seqProcessedDate,
384
+ merged.dnaSampleID,
385
+ merged.internalLabID,
386
+ merged.instrument_model,
387
+ merged.sequencingMethod,
388
+ merged.investigation_type,
389
+ merged.extrQaqcStatus,
390
+ merged.ncbiProjectID,
391
+ merged.genomicsSampleID,
392
+ merged.sequenceAnalysisType,
393
+ merged.sampleMass,
394
+ merged.nucleicAcidConcentration,
395
+ merged.siteID,
396
+ merged.seqCollectDate,
397
+ merged.extrCollectDate,
398
+ afg.geneticSampleID,
399
+ afg.geneticFilteredSampleVolume,
400
+ afg.sampleMaterial,
401
+ afs.parentSampleID,
402
+ afs.namedLocation,
403
+ afs.decimalLatitude,
404
+ afs.decimalLongitude,
405
+ afs.elevation,
406
+ afs.aquaticSiteType,
407
+ afs.samplerType,
408
+ afs.dissolvedOxygen,
409
+ afs.specificConductance,
410
+ afs.waterTemp,
411
+ afs.lakeSampleDepth1,
412
+ afs.lakeSampleDepth2
413
+ FROM
414
+ (
415
+ SELECT
416
+ msq.collectDate AS seqCollectDate,
417
+ mde.collectDate AS extrCollectDate,
418
+ msq.laboratoryName,
419
+ msq.sequencingFacilityID,
420
+ msq.processedDate AS seqProcessedDate,
421
+ mde.processedDate AS extrProcessedDate,
422
+ msq.dnaSampleID,
423
+ msq.internalLabID,
424
+ msq.instrument_model,
425
+ msq.sequencingMethod,
426
+ msq.investigation_type,
427
+ mde.qaqcStatus AS extrQaqcStatus,
428
+ msq.ncbiProjectID,
429
+ msq.siteID,
430
+ msq.labPrepMethod,
431
+ mde.genomicsSampleID,
432
+ mde.sequenceAnalysisType,
433
+ mde.sampleMass,
434
+ mde.nucleicAcidConcentration,
435
+ mde.nucleicAcidQuantMethod,
436
+ mde.nucleicAcidPurity
437
+ FROM
438
+ mms_swMetagenomeSequencing AS msq
439
+ JOIN
440
+ mms_swMetagenomeDnaExtraction AS mde
441
+ ON
442
+ msq.dnaSampleID = mde.dnaSampleID
443
+ ) AS merged
444
+ JOIN amc_fieldGenetic AS afg
445
+ ON
446
+ merged.genomicsSampleID = afg.geneticSampleID
447
+ JOIN amc_fieldSuperParent AS afs
448
+ ON
449
+ afg.parentSampleID = afs.parentSampleID
450
+ """
451
+ surface_water_samples = pd.read_sql_query(query, self.conn)
452
+
453
+ neon_biosample_ids = surface_water_samples["parentSampleID"]
454
+ nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
455
+ neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
456
+
457
+ neon_extraction_ids = surface_water_samples["parentSampleID"]
458
+ nmdc_extraction_ids = self._id_minter(
459
+ "nmdc:Extraction", len(neon_extraction_ids)
460
+ )
461
+ neon_to_nmdc_extraction_ids = dict(
462
+ zip(neon_extraction_ids, nmdc_extraction_ids)
463
+ )
464
+
465
+ neon_extraction_processed_ids = surface_water_samples["parentSampleID"]
466
+ nmdc_extraction_processed_ids = self._id_minter(
467
+ "nmdc:ProcessedSample", len(neon_extraction_processed_ids)
468
+ )
469
+ neon_to_nmdc_extraction_processed_ids = dict(
470
+ zip(neon_extraction_processed_ids, nmdc_extraction_processed_ids)
471
+ )
472
+
473
+ neon_lib_prep_ids = surface_water_samples["parentSampleID"]
474
+ nmdc_lib_prep_ids = self._id_minter(
475
+ "nmdc:LibraryPreparation", len(neon_lib_prep_ids)
476
+ )
477
+ neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
478
+
479
+ neon_lib_prep_processed_ids = surface_water_samples["parentSampleID"]
480
+ nmdc_lib_prep_processed_ids = self._id_minter(
481
+ "nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
482
+ )
483
+ neon_to_nmdc_lib_prep_processed_ids = dict(
484
+ zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
485
+ )
486
+
487
+ neon_omprc_ids = surface_water_samples["parentSampleID"]
488
+ nmdc_omprc_ids = self._id_minter("nmdc:OmicsProcessing", len(neon_omprc_ids))
489
+ neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
490
+
491
+ neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
492
+ neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
493
+ nmdc_data_object_ids = self._id_minter(
494
+ "nmdc:DataObject", len(neon_raw_file_paths)
495
+ )
496
+ neon_to_nmdc_data_object_ids = dict(
497
+ zip(neon_raw_file_paths, nmdc_data_object_ids)
498
+ )
499
+
500
+ for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
501
+ biosample_row = surface_water_samples[
502
+ surface_water_samples["parentSampleID"] == neon_id
503
+ ]
504
+
505
+ database.biosample_set.append(
506
+ self._translate_biosample(neon_id, nmdc_id, biosample_row)
507
+ )
508
+
509
+ for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
510
+ extraction_row = surface_water_samples[
511
+ surface_water_samples["parentSampleID"] == neon_id
512
+ ]
513
+
514
+ extraction_input = neon_to_nmdc_biosample_ids.get(neon_id)
515
+ processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
516
+
517
+ if extraction_input is not None and processed_sample_id is not None:
518
+ database.extraction_set.append(
519
+ self._translate_extraction_process(
520
+ nmdc_id,
521
+ extraction_input,
522
+ processed_sample_id,
523
+ extraction_row,
524
+ )
525
+ )
526
+
527
+ genomics_sample_id = _get_value_or_none(
528
+ extraction_row, "genomicsSampleID"
529
+ )
530
+
531
+ database.processed_sample_set.append(
532
+ self._translate_processed_sample(
533
+ processed_sample_id,
534
+ f"Extracted DNA from {genomics_sample_id}",
535
+ )
536
+ )
537
+
538
+ query = """
539
+ SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
540
+ FROM neonRawDataFile
541
+ GROUP BY dnaSampleID
542
+ """
543
+ neon_raw_data_files = pd.read_sql_query(query, self.conn)
544
+ neon_raw_data_files_dict = (
545
+ neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
546
+ .str.split("|")
547
+ .to_dict()
548
+ )
549
+ filtered_neon_raw_data_files_dict = {
550
+ key: value
551
+ for key, value in neon_raw_data_files_dict.items()
552
+ if len(value) <= 2
553
+ }
554
+
555
+ for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
556
+ lib_prep_row = surface_water_samples[
557
+ surface_water_samples["parentSampleID"] == neon_id
558
+ ]
559
+
560
+ lib_prep_input = neon_to_nmdc_extraction_processed_ids.get(neon_id)
561
+ processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
562
+
563
+ if lib_prep_input is not None and processed_sample_id is not None:
564
+ database.library_preparation_set.append(
565
+ self._translate_library_preparation(
566
+ nmdc_id,
567
+ lib_prep_input,
568
+ processed_sample_id,
569
+ lib_prep_row,
570
+ )
571
+ )
572
+
573
+ dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
574
+
575
+ database.processed_sample_set.append(
576
+ self._translate_processed_sample(
577
+ processed_sample_id,
578
+ f"Library preparation for {dna_sample_id}",
579
+ )
580
+ )
581
+
582
+ has_output = None
583
+ has_output_do_ids = []
584
+
585
+ if dna_sample_id in filtered_neon_raw_data_files_dict:
586
+ has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
587
+ for item in has_output:
588
+ if item in neon_to_nmdc_data_object_ids:
589
+ has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
590
+
591
+ checksum = None
592
+ do_type = None
593
+
594
+ checksum = neon_raw_data_file_mappings_df[
595
+ neon_raw_data_file_mappings_df["rawDataFilePath"] == item
596
+ ]["checkSum"].values[0]
597
+ if "_R1.fastq.gz" in item:
598
+ do_type = "Metagenome Raw Read 1"
599
+ elif "_R2.fastq.gz" in item:
600
+ do_type = "Metagenome Raw Read 2"
601
+
602
+ database.data_object_set.append(
603
+ self._translate_data_object(
604
+ neon_to_nmdc_data_object_ids.get(item),
605
+ item,
606
+ do_type,
607
+ checksum,
608
+ )
609
+ )
610
+
611
+ database.omics_processing_set.append(
612
+ self._translate_omics_processing(
613
+ neon_to_nmdc_omprc_ids.get(neon_id),
614
+ processed_sample_id,
615
+ has_output_do_ids,
616
+ lib_prep_row,
617
+ )
618
+ )
619
+
620
+ return database
@@ -22,7 +22,11 @@ def _get_value_or_none(data: pd.DataFrame, column_name: str) -> Union[str, float
22
22
  ):
23
23
  if column_name == "horizon":
24
24
  return f"{data[column_name].values[0]} horizon"
25
- elif column_name == "qaqcStatus":
25
+ elif (
26
+ column_name == "qaqcStatus"
27
+ or column_name == "extrQaqcStatus"
28
+ or column_name == "seqQaqcStatus"
29
+ ):
26
30
  return data[column_name].values[0].lower()
27
31
  elif column_name == "sampleTopDepth":
28
32
  return float(data[column_name].values[0]) / 100
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nmdc_runtime
3
- Version: 1.4.2
3
+ Version: 1.6.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -35,9 +35,9 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
35
35
  nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
37
37
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- nmdc_runtime/site/graphs.py,sha256=siHlRnD2eS9nw3Ne049TcGG6I6IYFvjgWQuuSHzEOqc,9492
39
- nmdc_runtime/site/ops.py,sha256=YzDm7Dm2sELptwTew8DTOcS3nYBH_JegXhu3wzZuuiY,32482
40
- nmdc_runtime/site/repository.py,sha256=UgY9eMnNgZxa-Y0QeDyENh4KHtxuBWkYCjxltM4mTzA,30938
38
+ nmdc_runtime/site/graphs.py,sha256=mOWZvT2Rk4X96RmVAvHQwur-FhNuMWAko3jjRLGygEE,11455
39
+ nmdc_runtime/site/ops.py,sha256=YjaH2zqzd01cRcqV0E93RoaWt8T4ExESx4SSszmczZ8,33620
40
+ nmdc_runtime/site/repository.py,sha256=QI9Gcjr68-DT2MPwOx87Vkxcwp3ZIOVaFZ9uCO13w9U,35502
41
41
  nmdc_runtime/site/resources.py,sha256=pQSwg1dRpL_D91gYLzzaOIDZ3qa69rPqSlsq5dS9i_M,17783
42
42
  nmdc_runtime/site/util.py,sha256=6hyVPpb6ZkWEG8Nm7uQxnZ-QmuPOG9hgWvl0mUBr5JU,1303
43
43
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -64,7 +64,8 @@ nmdc_runtime/site/translation/gold_translator.py,sha256=8i5FxrgAG4rLbM0mcCSBaZEz
64
64
  nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
65
65
  nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=e_7tXFrP0PpdhqUCxXmOaFViSuG36IIMDqyj3FHLcgQ,23069
66
66
  nmdc_runtime/site/translation/neon_soil_translator.py,sha256=cJJ_QPva5G5SIT_7DjCSsqbDvgbiKGqUYrxK3nx7_Lw,37634
67
- nmdc_runtime/site/translation/neon_utils.py,sha256=k8JYMnm-L981BTOdAMomR1CulS_Hz5v7aYxrJ94KEJc,5086
67
+ nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=6LaFwBnVx6TN9v1D-G6LFrDxY0TK05AvMklx0E1tTeQ,26590
68
+ nmdc_runtime/site/translation/neon_utils.py,sha256=mdxJVPb3zbD4DiKW3Fwgk22kjczKMwkcozvy7fwteTE,5203
68
69
  nmdc_runtime/site/translation/submission_portal_translator.py,sha256=lHcrfPR5wk3BcZ0Uw5zUyWu5XRVikgOzdzSb5nFVS9I,27964
69
70
  nmdc_runtime/site/translation/translator.py,sha256=xM9dM-nTgSWwu5HFoUVNHf8kqk9iiH4PgWdSx4OKxEk,601
70
71
  nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
@@ -73,9 +74,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
73
74
  nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
74
75
  nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
75
76
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
76
- nmdc_runtime-1.4.2.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
77
- nmdc_runtime-1.4.2.dist-info/METADATA,sha256=Y-wVXewG_8q6XgG2JMsAUyJJ_bdvadDKRUCWkmP-u4U,7424
78
- nmdc_runtime-1.4.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
79
- nmdc_runtime-1.4.2.dist-info/entry_points.txt,sha256=nfH6-K9tDKv7va8ENfShsBnxVQoYJdEe7HHdwtkbh1Y,289
80
- nmdc_runtime-1.4.2.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
81
- nmdc_runtime-1.4.2.dist-info/RECORD,,
77
+ nmdc_runtime-1.6.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
78
+ nmdc_runtime-1.6.0.dist-info/METADATA,sha256=hKgDLZfx14AX3IWIi3C9vHa9YAP-agU7tsmKZ_kg8JY,7424
79
+ nmdc_runtime-1.6.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
80
+ nmdc_runtime-1.6.0.dist-info/entry_points.txt,sha256=nfH6-K9tDKv7va8ENfShsBnxVQoYJdEe7HHdwtkbh1Y,289
81
+ nmdc_runtime-1.6.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
82
+ nmdc_runtime-1.6.0.dist-info/RECORD,,