nmdc-runtime 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,6 @@ from nmdc_runtime.infrastructure.database.impl.mongo.models.user import (
9
9
 
10
10
 
11
11
  class Container(containers.DeclarativeContainer):
12
-
13
12
  user_queries = providers.Singleton(UserQueries)
14
13
 
15
14
  user_service = providers.Factory(UserService, user_queries=user_queries)
@@ -196,7 +196,6 @@ class NMDC_ETL:
196
196
  print_df=False,
197
197
  print_dict=False,
198
198
  ) -> list:
199
-
200
199
  ## used for testing
201
200
  if test_rows != 0:
202
201
  nmdc_df = nmdc_df.head(test_rows)
@@ -995,7 +995,6 @@ def make_quantity_value(nmdc_objs: list, tx_attributes: list, **kwargs) -> list:
995
995
  for attribute in tx_attributes:
996
996
  for obj in nmdc_objs:
997
997
  if has_raw_value(obj, attribute):
998
-
999
998
  val = getattr(obj, attribute)
1000
999
 
1001
1000
  ## split raw value after first space
@@ -44,6 +44,9 @@ from nmdc_runtime.site.ops import (
44
44
  get_submission_portal_pipeline_inputs,
45
45
  get_csv_rows_from_url,
46
46
  get_neon_pipeline_benthic_data_product,
47
+ get_neon_pipeline_inputs,
48
+ get_df_from_url,
49
+ site_code_mapping,
47
50
  )
48
51
 
49
52
 
@@ -207,7 +210,20 @@ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
207
210
  mms_data = neon_data_by_product(mms_data_product)
208
211
  sls_data = neon_data_by_product(sls_data_product)
209
212
 
210
- database = nmdc_schema_database_from_neon_soil_data(mms_data, sls_data)
213
+ (
214
+ neon_envo_mappings_file_url,
215
+ neon_raw_data_file_mappings_file_url,
216
+ ) = get_neon_pipeline_inputs()
217
+
218
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
219
+
220
+ neon_raw_data_file_mappings_file = get_df_from_url(
221
+ neon_raw_data_file_mappings_file_url
222
+ )
223
+
224
+ database = nmdc_schema_database_from_neon_soil_data(
225
+ mms_data, sls_data, neon_envo_mappings_file, neon_raw_data_file_mappings_file
226
+ )
211
227
 
212
228
  database_dict = nmdc_schema_object_to_dict(database)
213
229
  filename = nmdc_schema_database_export_filename_neon()
@@ -224,18 +240,48 @@ def ingest_neon_soil_metadata():
224
240
  mms_data = neon_data_by_product(mms_data_product)
225
241
  sls_data = neon_data_by_product(sls_data_product)
226
242
 
227
- database = nmdc_schema_database_from_neon_soil_data(mms_data, sls_data)
243
+ (
244
+ neon_envo_mappings_file_url,
245
+ neon_raw_data_file_mappings_file_url,
246
+ ) = get_neon_pipeline_inputs()
247
+
248
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
249
+
250
+ neon_raw_data_file_mappings_file = get_df_from_url(
251
+ neon_raw_data_file_mappings_file_url
252
+ )
253
+
254
+ database = nmdc_schema_database_from_neon_soil_data(
255
+ mms_data, sls_data, neon_envo_mappings_file, neon_raw_data_file_mappings_file
256
+ )
228
257
  run_id = submit_metadata_to_db(database)
229
258
  poll_for_run_completion(run_id)
230
259
 
231
260
 
232
261
  @graph
233
262
  def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
234
- mms_benthic_data_product = get_neon_pipeline_benthic_data_product()
263
+ (
264
+ neon_envo_mappings_file_url,
265
+ neon_raw_data_file_mappings_file_url,
266
+ ) = get_neon_pipeline_inputs()
235
267
 
268
+ mms_benthic_data_product = get_neon_pipeline_benthic_data_product()
236
269
  mms_benthic = neon_data_by_product(mms_benthic_data_product)
237
270
 
238
- database = nmdc_schema_database_from_neon_benthic_data(mms_benthic)
271
+ sites_mapping_dict = site_code_mapping()
272
+
273
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
274
+
275
+ neon_raw_data_file_mappings_file = get_df_from_url(
276
+ neon_raw_data_file_mappings_file_url
277
+ )
278
+
279
+ database = nmdc_schema_database_from_neon_benthic_data(
280
+ mms_benthic,
281
+ sites_mapping_dict,
282
+ neon_envo_mappings_file,
283
+ neon_raw_data_file_mappings_file,
284
+ )
239
285
 
240
286
  database_dict = nmdc_schema_object_to_dict(database)
241
287
  filename = nmdc_schema_database_export_filename_neon()
@@ -250,6 +296,24 @@ def ingest_neon_benthic_metadata():
250
296
 
251
297
  mms_benthic = neon_data_by_product(mms_benthic_data_product)
252
298
 
253
- database = nmdc_schema_database_from_neon_benthic_data(mms_benthic)
299
+ sites_mapping_dict = site_code_mapping()
300
+
301
+ (
302
+ neon_envo_mappings_file_url,
303
+ neon_raw_data_file_mappings_file_url,
304
+ ) = get_neon_pipeline_inputs()
305
+
306
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
307
+
308
+ neon_raw_data_file_mappings_file = get_df_from_url(
309
+ neon_raw_data_file_mappings_file_url
310
+ )
311
+
312
+ database = nmdc_schema_database_from_neon_benthic_data(
313
+ mms_benthic,
314
+ sites_mapping_dict,
315
+ neon_envo_mappings_file,
316
+ neon_raw_data_file_mappings_file,
317
+ )
254
318
  run_id = submit_metadata_to_db(database)
255
319
  poll_for_run_completion(run_id)
nmdc_runtime/site/ops.py CHANGED
@@ -6,7 +6,7 @@ import subprocess
6
6
  import tempfile
7
7
  from collections import defaultdict
8
8
  from datetime import datetime, timezone
9
- from io import BytesIO
9
+ from io import BytesIO, StringIO
10
10
  from typing import Tuple
11
11
  from zipfile import ZipFile
12
12
  import pandas as pd
@@ -815,6 +815,8 @@ def nmdc_schema_database_from_neon_soil_data(
815
815
  context: OpExecutionContext,
816
816
  mms_data: Dict[str, pd.DataFrame],
817
817
  sls_data: Dict[str, pd.DataFrame],
818
+ neon_envo_mappings_file: pd.DataFrame,
819
+ neon_raw_data_file_mappings_file: pd.DataFrame,
818
820
  ) -> nmdc.Database:
819
821
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
820
822
 
@@ -822,7 +824,13 @@ def nmdc_schema_database_from_neon_soil_data(
822
824
  response = client.mint_id(*args, **kwargs)
823
825
  return response.json()
824
826
 
825
- translator = NeonSoilDataTranslator(mms_data, sls_data, id_minter=id_minter)
827
+ translator = NeonSoilDataTranslator(
828
+ mms_data,
829
+ sls_data,
830
+ neon_envo_mappings_file,
831
+ neon_raw_data_file_mappings_file,
832
+ id_minter=id_minter,
833
+ )
826
834
 
827
835
  database = translator.get_database()
828
836
  return database
@@ -832,6 +840,9 @@ def nmdc_schema_database_from_neon_soil_data(
832
840
  def nmdc_schema_database_from_neon_benthic_data(
833
841
  context: OpExecutionContext,
834
842
  benthic_data: Dict[str, pd.DataFrame],
843
+ site_code_mapping: Dict[str, str],
844
+ neon_envo_mappings_file: pd.DataFrame,
845
+ neon_raw_data_file_mappings_file: pd.DataFrame,
835
846
  ) -> nmdc.Database:
836
847
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
837
848
 
@@ -839,12 +850,34 @@ def nmdc_schema_database_from_neon_benthic_data(
839
850
  response = client.mint_id(*args, **kwargs)
840
851
  return response.json()
841
852
 
842
- translator = NeonBenthicDataTranslator(benthic_data, id_minter=id_minter)
853
+ translator = NeonBenthicDataTranslator(
854
+ benthic_data,
855
+ site_code_mapping,
856
+ neon_envo_mappings_file,
857
+ neon_raw_data_file_mappings_file,
858
+ id_minter=id_minter,
859
+ )
843
860
 
844
861
  database = translator.get_database()
845
862
  return database
846
863
 
847
864
 
865
+ @op(
866
+ out={
867
+ "neon_envo_mappings_file_url": Out(),
868
+ "neon_raw_data_file_mappings_file_url": Out(),
869
+ }
870
+ )
871
+ def get_neon_pipeline_inputs(
872
+ neon_envo_mappings_file_url: str,
873
+ neon_raw_data_file_mappings_file_url: str,
874
+ ) -> Tuple[str, str]:
875
+ return (
876
+ neon_envo_mappings_file_url,
877
+ neon_raw_data_file_mappings_file_url,
878
+ )
879
+
880
+
848
881
  @op
849
882
  def nmdc_schema_database_export_filename_neon() -> str:
850
883
  return "database_from_neon_metadata.json"
@@ -872,3 +905,41 @@ def get_csv_rows_from_url(url: Optional[str]) -> List[Dict]:
872
905
  # Collect all the rows into a list of dicts while stripping out (valfilter) cells where the
873
906
  # value is an empty string (identity returns a Falsy value).
874
907
  return [valfilter(identity, row) for row in reader]
908
+
909
+
910
+ @op
911
+ def get_df_from_url(url: str) -> pd.DataFrame:
912
+ """Download and return a pandas DataFrame from the URL of a TSV file.
913
+
914
+ :param url: raw URL of the TSV file to be downloaded as a DataFrame
915
+ :return: pandas DataFrame of TSV data
916
+ """
917
+ if not url:
918
+ return pd.DataFrame()
919
+
920
+ response = requests.get(url)
921
+ response.raise_for_status()
922
+
923
+ # Using Pandas read_csv to directly read the file-like object
924
+ df = pd.read_csv(url, delimiter="\t")
925
+
926
+ return df
927
+
928
+
929
+ @op
930
+ def site_code_mapping() -> dict:
931
+ endpoint = "https://data.neonscience.org/api/v0/sites/"
932
+ response = requests.get(endpoint)
933
+ if response.status_code == 200:
934
+ sites_data = response.json()
935
+ site_code_mapping = {
936
+ site["siteCode"]: f"USA: {site['stateName']}, {site['siteName']}".replace(
937
+ " NEON", ""
938
+ )
939
+ for site in sites_data["data"]
940
+ }
941
+ return site_code_mapping
942
+ else:
943
+ raise Exception(
944
+ f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}"
945
+ )
@@ -1,5 +1,7 @@
1
1
  import json
2
2
 
3
+ from typing import Any
4
+
3
5
  from dagster import (
4
6
  repository,
5
7
  ScheduleDefinition,
@@ -592,10 +594,27 @@ def biosample_submission_ingest():
592
594
  "base_url": {"env": "NEON_API_BASE_URL"},
593
595
  "api_token": {"env": "NEON_API_TOKEN"},
594
596
  },
595
- }
597
+ },
598
+ "mongo": {
599
+ "config": {
600
+ "dbname": {"env": "MONGO_DBNAME"},
601
+ "host": {"env": "MONGO_HOST"},
602
+ "password": {"env": "MONGO_PASSWORD"},
603
+ "username": {"env": "MONGO_USERNAME"},
604
+ },
605
+ },
606
+ "runtime_api_site_client": {
607
+ "config": {
608
+ "base_url": {"env": "API_HOST"},
609
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
610
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
611
+ "site_id": {"env": "API_SITE_ID"},
612
+ },
613
+ },
596
614
  },
597
615
  ),
598
616
  "ops": {
617
+ "export_json_to_drs": {"config": {"username": "..."}},
599
618
  "get_neon_pipeline_mms_data_product": {
600
619
  "config": {
601
620
  "mms_data_product": {
@@ -612,7 +631,12 @@ def biosample_submission_ingest():
612
631
  }
613
632
  }
614
633
  },
615
- "export_json_to_drs": {"config": {"username": ""}},
634
+ "get_neon_pipeline_inputs": {
635
+ "inputs": {
636
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
637
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
638
+ }
639
+ },
616
640
  },
617
641
  },
618
642
  ),
@@ -648,6 +672,12 @@ def biosample_submission_ingest():
648
672
  }
649
673
  }
650
674
  },
675
+ "get_neon_pipeline_inputs": {
676
+ "inputs": {
677
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
678
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
679
+ }
680
+ },
651
681
  },
652
682
  },
653
683
  ),
@@ -663,10 +693,33 @@ def biosample_submission_ingest():
663
693
  "base_url": {"env": "NEON_API_BASE_URL"},
664
694
  "api_token": {"env": "NEON_API_TOKEN"},
665
695
  },
666
- }
696
+ },
697
+ "mongo": {
698
+ "config": {
699
+ "dbname": {"env": "MONGO_DBNAME"},
700
+ "host": {"env": "MONGO_HOST"},
701
+ "password": {"env": "MONGO_PASSWORD"},
702
+ "username": {"env": "MONGO_USERNAME"},
703
+ },
704
+ },
705
+ "runtime_api_site_client": {
706
+ "config": {
707
+ "base_url": {"env": "API_HOST"},
708
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
709
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
710
+ "site_id": {"env": "API_SITE_ID"},
711
+ },
712
+ },
667
713
  },
668
714
  ),
669
715
  "ops": {
716
+ "export_json_to_drs": {"config": {"username": "..."}},
717
+ "get_neon_pipeline_inputs": {
718
+ "inputs": {
719
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
720
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
721
+ }
722
+ },
670
723
  "get_neon_pipeline_benthic_data_product": {
671
724
  "config": {
672
725
  "benthic_data_product": {
@@ -675,7 +728,6 @@ def biosample_submission_ingest():
675
728
  }
676
729
  }
677
730
  },
678
- "export_json_to_drs": {"config": {"username": ""}},
679
731
  },
680
732
  },
681
733
  ),
@@ -703,6 +755,12 @@ def biosample_submission_ingest():
703
755
  }
704
756
  }
705
757
  },
758
+ "get_neon_pipeline_inputs": {
759
+ "inputs": {
760
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
761
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
762
+ }
763
+ },
706
764
  },
707
765
  },
708
766
  ),
@@ -212,7 +212,7 @@ class GoldStudyTranslator(Translator):
212
212
  return None
213
213
  elif minimum_numeric_value is not None and maximum_numeric_value is None:
214
214
  return nmdc.QuantityValue(
215
- has_raw_value=field_value,
215
+ has_raw_value=minimum_numeric_value,
216
216
  has_numeric_value=nmdc.Double(minimum_numeric_value),
217
217
  has_unit=unit,
218
218
  )
@@ -2,13 +2,20 @@ import re
2
2
  import sqlite3
3
3
 
4
4
  import pandas as pd
5
- import requests
6
5
  import requests_cache
7
6
 
8
7
  from nmdc_schema import nmdc
9
8
  from nmdc_runtime.site.translation.translator import Translator
10
9
  from nmdc_runtime.site.util import get_basename
11
- from nmdc_runtime.site.translation.neon_utils import _get_value_or_none, _create_controlled_identified_term_value, _create_controlled_term_value, _create_geolocation_value, _create_quantity_value, _create_timestamp_value, _create_text_value
10
+ from nmdc_runtime.site.translation.neon_utils import (
11
+ _get_value_or_none,
12
+ _create_controlled_identified_term_value,
13
+ _create_controlled_term_value,
14
+ _create_geolocation_value,
15
+ _create_quantity_value,
16
+ _create_timestamp_value,
17
+ _create_text_value,
18
+ )
12
19
 
13
20
 
14
21
  BENTHIC_BROAD_SCALE_MAPPINGS = {
@@ -34,7 +41,15 @@ BENTHIC_ENV_MEDIUM_MAPPINGS = {
34
41
 
35
42
 
36
43
  class NeonBenthicDataTranslator(Translator):
37
- def __init__(self, benthic_data: dict, *args, **kwargs) -> None:
44
+ def __init__(
45
+ self,
46
+ benthic_data: dict,
47
+ site_code_mapping: dict,
48
+ neon_envo_mappings_file: pd.DataFrame,
49
+ neon_raw_data_file_mappings_file: pd.DataFrame,
50
+ *args,
51
+ **kwargs,
52
+ ) -> None:
38
53
  super().__init__(*args, **kwargs)
39
54
 
40
55
  self.conn = sqlite3.connect("neon.db")
@@ -67,29 +82,16 @@ class NeonBenthicDataTranslator(Translator):
67
82
  f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
68
83
  )
69
84
 
70
- neon_envo_mappings_file = "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv"
71
- neon_envo_terms = pd.read_csv(neon_envo_mappings_file, delimiter="\t")
72
- neon_envo_terms.to_sql(
85
+ neon_envo_mappings_file.to_sql(
73
86
  "neonEnvoTerms", self.conn, if_exists="replace", index=False
74
87
  )
75
88
 
76
- neon_raw_data_file_mappings_file = "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv"
77
- self.neon_raw_data_file_mappings_df = pd.read_csv(
78
- neon_raw_data_file_mappings_file, delimiter="\t"
79
- )
89
+ self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
80
90
  self.neon_raw_data_file_mappings_df.to_sql(
81
91
  "neonRawDataFile", self.conn, if_exists="replace", index=False
82
92
  )
83
93
 
84
- def get_site_by_code(self, site_code: str) -> str:
85
- site_response = requests.get(
86
- f"https://data.neonscience.org/api/v0/sites/{site_code}"
87
- )
88
- if site_response.status_code == 200:
89
- site_response = site_response.json()
90
- return f"USA: {site_response['data']['stateName']}, {site_response['data']['siteName']}".replace(
91
- " NEON", ""
92
- )
94
+ self.site_code_mapping = site_code_mapping
93
95
 
94
96
  def _translate_biosample(
95
97
  self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
@@ -134,7 +136,7 @@ class NeonBenthicDataTranslator(Translator):
134
136
  biosample_row["fieldSampleVolume"].values[0], "mL"
135
137
  ),
136
138
  geo_loc_name=_create_text_value(
137
- self.get_site_by_code(biosample_row["siteID"].values[0])
139
+ self.site_code_mapping[biosample_row["siteID"].values[0]]
138
140
  if biosample_row["siteID"].values[0]
139
141
  else None
140
142
  ),
@@ -210,9 +212,7 @@ class NeonBenthicDataTranslator(Translator):
210
212
  :return: Object that using LibraryPreparation process model.
211
213
  """
212
214
  processing_institution = None
213
- laboratory_name = _get_value_or_none(
214
- library_preparation_row, "laboratoryName"
215
- )
215
+ laboratory_name = _get_value_or_none(library_preparation_row, "laboratoryName")
216
216
  if laboratory_name is not None:
217
217
  if re.search("Battelle", laboratory_name, re.IGNORECASE):
218
218
  processing_institution = "Battelle"
@@ -263,9 +263,7 @@ class NeonBenthicDataTranslator(Translator):
263
263
  has_input=processed_sample_id,
264
264
  has_output=raw_data_file_data,
265
265
  processing_institution=processing_institution,
266
- ncbi_project_name=_get_value_or_none(
267
- omics_processing_row, "ncbiProjectID"
268
- ),
266
+ ncbi_project_name=_get_value_or_none(omics_processing_row, "ncbiProjectID"),
269
267
  omics_type=_create_controlled_term_value(
270
268
  omics_processing_row["investigation_type"].values[0]
271
269
  ),
@@ -434,9 +432,7 @@ class NeonBenthicDataTranslator(Translator):
434
432
  )
435
433
 
436
434
  for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
437
- biosample_row = benthic_samples[
438
- benthic_samples["sampleID"] == neon_id
439
- ]
435
+ biosample_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
440
436
 
441
437
  database.biosample_set.append(
442
438
  self._translate_biosample(neon_id, nmdc_id, biosample_row)
@@ -458,7 +454,9 @@ class NeonBenthicDataTranslator(Translator):
458
454
  )
459
455
  )
460
456
 
461
- genomics_sample_id = _get_value_or_none(extraction_row, "genomicsSampleID")
457
+ genomics_sample_id = _get_value_or_none(
458
+ extraction_row, "genomicsSampleID"
459
+ )
462
460
 
463
461
  database.processed_sample_set.append(
464
462
  self._translate_processed_sample(
@@ -1,17 +1,34 @@
1
1
  import re
2
2
  import sqlite3
3
- from typing import Union, List
3
+ from typing import List
4
4
 
5
5
  import pandas as pd
6
6
 
7
7
  from nmdc_schema import nmdc
8
8
  from nmdc_runtime.site.translation.translator import Translator
9
9
  from nmdc_runtime.site.util import get_basename
10
- from nmdc_runtime.site.translation.neon_utils import _get_value_or_none, _create_controlled_identified_term_value, _create_controlled_term_value, _create_geolocation_value, _create_quantity_value, _create_timestamp_value, _create_text_value, _create_double_value
10
+ from nmdc_runtime.site.translation.neon_utils import (
11
+ _get_value_or_none,
12
+ _create_controlled_identified_term_value,
13
+ _create_controlled_term_value,
14
+ _create_geolocation_value,
15
+ _create_quantity_value,
16
+ _create_timestamp_value,
17
+ _create_text_value,
18
+ _create_double_value,
19
+ )
11
20
 
12
21
 
13
22
  class NeonSoilDataTranslator(Translator):
14
- def __init__(self, mms_data: dict, sls_data: dict, *args, **kwargs) -> None:
23
+ def __init__(
24
+ self,
25
+ mms_data: dict,
26
+ sls_data: dict,
27
+ neon_envo_mappings_file: pd.DataFrame,
28
+ neon_raw_data_file_mappings_file: pd.DataFrame,
29
+ *args,
30
+ **kwargs,
31
+ ) -> None:
15
32
  super().__init__(*args, **kwargs)
16
33
 
17
34
  self.conn = sqlite3.connect("neon.db")
@@ -73,16 +90,11 @@ class NeonSoilDataTranslator(Translator):
73
90
  f"You are missing one of the soil periodic tables: {neon_sls_data_tables}"
74
91
  )
75
92
 
76
- neon_envo_mappings_file = "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv"
77
- neon_envo_terms = pd.read_csv(neon_envo_mappings_file, delimiter="\t")
78
- neon_envo_terms.to_sql(
93
+ neon_envo_mappings_file.to_sql(
79
94
  "neonEnvoTerms", self.conn, if_exists="replace", index=False
80
95
  )
81
96
 
82
- neon_raw_data_file_mappings_file = "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv"
83
- self.neon_raw_data_file_mappings_df = pd.read_csv(
84
- neon_raw_data_file_mappings_file, delimiter="\t"
85
- )
97
+ self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
86
98
  self.neon_raw_data_file_mappings_df.to_sql(
87
99
  "neonRawDataFile", self.conn, if_exists="replace", index=False
88
100
  )
@@ -124,9 +136,7 @@ class NeonSoilDataTranslator(Translator):
124
136
  collection_date=_create_timestamp_value(
125
137
  biosample_row["collectDate"].values[0]
126
138
  ),
127
- temp=_create_quantity_value(
128
- biosample_row["soilTemp"].values[0], "Celsius"
129
- ),
139
+ temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Celsius"),
130
140
  depth=nmdc.QuantityValue(
131
141
  has_minimum_numeric_value=_get_value_or_none(
132
142
  biosample_row, "sampleTopDepth"
@@ -136,13 +146,9 @@ class NeonSoilDataTranslator(Translator):
136
146
  ),
137
147
  has_unit="m",
138
148
  ),
139
- samp_collec_device=_get_value_or_none(
140
- biosample_row, "soilSamplingDevice"
141
- ),
149
+ samp_collec_device=_get_value_or_none(biosample_row, "soilSamplingDevice"),
142
150
  soil_horizon=_get_value_or_none(biosample_row, "horizon"),
143
- analysis_type=_get_value_or_none(
144
- biosample_row, "sequenceAnalysisType"
145
- ),
151
+ analysis_type=_get_value_or_none(biosample_row, "sequenceAnalysisType"),
146
152
  env_package=_create_text_value(biosample_row["sampleType"].values[0]),
147
153
  nitro=_create_quantity_value(
148
154
  biosample_row["nitrogenPercent"].values[0], "percent"
@@ -301,9 +307,7 @@ class NeonSoilDataTranslator(Translator):
301
307
  :return: Object that using LibraryPreparation process model.
302
308
  """
303
309
  processing_institution = None
304
- laboratory_name = _get_value_or_none(
305
- library_preparation_row, "laboratoryName"
306
- )
310
+ laboratory_name = _get_value_or_none(library_preparation_row, "laboratoryName")
307
311
  if laboratory_name is not None:
308
312
  if re.search("Battelle", laboratory_name, re.IGNORECASE):
309
313
  processing_institution = "Battelle"
@@ -354,9 +358,7 @@ class NeonSoilDataTranslator(Translator):
354
358
  has_input=processed_sample_id,
355
359
  has_output=raw_data_file_data,
356
360
  processing_institution=processing_institution,
357
- ncbi_project_name=_get_value_or_none(
358
- omics_processing_row, "ncbiProjectID"
359
- ),
361
+ ncbi_project_name=_get_value_or_none(omics_processing_row, "ncbiProjectID"),
360
362
  omics_type=_create_controlled_term_value(
361
363
  omics_processing_row["investigation_type"].values[0]
362
364
  ),
@@ -5,35 +5,34 @@ import pandas as pd
5
5
  from nmdc_schema import nmdc
6
6
 
7
7
 
8
- def _get_value_or_none(
9
- data: pd.DataFrame, column_name: str
10
- ) -> Union[str, float, None]:
11
- """
12
- Get the value from the specified column in the data DataFrame.
13
- If the column value is NaN, return None. However, there are handlers
14
- for a select set of columns - horizon, qaqcStatus, sampleTopDepth,
15
- and sampleBottomDepth.
16
-
17
- :param data: DataFrame to read the column value from.
18
- :return: Either a string, float or None depending on the column/column values.
19
- """
20
- if (
21
- column_name in data
22
- and not data[column_name].isna().any()
23
- and not data[column_name].empty
24
- ):
25
- if column_name == "horizon":
26
- return f"{data[column_name].values[0]} horizon"
27
- elif column_name == "qaqcStatus":
28
- return data[column_name].values[0].lower()
29
- elif column_name == "sampleTopDepth":
30
- return float(data[column_name].values[0]) / 100
31
- elif column_name == "sampleBottomDepth":
32
- return float(data[column_name].values[0]) / 100
33
- else:
34
- return data[column_name].values[0]
8
+ def _get_value_or_none(data: pd.DataFrame, column_name: str) -> Union[str, float, None]:
9
+ """
10
+ Get the value from the specified column in the data DataFrame.
11
+ If the column value is NaN, return None. However, there are handlers
12
+ for a select set of columns - horizon, qaqcStatus, sampleTopDepth,
13
+ and sampleBottomDepth.
14
+
15
+ :param data: DataFrame to read the column value from.
16
+ :return: Either a string, float or None depending on the column/column values.
17
+ """
18
+ if (
19
+ column_name in data
20
+ and not data[column_name].isna().any()
21
+ and not data[column_name].empty
22
+ ):
23
+ if column_name == "horizon":
24
+ return f"{data[column_name].values[0]} horizon"
25
+ elif column_name == "qaqcStatus":
26
+ return data[column_name].values[0].lower()
27
+ elif column_name == "sampleTopDepth":
28
+ return float(data[column_name].values[0]) / 100
29
+ elif column_name == "sampleBottomDepth":
30
+ return float(data[column_name].values[0]) / 100
31
+ else:
32
+ return data[column_name].values[0]
33
+
34
+ return None
35
35
 
36
- return None
37
36
 
38
37
  def _create_controlled_identified_term_value(
39
38
  id: str = None, name: str = None
@@ -47,13 +46,10 @@ def _create_controlled_identified_term_value(
47
46
  """
48
47
  if id is None or name is None:
49
48
  return None
50
- return nmdc.ControlledIdentifiedTermValue(
51
- term=nmdc.OntologyClass(id=id, name=name)
52
- )
49
+ return nmdc.ControlledIdentifiedTermValue(term=nmdc.OntologyClass(id=id, name=name))
53
50
 
54
- def _create_controlled_term_value(
55
- name: str = None
56
- ) -> nmdc.ControlledTermValue:
51
+
52
+ def _create_controlled_term_value(name: str = None) -> nmdc.ControlledTermValue:
57
53
  """
58
54
  Create a ControlledIdentifiedTermValue object with the specified id and name.
59
55
 
@@ -66,6 +62,7 @@ def _create_controlled_term_value(
66
62
  return None
67
63
  return nmdc.ControlledTermValue(has_raw_value=name)
68
64
 
65
+
69
66
  def _create_timestamp_value(value: str = None) -> nmdc.TimestampValue:
70
67
  """
71
68
  Create a TimestampValue object with the specified value.
@@ -78,6 +75,7 @@ def _create_timestamp_value(value: str = None) -> nmdc.TimestampValue:
78
75
  return None
79
76
  return nmdc.TimestampValue(has_raw_value=value)
80
77
 
78
+
81
79
  def _create_quantity_value(
82
80
  numeric_value: Union[str, int, float] = None, unit: str = None
83
81
  ) -> nmdc.QuantityValue:
@@ -94,6 +92,7 @@ def _create_quantity_value(
94
92
  return None
95
93
  return nmdc.QuantityValue(has_numeric_value=float(numeric_value), has_unit=unit)
96
94
 
95
+
97
96
  def _create_text_value(value: str = None) -> nmdc.TextValue:
98
97
  """
99
98
  Create a TextValue object with the specified value.
@@ -105,6 +104,7 @@ def _create_text_value(value: str = None) -> nmdc.TextValue:
105
104
  return None
106
105
  return nmdc.TextValue(has_raw_value=value)
107
106
 
107
+
108
108
  def _create_double_value(value: str = None) -> nmdc.Double:
109
109
  """
110
110
  Create a Double object with the specified value.
@@ -117,6 +117,7 @@ def _create_double_value(value: str = None) -> nmdc.Double:
117
117
  return None
118
118
  return nmdc.Double(value)
119
119
 
120
+
120
121
  def _create_geolocation_value(
121
122
  latitude: str = None, longitude: str = None
122
123
  ) -> nmdc.GeolocationValue:
@@ -142,4 +143,4 @@ def _create_geolocation_value(
142
143
  return nmdc.GeolocationValue(
143
144
  latitude=nmdc.DecimalDegree(latitude),
144
145
  longitude=nmdc.DecimalDegree(longitude),
145
- )
146
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
- Name: nmdc-runtime
3
- Version: 1.2.0
2
+ Name: nmdc_runtime
3
+ Version: 1.3.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -1,5 +1,5 @@
1
1
  nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- nmdc_runtime/containers.py,sha256=WBzPue0PRoyKXFxgLR-aQcuHetTa8yC5JjI0dGBblYA,419
2
+ nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
3
3
  nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  nmdc_runtime/util.py,sha256=o74ZKOmSD79brPFAcQFsYpA6wh9287m0hDhDlIpn9VM,19872
5
5
  nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -23,8 +23,8 @@ nmdc_runtime/lib/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,
23
23
  nmdc_runtime/lib/extract_nmdc_data.py,sha256=xDFPoYsgkauN48R4v-tJIF0cP_p3J-sBjnyHd0InD9Y,1177
24
24
  nmdc_runtime/lib/load_nmdc_data.py,sha256=KO2cIqkY3cBCVcFIwsGokZNOKntOejZVG8ecq43NjFM,3934
25
25
  nmdc_runtime/lib/nmdc_dataframes.py,sha256=rVTczY2Jey1yE3x3nZ-RTgtdc2XkzLtKhB_PM3FIb-E,28849
26
- nmdc_runtime/lib/nmdc_etl_class.py,sha256=x83nKGi7peATlPLf3LGGU6cf42ZmpE__s1XQV2DpkPc,13523
27
- nmdc_runtime/lib/transform_nmdc_data.py,sha256=3w8Em5byzzOSgjC1LX2nq_d2O4FsTpkDC3UBiZYi9EM,39907
26
+ nmdc_runtime/lib/nmdc_etl_class.py,sha256=tVh3rKVMkBHQE65_LhKeIjCsaCZQk_HJzbc9K4xUNCs,13522
27
+ nmdc_runtime/lib/transform_nmdc_data.py,sha256=hij4lR3IMQRJQdL-rsP_I-m_WyFPsBMchV2MNFUkh0M,39906
28
28
  nmdc_runtime/minter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  nmdc_runtime/minter/bootstrap.py,sha256=5Ej6pJVBRryRIi0ZwEloY78Zky7iE2okF6tPwRI2axM,822
30
30
  nmdc_runtime/minter/config.py,sha256=mq_s0xjLZK-zwjwk3IGgnk9ZIvvejyyZ7_qZkLt3V-c,1409
@@ -35,9 +35,9 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
35
35
  nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
37
37
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- nmdc_runtime/site/graphs.py,sha256=Ipv6AKz_Itdcs3bu6QQ-wybwdzzBFVsh5AJP1DM16jY,7709
39
- nmdc_runtime/site/ops.py,sha256=knMVKlEaaQ2mx_-KRayEMcuIPvmjWdGwcyeSFqgtyyo,30538
40
- nmdc_runtime/site/repository.py,sha256=OXr5z9EH_2ZYmsbkwPhXPXg-ID9NpV_a4-zQEv5zxGs,27150
38
+ nmdc_runtime/site/graphs.py,sha256=siHlRnD2eS9nw3Ne049TcGG6I6IYFvjgWQuuSHzEOqc,9492
39
+ nmdc_runtime/site/ops.py,sha256=YzDm7Dm2sELptwTew8DTOcS3nYBH_JegXhu3wzZuuiY,32482
40
+ nmdc_runtime/site/repository.py,sha256=UgY9eMnNgZxa-Y0QeDyENh4KHtxuBWkYCjxltM4mTzA,30938
41
41
  nmdc_runtime/site/resources.py,sha256=pQSwg1dRpL_D91gYLzzaOIDZ3qa69rPqSlsq5dS9i_M,17783
42
42
  nmdc_runtime/site/util.py,sha256=6hyVPpb6ZkWEG8Nm7uQxnZ-QmuPOG9hgWvl0mUBr5JU,1303
43
43
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -60,11 +60,11 @@ nmdc_runtime/site/terminusdb/schema.py,sha256=3e39rHUSZsNbN_F0SHHNsvcEGRWtYa6O9K
60
60
  nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
61
  nmdc_runtime/site/translation/emsl.py,sha256=l6Q9Jj3RNJFQNYAU_TtKTJ7cyFcR93xBRs_lLdX0bMQ,1244
62
62
  nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
63
- nmdc_runtime/site/translation/gold_translator.py,sha256=vqTmZ5dqmaExIgA8nXpBYCs0d2OedNZgwbWJrt5Tm_M,26441
63
+ nmdc_runtime/site/translation/gold_translator.py,sha256=8i5FxrgAG4rLbM0mcCSBaZEzyReht6xwmpm4xeX4HwI,26451
64
64
  nmdc_runtime/site/translation/jgi.py,sha256=bh73r0uq5BT3ywXwIa1OEKKtz9LbFsSng472tdr-xtg,875
65
- nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=DON0ssFVE11fAHssSteP9lQdaRuu-Rfzi9V90vy27n8,23737
66
- nmdc_runtime/site/translation/neon_soil_translator.py,sha256=fzKvXfMv9biYgNRi_hqns8hZMED1Fj2Jgt1uBTPQwRQ,38036
67
- nmdc_runtime/site/translation/neon_utils.py,sha256=aBx1TKA1rUpA3pZR3LbL7bIrJ8beci7_pZSCgmPAzII,5208
65
+ nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=e_7tXFrP0PpdhqUCxXmOaFViSuG36IIMDqyj3FHLcgQ,23069
66
+ nmdc_runtime/site/translation/neon_soil_translator.py,sha256=x-FfNKsIv0efgxty9v4wOxNu5nrrS-N8phx12IqfLOI,37624
67
+ nmdc_runtime/site/translation/neon_utils.py,sha256=k8JYMnm-L981BTOdAMomR1CulS_Hz5v7aYxrJ94KEJc,5086
68
68
  nmdc_runtime/site/translation/submission_portal_translator.py,sha256=lHcrfPR5wk3BcZ0Uw5zUyWu5XRVikgOzdzSb5nFVS9I,27964
69
69
  nmdc_runtime/site/translation/translator.py,sha256=xM9dM-nTgSWwu5HFoUVNHf8kqk9iiH4PgWdSx4OKxEk,601
70
70
  nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
@@ -73,9 +73,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=TgckqKkFquHDLso77sn-jZRu5ZaBevGCt5p8
73
73
  nmdc_runtime/site/validation/gold.py,sha256=kJ1L081SZb-8qKpF731r5aQOueM206SUfUYMTTNTFMc,802
74
74
  nmdc_runtime/site/validation/jgi.py,sha256=lBo-FCtEYedT74CpW-Kdj512Ib963ik-4YIYmY5puDo,1298
75
75
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
76
- nmdc_runtime-1.2.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
77
- nmdc_runtime-1.2.0.dist-info/METADATA,sha256=MFVvY4uNvHv32d9XVbMk25MolCbsPYfilS85DTvQQtQ,7424
78
- nmdc_runtime-1.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
79
- nmdc_runtime-1.2.0.dist-info/entry_points.txt,sha256=nfH6-K9tDKv7va8ENfShsBnxVQoYJdEe7HHdwtkbh1Y,289
80
- nmdc_runtime-1.2.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
81
- nmdc_runtime-1.2.0.dist-info/RECORD,,
76
+ nmdc_runtime-1.3.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
77
+ nmdc_runtime-1.3.0.dist-info/METADATA,sha256=XiOhRGoaBESF48sWor9SMpTdCL8X9yPKIh6mnA9xZtY,7424
78
+ nmdc_runtime-1.3.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
79
+ nmdc_runtime-1.3.0.dist-info/entry_points.txt,sha256=nfH6-K9tDKv7va8ENfShsBnxVQoYJdEe7HHdwtkbh1Y,289
80
+ nmdc_runtime-1.3.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
81
+ nmdc_runtime-1.3.0.dist-info/RECORD,,