nmdc-runtime 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,6 @@ from nmdc_runtime.infrastructure.database.impl.mongo.models.user import (
9
9
 
10
10
 
11
11
  class Container(containers.DeclarativeContainer):
12
-
13
12
  user_queries = providers.Singleton(UserQueries)
14
13
 
15
14
  user_service = providers.Factory(UserService, user_queries=user_queries)
@@ -196,7 +196,6 @@ class NMDC_ETL:
196
196
  print_df=False,
197
197
  print_dict=False,
198
198
  ) -> list:
199
-
200
199
  ## used for testing
201
200
  if test_rows != 0:
202
201
  nmdc_df = nmdc_df.head(test_rows)
@@ -995,7 +995,6 @@ def make_quantity_value(nmdc_objs: list, tx_attributes: list, **kwargs) -> list:
995
995
  for attribute in tx_attributes:
996
996
  for obj in nmdc_objs:
997
997
  if has_raw_value(obj, attribute):
998
-
999
998
  val = getattr(obj, attribute)
1000
999
 
1001
1000
  ## split raw value after first space
@@ -36,12 +36,17 @@ from nmdc_runtime.site.ops import (
36
36
  translate_portal_submission_to_nmdc_schema_database,
37
37
  validate_metadata,
38
38
  neon_data_by_product,
39
- nmdc_schema_database_from_neon_data,
39
+ nmdc_schema_database_from_neon_soil_data,
40
+ nmdc_schema_database_from_neon_benthic_data,
40
41
  nmdc_schema_database_export_filename_neon,
41
42
  get_neon_pipeline_mms_data_product,
42
43
  get_neon_pipeline_sls_data_product,
43
44
  get_submission_portal_pipeline_inputs,
44
45
  get_csv_rows_from_url,
46
+ get_neon_pipeline_benthic_data_product,
47
+ get_neon_pipeline_inputs,
48
+ get_df_from_url,
49
+ site_code_mapping,
45
50
  )
46
51
 
47
52
 
@@ -198,14 +203,27 @@ def ingest_metadata_submission():
198
203
 
199
204
 
200
205
  @graph
201
- def translate_neon_api_metadata_to_nmdc_schema_database():
206
+ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
202
207
  mms_data_product = get_neon_pipeline_mms_data_product()
203
208
  sls_data_product = get_neon_pipeline_sls_data_product()
204
209
 
205
210
  mms_data = neon_data_by_product(mms_data_product)
206
211
  sls_data = neon_data_by_product(sls_data_product)
207
212
 
208
- database = nmdc_schema_database_from_neon_data(mms_data, sls_data)
213
+ (
214
+ neon_envo_mappings_file_url,
215
+ neon_raw_data_file_mappings_file_url,
216
+ ) = get_neon_pipeline_inputs()
217
+
218
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
219
+
220
+ neon_raw_data_file_mappings_file = get_df_from_url(
221
+ neon_raw_data_file_mappings_file_url
222
+ )
223
+
224
+ database = nmdc_schema_database_from_neon_soil_data(
225
+ mms_data, sls_data, neon_envo_mappings_file, neon_raw_data_file_mappings_file
226
+ )
209
227
 
210
228
  database_dict = nmdc_schema_object_to_dict(database)
211
229
  filename = nmdc_schema_database_export_filename_neon()
@@ -215,13 +233,87 @@ def translate_neon_api_metadata_to_nmdc_schema_database():
215
233
 
216
234
 
217
235
  @graph
218
- def ingest_neon_metadata():
236
+ def ingest_neon_soil_metadata():
219
237
  mms_data_product = get_neon_pipeline_mms_data_product()
220
238
  sls_data_product = get_neon_pipeline_sls_data_product()
221
239
 
222
240
  mms_data = neon_data_by_product(mms_data_product)
223
241
  sls_data = neon_data_by_product(sls_data_product)
224
242
 
225
- database = nmdc_schema_database_from_neon_data(mms_data, sls_data)
243
+ (
244
+ neon_envo_mappings_file_url,
245
+ neon_raw_data_file_mappings_file_url,
246
+ ) = get_neon_pipeline_inputs()
247
+
248
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
249
+
250
+ neon_raw_data_file_mappings_file = get_df_from_url(
251
+ neon_raw_data_file_mappings_file_url
252
+ )
253
+
254
+ database = nmdc_schema_database_from_neon_soil_data(
255
+ mms_data, sls_data, neon_envo_mappings_file, neon_raw_data_file_mappings_file
256
+ )
257
+ run_id = submit_metadata_to_db(database)
258
+ poll_for_run_completion(run_id)
259
+
260
+
261
+ @graph
262
+ def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
263
+ (
264
+ neon_envo_mappings_file_url,
265
+ neon_raw_data_file_mappings_file_url,
266
+ ) = get_neon_pipeline_inputs()
267
+
268
+ mms_benthic_data_product = get_neon_pipeline_benthic_data_product()
269
+ mms_benthic = neon_data_by_product(mms_benthic_data_product)
270
+
271
+ sites_mapping_dict = site_code_mapping()
272
+
273
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
274
+
275
+ neon_raw_data_file_mappings_file = get_df_from_url(
276
+ neon_raw_data_file_mappings_file_url
277
+ )
278
+
279
+ database = nmdc_schema_database_from_neon_benthic_data(
280
+ mms_benthic,
281
+ sites_mapping_dict,
282
+ neon_envo_mappings_file,
283
+ neon_raw_data_file_mappings_file,
284
+ )
285
+
286
+ database_dict = nmdc_schema_object_to_dict(database)
287
+ filename = nmdc_schema_database_export_filename_neon()
288
+
289
+ outputs = export_json_to_drs(database_dict, filename)
290
+ add_output_run_event(outputs)
291
+
292
+
293
+ @graph
294
+ def ingest_neon_benthic_metadata():
295
+ mms_benthic_data_product = get_neon_pipeline_benthic_data_product()
296
+
297
+ mms_benthic = neon_data_by_product(mms_benthic_data_product)
298
+
299
+ sites_mapping_dict = site_code_mapping()
300
+
301
+ (
302
+ neon_envo_mappings_file_url,
303
+ neon_raw_data_file_mappings_file_url,
304
+ ) = get_neon_pipeline_inputs()
305
+
306
+ neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
307
+
308
+ neon_raw_data_file_mappings_file = get_df_from_url(
309
+ neon_raw_data_file_mappings_file_url
310
+ )
311
+
312
+ database = nmdc_schema_database_from_neon_benthic_data(
313
+ mms_benthic,
314
+ sites_mapping_dict,
315
+ neon_envo_mappings_file,
316
+ neon_raw_data_file_mappings_file,
317
+ )
226
318
  run_id = submit_metadata_to_db(database)
227
319
  poll_for_run_completion(run_id)
nmdc_runtime/site/ops.py CHANGED
@@ -6,7 +6,7 @@ import subprocess
6
6
  import tempfile
7
7
  from collections import defaultdict
8
8
  from datetime import datetime, timezone
9
- from io import BytesIO
9
+ from io import BytesIO, StringIO
10
10
  from typing import Tuple
11
11
  from zipfile import ZipFile
12
12
  import pandas as pd
@@ -64,7 +64,10 @@ from nmdc_runtime.site.resources import (
64
64
  NeonApiClient,
65
65
  )
66
66
  from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
67
- from nmdc_runtime.site.translation.neon_translator import NeonDataTranslator
67
+ from nmdc_runtime.site.translation.neon_soil_translator import NeonSoilDataTranslator
68
+ from nmdc_runtime.site.translation.neon_benthic_translator import (
69
+ NeonBenthicDataTranslator,
70
+ )
68
71
  from nmdc_runtime.site.translation.submission_portal_translator import (
69
72
  SubmissionPortalTranslator,
70
73
  )
@@ -776,6 +779,11 @@ def get_neon_pipeline_sls_data_product(context: OpExecutionContext) -> dict:
776
779
  return context.op_config["sls_data_product"]
777
780
 
778
781
 
782
+ @op(config_schema={"benthic_data_product": dict})
783
+ def get_neon_pipeline_benthic_data_product(context: OpExecutionContext) -> dict:
784
+ return context.op_config["benthic_data_product"]
785
+
786
+
779
787
  @op(required_resource_keys={"neon_api_client"})
780
788
  def neon_data_by_product(
781
789
  context: OpExecutionContext, data_product: dict
@@ -803,10 +811,38 @@ def neon_data_by_product(
803
811
 
804
812
 
805
813
  @op(required_resource_keys={"runtime_api_site_client"})
806
- def nmdc_schema_database_from_neon_data(
814
+ def nmdc_schema_database_from_neon_soil_data(
807
815
  context: OpExecutionContext,
808
816
  mms_data: Dict[str, pd.DataFrame],
809
817
  sls_data: Dict[str, pd.DataFrame],
818
+ neon_envo_mappings_file: pd.DataFrame,
819
+ neon_raw_data_file_mappings_file: pd.DataFrame,
820
+ ) -> nmdc.Database:
821
+ client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
822
+
823
+ def id_minter(*args, **kwargs):
824
+ response = client.mint_id(*args, **kwargs)
825
+ return response.json()
826
+
827
+ translator = NeonSoilDataTranslator(
828
+ mms_data,
829
+ sls_data,
830
+ neon_envo_mappings_file,
831
+ neon_raw_data_file_mappings_file,
832
+ id_minter=id_minter,
833
+ )
834
+
835
+ database = translator.get_database()
836
+ return database
837
+
838
+
839
+ @op(required_resource_keys={"runtime_api_site_client"})
840
+ def nmdc_schema_database_from_neon_benthic_data(
841
+ context: OpExecutionContext,
842
+ benthic_data: Dict[str, pd.DataFrame],
843
+ site_code_mapping: Dict[str, str],
844
+ neon_envo_mappings_file: pd.DataFrame,
845
+ neon_raw_data_file_mappings_file: pd.DataFrame,
810
846
  ) -> nmdc.Database:
811
847
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
812
848
 
@@ -814,12 +850,34 @@ def nmdc_schema_database_from_neon_data(
814
850
  response = client.mint_id(*args, **kwargs)
815
851
  return response.json()
816
852
 
817
- translator = NeonDataTranslator(mms_data, sls_data, id_minter=id_minter)
853
+ translator = NeonBenthicDataTranslator(
854
+ benthic_data,
855
+ site_code_mapping,
856
+ neon_envo_mappings_file,
857
+ neon_raw_data_file_mappings_file,
858
+ id_minter=id_minter,
859
+ )
818
860
 
819
861
  database = translator.get_database()
820
862
  return database
821
863
 
822
864
 
865
+ @op(
866
+ out={
867
+ "neon_envo_mappings_file_url": Out(),
868
+ "neon_raw_data_file_mappings_file_url": Out(),
869
+ }
870
+ )
871
+ def get_neon_pipeline_inputs(
872
+ neon_envo_mappings_file_url: str,
873
+ neon_raw_data_file_mappings_file_url: str,
874
+ ) -> Tuple[str, str]:
875
+ return (
876
+ neon_envo_mappings_file_url,
877
+ neon_raw_data_file_mappings_file_url,
878
+ )
879
+
880
+
823
881
  @op
824
882
  def nmdc_schema_database_export_filename_neon() -> str:
825
883
  return "database_from_neon_metadata.json"
@@ -847,3 +905,41 @@ def get_csv_rows_from_url(url: Optional[str]) -> List[Dict]:
847
905
  # Collect all the rows into a list of dicts while stripping out (valfilter) cells where the
848
906
  # value is an empty string (identity returns a Falsy value).
849
907
  return [valfilter(identity, row) for row in reader]
908
+
909
+
910
+ @op
911
+ def get_df_from_url(url: str) -> pd.DataFrame:
912
+ """Download and return a pandas DataFrame from the URL of a TSV file.
913
+
914
+ :param url: raw URL of the TSV file to be downloaded as a DataFrame
915
+ :return: pandas DataFrame of TSV data
916
+ """
917
+ if not url:
918
+ return pd.DataFrame()
919
+
920
+ response = requests.get(url)
921
+ response.raise_for_status()
922
+
923
+ # Using Pandas read_csv to directly read the file-like object
924
+ df = pd.read_csv(url, delimiter="\t")
925
+
926
+ return df
927
+
928
+
929
+ @op
930
+ def site_code_mapping() -> dict:
931
+ endpoint = "https://data.neonscience.org/api/v0/sites/"
932
+ response = requests.get(endpoint)
933
+ if response.status_code == 200:
934
+ sites_data = response.json()
935
+ site_code_mapping = {
936
+ site["siteCode"]: f"USA: {site['stateName']}, {site['siteName']}".replace(
937
+ " NEON", ""
938
+ )
939
+ for site in sites_data["data"]
940
+ }
941
+ return site_code_mapping
942
+ else:
943
+ raise Exception(
944
+ f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}"
945
+ )
@@ -1,5 +1,7 @@
1
1
  import json
2
2
 
3
+ from typing import Any
4
+
3
5
  from dagster import (
4
6
  repository,
5
7
  ScheduleDefinition,
@@ -34,8 +36,10 @@ from nmdc_runtime.site.graphs import (
34
36
  apply_changesheet,
35
37
  apply_metadata_in,
36
38
  hello_graph,
37
- translate_neon_api_metadata_to_nmdc_schema_database,
38
- ingest_neon_metadata,
39
+ translate_neon_api_soil_metadata_to_nmdc_schema_database,
40
+ translate_neon_api_benthic_metadata_to_nmdc_schema_database,
41
+ ingest_neon_soil_metadata,
42
+ ingest_neon_benthic_metadata,
39
43
  )
40
44
  from nmdc_runtime.site.resources import (
41
45
  get_mongo,
@@ -578,7 +582,7 @@ def biosample_submission_ingest():
578
582
  },
579
583
  },
580
584
  ),
581
- translate_neon_api_metadata_to_nmdc_schema_database.to_job(
585
+ translate_neon_api_soil_metadata_to_nmdc_schema_database.to_job(
582
586
  description="This job fetches the metadata associated with a given NEON data product code and translates it into an equivalent nmdc:Database object. The object is serialized to JSON and stored in DRS. This can be considered a dry-run for the `ingest_neon_metadata` job.",
583
587
  resource_defs=resource_defs,
584
588
  config={
@@ -590,10 +594,27 @@ def biosample_submission_ingest():
590
594
  "base_url": {"env": "NEON_API_BASE_URL"},
591
595
  "api_token": {"env": "NEON_API_TOKEN"},
592
596
  },
593
- }
597
+ },
598
+ "mongo": {
599
+ "config": {
600
+ "dbname": {"env": "MONGO_DBNAME"},
601
+ "host": {"env": "MONGO_HOST"},
602
+ "password": {"env": "MONGO_PASSWORD"},
603
+ "username": {"env": "MONGO_USERNAME"},
604
+ },
605
+ },
606
+ "runtime_api_site_client": {
607
+ "config": {
608
+ "base_url": {"env": "API_HOST"},
609
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
610
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
611
+ "site_id": {"env": "API_SITE_ID"},
612
+ },
613
+ },
594
614
  },
595
615
  ),
596
616
  "ops": {
617
+ "export_json_to_drs": {"config": {"username": "..."}},
597
618
  "get_neon_pipeline_mms_data_product": {
598
619
  "config": {
599
620
  "mms_data_product": {
@@ -610,11 +631,16 @@ def biosample_submission_ingest():
610
631
  }
611
632
  }
612
633
  },
613
- "export_json_to_drs": {"config": {"username": ""}},
634
+ "get_neon_pipeline_inputs": {
635
+ "inputs": {
636
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
637
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
638
+ }
639
+ },
614
640
  },
615
641
  },
616
642
  ),
617
- ingest_neon_metadata.to_job(
643
+ ingest_neon_soil_metadata.to_job(
618
644
  description="This job fetches the metadata associated with a given data product code and translates it into an equivalent nmdc:Database object. This object is validated and ingested into Mongo via a `POST /metadata/json:submit` request.",
619
645
  resource_defs=resource_defs,
620
646
  config={
@@ -646,6 +672,95 @@ def biosample_submission_ingest():
646
672
  }
647
673
  }
648
674
  },
675
+ "get_neon_pipeline_inputs": {
676
+ "inputs": {
677
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
678
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
679
+ }
680
+ },
681
+ },
682
+ },
683
+ ),
684
+ translate_neon_api_benthic_metadata_to_nmdc_schema_database.to_job(
685
+ description="This job fetches the metadata associated with a given NEON data product code and translates it into an equivalent nmdc:Database object. The object is serialized to JSON and stored in DRS. This can be considered a dry-run for the `ingest_neon_metadata` job.",
686
+ resource_defs=resource_defs,
687
+ config={
688
+ "resources": merge(
689
+ unfreeze(normal_resources),
690
+ {
691
+ "neon_api_client": {
692
+ "config": {
693
+ "base_url": {"env": "NEON_API_BASE_URL"},
694
+ "api_token": {"env": "NEON_API_TOKEN"},
695
+ },
696
+ },
697
+ "mongo": {
698
+ "config": {
699
+ "dbname": {"env": "MONGO_DBNAME"},
700
+ "host": {"env": "MONGO_HOST"},
701
+ "password": {"env": "MONGO_PASSWORD"},
702
+ "username": {"env": "MONGO_USERNAME"},
703
+ },
704
+ },
705
+ "runtime_api_site_client": {
706
+ "config": {
707
+ "base_url": {"env": "API_HOST"},
708
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
709
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
710
+ "site_id": {"env": "API_SITE_ID"},
711
+ },
712
+ },
713
+ },
714
+ ),
715
+ "ops": {
716
+ "export_json_to_drs": {"config": {"username": "..."}},
717
+ "get_neon_pipeline_inputs": {
718
+ "inputs": {
719
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
720
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
721
+ }
722
+ },
723
+ "get_neon_pipeline_benthic_data_product": {
724
+ "config": {
725
+ "benthic_data_product": {
726
+ "product_id": "DP1.20279.001",
727
+ "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
728
+ }
729
+ }
730
+ },
731
+ },
732
+ },
733
+ ),
734
+ ingest_neon_benthic_metadata.to_job(
735
+ description="",
736
+ resource_defs=resource_defs,
737
+ config={
738
+ "resources": merge(
739
+ unfreeze(normal_resources),
740
+ {
741
+ "neon_api_client": {
742
+ "config": {
743
+ "base_url": {"env": "NEON_API_BASE_URL"},
744
+ "api_token": {"env": "NEON_API_TOKEN"},
745
+ },
746
+ }
747
+ },
748
+ ),
749
+ "ops": {
750
+ "get_neon_pipeline_benthic_data_product": {
751
+ "config": {
752
+ "benthic_data_product": {
753
+ "product_id": "DP1.20279.001",
754
+ "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
755
+ }
756
+ }
757
+ },
758
+ "get_neon_pipeline_inputs": {
759
+ "inputs": {
760
+ "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
761
+ "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
762
+ }
763
+ },
649
764
  },
650
765
  },
651
766
  ),
@@ -212,7 +212,7 @@ class GoldStudyTranslator(Translator):
212
212
  return None
213
213
  elif minimum_numeric_value is not None and maximum_numeric_value is None:
214
214
  return nmdc.QuantityValue(
215
- has_raw_value=field_value,
215
+ has_raw_value=minimum_numeric_value,
216
216
  has_numeric_value=nmdc.Double(minimum_numeric_value),
217
217
  has_unit=unit,
218
218
  )