nmdc-runtime 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -0,0 +1,230 @@
1
+ from functools import lru_cache
2
+ from typing import Any, Dict, List
3
+ import pandas as pd
4
+ from nmdc_runtime.site.resources import (
5
+ RuntimeApiUserClient,
6
+ RuntimeApiSiteClient,
7
+ GoldApiClient,
8
+ )
9
+ from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
10
+ from nmdc_schema import nmdc
11
+
12
+
13
+ class DatabaseUpdater:
14
+ def __init__(
15
+ self,
16
+ runtime_api_user_client: RuntimeApiUserClient,
17
+ runtime_api_site_client: RuntimeApiSiteClient,
18
+ gold_api_client: GoldApiClient,
19
+ study_id: str,
20
+ gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
21
+ ):
22
+ """This class serves as an API for repairing connections in the database by
23
+ adding records that are essentially missing "links"/"connections". As we identify
24
+ common use cases for adding missing records to the database, we can
25
+ add helper methods to this class.
26
+
27
+ :param runtime_api_user_client: An object of RuntimeApiUserClient which can be
28
+ used to retrieve instance records from the NMDC database.
29
+ :param runtime_api_site_client: An object of RuntimeApiSiteClient which can be
30
+ used to mint new IDs for the repaired records that need to be added into the NMDC database.
31
+ :param gold_api_client: An object of GoldApiClient which can be used to retrieve
32
+ records from GOLD via the GOLD API.
33
+ :param study_id: NMDC study ID for which the missing records need to be added.
34
+ :param gold_nmdc_instrument_map_df: A dataframe originally stored as a TSV mapping file in the
35
+ NMDC schema repo, which maps GOLD instrument IDs to IDs of NMDC instrument_set records.
36
+ """
37
+ self.runtime_api_user_client = runtime_api_user_client
38
+ self.runtime_api_site_client = runtime_api_site_client
39
+ self.gold_api_client = gold_api_client
40
+ self.study_id = study_id
41
+ self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
42
+
43
+ @lru_cache
44
+ def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
45
+ """Fetch response from GOLD /biosamples API for a given biosample id.
46
+
47
+ :param gold_biosample_id: GOLD biosample ID.
48
+ :return: Dictionary containing the response from the GOLD /biosamples API.
49
+ """
50
+ return self.gold_api_client.fetch_biosample_by_biosample_id(gold_biosample_id)
51
+
52
+ @lru_cache
53
+ def _fetch_gold_projects(self, gold_biosample_id: str):
54
+ """Fetch response from GOLD /projects API for a given biosample id.
55
+
56
+ :param gold_biosample_id: GOLD biosample ID
57
+ :return: Dictionary containing the response from the GOLD /projects API.
58
+ """
59
+ return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id)
60
+
61
+ def generate_data_generation_set_records_from_gold_api_for_study(
62
+ self,
63
+ ) -> nmdc.Database:
64
+ """This method creates missing data generation records for a given study in the NMDC database using
65
+ metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated
66
+ with the study from the NMDC database. Then, it fetches all the biosample and project data data
67
+ associated with the individual biosamples from the GOLD API using the NMDC-GOLD biosample id
68
+ mappings on the "gold_biosample_identifiers" key/slot. We use the GoldStudyTranslator class
69
+ to mint the required number of `nmdc:DataGeneration` (`nmdc:NucleotideSequencing`) records based
70
+ on the number of GOLD sequencing projects, and then reimplement only the part of logic from that
71
+ class which is responsible for making data_generation_set records.
72
+
73
+ :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
74
+ """
75
+ database = nmdc.Database()
76
+
77
+ biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
78
+ self.study_id
79
+ )
80
+
81
+ all_gold_biosamples = []
82
+ all_gold_projects = []
83
+ for biosample in biosample_set:
84
+ gold_biosample_identifiers = biosample.get("gold_biosample_identifiers")
85
+ if gold_biosample_identifiers:
86
+ for gold_biosample_id in gold_biosample_identifiers:
87
+ gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0]
88
+ gold_projects = self._fetch_gold_projects(gold_biosample_id)
89
+ gold_biosample["projects"] = gold_projects
90
+
91
+ all_gold_biosamples.append(gold_biosample)
92
+ all_gold_projects.extend(gold_projects)
93
+
94
+ gold_study_translator = GoldStudyTranslator(
95
+ biosamples=all_gold_biosamples,
96
+ projects=all_gold_projects,
97
+ gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
98
+ )
99
+
100
+ # The GoldStudyTranslator class has some pre-processing logic which filters out
101
+ # invalid biosamples and projects (based on `sequencingStrategy`, `projectStatus`, etc.)
102
+ filtered_biosamples = gold_study_translator.biosamples
103
+ filtered_projects = gold_study_translator.projects
104
+
105
+ gold_project_ids = [project["projectGoldId"] for project in filtered_projects]
106
+ nmdc_nucleotide_sequencing_ids = self.runtime_api_site_client.mint_id(
107
+ "nmdc:NucleotideSequencing", len(gold_project_ids)
108
+ ).json()
109
+ gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
110
+ zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
111
+ )
112
+
113
+ gold_to_nmdc_biosample_ids = {}
114
+
115
+ for biosample in biosample_set:
116
+ gold_ids = biosample.get("gold_biosample_identifiers", [])
117
+ for gold_id in gold_ids:
118
+ gold_id_stripped = gold_id.replace("gold:", "")
119
+ gold_to_nmdc_biosample_ids[gold_id_stripped] = biosample["id"]
120
+
121
+ database.data_generation_set = []
122
+ # Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records
123
+ # created is based on the number of GOLD sequencing projects
124
+ for project in filtered_projects:
125
+ # map the projectGoldId to the NMDC biosample ID
126
+ biosample_gold_id = next(
127
+ (
128
+ biosample["biosampleGoldId"]
129
+ for biosample in filtered_biosamples
130
+ if any(
131
+ p["projectGoldId"] == project["projectGoldId"]
132
+ for p in biosample.get("projects", [])
133
+ )
134
+ ),
135
+ None,
136
+ )
137
+
138
+ if biosample_gold_id:
139
+ nmdc_biosample_id = gold_to_nmdc_biosample_ids.get(biosample_gold_id)
140
+ if nmdc_biosample_id:
141
+ database.data_generation_set.append(
142
+ gold_study_translator._translate_nucleotide_sequencing(
143
+ project,
144
+ nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
145
+ project["projectGoldId"]
146
+ ],
147
+ nmdc_biosample_id=nmdc_biosample_id,
148
+ nmdc_study_id=self.study_id,
149
+ )
150
+ )
151
+
152
+ return database
153
+
154
+ def generate_biosample_set_from_gold_api_for_study(self) -> nmdc.Database:
155
+ """This method creates biosample_set records for a given study in the NMDC database using
156
+ metadata from GOLD. The logic works by first fetching the biosampleGoldId values of all
157
+ biosamples associated with the study. Then, it fetches the list of all biosamples associated
158
+ with the GOLD study using the GOLD API. There's pre-processing logic in the GoldStudyTranslator
159
+ to filter out biosamples based on `sequencingStrategy` and `projectStatus`. On this list of
160
+ filtered biosamples, we compute a "set difference" (conceptually) between the list of
161
+ filtered samples and ones that are already in the NMDC database, i.e., we ignore biosamples
162
+ that are already present in the database, and continue on to create biosample_set records for
163
+ those that do not have records in the database already.
164
+
165
+ :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
166
+ """
167
+ database = nmdc.Database()
168
+
169
+ # get a list of all biosamples associated with a given NMDC study id
170
+ biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
171
+ self.study_id
172
+ )
173
+
174
+ # get a list of GOLD biosample ids (`biosampleGoldId` values) by iterating
175
+ # over all the biosample_set records retrieved using the above logic
176
+ nmdc_gold_ids = set()
177
+ for biosample in biosample_set:
178
+ gold_ids = biosample.get("gold_biosample_identifiers", [])
179
+ for gold_id in gold_ids:
180
+ nmdc_gold_ids.add(gold_id.replace("gold:", ""))
181
+
182
+ # retrieve GOLD study id by looking at the `gold_study_identifiers` key/slot
183
+ # on the NMDC study record
184
+ nmdc_study = self.runtime_api_user_client.get_study(self.study_id)[0]
185
+ gold_study_id = nmdc_study.get("gold_study_identifiers", [])[0].replace(
186
+ "gold:", ""
187
+ )
188
+
189
+ # use the GOLD study id to fetch all biosample records associated with the study
190
+ gold_biosamples_for_study = self.gold_api_client.fetch_biosamples_by_study(
191
+ gold_study_id
192
+ )
193
+
194
+ # part of the code where we are (conceptually) computing a set difference between
195
+ # the list of filtered samples and ones that are already in the NMDC database
196
+ missing_gold_biosamples = [
197
+ gbs
198
+ for gbs in gold_biosamples_for_study
199
+ if gbs.get("biosampleGoldId") not in nmdc_gold_ids
200
+ ]
201
+
202
+ gold_study_translator = GoldStudyTranslator(
203
+ biosamples=missing_gold_biosamples,
204
+ gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
205
+ )
206
+
207
+ translated_biosamples = gold_study_translator.biosamples
208
+
209
+ # mint new NMDC biosample IDs for the "missing" biosamples
210
+ gold_biosample_ids = [
211
+ biosample["biosampleGoldId"] for biosample in translated_biosamples
212
+ ]
213
+ nmdc_biosample_ids = self.runtime_api_site_client.mint_id(
214
+ "nmdc:Biosample", len(translated_biosamples)
215
+ ).json()
216
+ gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
217
+
218
+ database.biosample_set = [
219
+ gold_study_translator._translate_biosample(
220
+ biosample,
221
+ nmdc_biosample_id=gold_to_nmdc_biosample_ids[
222
+ biosample["biosampleGoldId"]
223
+ ],
224
+ nmdc_study_id=self.study_id,
225
+ nmdc_field_site_id=None,
226
+ )
227
+ for biosample in translated_biosamples
228
+ ]
229
+
230
+ return database
@@ -25,6 +25,7 @@ from nmdc_runtime.api.models.run import _add_run_fail_event
25
25
  from nmdc_runtime.api.models.trigger import Trigger
26
26
  from nmdc_runtime.site.export.study_metadata import export_study_biosamples_metadata
27
27
  from nmdc_runtime.site.graphs import (
28
+ generate_biosample_set_from_samples_in_gold,
28
29
  translate_metadata_submission_to_nmdc_schema_database,
29
30
  ingest_metadata_submission,
30
31
  gold_study_to_database,
@@ -44,6 +45,7 @@ from nmdc_runtime.site.graphs import (
44
45
  ingest_neon_surface_water_metadata,
45
46
  ensure_alldocs,
46
47
  nmdc_study_to_ncbi_submission_export,
48
+ generate_data_generation_set_for_biosamples_in_nmdc_study,
47
49
  )
48
50
  from nmdc_runtime.site.resources import (
49
51
  get_mongo,
@@ -113,6 +115,13 @@ housekeeping_weekly = ScheduleDefinition(
113
115
  job=housekeeping.to_job(**preset_normal),
114
116
  )
115
117
 
118
+ ensure_alldocs_daily = ScheduleDefinition(
119
+ name="daily_ensure_alldocs",
120
+ cron_schedule="0 3 * * *",
121
+ execution_timezone="America/New_York",
122
+ job=ensure_alldocs.to_job(**preset_normal),
123
+ )
124
+
116
125
 
117
126
  def asset_materialization_metadata(asset_event, key):
118
127
  """Get metadata from an asset materialization event.
@@ -453,7 +462,7 @@ def repo():
453
462
  export_study_biosamples_metadata.to_job(**preset_normal),
454
463
  ensure_alldocs.to_job(**preset_normal),
455
464
  ]
456
- schedules = [housekeeping_weekly]
465
+ schedules = [housekeeping_weekly, ensure_alldocs_daily]
457
466
  sensors = [
458
467
  done_object_put_ops,
459
468
  ensure_gold_translation_job,
@@ -643,7 +652,7 @@ def biosample_submission_ingest():
643
652
  "inputs": {
644
653
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
645
654
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
646
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
655
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
647
656
  }
648
657
  },
649
658
  },
@@ -685,7 +694,7 @@ def biosample_submission_ingest():
685
694
  "inputs": {
686
695
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
687
696
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
688
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
697
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
689
698
  }
690
699
  },
691
700
  },
@@ -728,7 +737,7 @@ def biosample_submission_ingest():
728
737
  "inputs": {
729
738
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
730
739
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
731
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
740
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
732
741
  }
733
742
  },
734
743
  "get_neon_pipeline_benthic_data_product": {
@@ -770,7 +779,7 @@ def biosample_submission_ingest():
770
779
  "inputs": {
771
780
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
772
781
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
773
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
782
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
774
783
  }
775
784
  },
776
785
  },
@@ -813,14 +822,14 @@ def biosample_submission_ingest():
813
822
  "inputs": {
814
823
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
815
824
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
816
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
825
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
817
826
  }
818
827
  },
819
828
  "get_neon_pipeline_surface_water_data_product": {
820
829
  "config": {
821
830
  "surface_water_data_product": {
822
831
  "product_id": "DP1.20281.001",
823
- "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
832
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
824
833
  }
825
834
  }
826
835
  },
@@ -847,7 +856,7 @@ def biosample_submission_ingest():
847
856
  "config": {
848
857
  "surface_water_data_product": {
849
858
  "product_id": "DP1.20281.001",
850
- "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
859
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
851
860
  }
852
861
  }
853
862
  },
@@ -855,7 +864,7 @@ def biosample_submission_ingest():
855
864
  "inputs": {
856
865
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
857
866
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
858
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
867
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
859
868
  }
860
869
  },
861
870
  },
@@ -915,6 +924,97 @@ def biosample_export():
915
924
  ]
916
925
 
917
926
 
927
+ @repository
928
+ def database_records_stitching():
929
+ normal_resources = run_config_frozen__normal_env["resources"]
930
+ return [
931
+ generate_data_generation_set_for_biosamples_in_nmdc_study.to_job(
932
+ description="This job can be used to create a data_generation_set JSON for biosamples that are already present in the NMDC database.",
933
+ resource_defs=resource_defs,
934
+ config={
935
+ "resources": merge(
936
+ unfreeze(normal_resources),
937
+ {
938
+ "runtime_api_user_client": {
939
+ "config": {
940
+ "base_url": {"env": "API_HOST"},
941
+ "username": {"env": "API_ADMIN_USER"},
942
+ "password": {"env": "API_ADMIN_PASS"},
943
+ },
944
+ },
945
+ "runtime_api_site_client": {
946
+ "config": {
947
+ "base_url": {"env": "API_HOST"},
948
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
949
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
950
+ "site_id": {"env": "API_SITE_ID"},
951
+ },
952
+ },
953
+ "gold_api_client": {
954
+ "config": {
955
+ "base_url": {"env": "GOLD_API_BASE_URL"},
956
+ "username": {"env": "GOLD_API_USERNAME"},
957
+ "password": {"env": "GOLD_API_PASSWORD"},
958
+ },
959
+ },
960
+ },
961
+ ),
962
+ "ops": {
963
+ "get_database_updater_inputs": {
964
+ "config": {
965
+ "nmdc_study_id": "",
966
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
967
+ }
968
+ },
969
+ "export_json_to_drs": {"config": {"username": ""}},
970
+ },
971
+ },
972
+ ),
973
+ generate_biosample_set_from_samples_in_gold.to_job(
974
+ description="This job can be used to create a biosample_set JSON from samples in GOLD for a given study in NMDC.",
975
+ resource_defs=resource_defs,
976
+ config={
977
+ "resources": merge(
978
+ unfreeze(normal_resources),
979
+ {
980
+ "runtime_api_user_client": {
981
+ "config": {
982
+ "base_url": {"env": "API_HOST"},
983
+ "username": {"env": "API_ADMIN_USER"},
984
+ "password": {"env": "API_ADMIN_PASS"},
985
+ },
986
+ },
987
+ "runtime_api_site_client": {
988
+ "config": {
989
+ "base_url": {"env": "API_HOST"},
990
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
991
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
992
+ "site_id": {"env": "API_SITE_ID"},
993
+ },
994
+ },
995
+ "gold_api_client": {
996
+ "config": {
997
+ "base_url": {"env": "GOLD_API_BASE_URL"},
998
+ "username": {"env": "GOLD_API_USERNAME"},
999
+ "password": {"env": "GOLD_API_PASSWORD"},
1000
+ },
1001
+ },
1002
+ },
1003
+ ),
1004
+ "ops": {
1005
+ "get_database_updater_inputs": {
1006
+ "config": {
1007
+ "nmdc_study_id": "",
1008
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
1009
+ }
1010
+ },
1011
+ "export_json_to_drs": {"config": {"username": ""}},
1012
+ },
1013
+ },
1014
+ ),
1015
+ ]
1016
+
1017
+
918
1018
  # @repository
919
1019
  # def validation():
920
1020
  # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
@@ -129,16 +129,23 @@ class RuntimeApiUserClient(RuntimeApiClient):
129
129
  return response.json()["cursor"]["firstBatch"]
130
130
 
131
131
  def get_biosamples_for_study(self, study_id: str):
132
+ # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
133
+ # The /nmdcschema/{collection-name} endpoint implements pagination via the page_token mechanism,
134
+ # but the tradeoff there is that we would need to make multiple requests to step through the
135
+ # each of the pages. By picking a large number for max_page_size, we can get all the results
136
+ # in a single request.
137
+ # This method previously used the /queries:run endpoint but the problem with that was that
138
+ # it used to truncate the number of results returned to 100.
132
139
  response = self.request(
133
- "POST",
134
- f"/queries:run",
140
+ "GET",
141
+ f"/nmdcschema/biosample_set",
135
142
  {
136
- "find": "biosample_set",
137
- "filter": {"part_of": {"$elemMatch": {"$eq": study_id}}},
143
+ "filter": json.dumps({"associated_studies": study_id}),
144
+ "max_page_size": 10000,
138
145
  },
139
146
  )
140
147
  response.raise_for_status()
141
- return response.json()["cursor"]["firstBatch"]
148
+ return response.json()["resources"]
142
149
 
143
150
  def get_omics_processing_by_name(self, name: str):
144
151
  response = self.request(
@@ -152,6 +159,18 @@ class RuntimeApiUserClient(RuntimeApiClient):
152
159
  response.raise_for_status()
153
160
  return response.json()["cursor"]["firstBatch"]
154
161
 
162
+ def get_study(self, study_id: str):
163
+ response = self.request(
164
+ "POST",
165
+ f"/queries:run",
166
+ {
167
+ "find": "study_set",
168
+ "filter": {"id": study_id},
169
+ },
170
+ )
171
+ response.raise_for_status()
172
+ return response.json()["cursor"]["firstBatch"]
173
+
155
174
 
156
175
  class RuntimeApiSiteClient(RuntimeApiClient):
157
176
  def __init__(
@@ -370,6 +389,18 @@ class GoldApiClient(BasicAuthClient):
370
389
  return None
371
390
  return results[0]
372
391
 
392
+ def fetch_projects_by_biosample(self, biosample_id: str) -> List[Dict[str, Any]]:
393
+ id = self._normalize_id(biosample_id)
394
+ results = self.request("/projects", params={"biosampleGoldId": id})
395
+ return results
396
+
397
+ def fetch_biosample_by_biosample_id(
398
+ self, biosample_id: str
399
+ ) -> List[Dict[str, Any]]:
400
+ id = self._normalize_id(biosample_id)
401
+ results = self.request("/biosamples", params={"biosampleGoldId": id})
402
+ return results
403
+
373
404
 
374
405
  @resource(
375
406
  config_schema={
@@ -12,6 +12,29 @@ from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
12
12
  SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
13
13
 
14
14
 
15
+ def _is_valid_project(project: dict) -> bool:
16
+ """A project is considered valid if:
17
+ 1. `sequencingStrategy` is in {"Metagenome", "Metatranscriptome"}
18
+ 2. if `sequencingCenters` == 'DOE Joint Genome Institute (JGI)' then
19
+ `projectStatus` must be in ("Permanent Draft", "Complete and Published")
20
+ 3. otherwise, no `projectStatus` filter is applied
21
+
22
+ :param project: GOLD project object (structurally similar to response
23
+ from `/projects` endpoint)
24
+ :return: True if the project is valid, False otherwise
25
+ """
26
+ if project.get("sequencingStrategy") not in SEQUENCING_STRATEGIES:
27
+ return False
28
+
29
+ if project.get("sequencingCenters") == "DOE Joint Genome Institute (JGI)":
30
+ return project.get("projectStatus") in (
31
+ "Permanent Draft",
32
+ "Complete and Published",
33
+ )
34
+
35
+ return True
36
+
37
+
15
38
  class GoldStudyTranslator(Translator):
16
39
  def __init__(
17
40
  self,
@@ -36,16 +59,15 @@ class GoldStudyTranslator(Translator):
36
59
  biosample
37
60
  for biosample in biosamples
38
61
  if any(
39
- project.get("sequencingStrategy") in SEQUENCING_STRATEGIES
40
- for project in biosample.get("projects", [])
62
+ _is_valid_project(project) for project in biosample.get("projects", [])
41
63
  )
42
64
  ]
43
65
  # Fetch the valid projectGoldIds that are associated with filtered
44
66
  # biosamples on their `projects` field
45
67
  valid_project_ids = {
46
68
  project.get("projectGoldId")
47
- for biosample in self.biosamples
48
- for project in biosample.get("projects", [])
69
+ for project in projects
70
+ if _is_valid_project(project)
49
71
  }
50
72
  # Filter projects to only those with `projectGoldId` in valid_project_ids
51
73
  self.projects = [