nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,451 @@
1
+ from functools import lru_cache
2
+ from typing import Any, Dict, List, Union
3
+ import pandas as pd
4
+ from nmdc_runtime.site.resources import (
5
+ RuntimeApiUserClient,
6
+ RuntimeApiSiteClient,
7
+ GoldApiClient,
8
+ )
9
+ from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
10
+ from nmdc_schema import nmdc
11
+
12
+
13
+ class DatabaseUpdater:
14
+ def __init__(
15
+ self,
16
+ runtime_api_user_client: RuntimeApiUserClient,
17
+ runtime_api_site_client: RuntimeApiSiteClient,
18
+ gold_api_client: GoldApiClient,
19
+ study_id: str,
20
+ gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
21
+ include_field_site_info: bool = False,
22
+ enable_biosample_filtering: bool = True,
23
+ ):
24
+ """This class serves as an API for repairing connections in the database by
25
+ adding records that are essentially missing "links"/"connections". As we identify
26
+ common use cases for adding missing records to the database, we can
27
+ add helper methods to this class.
28
+
29
+ :param runtime_api_user_client: An object of RuntimeApiUserClient which can be
30
+ used to retrieve instance records from the NMDC database.
31
+ :param runtime_api_site_client: An object of RuntimeApiSiteClient which can be
32
+ used to mint new IDs for the repaired records that need to be added into the NMDC database.
33
+ :param gold_api_client: An object of GoldApiClient which can be used to retrieve
34
+ records from GOLD via the GOLD API.
35
+ :param study_id: NMDC study ID for which the missing records need to be added.
36
+ :param gold_nmdc_instrument_map_df: A dataframe originally stored as a TSV mapping file in the
37
+ NMDC schema repo, which maps GOLD instrument IDs to IDs of NMDC instrument_set records.
38
+ """
39
+ self.runtime_api_user_client = runtime_api_user_client
40
+ self.runtime_api_site_client = runtime_api_site_client
41
+ self.gold_api_client = gold_api_client
42
+ self.study_id = study_id
43
+ self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
44
+ self.include_field_site_info = include_field_site_info
45
+ self.enable_biosample_filtering = enable_biosample_filtering
46
+
47
+ @lru_cache
48
+ def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
49
+ """Fetch response from GOLD /biosamples API for a given biosample id.
50
+
51
+ :param gold_biosample_id: GOLD biosample ID.
52
+ :return: Dictionary containing the response from the GOLD /biosamples API.
53
+ """
54
+ return self.gold_api_client.fetch_biosample_by_biosample_id(gold_biosample_id)
55
+
56
+ @lru_cache
57
+ def _fetch_gold_projects(self, gold_biosample_id: str):
58
+ """Fetch response from GOLD /projects API for a given biosample id.
59
+
60
+ :param gold_biosample_id: GOLD biosample ID
61
+ :return: Dictionary containing the response from the GOLD /projects API.
62
+ """
63
+ return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id)
64
+
65
+ def generate_data_generation_set_records_from_gold_api_for_study(
66
+ self,
67
+ ) -> nmdc.Database:
68
+ """This method creates missing data generation records for a given study in the NMDC database using
69
+ metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated
70
+ with the study from the NMDC database. Then, it fetches all the biosample and project data data
71
+ associated with the individual biosamples from the GOLD API using the NMDC-GOLD biosample id
72
+ mappings on the "gold_biosample_identifiers" key/slot. We use the GoldStudyTranslator class
73
+ to mint the required number of `nmdc:DataGeneration` (`nmdc:NucleotideSequencing`) records based
74
+ on the number of GOLD sequencing projects, and then reimplement only the part of logic from that
75
+ class which is responsible for making data_generation_set records.
76
+
77
+ :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
78
+ """
79
+ database = nmdc.Database()
80
+
81
+ biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
82
+ self.study_id
83
+ )
84
+
85
+ all_gold_biosamples = []
86
+ all_gold_projects = []
87
+ for biosample in biosample_set:
88
+ gold_biosample_identifiers = biosample.get("gold_biosample_identifiers")
89
+ if gold_biosample_identifiers:
90
+ for gold_biosample_id in gold_biosample_identifiers:
91
+ gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0]
92
+ gold_projects = self._fetch_gold_projects(gold_biosample_id)
93
+ gold_biosample["projects"] = gold_projects
94
+
95
+ all_gold_biosamples.append(gold_biosample)
96
+ all_gold_projects.extend(gold_projects)
97
+
98
+ gold_study_translator = GoldStudyTranslator(
99
+ biosamples=all_gold_biosamples,
100
+ projects=all_gold_projects,
101
+ gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
102
+ include_field_site_info=self.include_field_site_info,
103
+ enable_biosample_filtering=self.enable_biosample_filtering,
104
+ )
105
+
106
+ # The GoldStudyTranslator class has some pre-processing logic which filters out
107
+ # invalid biosamples and projects (based on `sequencingStrategy`, `projectStatus`, etc.)
108
+ filtered_biosamples = gold_study_translator.biosamples
109
+ filtered_projects = gold_study_translator.projects
110
+
111
+ gold_project_ids = [project["projectGoldId"] for project in filtered_projects]
112
+ nmdc_nucleotide_sequencing_ids = self.runtime_api_site_client.mint_id(
113
+ "nmdc:NucleotideSequencing", len(gold_project_ids)
114
+ ).json()
115
+ gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
116
+ zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
117
+ )
118
+
119
+ gold_to_nmdc_biosample_ids = {}
120
+
121
+ for biosample in biosample_set:
122
+ gold_ids = biosample.get("gold_biosample_identifiers", [])
123
+ for gold_id in gold_ids:
124
+ gold_id_stripped = gold_id.replace("gold:", "")
125
+ gold_to_nmdc_biosample_ids[gold_id_stripped] = biosample["id"]
126
+
127
+ database.data_generation_set = []
128
+ # Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records
129
+ # created is based on the number of GOLD sequencing projects
130
+ for project in filtered_projects:
131
+ # map the projectGoldId to the NMDC biosample ID
132
+ biosample_gold_id = next(
133
+ (
134
+ biosample["biosampleGoldId"]
135
+ for biosample in filtered_biosamples
136
+ if any(
137
+ p["projectGoldId"] == project["projectGoldId"]
138
+ for p in biosample.get("projects", [])
139
+ )
140
+ ),
141
+ None,
142
+ )
143
+
144
+ if biosample_gold_id:
145
+ nmdc_biosample_id = gold_to_nmdc_biosample_ids.get(biosample_gold_id)
146
+ if nmdc_biosample_id:
147
+ database.data_generation_set.append(
148
+ gold_study_translator._translate_nucleotide_sequencing(
149
+ project,
150
+ nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
151
+ project["projectGoldId"]
152
+ ],
153
+ nmdc_biosample_id=nmdc_biosample_id,
154
+ nmdc_study_id=self.study_id,
155
+ )
156
+ )
157
+
158
+ return database
159
+
160
+ def generate_biosample_set_from_gold_api_for_study(self) -> nmdc.Database:
161
+ """This method creates biosample_set records for a given study in the NMDC database using
162
+ metadata from GOLD. The logic works by first fetching the biosampleGoldId values of all
163
+ biosamples associated with the study. Then, it fetches the list of all biosamples associated
164
+ with the GOLD study using the GOLD API. There's pre-processing logic in the GoldStudyTranslator
165
+ to filter out biosamples based on `sequencingStrategy` and `projectStatus`. On this list of
166
+ filtered biosamples, we compute a "set difference" (conceptually) between the list of
167
+ filtered samples and ones that are already in the NMDC database, i.e., we ignore biosamples
168
+ that are already present in the database, and continue on to create biosample_set records for
169
+ those that do not have records in the database already.
170
+
171
+ :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
172
+ """
173
+ database = nmdc.Database()
174
+
175
+ # get a list of all biosamples associated with a given NMDC study id
176
+ biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
177
+ self.study_id
178
+ )
179
+
180
+ # get a list of GOLD biosample ids (`biosampleGoldId` values) by iterating
181
+ # over all the biosample_set records retrieved using the above logic
182
+ nmdc_gold_ids = set()
183
+ for biosample in biosample_set:
184
+ gold_ids = biosample.get("gold_biosample_identifiers", [])
185
+ for gold_id in gold_ids:
186
+ nmdc_gold_ids.add(gold_id.replace("gold:", ""))
187
+
188
+ # retrieve GOLD study id by looking at the `gold_study_identifiers` key/slot
189
+ # on the NMDC study record
190
+ nmdc_study = self.runtime_api_user_client.get_study(self.study_id)[0]
191
+ gold_study_id = nmdc_study.get("gold_study_identifiers", [])[0].replace(
192
+ "gold:", ""
193
+ )
194
+
195
+ # use the GOLD study id to fetch all biosample records associated with the study
196
+ gold_biosamples_for_study = self.gold_api_client.fetch_biosamples_by_study(
197
+ gold_study_id
198
+ )
199
+
200
+ # part of the code where we are (conceptually) computing a set difference between
201
+ # the list of filtered samples and ones that are already in the NMDC database
202
+ missing_gold_biosamples = [
203
+ gbs
204
+ for gbs in gold_biosamples_for_study
205
+ if gbs.get("biosampleGoldId") not in nmdc_gold_ids
206
+ ]
207
+
208
+ # use the GOLD study id to fetch all sequencing project records associated with the study
209
+ gold_sequencing_projects_for_study = (
210
+ self.gold_api_client.fetch_projects_by_study(gold_study_id)
211
+ )
212
+
213
+ # use the GOLD study id to fetch all analysis project records associated with the study
214
+ gold_analysis_projects_for_study = (
215
+ self.gold_api_client.fetch_analysis_projects_by_study(gold_study_id)
216
+ )
217
+
218
+ gold_study_translator = GoldStudyTranslator(
219
+ biosamples=missing_gold_biosamples,
220
+ projects=gold_sequencing_projects_for_study,
221
+ analysis_projects=gold_analysis_projects_for_study,
222
+ gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
223
+ include_field_site_info=self.include_field_site_info,
224
+ enable_biosample_filtering=self.enable_biosample_filtering,
225
+ )
226
+
227
+ translated_biosamples = gold_study_translator.biosamples
228
+
229
+ # mint new NMDC biosample IDs for the "missing" biosamples
230
+ gold_biosample_ids = [
231
+ biosample["biosampleGoldId"] for biosample in translated_biosamples
232
+ ]
233
+ nmdc_biosample_ids = self.runtime_api_site_client.mint_id(
234
+ "nmdc:Biosample", len(translated_biosamples)
235
+ ).json()
236
+ gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
237
+
238
+ database.biosample_set = [
239
+ gold_study_translator._translate_biosample(
240
+ biosample,
241
+ nmdc_biosample_id=gold_to_nmdc_biosample_ids[
242
+ biosample["biosampleGoldId"]
243
+ ],
244
+ nmdc_study_id=self.study_id,
245
+ nmdc_field_site_id=None,
246
+ )
247
+ for biosample in translated_biosamples
248
+ ]
249
+
250
+ return database
251
+
252
+ def queries_run_script_to_update_insdc_identifiers(
253
+ self,
254
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
255
+ """This method creates a `/queries:run` API endpoint compatible update script that can be run
256
+ using that API endpoint to update/add information on the `insdc_biosample_identifiers` field
257
+ of biosample_set records and the `insdc_bioproject_identifiers` field on data_generation_set records.
258
+
259
+ The information to be asserted is retrieved from the `ncbiBioSampleAccession` and
260
+ `ncbiBioProjectAccession` fields on the GOLD `/projects` API endpoint.
261
+
262
+ :return: A `/queries:run` update query compatible script serialized as a dictionary/JSON.
263
+ """
264
+ # Fetch all biosamples associated with the study
265
+ biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
266
+ self.study_id
267
+ )
268
+
269
+ # Fetch all data_generation records associated with the study
270
+ data_generation_set = (
271
+ self.runtime_api_user_client.get_data_generation_records_for_study(
272
+ self.study_id
273
+ )
274
+ )
275
+
276
+ biosample_updates = []
277
+ data_generation_updates = []
278
+
279
+ # Dictionary to store gold_project_id -> ncbi_bioproject_accession mapping
280
+ gold_project_to_bioproject = {}
281
+
282
+ # Dictionary to store all project data we gather during biosample processing
283
+ all_processed_projects = {}
284
+
285
+ # Process biosamples for insdc_biosample_identifiers
286
+ for biosample in biosample_set:
287
+ # get the list (usually one) of GOLD biosample identifiers on the gold_biosample_identifiers slot
288
+ gold_biosample_identifiers = biosample.get("gold_biosample_identifiers", [])
289
+ if not gold_biosample_identifiers:
290
+ continue
291
+
292
+ biosample_id = biosample.get("id")
293
+ if not biosample_id:
294
+ continue
295
+
296
+ insdc_biosample_identifiers = []
297
+
298
+ for gold_biosample_id in gold_biosample_identifiers:
299
+ normalized_id = gold_biosample_id.replace("gold:", "")
300
+
301
+ # fetch projects associated with a GOLD biosample from the GOLD `/projects` API endpoint
302
+ gold_projects = self.gold_api_client.fetch_projects_by_biosample(
303
+ normalized_id
304
+ )
305
+
306
+ for project in gold_projects:
307
+ # Store each project for later use
308
+ project_gold_id = project.get("projectGoldId")
309
+ if project_gold_id:
310
+ all_processed_projects[project_gold_id] = project
311
+
312
+ # Collect ncbi_biosample_accession for biosample updates
313
+ ncbi_biosample_accession = project.get("ncbiBioSampleAccession")
314
+ if ncbi_biosample_accession and ncbi_biosample_accession.strip():
315
+ insdc_biosample_identifiers.append(ncbi_biosample_accession)
316
+
317
+ # Collect ncbi_bioproject_accession for data_generation records
318
+ ncbi_bioproject_accession = project.get("ncbiBioProjectAccession")
319
+ if (
320
+ project_gold_id
321
+ and ncbi_bioproject_accession
322
+ and ncbi_bioproject_accession.strip()
323
+ ):
324
+ gold_project_to_bioproject[project_gold_id] = (
325
+ ncbi_bioproject_accession
326
+ )
327
+
328
+ if insdc_biosample_identifiers:
329
+ existing_insdc_biosample_identifiers = biosample.get(
330
+ "insdc_biosample_identifiers", []
331
+ )
332
+ new_insdc_biosample_identifiers = list(
333
+ set(insdc_biosample_identifiers)
334
+ - set(existing_insdc_biosample_identifiers)
335
+ )
336
+
337
+ if new_insdc_biosample_identifiers:
338
+ prefixed_new_biosample_identifiers = [
339
+ f"biosample:{id}" for id in new_insdc_biosample_identifiers
340
+ ]
341
+
342
+ if existing_insdc_biosample_identifiers:
343
+ all_biosample_identifiers = list(
344
+ set(
345
+ existing_insdc_biosample_identifiers
346
+ + prefixed_new_biosample_identifiers
347
+ )
348
+ )
349
+ biosample_updates.append(
350
+ {
351
+ "q": {"id": biosample_id},
352
+ "u": {
353
+ "$set": {
354
+ "insdc_biosample_identifiers": all_biosample_identifiers
355
+ }
356
+ },
357
+ }
358
+ )
359
+ else:
360
+ biosample_updates.append(
361
+ {
362
+ "q": {"id": biosample_id},
363
+ "u": {
364
+ "$set": {
365
+ "insdc_biosample_identifiers": prefixed_new_biosample_identifiers
366
+ }
367
+ },
368
+ }
369
+ )
370
+
371
+ # Process data_generation records for insdc_bioproject_identifiers
372
+ for data_generation in data_generation_set:
373
+ data_generation_id = data_generation.get("id")
374
+ if not data_generation_id:
375
+ continue
376
+
377
+ # Extract existing insdc_bioproject_identifiers
378
+ existing_insdc_bioproject_identifiers = data_generation.get(
379
+ "insdc_bioproject_identifiers", []
380
+ )
381
+
382
+ collected_insdc_bioproject_identifiers = set()
383
+
384
+ # Add any project identifiers already on the record
385
+ if "insdc_bioproject_identifiers" in data_generation:
386
+ for identifier in data_generation["insdc_bioproject_identifiers"]:
387
+ collected_insdc_bioproject_identifiers.add(identifier)
388
+
389
+ # If there are gold_sequencing_project_identifiers, use our pre-collected mapping
390
+ gold_project_identifiers = data_generation.get(
391
+ "gold_sequencing_project_identifiers", []
392
+ )
393
+ for gold_project_id in gold_project_identifiers:
394
+ normalized_id = gold_project_id.replace("gold:", "")
395
+
396
+ # Check if we have a bioproject ID for this GOLD project ID
397
+ if normalized_id in gold_project_to_bioproject:
398
+ ncbi_bioproject_accession = gold_project_to_bioproject[
399
+ normalized_id
400
+ ]
401
+ collected_insdc_bioproject_identifiers.add(
402
+ f"bioproject:{ncbi_bioproject_accession}"
403
+ )
404
+ else:
405
+ # Only if we don't have it in our mapping, try to fetch it
406
+ # Instead of making a direct API request, check if we've already seen this project
407
+ if normalized_id in all_processed_projects:
408
+ project_data = all_processed_projects[normalized_id]
409
+ ncbi_bioproject_accession = project_data.get(
410
+ "ncbiBioProjectAccession"
411
+ )
412
+ if (
413
+ ncbi_bioproject_accession
414
+ and ncbi_bioproject_accession.strip()
415
+ ):
416
+ collected_insdc_bioproject_identifiers.add(
417
+ f"bioproject:{ncbi_bioproject_accession}"
418
+ )
419
+ # Add to our mapping for future reference
420
+ gold_project_to_bioproject[normalized_id] = (
421
+ ncbi_bioproject_accession
422
+ )
423
+
424
+ # Create a list from the set of collected identifiers
425
+ collected_insdc_bioproject_identifiers = list(
426
+ collected_insdc_bioproject_identifiers
427
+ )
428
+
429
+ # Only update if there are identifiers to add
430
+ if collected_insdc_bioproject_identifiers and set(
431
+ collected_insdc_bioproject_identifiers
432
+ ) != set(existing_insdc_bioproject_identifiers):
433
+ data_generation_updates.append(
434
+ {
435
+ "q": {"id": data_generation_id},
436
+ "u": {
437
+ "$set": {
438
+ "insdc_bioproject_identifiers": collected_insdc_bioproject_identifiers
439
+ }
440
+ },
441
+ }
442
+ )
443
+
444
+ # Return updates for both collections
445
+ if data_generation_updates:
446
+ return [
447
+ {"update": "biosample_set", "updates": biosample_updates},
448
+ {"update": "data_generation_set", "updates": data_generation_updates},
449
+ ]
450
+ else:
451
+ return {"update": "biosample_set", "updates": biosample_updates}