nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,311 @@
1
+ #!/usr/bin/env python
2
+ # nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py
3
+ """
4
+ missing_neon_soils_ecosystem_data.py: Create a changesheet for missing ecosystem data for NEON soils samples
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ import time
10
+ from pathlib import Path
11
+ from typing import List, Tuple
12
+
13
+ import click
14
+ from dotenv import load_dotenv
15
+
16
+ from nmdc_runtime.site.changesheets.base import (
17
+ Changesheet,
18
+ ChangesheetLineItem,
19
+ JSON_OBJECT,
20
+ )
21
+ from nmdc_runtime.site.normalization.gold import (
22
+ get_gold_biosample_name_suffix,
23
+ normalize_gold_id,
24
+ )
25
+ from nmdc_runtime.site.resources import GoldApiClient, RuntimeApiUserClient
26
+
27
+ load_dotenv()
28
+ GOLD_NEON_SOIL_STUDY_ID = "Gs0144570"
29
+ NAME = "missing_neon_soils_ecosystem_data"
30
+ # omics processing to biosamples file in data directory
31
+ DATA_PATH = Path(__file__).parent.parent.joinpath("data")
32
+
33
+ log_filename = f"{NAME}-{time.strftime('%Y%m%d-%H%M%S')}.log"
34
+ logging.basicConfig(
35
+ level=logging.INFO,
36
+ format="%(asctime)s %(levelname)s %(message)s",
37
+ filename=log_filename,
38
+ encoding="utf-8",
39
+ filemode="w",
40
+ )
41
+
42
+
43
+ def read_omics_processing_to_biosample_map() -> dict:
44
+ """
45
+ Map omics processing to biosamples
46
+ :return: dict
47
+ """
48
+ omics_processing_to_biosamples = {}
49
+ with open(DATA_PATH.joinpath("OmicsProcessing-to-catted-Biosamples.tsv")) as f:
50
+ lines_read = 0
51
+ lines_skipped = 0
52
+ for line in f:
53
+ if not line.startswith("nmdc:"):
54
+ lines_skipped += 1
55
+ continue
56
+ lines_read += 1
57
+ omics_processing_id, biosample_ids = line.strip().split("\t")
58
+ omics_processing_to_biosamples[omics_processing_id] = biosample_ids.split(
59
+ "|"
60
+ )
61
+ logging.debug(
62
+ f"Read {lines_read} lines from OmicsProcessing-to-catted-Biosamples.tsv"
63
+ )
64
+ logging.info(
65
+ f"Skipped {lines_skipped} lines from OmicsProcessing-to-catted-Biosamples.tsv"
66
+ )
67
+ return omics_processing_to_biosamples
68
+
69
+
70
+ def gold_biosample_to_nmdc_biosamples_and_omics_processing_records(
71
+ runtime_api_client, omprc_to_bs_map, goldbs
72
+ ) -> Tuple[List[JSON_OBJECT], List[JSON_OBJECT]]:
73
+ """
74
+ Find the corresponding NMDC biosamples and omics processing records for a
75
+ GOLD biosample
76
+ :param runtime_api_client:
77
+ :param omprc_to_bs_map: Dict of omics processing ID to biosample IDs
78
+ :param goldbs: a GOLD biosample
79
+ :return: (
80
+ List of corresponding NMDC biosamples,
81
+ List of corresponding NMDC omics processing records
82
+ )
83
+ """
84
+ goldbs_id = normalize_gold_id(goldbs["biosampleGoldId"])
85
+ goldbs_name_suffix = get_gold_biosample_name_suffix(goldbs["biosampleName"])
86
+ logging.info(f"goldbs_id: {goldbs_id}")
87
+ logging.info(f"goldbs_name_suffix: {goldbs_name_suffix}")
88
+
89
+ # Search for NMDC biosamples with by GOLD biosample ID
90
+ nmdc_biosamples = []
91
+ logging.info(f"Searching for NMDC biosamples with {goldbs_id}...")
92
+ nmdcbs = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id)
93
+ logging.info(f"Found {len(nmdcbs)} NMDC biosamples with {goldbs_id}...")
94
+ nmdc_biosamples.extend(nmdcbs)
95
+
96
+ # Search for NMDC biosamples via omics processing name containing GOLD biosample name suffix
97
+ logging.info(
98
+ f"Searching for NMDC omics processing name containing {goldbs_name_suffix}..."
99
+ )
100
+ omprc_records = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix)
101
+ for omprc in omprc_records:
102
+ omprc_id = omprc["id"]
103
+ logging.info(f"omprc_id: {omprc_id}")
104
+ logging.info(
105
+ f"Searching for NMDC biosamples with omics processing {omprc_id}..."
106
+ )
107
+ nmdcbs_ids = omprc_to_bs_map.get(omprc_id, [])
108
+ logging.info(f"Found {len(nmdcbs_ids)} NMDC biosamples with {omprc_id}...")
109
+ for nmdcbs_id in nmdcbs_ids:
110
+ nmdcbs_req = runtime_api_client.request("GET", f"/biosamples/{nmdcbs_id}")
111
+ if nmdcbs_req.status_code != 200:
112
+ logging.error(
113
+ f"Failed to retrieve NMDC biosample {nmdcbs_id}: {nmdcbs_req.status_code}"
114
+ )
115
+ continue
116
+ nmdcbs = nmdcbs_req.json()
117
+ nmdc_biosamples.append(nmdcbs)
118
+
119
+ logging.info(f"Found {len(nmdc_biosamples)} NMDC biosamples for {goldbs_id}...")
120
+ return nmdc_biosamples, omprc_records
121
+
122
+
123
+ def compare_biosamples(goldbs, nmdcbs) -> List[ChangesheetLineItem]:
124
+ changesheet_line_items = []
125
+
126
+ # Check for missing ecosystem metadata
127
+ changesheet_line_items.extend(_check_ecosystem_metadata(goldbs, nmdcbs))
128
+
129
+ # Check for missing gold biosample identifiers
130
+ changesheet_line_items.extend(_check_gold_biosample_identifiers(goldbs, nmdcbs))
131
+
132
+ return changesheet_line_items
133
+
134
+
135
+ def _check_ecosystem_metadata(goldbs, nmdcbs) -> List[ChangesheetLineItem]:
136
+ # nmdc to gold ecosystem key map
137
+ ecosystem_key_map = {
138
+ "ecosystem": "ecosystem",
139
+ "ecosystem_category": "ecosystemCategory",
140
+ "ecosystem_type": "ecosystemType",
141
+ "ecosystem_subtype": "ecosystemSubtype",
142
+ }
143
+ changesheet_line_items = []
144
+ for nmdc_key, gold_key in ecosystem_key_map.items():
145
+ if not gold_key in goldbs:
146
+ logging.warning(f"no {gold_key} for {goldbs['biosampleGoldId']}...")
147
+ continue
148
+ if not nmdc_key in nmdcbs:
149
+ changesheet_line_items.append(
150
+ ChangesheetLineItem(
151
+ id=nmdcbs["id"],
152
+ action="update",
153
+ attribute=nmdc_key,
154
+ value=goldbs.get(gold_key),
155
+ )
156
+ )
157
+ continue
158
+ if nmdcbs[nmdc_key] != goldbs.get(gold_key):
159
+ changesheet_line_items.append(
160
+ ChangesheetLineItem(
161
+ id=nmdcbs["id"],
162
+ action="update",
163
+ attribute=nmdc_key,
164
+ value=goldbs.get(gold_key),
165
+ )
166
+ )
167
+ continue
168
+
169
+ return changesheet_line_items
170
+
171
+
172
+ def _check_gold_biosample_identifiers(goldbs, nmdcbs) -> List[ChangesheetLineItem]:
173
+ changesheet_line_items = []
174
+ goldbs_id = normalize_gold_id(goldbs["biosampleGoldId"])
175
+ if not goldbs_id in nmdcbs["gold_biosample_identifiers"]:
176
+ changesheet_line_items.append(
177
+ ChangesheetLineItem(
178
+ id=nmdcbs["id"],
179
+ action="insert",
180
+ attribute="gold_biosample_identifiers",
181
+ value=goldbs_id + "|",
182
+ )
183
+ )
184
+ return changesheet_line_items
185
+
186
+
187
+ def compare_projects(gold_project, omprc_record) -> ChangesheetLineItem:
188
+ gold_project_id = normalize_gold_id(gold_project["projectGoldId"])
189
+ if "gold_sequencing_project_identifiers" not in omprc_record:
190
+ return ChangesheetLineItem(
191
+ id=omprc_record["id"],
192
+ action="insert",
193
+ attribute="gold_sequencing_project_identifiers",
194
+ value=gold_project_id + "|",
195
+ )
196
+
197
+ if gold_project_id not in omprc_record["gold_sequencing_project_identifiers"]:
198
+ return ChangesheetLineItem(
199
+ id=omprc_record["id"],
200
+ action="insert",
201
+ attribute="gold_sequencing_project_identifiers",
202
+ value=gold_project_id + "|",
203
+ )
204
+
205
+
206
+ @click.command()
207
+ @click.option("--study_id", default=GOLD_NEON_SOIL_STUDY_ID, help="GOLD study ID")
208
+ @click.option("--use_dev_api", is_flag=True, default=False, help="Use the dev API")
209
+ def generate_changesheet(study_id, use_dev_api):
210
+ """
211
+ Generate a changesheet for missing ecosystem data for NEON soils samples by:
212
+ 1. Retrieving GOLD biosamples for the given study
213
+ 2. Finding the corresponding NMDC biosamples and omics processing records for each GOLD biosample
214
+ 3. Comparing the GOLD biosample to the NMDC biosamples and omics processing records
215
+ 4. Generating a changesheet for the differences
216
+
217
+ WARNING: This script is not idempotent. It will generate a new changesheet each time it is run.
218
+
219
+ Changesheet is written to nmdc_runtime/site/changesheets/changesheets_output
220
+
221
+ :param study_id: The GOLD study ID
222
+ :param use_dev_api: Use the dev API (default: False)
223
+ :return:
224
+ """
225
+ start_time = time.time()
226
+ logging.info("starting missing_neon_soils_ecosystem_data.py...")
227
+ logging.info(f"study_id: {study_id}")
228
+
229
+ gold_api_client = GoldApiClient(
230
+ base_url=os.getenv("GOLD_API_BASE_URL"),
231
+ username=os.getenv("GOLD_API_USERNAME"),
232
+ password=os.getenv("GOLD_API_PASSWORD"),
233
+ )
234
+ logging.info("connected to GOLD API...")
235
+
236
+ if use_dev_api:
237
+ base_url = os.getenv("API_HOST_DEV")
238
+ logging.info("using dev API...")
239
+ else:
240
+ base_url = os.getenv("API_HOST")
241
+ logging.info("using prod API...")
242
+
243
+ runtime_api_user_client = RuntimeApiUserClient(
244
+ base_url=base_url,
245
+ username=os.getenv("API_QUERY_USER"),
246
+ password=os.getenv("API_QUERY_PASS"),
247
+ )
248
+ logging.info("connected to NMDC API...")
249
+
250
+ # Retrieve GOLD biosamples for the given study
251
+ gold_biosamples = gold_api_client.fetch_biosamples_by_study(study_id)
252
+ logging.info(f"retrieved {len(gold_biosamples)} biosamples from GOLD API...")
253
+
254
+ # omics processing to biosamples map generated by a SPARQL query
255
+ omprc_to_bs_map = read_omics_processing_to_biosample_map()
256
+
257
+ changesheet = Changesheet(name=NAME)
258
+ # For each GOLD biosample, find the corresponding NMDC biosamples
259
+ nmdcbs_count = 0
260
+ unfindable_goldbs_ids = []
261
+ for goldbs in gold_biosamples:
262
+ (
263
+ nmdc_biosamples,
264
+ omprc_records,
265
+ ) = gold_biosample_to_nmdc_biosamples_and_omics_processing_records(
266
+ runtime_api_user_client, omprc_to_bs_map, goldbs
267
+ )
268
+ if not nmdc_biosamples:
269
+ logging.warning(
270
+ f"no corresponding NMDC biosamples found for {goldbs['biosampleGoldId']}..."
271
+ )
272
+ unfindable_goldbs_ids.append(goldbs["biosampleGoldId"])
273
+ continue
274
+ logging.info(
275
+ f"found {len(nmdc_biosamples)} corresponding NMDC biosamples for {goldbs['biosampleGoldId']}..."
276
+ )
277
+ nmdcbs_count += len(nmdc_biosamples)
278
+ for nmdcbs in nmdc_biosamples:
279
+ logging.info(f"nmdcbs: {nmdcbs['id']}")
280
+ changesheet.line_items.extend(compare_biosamples(goldbs, nmdcbs))
281
+
282
+ # Insert gold project id into omprc alternative identifiers
283
+ gold_projects = gold_api_client.request(
284
+ "/projects", params={"biosampleGoldId": goldbs["biosampleGoldId"]}
285
+ )
286
+ for gold_project in gold_projects:
287
+ for omprc_record in omprc_records:
288
+ changesheet.line_items.append(
289
+ compare_projects(gold_project, omprc_record)
290
+ )
291
+
292
+ logging.info(f"Processed {len(gold_biosamples)} GOLD biosamples...")
293
+ logging.info(f"found {nmdcbs_count} corresponding NMDC biosamples...")
294
+ logging.info(f"unfindable_count: {len(unfindable_goldbs_ids)}...")
295
+ for unfindable_goldbs_ids in unfindable_goldbs_ids:
296
+ logging.info(f"unfindable_goldbs_id: {unfindable_goldbs_ids}...")
297
+ logging.info(f"changesheet has {len(changesheet.line_items)} line items...")
298
+
299
+ changesheet.write_changesheet()
300
+
301
+ logging.info("Validating changesheet...")
302
+ is_valid_changesheet = changesheet.validate_changesheet(base_url)
303
+ logging.info(f"Changesheet is valid: {is_valid_changesheet}")
304
+
305
+ logging.info(
306
+ f"missing_neon_soils_ecosystem_data.py completed in {time.time() - start_time} seconds..."
307
+ )
308
+
309
+
310
+ if __name__ == "__main__":
311
+ generate_changesheet()
@@ -0,0 +1,210 @@
1
+ #!/usr/bin/env python3
2
+ # coding: utf-8
3
+ # nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py
4
+ """
5
+ neon_soils_add_ncbi_ids.py: Add NCBI biosample accessions to neon soils
6
+ biosamples, NCBI bioproject accessions to omics processing, and
7
+ NCBI Umbrella bioproject accession to neon soils study.
8
+ """
9
+ import logging
10
+ import time
11
+
12
+ import click
13
+ from dotenv import load_dotenv
14
+
15
+ from nmdc_runtime.site.changesheets.base import (
16
+ Changesheet,
17
+ ChangesheetLineItem,
18
+ get_gold_client,
19
+ get_runtime_client,
20
+ )
21
+
22
+ load_dotenv()
23
+ NAME = "neon_soils_add_ncbi_ids"
24
+ NMDC_STUDY_ID = "nmdc:sty-11-34xj1150"
25
+ UMBRELLA_BIOPROJECT_ACCESSION = "PRJNA1029061"
26
+
27
+ log_filename = f"{NAME}-{time.strftime('%Y%m%d-%H%M%S')}.log"
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="%(asctime)s %(levelname)s %(message)s",
31
+ filename=log_filename,
32
+ encoding="utf-8",
33
+ filemode="w",
34
+ )
35
+
36
+
37
+ def _get_change_for_biosample(biosample, ncbi_biosample_accession):
38
+ """
39
+ Get the changes for the given biosample
40
+ :param biosample: dict - the biosample
41
+ :param ncbi_biosample_accession: str - the NCBI BioSample accession
42
+ :return: list - the changes
43
+ """
44
+ ncbi_biosample_accessions = biosample.get("insdc_biosample_identifiers", [])
45
+ if ncbi_biosample_accession in ncbi_biosample_accessions:
46
+ return
47
+ biosample_id = biosample["id"]
48
+ logging.info(f"creating change for biosample_id: {biosample_id}")
49
+ return ChangesheetLineItem(
50
+ id=biosample["id"],
51
+ action="insert",
52
+ attribute="insdc_biosample_identifiers",
53
+ value="biosample:" + ncbi_biosample_accession + "|",
54
+ )
55
+
56
+
57
+ def _get_change_for_omics_processing(
58
+ omics_processing_record, ncbi_bioproject_accession
59
+ ):
60
+ """
61
+ Get the changes for the given omics_processing_record
62
+ :param omics_processing_record:
63
+ :param ncbi_bioproject_accession:
64
+ :return:
65
+ """
66
+ ncbi_bioproject_accessions = omics_processing_record.get(
67
+ "insdc_bioproject_identifiers", []
68
+ )
69
+ if ncbi_bioproject_accession in ncbi_bioproject_accessions:
70
+ return
71
+ omics_processing_id = omics_processing_record["id"]
72
+ logging.info(f"creating change for omics_processing_id: {omics_processing_id}")
73
+ return ChangesheetLineItem(
74
+ id=omics_processing_id,
75
+ action="insert",
76
+ attribute="insdc_bioproject_identifiers",
77
+ value="bioproject:" + ncbi_bioproject_accession + "|",
78
+ )
79
+
80
+
81
+ @click.command()
82
+ @click.option("--study_id", default=NMDC_STUDY_ID, help="NMDC study ID")
83
+ @click.option("--use_dev_api", is_flag=True, default=True, help="Use the dev API")
84
+ def generate_changesheet(study_id, use_dev_api):
85
+ """
86
+ Generate a changesheet for neon soils study and biosamples by:
87
+ 0. Changesheet line item: Umbrella BioProjectAccession to
88
+ study.insdc_project_identifiers
89
+ 1. Retrieving all gold_study_identifiers for the neon soils study
90
+ 2. For each gold_study_identifier, retrieve the GOLD projects
91
+ 3. For each GOLD project,
92
+ A. retrieve the corresponding NMDC biosample(s). For each biosample,
93
+ - Changesheet line item:NCBI BioSampleAccession to
94
+ insdc_biosample_identifiers
95
+ B. Retrieve the corresponding NMDC omics_processing. For each,
96
+ - Changesheet line item:NCBI BioProjectAccession to
97
+ insdc_bioproject_identifiers
98
+
99
+ WARNING: This script is not idempotent. It will generate a new changesheet
100
+ each time it is run.
101
+ Changesheet is written to nmdc_runtime/site/changesheets/changesheets_output
102
+
103
+ :param study_id: The NMDC study ID
104
+ :param use_dev_api: Use the dev API (default: False)
105
+ :return:
106
+ """
107
+ start_time = time.time()
108
+ logging.info(f"Generating changesheet for {study_id}")
109
+ logging.info(f"Using dev API: {use_dev_api}")
110
+
111
+ # Initialize the NMDC API
112
+ runtime_client = get_runtime_client(use_dev_api)
113
+
114
+ # Initialize the GOLD API
115
+ gold_client = get_gold_client()
116
+
117
+ # Initialize the changesheet
118
+ changesheet = Changesheet(name=NAME)
119
+
120
+ # 1. Retrieve all gold_study_identifiers for the neon soils study
121
+ logging.info(f"Retrieving gold_study_identifiers for {study_id}")
122
+ res = runtime_client.request("GET", f"/studies/{study_id}")
123
+ nmdc_study = res.json()
124
+ changesheet.line_items.append(
125
+ ChangesheetLineItem(
126
+ id=study_id,
127
+ action="insert",
128
+ attribute="insdc_bioproject_identifiers",
129
+ value="bioproject:" + UMBRELLA_BIOPROJECT_ACCESSION + "|",
130
+ )
131
+ )
132
+
133
+ gold_study_identifiers = nmdc_study["gold_study_identifiers"]
134
+ logging.info(f"gold_study_identifiers: {gold_study_identifiers}")
135
+ gold_project_count = 0
136
+ biosample_count = 0
137
+ for gold_study_identifier in gold_study_identifiers:
138
+ # 2. For each gold_study_identifier, retrieve the GOLD projects
139
+ if gold_study_identifier == "gold:Gs0144570":
140
+ # TODO verify that this one has already been done
141
+ continue
142
+ logging.info(
143
+ f"Retrieving GOLD projects for gold_study_identifier: {gold_study_identifier}"
144
+ )
145
+ projects = gold_client.fetch_projects_by_study(gold_study_identifier)
146
+ logging.info(f"Retrieved {len(projects)} projects")
147
+
148
+ # 3. For each GOLD project,
149
+ for project in projects:
150
+ gold_project_count += 1
151
+ project_gold_id = project["projectGoldId"]
152
+ biosample_gold_id = project["biosampleGoldId"]
153
+ ncbi_bioproject_accession = project["ncbiBioProjectAccession"]
154
+ ncbi_biosample_accession = project["ncbiBioSampleAccession"]
155
+
156
+ # A. retrieve the corresponding NMDC biosample(s)
157
+ logging.info(
158
+ f"Retrieving NMDC biosamples for biosample_gold_id: {biosample_gold_id}"
159
+ )
160
+ biosamples = runtime_client.get_biosamples_by_gold_biosample_id(
161
+ biosample_gold_id
162
+ )
163
+ logging.info(f"Retrieved {len(biosamples)} biosamples")
164
+ for biosample in biosamples:
165
+ biosample_count += 1
166
+ biosample_id = biosample["id"]
167
+ logging.info(f"biosample_id: {biosample_id}")
168
+ # NcbiBioSampleAccession to insdc_biosample_identifiers
169
+ change = _get_change_for_biosample(biosample, ncbi_biosample_accession)
170
+ if change:
171
+ changesheet.line_items.append(change)
172
+
173
+ # B. Retrieve the corresponding NMDC omics_processing
174
+ logging.info(
175
+ f"Retrieving NMDC omics_processing for project_gold_id: {project_gold_id}"
176
+ )
177
+ omics_processing_records = (
178
+ runtime_client.get_omics_processing_records_by_gold_project_id(
179
+ project_gold_id
180
+ )
181
+ )
182
+ logging.info(f"Retrieved {len(omics_processing_records)} omics_processings")
183
+ for omics_processing in omics_processing_records:
184
+ omics_processing_id = omics_processing["id"]
185
+ logging.info(f"omics_processing_id: {omics_processing_id}")
186
+ # NcbiBioProjectAccession to insdc_experiment_identifiers
187
+ change = _get_change_for_omics_processing(
188
+ omics_processing, ncbi_bioproject_accession
189
+ )
190
+ if change:
191
+ changesheet.line_items.append(change)
192
+
193
+ logging.info(f"gold_project_count: {gold_project_count}")
194
+ logging.info(f"biosample_count: {biosample_count}")
195
+ logging.info(f"changesheet has {len(changesheet.line_items)} line items")
196
+
197
+ # Write the changesheet
198
+ changesheet.write_changesheet()
199
+
200
+ # Validate the changesheet
201
+ if changesheet.validate_changesheet(runtime_client.base_url):
202
+ logging.info(f"Changesheet is valid")
203
+ else:
204
+ logging.error(f"Changesheet is invalid")
205
+
206
+ logging.info(f"Completed in {time.time() - start_time} seconds")
207
+
208
+
209
+ if __name__ == "__main__":
210
+ generate_changesheet()
@@ -0,0 +1,53 @@
1
+ scheduler:
2
+ module: dagster.core.scheduler
3
+ class: DagsterDaemonScheduler
4
+
5
+ run_coordinator:
6
+ module: dagster.core.run_coordinator
7
+ class: QueuedRunCoordinator
8
+
9
+ run_launcher:
10
+ module: dagster.core.launcher
11
+ class: DefaultRunLauncher
12
+
13
+ run_storage:
14
+ module: dagster_postgres.run_storage
15
+ class: PostgresRunStorage
16
+ config:
17
+ postgres_db:
18
+ hostname: dagster-postgresql
19
+ username:
20
+ env: DAGSTER_POSTGRES_USER
21
+ password:
22
+ env: DAGSTER_POSTGRES_PASSWORD
23
+ db_name:
24
+ env: DAGSTER_POSTGRES_DB
25
+ port: 5432
26
+
27
+ schedule_storage:
28
+ module: dagster_postgres.schedule_storage
29
+ class: PostgresScheduleStorage
30
+ config:
31
+ postgres_db:
32
+ hostname: dagster-postgresql
33
+ username:
34
+ env: DAGSTER_POSTGRES_USER
35
+ password:
36
+ env: DAGSTER_POSTGRES_PASSWORD
37
+ db_name:
38
+ env: DAGSTER_POSTGRES_DB
39
+ port: 5432
40
+
41
+ event_log_storage:
42
+ module: dagster_postgres.event_log
43
+ class: PostgresEventLogStorage
44
+ config:
45
+ postgres_db:
46
+ hostname: dagster-postgresql
47
+ username:
48
+ env: DAGSTER_POSTGRES_USER
49
+ password:
50
+ env: DAGSTER_POSTGRES_PASSWORD
51
+ db_name:
52
+ env: DAGSTER_POSTGRES_DB
53
+ port: 5432
@@ -0,0 +1,29 @@
1
+ #!/bin/bash
2
+
3
+ set -euo pipefail
4
+
5
+ file_env() {
6
+ local var="$1"
7
+ local fileVar="${var}_FILE"
8
+ local def="${2:-}"
9
+ if [ "${!var:-}" ] && [ "${!fileVar:-}" ]; then
10
+ echo >&2 "error: both $var and $fileVar are set (but are exclusive)"
11
+ exit 1
12
+ fi
13
+ local val="$def"
14
+ if [ "${!var:-}" ]; then
15
+ val="${!var}"
16
+ elif [ "${!fileVar:-}" ]; then
17
+ val="$(< "${!fileVar}")"
18
+ fi
19
+ export "$var"="$val"
20
+ unset "$fileVar"
21
+ }
22
+
23
+ file_env "MONGO_PASSWORD"
24
+ file_env "DAGSTER_POSTGRES_PASSWORD"
25
+
26
+ # Note: The `--no-sync` flag has no effect when used outside of a project,
27
+ # so we omit it from this command. If we were to include it here, uv
28
+ # would display a warning saying exactly that.
29
+ exec uv run --active dagster-daemon run
@@ -0,0 +1,26 @@
1
+ #!/bin/bash
2
+
3
+ set -euo pipefail
4
+
5
+ file_env() {
6
+ local var="$1"
7
+ local fileVar="${var}_FILE"
8
+ local def="${2:-}"
9
+ if [ "${!var:-}" ] && [ "${!fileVar:-}" ]; then
10
+ echo >&2 "error: both $var and $fileVar are set (but are exclusive)"
11
+ exit 1
12
+ fi
13
+ local val="$def"
14
+ if [ "${!var:-}" ]; then
15
+ val="${!var}"
16
+ elif [ "${!fileVar:-}" ]; then
17
+ val="$(< "${!fileVar}")"
18
+ fi
19
+ export "$var"="$val"
20
+ unset "$fileVar"
21
+ }
22
+
23
+ file_env "MONGO_PASSWORD"
24
+ file_env "DAGSTER_POSTGRES_PASSWORD"
25
+
26
+ exec uv run --active --no-sync dagit -h 0.0.0.0 -p 3000 -w workspace.yaml --read-only
@@ -0,0 +1,29 @@
1
+ #!/bin/bash
2
+
3
+ set -euo pipefail
4
+
5
+ file_env() {
6
+ local var="$1"
7
+ local fileVar="${var}_FILE"
8
+ local def="${2:-}"
9
+ if [ "${!var:-}" ] && [ "${!fileVar:-}" ]; then
10
+ echo >&2 "error: both $var and $fileVar are set (but are exclusive)"
11
+ exit 1
12
+ fi
13
+ local val="$def"
14
+ if [ "${!var:-}" ]; then
15
+ val="${!var}"
16
+ elif [ "${!fileVar:-}" ]; then
17
+ val="$(< "${!fileVar}")"
18
+ fi
19
+ export "$var"="$val"
20
+ unset "$fileVar"
21
+ }
22
+
23
+ file_env "MONGO_PASSWORD"
24
+ file_env "DAGSTER_POSTGRES_PASSWORD"
25
+
26
+ # Note: The `--no-sync` flag has no effect when used outside of a project,
27
+ # so we omit it from this command. If we were to include it here, uv
28
+ # would display a warning saying exactly that.
29
+ exec uv run --active dagit -h 0.0.0.0 -p 3000 -w workspace.yaml