nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +167 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/mongo.py +435 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +270 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +796 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +425 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +7 -8
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +633 -13
- nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
- nmdc_runtime/site/graphs.py +8 -22
- nmdc_runtime/site/ops.py +147 -181
- nmdc_runtime/site/repository.py +2 -112
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/gold_translator.py +4 -12
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +90 -48
- nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
- nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
- nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
- nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py
|
|
3
|
+
"""
|
|
4
|
+
missing_neon_soils_ecosystem_data.py: Create a changesheet for missing ecosystem data for NEON soils samples
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import time
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Tuple
|
|
12
|
+
|
|
13
|
+
import click
|
|
14
|
+
from dotenv import load_dotenv
|
|
15
|
+
|
|
16
|
+
from nmdc_runtime.site.changesheets.base import (
|
|
17
|
+
Changesheet,
|
|
18
|
+
ChangesheetLineItem,
|
|
19
|
+
JSON_OBJECT,
|
|
20
|
+
)
|
|
21
|
+
from nmdc_runtime.site.normalization.gold import (
|
|
22
|
+
get_gold_biosample_name_suffix,
|
|
23
|
+
normalize_gold_id,
|
|
24
|
+
)
|
|
25
|
+
from nmdc_runtime.site.resources import GoldApiClient, RuntimeApiUserClient
|
|
26
|
+
|
|
27
|
+
load_dotenv()
|
|
28
|
+
GOLD_NEON_SOIL_STUDY_ID = "Gs0144570"
|
|
29
|
+
NAME = "missing_neon_soils_ecosystem_data"
|
|
30
|
+
# omics processing to biosamples file in data directory
|
|
31
|
+
DATA_PATH = Path(__file__).parent.parent.joinpath("data")
|
|
32
|
+
|
|
33
|
+
log_filename = f"{NAME}-{time.strftime('%Y%m%d-%H%M%S')}.log"
|
|
34
|
+
logging.basicConfig(
|
|
35
|
+
level=logging.INFO,
|
|
36
|
+
format="%(asctime)s %(levelname)s %(message)s",
|
|
37
|
+
filename=log_filename,
|
|
38
|
+
encoding="utf-8",
|
|
39
|
+
filemode="w",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def read_omics_processing_to_biosample_map() -> dict:
|
|
44
|
+
"""
|
|
45
|
+
Map omics processing to biosamples
|
|
46
|
+
:return: dict
|
|
47
|
+
"""
|
|
48
|
+
omics_processing_to_biosamples = {}
|
|
49
|
+
with open(DATA_PATH.joinpath("OmicsProcessing-to-catted-Biosamples.tsv")) as f:
|
|
50
|
+
lines_read = 0
|
|
51
|
+
lines_skipped = 0
|
|
52
|
+
for line in f:
|
|
53
|
+
if not line.startswith("nmdc:"):
|
|
54
|
+
lines_skipped += 1
|
|
55
|
+
continue
|
|
56
|
+
lines_read += 1
|
|
57
|
+
omics_processing_id, biosample_ids = line.strip().split("\t")
|
|
58
|
+
omics_processing_to_biosamples[omics_processing_id] = biosample_ids.split(
|
|
59
|
+
"|"
|
|
60
|
+
)
|
|
61
|
+
logging.debug(
|
|
62
|
+
f"Read {lines_read} lines from OmicsProcessing-to-catted-Biosamples.tsv"
|
|
63
|
+
)
|
|
64
|
+
logging.info(
|
|
65
|
+
f"Skipped {lines_skipped} lines from OmicsProcessing-to-catted-Biosamples.tsv"
|
|
66
|
+
)
|
|
67
|
+
return omics_processing_to_biosamples
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def gold_biosample_to_nmdc_biosamples_and_omics_processing_records(
|
|
71
|
+
runtime_api_client, omprc_to_bs_map, goldbs
|
|
72
|
+
) -> Tuple[List[JSON_OBJECT], List[JSON_OBJECT]]:
|
|
73
|
+
"""
|
|
74
|
+
Find the corresponding NMDC biosamples and omics processing records for a
|
|
75
|
+
GOLD biosample
|
|
76
|
+
:param runtime_api_client:
|
|
77
|
+
:param omprc_to_bs_map: Dict of omics processing ID to biosample IDs
|
|
78
|
+
:param goldbs: a GOLD biosample
|
|
79
|
+
:return: (
|
|
80
|
+
List of corresponding NMDC biosamples,
|
|
81
|
+
List of corresponding NMDC omics processing records
|
|
82
|
+
)
|
|
83
|
+
"""
|
|
84
|
+
goldbs_id = normalize_gold_id(goldbs["biosampleGoldId"])
|
|
85
|
+
goldbs_name_suffix = get_gold_biosample_name_suffix(goldbs["biosampleName"])
|
|
86
|
+
logging.info(f"goldbs_id: {goldbs_id}")
|
|
87
|
+
logging.info(f"goldbs_name_suffix: {goldbs_name_suffix}")
|
|
88
|
+
|
|
89
|
+
# Search for NMDC biosamples with by GOLD biosample ID
|
|
90
|
+
nmdc_biosamples = []
|
|
91
|
+
logging.info(f"Searching for NMDC biosamples with {goldbs_id}...")
|
|
92
|
+
nmdcbs = runtime_api_client.get_biosamples_by_gold_biosample_id(goldbs_id)
|
|
93
|
+
logging.info(f"Found {len(nmdcbs)} NMDC biosamples with {goldbs_id}...")
|
|
94
|
+
nmdc_biosamples.extend(nmdcbs)
|
|
95
|
+
|
|
96
|
+
# Search for NMDC biosamples via omics processing name containing GOLD biosample name suffix
|
|
97
|
+
logging.info(
|
|
98
|
+
f"Searching for NMDC omics processing name containing {goldbs_name_suffix}..."
|
|
99
|
+
)
|
|
100
|
+
omprc_records = runtime_api_client.get_omics_processing_by_name(goldbs_name_suffix)
|
|
101
|
+
for omprc in omprc_records:
|
|
102
|
+
omprc_id = omprc["id"]
|
|
103
|
+
logging.info(f"omprc_id: {omprc_id}")
|
|
104
|
+
logging.info(
|
|
105
|
+
f"Searching for NMDC biosamples with omics processing {omprc_id}..."
|
|
106
|
+
)
|
|
107
|
+
nmdcbs_ids = omprc_to_bs_map.get(omprc_id, [])
|
|
108
|
+
logging.info(f"Found {len(nmdcbs_ids)} NMDC biosamples with {omprc_id}...")
|
|
109
|
+
for nmdcbs_id in nmdcbs_ids:
|
|
110
|
+
nmdcbs_req = runtime_api_client.request("GET", f"/biosamples/{nmdcbs_id}")
|
|
111
|
+
if nmdcbs_req.status_code != 200:
|
|
112
|
+
logging.error(
|
|
113
|
+
f"Failed to retrieve NMDC biosample {nmdcbs_id}: {nmdcbs_req.status_code}"
|
|
114
|
+
)
|
|
115
|
+
continue
|
|
116
|
+
nmdcbs = nmdcbs_req.json()
|
|
117
|
+
nmdc_biosamples.append(nmdcbs)
|
|
118
|
+
|
|
119
|
+
logging.info(f"Found {len(nmdc_biosamples)} NMDC biosamples for {goldbs_id}...")
|
|
120
|
+
return nmdc_biosamples, omprc_records
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def compare_biosamples(goldbs, nmdcbs) -> List[ChangesheetLineItem]:
|
|
124
|
+
changesheet_line_items = []
|
|
125
|
+
|
|
126
|
+
# Check for missing ecosystem metadata
|
|
127
|
+
changesheet_line_items.extend(_check_ecosystem_metadata(goldbs, nmdcbs))
|
|
128
|
+
|
|
129
|
+
# Check for missing gold biosample identifiers
|
|
130
|
+
changesheet_line_items.extend(_check_gold_biosample_identifiers(goldbs, nmdcbs))
|
|
131
|
+
|
|
132
|
+
return changesheet_line_items
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _check_ecosystem_metadata(goldbs, nmdcbs) -> List[ChangesheetLineItem]:
|
|
136
|
+
# nmdc to gold ecosystem key map
|
|
137
|
+
ecosystem_key_map = {
|
|
138
|
+
"ecosystem": "ecosystem",
|
|
139
|
+
"ecosystem_category": "ecosystemCategory",
|
|
140
|
+
"ecosystem_type": "ecosystemType",
|
|
141
|
+
"ecosystem_subtype": "ecosystemSubtype",
|
|
142
|
+
}
|
|
143
|
+
changesheet_line_items = []
|
|
144
|
+
for nmdc_key, gold_key in ecosystem_key_map.items():
|
|
145
|
+
if not gold_key in goldbs:
|
|
146
|
+
logging.warning(f"no {gold_key} for {goldbs['biosampleGoldId']}...")
|
|
147
|
+
continue
|
|
148
|
+
if not nmdc_key in nmdcbs:
|
|
149
|
+
changesheet_line_items.append(
|
|
150
|
+
ChangesheetLineItem(
|
|
151
|
+
id=nmdcbs["id"],
|
|
152
|
+
action="update",
|
|
153
|
+
attribute=nmdc_key,
|
|
154
|
+
value=goldbs.get(gold_key),
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
continue
|
|
158
|
+
if nmdcbs[nmdc_key] != goldbs.get(gold_key):
|
|
159
|
+
changesheet_line_items.append(
|
|
160
|
+
ChangesheetLineItem(
|
|
161
|
+
id=nmdcbs["id"],
|
|
162
|
+
action="update",
|
|
163
|
+
attribute=nmdc_key,
|
|
164
|
+
value=goldbs.get(gold_key),
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
return changesheet_line_items
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _check_gold_biosample_identifiers(goldbs, nmdcbs) -> List[ChangesheetLineItem]:
|
|
173
|
+
changesheet_line_items = []
|
|
174
|
+
goldbs_id = normalize_gold_id(goldbs["biosampleGoldId"])
|
|
175
|
+
if not goldbs_id in nmdcbs["gold_biosample_identifiers"]:
|
|
176
|
+
changesheet_line_items.append(
|
|
177
|
+
ChangesheetLineItem(
|
|
178
|
+
id=nmdcbs["id"],
|
|
179
|
+
action="insert",
|
|
180
|
+
attribute="gold_biosample_identifiers",
|
|
181
|
+
value=goldbs_id + "|",
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
return changesheet_line_items
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def compare_projects(gold_project, omprc_record) -> ChangesheetLineItem:
|
|
188
|
+
gold_project_id = normalize_gold_id(gold_project["projectGoldId"])
|
|
189
|
+
if "gold_sequencing_project_identifiers" not in omprc_record:
|
|
190
|
+
return ChangesheetLineItem(
|
|
191
|
+
id=omprc_record["id"],
|
|
192
|
+
action="insert",
|
|
193
|
+
attribute="gold_sequencing_project_identifiers",
|
|
194
|
+
value=gold_project_id + "|",
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
if gold_project_id not in omprc_record["gold_sequencing_project_identifiers"]:
|
|
198
|
+
return ChangesheetLineItem(
|
|
199
|
+
id=omprc_record["id"],
|
|
200
|
+
action="insert",
|
|
201
|
+
attribute="gold_sequencing_project_identifiers",
|
|
202
|
+
value=gold_project_id + "|",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@click.command()
|
|
207
|
+
@click.option("--study_id", default=GOLD_NEON_SOIL_STUDY_ID, help="GOLD study ID")
|
|
208
|
+
@click.option("--use_dev_api", is_flag=True, default=False, help="Use the dev API")
|
|
209
|
+
def generate_changesheet(study_id, use_dev_api):
|
|
210
|
+
"""
|
|
211
|
+
Generate a changesheet for missing ecosystem data for NEON soils samples by:
|
|
212
|
+
1. Retrieving GOLD biosamples for the given study
|
|
213
|
+
2. Finding the corresponding NMDC biosamples and omics processing records for each GOLD biosample
|
|
214
|
+
3. Comparing the GOLD biosample to the NMDC biosamples and omics processing records
|
|
215
|
+
4. Generating a changesheet for the differences
|
|
216
|
+
|
|
217
|
+
WARNING: This script is not idempotent. It will generate a new changesheet each time it is run.
|
|
218
|
+
|
|
219
|
+
Changesheet is written to nmdc_runtime/site/changesheets/changesheets_output
|
|
220
|
+
|
|
221
|
+
:param study_id: The GOLD study ID
|
|
222
|
+
:param use_dev_api: Use the dev API (default: False)
|
|
223
|
+
:return:
|
|
224
|
+
"""
|
|
225
|
+
start_time = time.time()
|
|
226
|
+
logging.info("starting missing_neon_soils_ecosystem_data.py...")
|
|
227
|
+
logging.info(f"study_id: {study_id}")
|
|
228
|
+
|
|
229
|
+
gold_api_client = GoldApiClient(
|
|
230
|
+
base_url=os.getenv("GOLD_API_BASE_URL"),
|
|
231
|
+
username=os.getenv("GOLD_API_USERNAME"),
|
|
232
|
+
password=os.getenv("GOLD_API_PASSWORD"),
|
|
233
|
+
)
|
|
234
|
+
logging.info("connected to GOLD API...")
|
|
235
|
+
|
|
236
|
+
if use_dev_api:
|
|
237
|
+
base_url = os.getenv("API_HOST_DEV")
|
|
238
|
+
logging.info("using dev API...")
|
|
239
|
+
else:
|
|
240
|
+
base_url = os.getenv("API_HOST")
|
|
241
|
+
logging.info("using prod API...")
|
|
242
|
+
|
|
243
|
+
runtime_api_user_client = RuntimeApiUserClient(
|
|
244
|
+
base_url=base_url,
|
|
245
|
+
username=os.getenv("API_QUERY_USER"),
|
|
246
|
+
password=os.getenv("API_QUERY_PASS"),
|
|
247
|
+
)
|
|
248
|
+
logging.info("connected to NMDC API...")
|
|
249
|
+
|
|
250
|
+
# Retrieve GOLD biosamples for the given study
|
|
251
|
+
gold_biosamples = gold_api_client.fetch_biosamples_by_study(study_id)
|
|
252
|
+
logging.info(f"retrieved {len(gold_biosamples)} biosamples from GOLD API...")
|
|
253
|
+
|
|
254
|
+
# omics processing to biosamples map generated by a SPARQL query
|
|
255
|
+
omprc_to_bs_map = read_omics_processing_to_biosample_map()
|
|
256
|
+
|
|
257
|
+
changesheet = Changesheet(name=NAME)
|
|
258
|
+
# For each GOLD biosample, find the corresponding NMDC biosamples
|
|
259
|
+
nmdcbs_count = 0
|
|
260
|
+
unfindable_goldbs_ids = []
|
|
261
|
+
for goldbs in gold_biosamples:
|
|
262
|
+
(
|
|
263
|
+
nmdc_biosamples,
|
|
264
|
+
omprc_records,
|
|
265
|
+
) = gold_biosample_to_nmdc_biosamples_and_omics_processing_records(
|
|
266
|
+
runtime_api_user_client, omprc_to_bs_map, goldbs
|
|
267
|
+
)
|
|
268
|
+
if not nmdc_biosamples:
|
|
269
|
+
logging.warning(
|
|
270
|
+
f"no corresponding NMDC biosamples found for {goldbs['biosampleGoldId']}..."
|
|
271
|
+
)
|
|
272
|
+
unfindable_goldbs_ids.append(goldbs["biosampleGoldId"])
|
|
273
|
+
continue
|
|
274
|
+
logging.info(
|
|
275
|
+
f"found {len(nmdc_biosamples)} corresponding NMDC biosamples for {goldbs['biosampleGoldId']}..."
|
|
276
|
+
)
|
|
277
|
+
nmdcbs_count += len(nmdc_biosamples)
|
|
278
|
+
for nmdcbs in nmdc_biosamples:
|
|
279
|
+
logging.info(f"nmdcbs: {nmdcbs['id']}")
|
|
280
|
+
changesheet.line_items.extend(compare_biosamples(goldbs, nmdcbs))
|
|
281
|
+
|
|
282
|
+
# Insert gold project id into omprc alternative identifiers
|
|
283
|
+
gold_projects = gold_api_client.request(
|
|
284
|
+
"/projects", params={"biosampleGoldId": goldbs["biosampleGoldId"]}
|
|
285
|
+
)
|
|
286
|
+
for gold_project in gold_projects:
|
|
287
|
+
for omprc_record in omprc_records:
|
|
288
|
+
changesheet.line_items.append(
|
|
289
|
+
compare_projects(gold_project, omprc_record)
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
logging.info(f"Processed {len(gold_biosamples)} GOLD biosamples...")
|
|
293
|
+
logging.info(f"found {nmdcbs_count} corresponding NMDC biosamples...")
|
|
294
|
+
logging.info(f"unfindable_count: {len(unfindable_goldbs_ids)}...")
|
|
295
|
+
for unfindable_goldbs_ids in unfindable_goldbs_ids:
|
|
296
|
+
logging.info(f"unfindable_goldbs_id: {unfindable_goldbs_ids}...")
|
|
297
|
+
logging.info(f"changesheet has {len(changesheet.line_items)} line items...")
|
|
298
|
+
|
|
299
|
+
changesheet.write_changesheet()
|
|
300
|
+
|
|
301
|
+
logging.info("Validating changesheet...")
|
|
302
|
+
is_valid_changesheet = changesheet.validate_changesheet(base_url)
|
|
303
|
+
logging.info(f"Changesheet is valid: {is_valid_changesheet}")
|
|
304
|
+
|
|
305
|
+
logging.info(
|
|
306
|
+
f"missing_neon_soils_ecosystem_data.py completed in {time.time() - start_time} seconds..."
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
if __name__ == "__main__":
|
|
311
|
+
generate_changesheet()
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# coding: utf-8
|
|
3
|
+
# nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py
|
|
4
|
+
"""
|
|
5
|
+
neon_soils_add_ncbi_ids.py: Add NCBI biosample accessions to neon soils
|
|
6
|
+
biosamples, NCBI bioproject accessions to omics processing, and
|
|
7
|
+
NCBI Umbrella bioproject accession to neon soils study.
|
|
8
|
+
"""
|
|
9
|
+
import logging
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
import click
|
|
13
|
+
from dotenv import load_dotenv
|
|
14
|
+
|
|
15
|
+
from nmdc_runtime.site.changesheets.base import (
|
|
16
|
+
Changesheet,
|
|
17
|
+
ChangesheetLineItem,
|
|
18
|
+
get_gold_client,
|
|
19
|
+
get_runtime_client,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
load_dotenv()
|
|
23
|
+
NAME = "neon_soils_add_ncbi_ids"
|
|
24
|
+
NMDC_STUDY_ID = "nmdc:sty-11-34xj1150"
|
|
25
|
+
UMBRELLA_BIOPROJECT_ACCESSION = "PRJNA1029061"
|
|
26
|
+
|
|
27
|
+
log_filename = f"{NAME}-{time.strftime('%Y%m%d-%H%M%S')}.log"
|
|
28
|
+
logging.basicConfig(
|
|
29
|
+
level=logging.INFO,
|
|
30
|
+
format="%(asctime)s %(levelname)s %(message)s",
|
|
31
|
+
filename=log_filename,
|
|
32
|
+
encoding="utf-8",
|
|
33
|
+
filemode="w",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_change_for_biosample(biosample, ncbi_biosample_accession):
|
|
38
|
+
"""
|
|
39
|
+
Get the changes for the given biosample
|
|
40
|
+
:param biosample: dict - the biosample
|
|
41
|
+
:param ncbi_biosample_accession: str - the NCBI BioSample accession
|
|
42
|
+
:return: list - the changes
|
|
43
|
+
"""
|
|
44
|
+
ncbi_biosample_accessions = biosample.get("insdc_biosample_identifiers", [])
|
|
45
|
+
if ncbi_biosample_accession in ncbi_biosample_accessions:
|
|
46
|
+
return
|
|
47
|
+
biosample_id = biosample["id"]
|
|
48
|
+
logging.info(f"creating change for biosample_id: {biosample_id}")
|
|
49
|
+
return ChangesheetLineItem(
|
|
50
|
+
id=biosample["id"],
|
|
51
|
+
action="insert",
|
|
52
|
+
attribute="insdc_biosample_identifiers",
|
|
53
|
+
value="biosample:" + ncbi_biosample_accession + "|",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _get_change_for_omics_processing(
|
|
58
|
+
omics_processing_record, ncbi_bioproject_accession
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Get the changes for the given omics_processing_record
|
|
62
|
+
:param omics_processing_record:
|
|
63
|
+
:param ncbi_bioproject_accession:
|
|
64
|
+
:return:
|
|
65
|
+
"""
|
|
66
|
+
ncbi_bioproject_accessions = omics_processing_record.get(
|
|
67
|
+
"insdc_bioproject_identifiers", []
|
|
68
|
+
)
|
|
69
|
+
if ncbi_bioproject_accession in ncbi_bioproject_accessions:
|
|
70
|
+
return
|
|
71
|
+
omics_processing_id = omics_processing_record["id"]
|
|
72
|
+
logging.info(f"creating change for omics_processing_id: {omics_processing_id}")
|
|
73
|
+
return ChangesheetLineItem(
|
|
74
|
+
id=omics_processing_id,
|
|
75
|
+
action="insert",
|
|
76
|
+
attribute="insdc_bioproject_identifiers",
|
|
77
|
+
value="bioproject:" + ncbi_bioproject_accession + "|",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@click.command()
|
|
82
|
+
@click.option("--study_id", default=NMDC_STUDY_ID, help="NMDC study ID")
|
|
83
|
+
@click.option("--use_dev_api", is_flag=True, default=True, help="Use the dev API")
|
|
84
|
+
def generate_changesheet(study_id, use_dev_api):
|
|
85
|
+
"""
|
|
86
|
+
Generate a changesheet for neon soils study and biosamples by:
|
|
87
|
+
0. Changesheet line item: Umbrella BioProjectAccession to
|
|
88
|
+
study.insdc_project_identifiers
|
|
89
|
+
1. Retrieving all gold_study_identifiers for the neon soils study
|
|
90
|
+
2. For each gold_study_identifier, retrieve the GOLD projects
|
|
91
|
+
3. For each GOLD project,
|
|
92
|
+
A. retrieve the corresponding NMDC biosample(s). For each biosample,
|
|
93
|
+
- Changesheet line item:NCBI BioSampleAccession to
|
|
94
|
+
insdc_biosample_identifiers
|
|
95
|
+
B. Retrieve the corresponding NMDC omics_processing. For each,
|
|
96
|
+
- Changesheet line item:NCBI BioProjectAccession to
|
|
97
|
+
insdc_bioproject_identifiers
|
|
98
|
+
|
|
99
|
+
WARNING: This script is not idempotent. It will generate a new changesheet
|
|
100
|
+
each time it is run.
|
|
101
|
+
Changesheet is written to nmdc_runtime/site/changesheets/changesheets_output
|
|
102
|
+
|
|
103
|
+
:param study_id: The NMDC study ID
|
|
104
|
+
:param use_dev_api: Use the dev API (default: False)
|
|
105
|
+
:return:
|
|
106
|
+
"""
|
|
107
|
+
start_time = time.time()
|
|
108
|
+
logging.info(f"Generating changesheet for {study_id}")
|
|
109
|
+
logging.info(f"Using dev API: {use_dev_api}")
|
|
110
|
+
|
|
111
|
+
# Initialize the NMDC API
|
|
112
|
+
runtime_client = get_runtime_client(use_dev_api)
|
|
113
|
+
|
|
114
|
+
# Initialize the GOLD API
|
|
115
|
+
gold_client = get_gold_client()
|
|
116
|
+
|
|
117
|
+
# Initialize the changesheet
|
|
118
|
+
changesheet = Changesheet(name=NAME)
|
|
119
|
+
|
|
120
|
+
# 1. Retrieve all gold_study_identifiers for the neon soils study
|
|
121
|
+
logging.info(f"Retrieving gold_study_identifiers for {study_id}")
|
|
122
|
+
res = runtime_client.request("GET", f"/studies/{study_id}")
|
|
123
|
+
nmdc_study = res.json()
|
|
124
|
+
changesheet.line_items.append(
|
|
125
|
+
ChangesheetLineItem(
|
|
126
|
+
id=study_id,
|
|
127
|
+
action="insert",
|
|
128
|
+
attribute="insdc_bioproject_identifiers",
|
|
129
|
+
value="bioproject:" + UMBRELLA_BIOPROJECT_ACCESSION + "|",
|
|
130
|
+
)
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
gold_study_identifiers = nmdc_study["gold_study_identifiers"]
|
|
134
|
+
logging.info(f"gold_study_identifiers: {gold_study_identifiers}")
|
|
135
|
+
gold_project_count = 0
|
|
136
|
+
biosample_count = 0
|
|
137
|
+
for gold_study_identifier in gold_study_identifiers:
|
|
138
|
+
# 2. For each gold_study_identifier, retrieve the GOLD projects
|
|
139
|
+
if gold_study_identifier == "gold:Gs0144570":
|
|
140
|
+
# TODO verify that this one has already been done
|
|
141
|
+
continue
|
|
142
|
+
logging.info(
|
|
143
|
+
f"Retrieving GOLD projects for gold_study_identifier: {gold_study_identifier}"
|
|
144
|
+
)
|
|
145
|
+
projects = gold_client.fetch_projects_by_study(gold_study_identifier)
|
|
146
|
+
logging.info(f"Retrieved {len(projects)} projects")
|
|
147
|
+
|
|
148
|
+
# 3. For each GOLD project,
|
|
149
|
+
for project in projects:
|
|
150
|
+
gold_project_count += 1
|
|
151
|
+
project_gold_id = project["projectGoldId"]
|
|
152
|
+
biosample_gold_id = project["biosampleGoldId"]
|
|
153
|
+
ncbi_bioproject_accession = project["ncbiBioProjectAccession"]
|
|
154
|
+
ncbi_biosample_accession = project["ncbiBioSampleAccession"]
|
|
155
|
+
|
|
156
|
+
# A. retrieve the corresponding NMDC biosample(s)
|
|
157
|
+
logging.info(
|
|
158
|
+
f"Retrieving NMDC biosamples for biosample_gold_id: {biosample_gold_id}"
|
|
159
|
+
)
|
|
160
|
+
biosamples = runtime_client.get_biosamples_by_gold_biosample_id(
|
|
161
|
+
biosample_gold_id
|
|
162
|
+
)
|
|
163
|
+
logging.info(f"Retrieved {len(biosamples)} biosamples")
|
|
164
|
+
for biosample in biosamples:
|
|
165
|
+
biosample_count += 1
|
|
166
|
+
biosample_id = biosample["id"]
|
|
167
|
+
logging.info(f"biosample_id: {biosample_id}")
|
|
168
|
+
# NcbiBioSampleAccession to insdc_biosample_identifiers
|
|
169
|
+
change = _get_change_for_biosample(biosample, ncbi_biosample_accession)
|
|
170
|
+
if change:
|
|
171
|
+
changesheet.line_items.append(change)
|
|
172
|
+
|
|
173
|
+
# B. Retrieve the corresponding NMDC omics_processing
|
|
174
|
+
logging.info(
|
|
175
|
+
f"Retrieving NMDC omics_processing for project_gold_id: {project_gold_id}"
|
|
176
|
+
)
|
|
177
|
+
omics_processing_records = (
|
|
178
|
+
runtime_client.get_omics_processing_records_by_gold_project_id(
|
|
179
|
+
project_gold_id
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
logging.info(f"Retrieved {len(omics_processing_records)} omics_processings")
|
|
183
|
+
for omics_processing in omics_processing_records:
|
|
184
|
+
omics_processing_id = omics_processing["id"]
|
|
185
|
+
logging.info(f"omics_processing_id: {omics_processing_id}")
|
|
186
|
+
# NcbiBioProjectAccession to insdc_experiment_identifiers
|
|
187
|
+
change = _get_change_for_omics_processing(
|
|
188
|
+
omics_processing, ncbi_bioproject_accession
|
|
189
|
+
)
|
|
190
|
+
if change:
|
|
191
|
+
changesheet.line_items.append(change)
|
|
192
|
+
|
|
193
|
+
logging.info(f"gold_project_count: {gold_project_count}")
|
|
194
|
+
logging.info(f"biosample_count: {biosample_count}")
|
|
195
|
+
logging.info(f"changesheet has {len(changesheet.line_items)} line items")
|
|
196
|
+
|
|
197
|
+
# Write the changesheet
|
|
198
|
+
changesheet.write_changesheet()
|
|
199
|
+
|
|
200
|
+
# Validate the changesheet
|
|
201
|
+
if changesheet.validate_changesheet(runtime_client.base_url):
|
|
202
|
+
logging.info(f"Changesheet is valid")
|
|
203
|
+
else:
|
|
204
|
+
logging.error(f"Changesheet is invalid")
|
|
205
|
+
|
|
206
|
+
logging.info(f"Completed in {time.time() - start_time} seconds")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
if __name__ == "__main__":
|
|
210
|
+
generate_changesheet()
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
scheduler:
|
|
2
|
+
module: dagster.core.scheduler
|
|
3
|
+
class: DagsterDaemonScheduler
|
|
4
|
+
|
|
5
|
+
run_coordinator:
|
|
6
|
+
module: dagster.core.run_coordinator
|
|
7
|
+
class: QueuedRunCoordinator
|
|
8
|
+
|
|
9
|
+
run_launcher:
|
|
10
|
+
module: dagster.core.launcher
|
|
11
|
+
class: DefaultRunLauncher
|
|
12
|
+
|
|
13
|
+
run_storage:
|
|
14
|
+
module: dagster_postgres.run_storage
|
|
15
|
+
class: PostgresRunStorage
|
|
16
|
+
config:
|
|
17
|
+
postgres_db:
|
|
18
|
+
hostname: dagster-postgresql
|
|
19
|
+
username:
|
|
20
|
+
env: DAGSTER_POSTGRES_USER
|
|
21
|
+
password:
|
|
22
|
+
env: DAGSTER_POSTGRES_PASSWORD
|
|
23
|
+
db_name:
|
|
24
|
+
env: DAGSTER_POSTGRES_DB
|
|
25
|
+
port: 5432
|
|
26
|
+
|
|
27
|
+
schedule_storage:
|
|
28
|
+
module: dagster_postgres.schedule_storage
|
|
29
|
+
class: PostgresScheduleStorage
|
|
30
|
+
config:
|
|
31
|
+
postgres_db:
|
|
32
|
+
hostname: dagster-postgresql
|
|
33
|
+
username:
|
|
34
|
+
env: DAGSTER_POSTGRES_USER
|
|
35
|
+
password:
|
|
36
|
+
env: DAGSTER_POSTGRES_PASSWORD
|
|
37
|
+
db_name:
|
|
38
|
+
env: DAGSTER_POSTGRES_DB
|
|
39
|
+
port: 5432
|
|
40
|
+
|
|
41
|
+
event_log_storage:
|
|
42
|
+
module: dagster_postgres.event_log
|
|
43
|
+
class: PostgresEventLogStorage
|
|
44
|
+
config:
|
|
45
|
+
postgres_db:
|
|
46
|
+
hostname: dagster-postgresql
|
|
47
|
+
username:
|
|
48
|
+
env: DAGSTER_POSTGRES_USER
|
|
49
|
+
password:
|
|
50
|
+
env: DAGSTER_POSTGRES_PASSWORD
|
|
51
|
+
db_name:
|
|
52
|
+
env: DAGSTER_POSTGRES_DB
|
|
53
|
+
port: 5432
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
file_env() {
|
|
6
|
+
local var="$1"
|
|
7
|
+
local fileVar="${var}_FILE"
|
|
8
|
+
local def="${2:-}"
|
|
9
|
+
if [ "${!var:-}" ] && [ "${!fileVar:-}" ]; then
|
|
10
|
+
echo >&2 "error: both $var and $fileVar are set (but are exclusive)"
|
|
11
|
+
exit 1
|
|
12
|
+
fi
|
|
13
|
+
local val="$def"
|
|
14
|
+
if [ "${!var:-}" ]; then
|
|
15
|
+
val="${!var}"
|
|
16
|
+
elif [ "${!fileVar:-}" ]; then
|
|
17
|
+
val="$(< "${!fileVar}")"
|
|
18
|
+
fi
|
|
19
|
+
export "$var"="$val"
|
|
20
|
+
unset "$fileVar"
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
file_env "MONGO_PASSWORD"
|
|
24
|
+
file_env "DAGSTER_POSTGRES_PASSWORD"
|
|
25
|
+
|
|
26
|
+
exec uv run --active dagster-daemon run
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
file_env() {
|
|
6
|
+
local var="$1"
|
|
7
|
+
local fileVar="${var}_FILE"
|
|
8
|
+
local def="${2:-}"
|
|
9
|
+
if [ "${!var:-}" ] && [ "${!fileVar:-}" ]; then
|
|
10
|
+
echo >&2 "error: both $var and $fileVar are set (but are exclusive)"
|
|
11
|
+
exit 1
|
|
12
|
+
fi
|
|
13
|
+
local val="$def"
|
|
14
|
+
if [ "${!var:-}" ]; then
|
|
15
|
+
val="${!var}"
|
|
16
|
+
elif [ "${!fileVar:-}" ]; then
|
|
17
|
+
val="$(< "${!fileVar}")"
|
|
18
|
+
fi
|
|
19
|
+
export "$var"="$val"
|
|
20
|
+
unset "$fileVar"
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
file_env "MONGO_PASSWORD"
|
|
24
|
+
file_env "DAGSTER_POSTGRES_PASSWORD"
|
|
25
|
+
|
|
26
|
+
exec uv run --active dagit -h 0.0.0.0 -p 3000 -w workspace.yaml --read-only
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
file_env() {
|
|
6
|
+
local var="$1"
|
|
7
|
+
local fileVar="${var}_FILE"
|
|
8
|
+
local def="${2:-}"
|
|
9
|
+
if [ "${!var:-}" ] && [ "${!fileVar:-}" ]; then
|
|
10
|
+
echo >&2 "error: both $var and $fileVar are set (but are exclusive)"
|
|
11
|
+
exit 1
|
|
12
|
+
fi
|
|
13
|
+
local val="$def"
|
|
14
|
+
if [ "${!var:-}" ]; then
|
|
15
|
+
val="${!var}"
|
|
16
|
+
elif [ "${!fileVar:-}" ]; then
|
|
17
|
+
val="$(< "${!fileVar}")"
|
|
18
|
+
fi
|
|
19
|
+
export "$var"="$val"
|
|
20
|
+
unset "$fileVar"
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
file_env "MONGO_PASSWORD"
|
|
24
|
+
file_env "DAGSTER_POSTGRES_PASSWORD"
|
|
25
|
+
|
|
26
|
+
exec uv run --active dagit -h 0.0.0.0 -p 3000 -w workspace.yaml
|