nmdc-runtime 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/config.py +57 -1
- nmdc_runtime/mongo_util.py +90 -0
- nmdc_runtime/site/export/ncbi_xml.py +98 -27
- nmdc_runtime/site/export/ncbi_xml_utils.py +27 -25
- nmdc_runtime/site/graphs.py +72 -9
- nmdc_runtime/site/ops.py +408 -65
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +107 -6
- nmdc_runtime/site/resources.py +17 -4
- nmdc_runtime/site/translation/gold_translator.py +18 -9
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -0
- nmdc_runtime/site/translation/neon_soil_translator.py +1 -0
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -0
- nmdc_runtime/site/translation/submission_portal_translator.py +62 -0
- nmdc_runtime/util.py +53 -267
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/METADATA +18 -3
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/RECORD +21 -20
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from functools import lru_cache
|
|
2
|
-
from typing import Any, Dict, List
|
|
2
|
+
from typing import Any, Dict, List, Union
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from nmdc_runtime.site.resources import (
|
|
5
5
|
RuntimeApiUserClient,
|
|
@@ -18,6 +18,8 @@ class DatabaseUpdater:
|
|
|
18
18
|
gold_api_client: GoldApiClient,
|
|
19
19
|
study_id: str,
|
|
20
20
|
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
21
|
+
include_field_site_info: bool = False,
|
|
22
|
+
enable_biosample_filtering: bool = True,
|
|
21
23
|
):
|
|
22
24
|
"""This class serves as an API for repairing connections in the database by
|
|
23
25
|
adding records that are essentially missing "links"/"connections". As we identify
|
|
@@ -39,6 +41,8 @@ class DatabaseUpdater:
|
|
|
39
41
|
self.gold_api_client = gold_api_client
|
|
40
42
|
self.study_id = study_id
|
|
41
43
|
self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
|
|
44
|
+
self.include_field_site_info = include_field_site_info
|
|
45
|
+
self.enable_biosample_filtering = enable_biosample_filtering
|
|
42
46
|
|
|
43
47
|
@lru_cache
|
|
44
48
|
def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
|
|
@@ -95,6 +99,8 @@ class DatabaseUpdater:
|
|
|
95
99
|
biosamples=all_gold_biosamples,
|
|
96
100
|
projects=all_gold_projects,
|
|
97
101
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
102
|
+
include_field_site_info=self.include_field_site_info,
|
|
103
|
+
enable_biosample_filtering=self.enable_biosample_filtering,
|
|
98
104
|
)
|
|
99
105
|
|
|
100
106
|
# The GoldStudyTranslator class has some pre-processing logic which filters out
|
|
@@ -214,6 +220,8 @@ class DatabaseUpdater:
|
|
|
214
220
|
projects=gold_sequencing_projects_for_study,
|
|
215
221
|
analysis_projects=gold_analysis_projects_for_study,
|
|
216
222
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
223
|
+
include_field_site_info=self.include_field_site_info,
|
|
224
|
+
enable_biosample_filtering=self.enable_biosample_filtering,
|
|
217
225
|
)
|
|
218
226
|
|
|
219
227
|
translated_biosamples = gold_study_translator.biosamples
|
|
@@ -240,3 +248,204 @@ class DatabaseUpdater:
|
|
|
240
248
|
]
|
|
241
249
|
|
|
242
250
|
return database
|
|
251
|
+
|
|
252
|
+
def queries_run_script_to_update_insdc_identifiers(
|
|
253
|
+
self,
|
|
254
|
+
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
|
|
255
|
+
"""This method creates a `/queries:run` API endpoint compatible update script that can be run
|
|
256
|
+
using that API endpoint to update/add information on the `insdc_biosample_identifiers` field
|
|
257
|
+
of biosample_set records and the `insdc_bioproject_identifiers` field on data_generation_set records.
|
|
258
|
+
|
|
259
|
+
The information to be asserted is retrieved from the `ncbiBioSampleAccession` and
|
|
260
|
+
`ncbiBioProjectAccession` fields on the GOLD `/projects` API endpoint.
|
|
261
|
+
|
|
262
|
+
:return: A `/queries:run` update query compatible script serialized as a dictionary/JSON.
|
|
263
|
+
"""
|
|
264
|
+
# Fetch all biosamples associated with the study
|
|
265
|
+
biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
|
|
266
|
+
self.study_id
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Fetch all data_generation records associated with the study
|
|
270
|
+
data_generation_set = (
|
|
271
|
+
self.runtime_api_user_client.get_data_generation_records_for_study(
|
|
272
|
+
self.study_id
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
biosample_updates = []
|
|
277
|
+
data_generation_updates = []
|
|
278
|
+
|
|
279
|
+
# Dictionary to store gold_project_id -> ncbi_bioproject_accession mapping
|
|
280
|
+
gold_project_to_bioproject = {}
|
|
281
|
+
|
|
282
|
+
# Dictionary to store all project data we gather during biosample processing
|
|
283
|
+
all_processed_projects = {}
|
|
284
|
+
|
|
285
|
+
# Process biosamples for insdc_biosample_identifiers
|
|
286
|
+
for biosample in biosample_set:
|
|
287
|
+
# get the list (usually one) of GOLD biosample identifiers on the gold_biosample_identifiers slot
|
|
288
|
+
gold_biosample_identifiers = biosample.get("gold_biosample_identifiers", [])
|
|
289
|
+
if not gold_biosample_identifiers:
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
biosample_id = biosample.get("id")
|
|
293
|
+
if not biosample_id:
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
insdc_biosample_identifiers = []
|
|
297
|
+
|
|
298
|
+
for gold_biosample_id in gold_biosample_identifiers:
|
|
299
|
+
normalized_id = gold_biosample_id.replace("gold:", "")
|
|
300
|
+
|
|
301
|
+
# fetch projects associated with a GOLD biosample from the GOLD `/projects` API endpoint
|
|
302
|
+
gold_projects = self.gold_api_client.fetch_projects_by_biosample(
|
|
303
|
+
normalized_id
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
for project in gold_projects:
|
|
307
|
+
# Store each project for later use
|
|
308
|
+
project_gold_id = project.get("projectGoldId")
|
|
309
|
+
if project_gold_id:
|
|
310
|
+
all_processed_projects[project_gold_id] = project
|
|
311
|
+
|
|
312
|
+
# Collect ncbi_biosample_accession for biosample updates
|
|
313
|
+
ncbi_biosample_accession = project.get("ncbiBioSampleAccession")
|
|
314
|
+
if ncbi_biosample_accession and ncbi_biosample_accession.strip():
|
|
315
|
+
insdc_biosample_identifiers.append(ncbi_biosample_accession)
|
|
316
|
+
|
|
317
|
+
# Collect ncbi_bioproject_accession for data_generation records
|
|
318
|
+
ncbi_bioproject_accession = project.get("ncbiBioProjectAccession")
|
|
319
|
+
if (
|
|
320
|
+
project_gold_id
|
|
321
|
+
and ncbi_bioproject_accession
|
|
322
|
+
and ncbi_bioproject_accession.strip()
|
|
323
|
+
):
|
|
324
|
+
gold_project_to_bioproject[project_gold_id] = (
|
|
325
|
+
ncbi_bioproject_accession
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if insdc_biosample_identifiers:
|
|
329
|
+
existing_insdc_biosample_identifiers = biosample.get(
|
|
330
|
+
"insdc_biosample_identifiers", []
|
|
331
|
+
)
|
|
332
|
+
new_insdc_biosample_identifiers = list(
|
|
333
|
+
set(insdc_biosample_identifiers)
|
|
334
|
+
- set(existing_insdc_biosample_identifiers)
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
if new_insdc_biosample_identifiers:
|
|
338
|
+
prefixed_new_biosample_identifiers = [
|
|
339
|
+
f"biosample:{id}" for id in new_insdc_biosample_identifiers
|
|
340
|
+
]
|
|
341
|
+
|
|
342
|
+
if existing_insdc_biosample_identifiers:
|
|
343
|
+
all_biosample_identifiers = list(
|
|
344
|
+
set(
|
|
345
|
+
existing_insdc_biosample_identifiers
|
|
346
|
+
+ prefixed_new_biosample_identifiers
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
biosample_updates.append(
|
|
350
|
+
{
|
|
351
|
+
"q": {"id": biosample_id},
|
|
352
|
+
"u": {
|
|
353
|
+
"$set": {
|
|
354
|
+
"insdc_biosample_identifiers": all_biosample_identifiers
|
|
355
|
+
}
|
|
356
|
+
},
|
|
357
|
+
}
|
|
358
|
+
)
|
|
359
|
+
else:
|
|
360
|
+
biosample_updates.append(
|
|
361
|
+
{
|
|
362
|
+
"q": {"id": biosample_id},
|
|
363
|
+
"u": {
|
|
364
|
+
"$set": {
|
|
365
|
+
"insdc_biosample_identifiers": prefixed_new_biosample_identifiers
|
|
366
|
+
}
|
|
367
|
+
},
|
|
368
|
+
}
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Process data_generation records for insdc_bioproject_identifiers
|
|
372
|
+
for data_generation in data_generation_set:
|
|
373
|
+
data_generation_id = data_generation.get("id")
|
|
374
|
+
if not data_generation_id:
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
# Extract existing insdc_bioproject_identifiers
|
|
378
|
+
existing_insdc_bioproject_identifiers = data_generation.get(
|
|
379
|
+
"insdc_bioproject_identifiers", []
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
collected_insdc_bioproject_identifiers = set()
|
|
383
|
+
|
|
384
|
+
# Add any project identifiers already on the record
|
|
385
|
+
if "insdc_bioproject_identifiers" in data_generation:
|
|
386
|
+
for identifier in data_generation["insdc_bioproject_identifiers"]:
|
|
387
|
+
collected_insdc_bioproject_identifiers.add(identifier)
|
|
388
|
+
|
|
389
|
+
# If there are gold_sequencing_project_identifiers, use our pre-collected mapping
|
|
390
|
+
gold_project_identifiers = data_generation.get(
|
|
391
|
+
"gold_sequencing_project_identifiers", []
|
|
392
|
+
)
|
|
393
|
+
for gold_project_id in gold_project_identifiers:
|
|
394
|
+
normalized_id = gold_project_id.replace("gold:", "")
|
|
395
|
+
|
|
396
|
+
# Check if we have a bioproject ID for this GOLD project ID
|
|
397
|
+
if normalized_id in gold_project_to_bioproject:
|
|
398
|
+
ncbi_bioproject_accession = gold_project_to_bioproject[
|
|
399
|
+
normalized_id
|
|
400
|
+
]
|
|
401
|
+
collected_insdc_bioproject_identifiers.add(
|
|
402
|
+
f"bioproject:{ncbi_bioproject_accession}"
|
|
403
|
+
)
|
|
404
|
+
else:
|
|
405
|
+
# Only if we don't have it in our mapping, try to fetch it
|
|
406
|
+
# Instead of making a direct API request, check if we've already seen this project
|
|
407
|
+
if normalized_id in all_processed_projects:
|
|
408
|
+
project_data = all_processed_projects[normalized_id]
|
|
409
|
+
ncbi_bioproject_accession = project_data.get(
|
|
410
|
+
"ncbiBioProjectAccession"
|
|
411
|
+
)
|
|
412
|
+
if (
|
|
413
|
+
ncbi_bioproject_accession
|
|
414
|
+
and ncbi_bioproject_accession.strip()
|
|
415
|
+
):
|
|
416
|
+
collected_insdc_bioproject_identifiers.add(
|
|
417
|
+
f"bioproject:{ncbi_bioproject_accession}"
|
|
418
|
+
)
|
|
419
|
+
# Add to our mapping for future reference
|
|
420
|
+
gold_project_to_bioproject[normalized_id] = (
|
|
421
|
+
ncbi_bioproject_accession
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
# Create a list from the set of collected identifiers
|
|
425
|
+
collected_insdc_bioproject_identifiers = list(
|
|
426
|
+
collected_insdc_bioproject_identifiers
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Only update if there are identifiers to add
|
|
430
|
+
if collected_insdc_bioproject_identifiers and set(
|
|
431
|
+
collected_insdc_bioproject_identifiers
|
|
432
|
+
) != set(existing_insdc_bioproject_identifiers):
|
|
433
|
+
data_generation_updates.append(
|
|
434
|
+
{
|
|
435
|
+
"q": {"id": data_generation_id},
|
|
436
|
+
"u": {
|
|
437
|
+
"$set": {
|
|
438
|
+
"insdc_bioproject_identifiers": collected_insdc_bioproject_identifiers
|
|
439
|
+
}
|
|
440
|
+
},
|
|
441
|
+
}
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Return updates for both collections
|
|
445
|
+
if data_generation_updates:
|
|
446
|
+
return [
|
|
447
|
+
{"update": "biosample_set", "updates": biosample_updates},
|
|
448
|
+
{"update": "data_generation_set", "updates": data_generation_updates},
|
|
449
|
+
]
|
|
450
|
+
else:
|
|
451
|
+
return {"update": "biosample_set", "updates": biosample_updates}
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -14,6 +14,7 @@ from dagster import (
|
|
|
14
14
|
DagsterRunStatus,
|
|
15
15
|
RunStatusSensorContext,
|
|
16
16
|
DefaultSensorStatus,
|
|
17
|
+
in_process_executor,
|
|
17
18
|
)
|
|
18
19
|
from starlette import status
|
|
19
20
|
from toolz import merge, get_in
|
|
@@ -44,8 +45,10 @@ from nmdc_runtime.site.graphs import (
|
|
|
44
45
|
ingest_neon_benthic_metadata,
|
|
45
46
|
ingest_neon_surface_water_metadata,
|
|
46
47
|
ensure_alldocs,
|
|
48
|
+
run_ontology_load,
|
|
47
49
|
nmdc_study_to_ncbi_submission_export,
|
|
48
50
|
generate_data_generation_set_for_biosamples_in_nmdc_study,
|
|
51
|
+
generate_update_script_for_insdc_biosample_identifiers,
|
|
49
52
|
)
|
|
50
53
|
from nmdc_runtime.site.resources import (
|
|
51
54
|
get_mongo,
|
|
@@ -123,6 +126,55 @@ ensure_alldocs_daily = ScheduleDefinition(
|
|
|
123
126
|
)
|
|
124
127
|
|
|
125
128
|
|
|
129
|
+
load_envo_ontology_weekly = ScheduleDefinition(
|
|
130
|
+
name="weekly_load_envo_ontology",
|
|
131
|
+
cron_schedule="0 7 * * 1",
|
|
132
|
+
execution_timezone="America/New_York",
|
|
133
|
+
job=run_ontology_load.to_job(
|
|
134
|
+
name="scheduled_envo_ontology_load",
|
|
135
|
+
config=unfreeze(
|
|
136
|
+
merge(
|
|
137
|
+
run_config_frozen__normal_env,
|
|
138
|
+
{"ops": {"load_ontology": {"config": {"source_ontology": "envo"}}}},
|
|
139
|
+
)
|
|
140
|
+
),
|
|
141
|
+
resource_defs=resource_defs,
|
|
142
|
+
),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
load_uberon_ontology_weekly = ScheduleDefinition(
|
|
146
|
+
name="weekly_load_uberon_ontology",
|
|
147
|
+
cron_schedule="0 8 * * 1",
|
|
148
|
+
execution_timezone="America/New_York",
|
|
149
|
+
job=run_ontology_load.to_job(
|
|
150
|
+
name="scheduled_uberon_ontology_load",
|
|
151
|
+
config=unfreeze(
|
|
152
|
+
merge(
|
|
153
|
+
run_config_frozen__normal_env,
|
|
154
|
+
{"ops": {"load_ontology": {"config": {"source_ontology": "uberon"}}}},
|
|
155
|
+
)
|
|
156
|
+
),
|
|
157
|
+
resource_defs=resource_defs,
|
|
158
|
+
),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
load_po_ontology_weekly = ScheduleDefinition(
|
|
162
|
+
name="weekly_load_po_ontology",
|
|
163
|
+
cron_schedule="0 9 * * 1",
|
|
164
|
+
execution_timezone="America/New_York",
|
|
165
|
+
job=run_ontology_load.to_job(
|
|
166
|
+
name="scheduled_po_ontology_load",
|
|
167
|
+
config=unfreeze(
|
|
168
|
+
merge(
|
|
169
|
+
run_config_frozen__normal_env,
|
|
170
|
+
{"ops": {"load_ontology": {"config": {"source_ontology": "po"}}}},
|
|
171
|
+
)
|
|
172
|
+
),
|
|
173
|
+
resource_defs=resource_defs,
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
126
178
|
def asset_materialization_metadata(asset_event, key):
|
|
127
179
|
"""Get metadata from an asset materialization event.
|
|
128
180
|
|
|
@@ -411,11 +463,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
|
|
|
411
463
|
yield SkipReason("; ".join(skip_notes))
|
|
412
464
|
|
|
413
465
|
|
|
414
|
-
# TODO ensure data_object_type values from file_type_enum
|
|
415
|
-
# see /metadata-translation/notebooks/202106_curation_updates.ipynb
|
|
416
|
-
# for details ("Create file_type_enum collection" section).
|
|
417
|
-
|
|
418
|
-
|
|
419
466
|
@sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
|
|
420
467
|
def done_object_put_ops(_context):
|
|
421
468
|
client = get_runtime_api_site_client(run_config_frozen__normal_env)
|
|
@@ -462,7 +509,13 @@ def repo():
|
|
|
462
509
|
export_study_biosamples_metadata.to_job(**preset_normal),
|
|
463
510
|
ensure_alldocs.to_job(**preset_normal),
|
|
464
511
|
]
|
|
465
|
-
schedules = [
|
|
512
|
+
schedules = [
|
|
513
|
+
housekeeping_weekly,
|
|
514
|
+
ensure_alldocs_daily,
|
|
515
|
+
load_envo_ontology_weekly,
|
|
516
|
+
load_uberon_ontology_weekly,
|
|
517
|
+
load_po_ontology_weekly,
|
|
518
|
+
]
|
|
466
519
|
sensors = [
|
|
467
520
|
done_object_put_ops,
|
|
468
521
|
ensure_gold_translation_job,
|
|
@@ -516,6 +569,7 @@ def biosample_submission_ingest():
|
|
|
516
569
|
"study_type": "research_study",
|
|
517
570
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
518
571
|
"include_field_site_info": False,
|
|
572
|
+
"enable_biosample_filtering": True,
|
|
519
573
|
},
|
|
520
574
|
},
|
|
521
575
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -960,6 +1014,8 @@ def database_records_stitching():
|
|
|
960
1014
|
"config": {
|
|
961
1015
|
"nmdc_study_id": "",
|
|
962
1016
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
1017
|
+
"include_field_site_info": False,
|
|
1018
|
+
"enable_biosample_filtering": True,
|
|
963
1019
|
}
|
|
964
1020
|
},
|
|
965
1021
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -1002,12 +1058,57 @@ def database_records_stitching():
|
|
|
1002
1058
|
"config": {
|
|
1003
1059
|
"nmdc_study_id": "",
|
|
1004
1060
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
1061
|
+
"include_field_site_info": False,
|
|
1062
|
+
"enable_biosample_filtering": True,
|
|
1005
1063
|
}
|
|
1006
1064
|
},
|
|
1007
1065
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
1008
1066
|
},
|
|
1009
1067
|
},
|
|
1010
1068
|
),
|
|
1069
|
+
generate_update_script_for_insdc_biosample_identifiers.to_job(
|
|
1070
|
+
description="This job generates a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.",
|
|
1071
|
+
resource_defs=resource_defs,
|
|
1072
|
+
config={
|
|
1073
|
+
"resources": merge(
|
|
1074
|
+
unfreeze(normal_resources),
|
|
1075
|
+
{
|
|
1076
|
+
"runtime_api_user_client": {
|
|
1077
|
+
"config": {
|
|
1078
|
+
"base_url": {"env": "API_HOST"},
|
|
1079
|
+
"username": {"env": "API_ADMIN_USER"},
|
|
1080
|
+
"password": {"env": "API_ADMIN_PASS"},
|
|
1081
|
+
},
|
|
1082
|
+
},
|
|
1083
|
+
"runtime_api_site_client": {
|
|
1084
|
+
"config": {
|
|
1085
|
+
"base_url": {"env": "API_HOST"},
|
|
1086
|
+
"client_id": {"env": "API_SITE_CLIENT_ID"},
|
|
1087
|
+
"client_secret": {"env": "API_SITE_CLIENT_SECRET"},
|
|
1088
|
+
"site_id": {"env": "API_SITE_ID"},
|
|
1089
|
+
},
|
|
1090
|
+
},
|
|
1091
|
+
"gold_api_client": {
|
|
1092
|
+
"config": {
|
|
1093
|
+
"base_url": {"env": "GOLD_API_BASE_URL"},
|
|
1094
|
+
"username": {"env": "GOLD_API_USERNAME"},
|
|
1095
|
+
"password": {"env": "GOLD_API_PASSWORD"},
|
|
1096
|
+
},
|
|
1097
|
+
},
|
|
1098
|
+
},
|
|
1099
|
+
),
|
|
1100
|
+
"ops": {
|
|
1101
|
+
"get_database_updater_inputs": {
|
|
1102
|
+
"config": {
|
|
1103
|
+
"nmdc_study_id": "",
|
|
1104
|
+
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
1105
|
+
"include_field_site_info": False,
|
|
1106
|
+
"enable_biosample_filtering": True,
|
|
1107
|
+
}
|
|
1108
|
+
},
|
|
1109
|
+
},
|
|
1110
|
+
},
|
|
1111
|
+
),
|
|
1011
1112
|
]
|
|
1012
1113
|
|
|
1013
1114
|
|
nmdc_runtime/site/resources.py
CHANGED
|
@@ -109,7 +109,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
109
109
|
},
|
|
110
110
|
)
|
|
111
111
|
response.raise_for_status()
|
|
112
|
-
return response.json()["cursor"]["
|
|
112
|
+
return response.json()["cursor"]["batch"]
|
|
113
113
|
|
|
114
114
|
def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
|
|
115
115
|
gold_project_id = normalize_gold_id(gold_project_id)
|
|
@@ -126,7 +126,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
126
126
|
},
|
|
127
127
|
)
|
|
128
128
|
response.raise_for_status()
|
|
129
|
-
return response.json()["cursor"]["
|
|
129
|
+
return response.json()["cursor"]["batch"]
|
|
130
130
|
|
|
131
131
|
def get_biosamples_for_study(self, study_id: str):
|
|
132
132
|
# TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
|
|
@@ -147,6 +147,19 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
147
147
|
response.raise_for_status()
|
|
148
148
|
return response.json()["resources"]
|
|
149
149
|
|
|
150
|
+
def get_data_generation_records_for_study(self, study_id: str):
|
|
151
|
+
# TODO: same as above, we are using a large max_page_size to avoid pagination.
|
|
152
|
+
response = self.request(
|
|
153
|
+
"GET",
|
|
154
|
+
f"/nmdcschema/data_generation_set",
|
|
155
|
+
{
|
|
156
|
+
"filter": json.dumps({"associated_studies": study_id}),
|
|
157
|
+
"max_page_size": 10000,
|
|
158
|
+
},
|
|
159
|
+
)
|
|
160
|
+
response.raise_for_status()
|
|
161
|
+
return response.json()["resources"]
|
|
162
|
+
|
|
150
163
|
def get_omics_processing_by_name(self, name: str):
|
|
151
164
|
response = self.request(
|
|
152
165
|
"POST",
|
|
@@ -157,7 +170,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
157
170
|
},
|
|
158
171
|
)
|
|
159
172
|
response.raise_for_status()
|
|
160
|
-
return response.json()["cursor"]["
|
|
173
|
+
return response.json()["cursor"]["batch"]
|
|
161
174
|
|
|
162
175
|
def get_study(self, study_id: str):
|
|
163
176
|
response = self.request(
|
|
@@ -169,7 +182,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
169
182
|
},
|
|
170
183
|
)
|
|
171
184
|
response.raise_for_status()
|
|
172
|
-
return response.json()["cursor"]["
|
|
185
|
+
return response.json()["cursor"]["batch"]
|
|
173
186
|
|
|
174
187
|
|
|
175
188
|
class RuntimeApiSiteClient(RuntimeApiClient):
|
|
@@ -45,6 +45,7 @@ class GoldStudyTranslator(Translator):
|
|
|
45
45
|
analysis_projects: List[JSON_OBJECT] = [],
|
|
46
46
|
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
47
47
|
include_field_site_info: bool = False,
|
|
48
|
+
enable_biosample_filtering: bool = True,
|
|
48
49
|
*args,
|
|
49
50
|
**kwargs,
|
|
50
51
|
) -> None:
|
|
@@ -53,15 +54,20 @@ class GoldStudyTranslator(Translator):
|
|
|
53
54
|
self.study = study
|
|
54
55
|
self.study_type = nmdc.StudyCategoryEnum(study_type)
|
|
55
56
|
self.include_field_site_info = include_field_site_info
|
|
57
|
+
self.enable_biosample_filtering = enable_biosample_filtering
|
|
56
58
|
# Filter biosamples to only those with `sequencingStrategy` of
|
|
57
|
-
# "Metagenome" or "Metatranscriptome"
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
59
|
+
# "Metagenome" or "Metatranscriptome" if filtering is enabled
|
|
60
|
+
if enable_biosample_filtering:
|
|
61
|
+
self.biosamples = [
|
|
62
|
+
biosample
|
|
63
|
+
for biosample in biosamples
|
|
64
|
+
if any(
|
|
65
|
+
_is_valid_project(project)
|
|
66
|
+
for project in biosample.get("projects", [])
|
|
67
|
+
)
|
|
68
|
+
]
|
|
69
|
+
else:
|
|
70
|
+
self.biosamples = biosamples
|
|
65
71
|
# Fetch the valid projectGoldIds that are associated with filtered
|
|
66
72
|
# biosamples on their `projects` field
|
|
67
73
|
valid_project_ids = {
|
|
@@ -116,6 +122,9 @@ class GoldStudyTranslator(Translator):
|
|
|
116
122
|
:param gold_entity: GOLD entity object
|
|
117
123
|
:return: PersonValue corresponding to the first PI in the `contacts` field
|
|
118
124
|
"""
|
|
125
|
+
if "contacts" not in gold_entity:
|
|
126
|
+
return None
|
|
127
|
+
|
|
119
128
|
pi_dict = next(
|
|
120
129
|
(
|
|
121
130
|
contact
|
|
@@ -169,7 +178,7 @@ class GoldStudyTranslator(Translator):
|
|
|
169
178
|
project["ncbiBioSampleAccession"], default_prefix="biosample"
|
|
170
179
|
)
|
|
171
180
|
for project in biosample_projects
|
|
172
|
-
if project
|
|
181
|
+
if project.get("ncbiBioSampleAccession")
|
|
173
182
|
]
|
|
174
183
|
|
|
175
184
|
def _get_samp_taxon_id(
|
|
@@ -349,6 +349,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
349
349
|
description=f"sequencing results for {basename}",
|
|
350
350
|
type="nmdc:DataObject",
|
|
351
351
|
data_object_type=do_type,
|
|
352
|
+
data_category=nmdc.DataCategoryEnum.instrument_data.text,
|
|
352
353
|
in_manifest=manifest_id,
|
|
353
354
|
)
|
|
354
355
|
|
|
@@ -397,6 +397,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
397
397
|
description=f"sequencing results for {basename}",
|
|
398
398
|
type="nmdc:DataObject",
|
|
399
399
|
data_object_type=do_type,
|
|
400
|
+
data_category=nmdc.DataCategoryEnum.instrument_data.text,
|
|
400
401
|
in_manifest=manifest_id,
|
|
401
402
|
)
|
|
402
403
|
|
|
@@ -47,6 +47,12 @@ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str]
|
|
|
47
47
|
(INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
UNIT_OVERRIDES: dict[str, dict[str, str]] = {
|
|
51
|
+
"Biosample": {
|
|
52
|
+
"depth": "m",
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
50
56
|
|
|
51
57
|
class EnvironmentPackage(Enum):
|
|
52
58
|
r"""
|
|
@@ -475,6 +481,50 @@ class SubmissionPortalTranslator(Translator):
|
|
|
475
481
|
|
|
476
482
|
return value
|
|
477
483
|
|
|
484
|
+
def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
|
|
485
|
+
"""Collect and format DOIs from submission portal schema in nmdc format DOIs
|
|
486
|
+
|
|
487
|
+
If there were no DOIs, None is returned.
|
|
488
|
+
|
|
489
|
+
:param metadata_submission: submission portal entry
|
|
490
|
+
:return: list of nmdc.DOI objects
|
|
491
|
+
"""
|
|
492
|
+
data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
|
|
493
|
+
award_dois = self._get_from(
|
|
494
|
+
metadata_submission, ["multiOmicsForm", "awardDois"]
|
|
495
|
+
)
|
|
496
|
+
if data_dois and len(data_dois) > 0:
|
|
497
|
+
updated_data_dois = [
|
|
498
|
+
nmdc.Doi(
|
|
499
|
+
doi_category="dataset_doi",
|
|
500
|
+
doi_provider=doi["provider"],
|
|
501
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
502
|
+
type="nmdc:Doi",
|
|
503
|
+
)
|
|
504
|
+
for doi in data_dois
|
|
505
|
+
]
|
|
506
|
+
else:
|
|
507
|
+
updated_data_dois = []
|
|
508
|
+
|
|
509
|
+
if award_dois and len(award_dois) > 0:
|
|
510
|
+
updated_award_dois = [
|
|
511
|
+
nmdc.Doi(
|
|
512
|
+
doi_category="award_doi",
|
|
513
|
+
doi_provider=doi["provider"],
|
|
514
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
515
|
+
type="nmdc:Doi",
|
|
516
|
+
)
|
|
517
|
+
for doi in award_dois
|
|
518
|
+
]
|
|
519
|
+
else:
|
|
520
|
+
updated_award_dois = []
|
|
521
|
+
|
|
522
|
+
return_val = updated_data_dois + updated_award_dois
|
|
523
|
+
if len(return_val) == 0:
|
|
524
|
+
return_val = None
|
|
525
|
+
|
|
526
|
+
return return_val
|
|
527
|
+
|
|
478
528
|
def _get_data_objects_from_fields(
|
|
479
529
|
self,
|
|
480
530
|
sample_data: JSON_OBJECT,
|
|
@@ -591,6 +641,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
591
641
|
websites=self._get_from(
|
|
592
642
|
metadata_submission, ["studyForm", "linkOutWebpage"]
|
|
593
643
|
),
|
|
644
|
+
associated_dois=self._get_study_dois(metadata_submission),
|
|
594
645
|
)
|
|
595
646
|
|
|
596
647
|
def _transform_value_for_slot(
|
|
@@ -660,6 +711,17 @@ class SubmissionPortalTranslator(Translator):
|
|
|
660
711
|
logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
|
|
661
712
|
continue
|
|
662
713
|
|
|
714
|
+
# This step handles cases where the submission portal/schema instructs a user to
|
|
715
|
+
# provide a value in a specific unit. The unit cannot be parsed out of the raw value
|
|
716
|
+
# in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
|
|
717
|
+
# go away once units are encoded in the schema itself.
|
|
718
|
+
# See: https://github.com/microbiomedata/nmdc-schema/issues/2517
|
|
719
|
+
if class_name in UNIT_OVERRIDES:
|
|
720
|
+
# If the class has unit overrides, check if the slot is in the overrides
|
|
721
|
+
unit_overrides = UNIT_OVERRIDES[class_name]
|
|
722
|
+
if slot_name in unit_overrides:
|
|
723
|
+
unit = unit_overrides[slot_name]
|
|
724
|
+
|
|
663
725
|
slot_definition = self.schema_view.induced_slot(slot_name, class_name)
|
|
664
726
|
if slot_definition.multivalued:
|
|
665
727
|
value_list = value
|