nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -1
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +731 -40
- nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
- nmdc_runtime/site/graphs.py +80 -29
- nmdc_runtime/site/ops.py +522 -183
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +108 -117
- nmdc_runtime/site/resources.py +72 -36
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
- nmdc_runtime/site/translation/translator.py +64 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +175 -348
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
- nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
- nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from io import BytesIO, StringIO
|
|
2
|
-
from typing import Any, Dict, List
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
3
|
|
|
4
4
|
from nmdc_runtime.api.endpoints.util import strip_oid
|
|
5
5
|
from nmdc_runtime.minter.config import typecodes
|
|
@@ -99,31 +99,36 @@ def fetch_nucleotide_sequencing_from_biosamples(
|
|
|
99
99
|
for biosample in biosamples_list:
|
|
100
100
|
current_ids = [biosample["id"]]
|
|
101
101
|
collected_ntseq_objects = []
|
|
102
|
+
processed_ids = set() # Track already processed nucleotide sequencing IDs
|
|
102
103
|
|
|
103
104
|
while current_ids:
|
|
104
105
|
new_current_ids = []
|
|
105
106
|
for current_id in current_ids:
|
|
106
|
-
|
|
107
|
-
document
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
107
|
+
# Find all documents with current_id as input instead of just one
|
|
108
|
+
for document in all_docs_collection.find({"has_input": current_id}):
|
|
109
|
+
has_output = document.get("has_output")
|
|
110
|
+
if not has_output:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
for output_id in has_output:
|
|
114
|
+
if get_classname_from_typecode(output_id) == "DataObject":
|
|
115
|
+
# Only process if we haven't seen this document ID before
|
|
116
|
+
if document["id"] not in processed_ids:
|
|
117
|
+
nucleotide_sequencing_doc = (
|
|
118
|
+
data_generation_set.find_one(
|
|
119
|
+
{
|
|
120
|
+
"id": document["id"],
|
|
121
|
+
"type": "nmdc:NucleotideSequencing",
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
if nucleotide_sequencing_doc:
|
|
126
|
+
collected_ntseq_objects.append(
|
|
127
|
+
strip_oid(nucleotide_sequencing_doc)
|
|
128
|
+
)
|
|
129
|
+
processed_ids.add(document["id"])
|
|
130
|
+
else:
|
|
131
|
+
new_current_ids.append(output_id)
|
|
127
132
|
|
|
128
133
|
current_ids = new_current_ids
|
|
129
134
|
|
|
@@ -187,10 +192,7 @@ def handle_quantity_value(slot_value):
|
|
|
187
192
|
and "has_minimum_numeric_value" in slot_value
|
|
188
193
|
and "has_unit" in slot_value
|
|
189
194
|
):
|
|
190
|
-
range_value =
|
|
191
|
-
slot_value["has_maximum_numeric_value"]
|
|
192
|
-
- slot_value["has_minimum_numeric_value"]
|
|
193
|
-
)
|
|
195
|
+
range_value = f"{slot_value['has_minimum_numeric_value']} - {slot_value['has_maximum_numeric_value']}"
|
|
194
196
|
return f"{range_value} {slot_value['has_unit']}"
|
|
195
197
|
elif "has_raw_value" in slot_value:
|
|
196
198
|
return slot_value["has_raw_value"]
|
|
@@ -273,6 +275,120 @@ def load_mappings(url):
|
|
|
273
275
|
return attribute_mappings, slot_range_mappings
|
|
274
276
|
|
|
275
277
|
|
|
278
|
+
def check_pooling_for_biosamples(
|
|
279
|
+
material_processing_set: Collection, biosamples_list: List[Dict[str, Any]]
|
|
280
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
281
|
+
"""Check which biosamples are part of pooling processes and return pooling information.
|
|
282
|
+
|
|
283
|
+
The way in which we check if a biosample is part of a Pooling process is by checking if
|
|
284
|
+
the biosample id has been asserted on the `has_input` slot/key of an `nmdc:Pooling` process
|
|
285
|
+
instance.
|
|
286
|
+
|
|
287
|
+
:param material_processing_set: reference to the material_processing_set collection
|
|
288
|
+
:param biosamples_list: list of all biosamples to check
|
|
289
|
+
:return: dictionary mapping biosample_id to pooling information (empty dict if not pooled)
|
|
290
|
+
"""
|
|
291
|
+
result = {}
|
|
292
|
+
# get list of all biosample IDs that are part of a given study
|
|
293
|
+
biosample_lookup = {bs["id"]: bs for bs in biosamples_list}
|
|
294
|
+
|
|
295
|
+
# get list of all pooling processes
|
|
296
|
+
pooling_processes = list(material_processing_set.find({"type": "nmdc:Pooling"}))
|
|
297
|
+
|
|
298
|
+
# initialize all biosamples as not pooled
|
|
299
|
+
for biosample in biosamples_list:
|
|
300
|
+
result[biosample["id"]] = {}
|
|
301
|
+
|
|
302
|
+
# process each pooling process
|
|
303
|
+
for pooling_process in pooling_processes:
|
|
304
|
+
pooled_biosample_ids = pooling_process.get("has_input", [])
|
|
305
|
+
|
|
306
|
+
# get the processed sample output from the pooling process
|
|
307
|
+
has_output = pooling_process.get("has_output", [])
|
|
308
|
+
processed_sample_id = None
|
|
309
|
+
|
|
310
|
+
for output_id in has_output:
|
|
311
|
+
if get_classname_from_typecode(output_id) == "ProcessedSample":
|
|
312
|
+
processed_sample_id = output_id
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
# aggregate the values on `collection_date` and `depth` slots
|
|
316
|
+
# here, we are collecting the `collection_date` and `depth` values
|
|
317
|
+
# asserted on each of the biosamples that are part of a given pooling
|
|
318
|
+
# process in the following way:
|
|
319
|
+
# example of aggregated `collection_date`: 2017-06-05T16:50Z/2017-06-05T17:47Z
|
|
320
|
+
# example of aggregated `depth`: 0-10 m
|
|
321
|
+
collection_dates = []
|
|
322
|
+
depths = []
|
|
323
|
+
|
|
324
|
+
for bs_id in pooled_biosample_ids:
|
|
325
|
+
biosample = biosample_lookup.get(bs_id)
|
|
326
|
+
if not biosample:
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
if "collection_date" in biosample:
|
|
330
|
+
collection_date = biosample["collection_date"]
|
|
331
|
+
if (
|
|
332
|
+
isinstance(collection_date, dict)
|
|
333
|
+
and "has_raw_value" in collection_date
|
|
334
|
+
):
|
|
335
|
+
collection_dates.append(collection_date["has_raw_value"])
|
|
336
|
+
elif isinstance(collection_date, str):
|
|
337
|
+
collection_dates.append(collection_date)
|
|
338
|
+
|
|
339
|
+
if "depth" in biosample:
|
|
340
|
+
depth = biosample["depth"]
|
|
341
|
+
if isinstance(depth, dict):
|
|
342
|
+
if "has_numeric_value" in depth:
|
|
343
|
+
depths.append(depth["has_numeric_value"])
|
|
344
|
+
elif (
|
|
345
|
+
"has_minimum_numeric_value" in depth
|
|
346
|
+
and "has_maximum_numeric_value" in depth
|
|
347
|
+
):
|
|
348
|
+
depths.extend(
|
|
349
|
+
[
|
|
350
|
+
depth["has_minimum_numeric_value"],
|
|
351
|
+
depth["has_maximum_numeric_value"],
|
|
352
|
+
]
|
|
353
|
+
)
|
|
354
|
+
elif isinstance(depth, (int, float)):
|
|
355
|
+
depths.append(depth)
|
|
356
|
+
|
|
357
|
+
# create aggregated (forward slash separated) value for `collection_date`
|
|
358
|
+
aggregated_collection_date = None
|
|
359
|
+
if collection_dates:
|
|
360
|
+
sorted_dates = sorted(collection_dates)
|
|
361
|
+
if len(sorted_dates) > 1:
|
|
362
|
+
aggregated_collection_date = f"{sorted_dates[0]}/{sorted_dates[-1]}"
|
|
363
|
+
else:
|
|
364
|
+
aggregated_collection_date = sorted_dates[0]
|
|
365
|
+
|
|
366
|
+
# create aggregated (hyphen separated) value for `depth`
|
|
367
|
+
aggregated_depth = None
|
|
368
|
+
if depths:
|
|
369
|
+
min_depth = min(depths)
|
|
370
|
+
max_depth = max(depths)
|
|
371
|
+
if min_depth != max_depth:
|
|
372
|
+
aggregated_depth = f"{min_depth}-{max_depth} m"
|
|
373
|
+
else:
|
|
374
|
+
aggregated_depth = f"{min_depth} m"
|
|
375
|
+
|
|
376
|
+
# update all biosamples that are part of this pooling process
|
|
377
|
+
pooling_info = {
|
|
378
|
+
"processed_sample_id": processed_sample_id,
|
|
379
|
+
"pooling_process_id": pooling_process.get("id"),
|
|
380
|
+
"pooled_biosample_ids": pooled_biosample_ids,
|
|
381
|
+
"aggregated_collection_date": aggregated_collection_date,
|
|
382
|
+
"aggregated_depth": aggregated_depth,
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
for bs_id in pooled_biosample_ids:
|
|
386
|
+
if bs_id in result:
|
|
387
|
+
result[bs_id] = pooling_info
|
|
388
|
+
|
|
389
|
+
return result
|
|
390
|
+
|
|
391
|
+
|
|
276
392
|
def validate_xml(xml, xsd_url):
|
|
277
393
|
response = requests.get(xsd_url)
|
|
278
394
|
response.raise_for_status()
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from dagster import graph
|
|
2
2
|
|
|
3
3
|
from nmdc_runtime.site.ops import (
|
|
4
|
-
build_merged_db,
|
|
5
4
|
generate_biosample_set_for_nmdc_study_from_gold,
|
|
6
5
|
nmdc_schema_database_export_filename,
|
|
7
6
|
nmdc_schema_database_from_gold_study,
|
|
@@ -12,8 +11,6 @@ from nmdc_runtime.site.ops import (
|
|
|
12
11
|
gold_projects_by_study,
|
|
13
12
|
gold_study,
|
|
14
13
|
poll_for_run_completion,
|
|
15
|
-
run_etl,
|
|
16
|
-
local_file_to_api_object,
|
|
17
14
|
get_operation,
|
|
18
15
|
produce_curated_db,
|
|
19
16
|
delete_operations,
|
|
@@ -22,6 +19,7 @@ from nmdc_runtime.site.ops import (
|
|
|
22
19
|
filter_ops_done_object_puts,
|
|
23
20
|
hello,
|
|
24
21
|
mongo_stats,
|
|
22
|
+
run_script_to_update_insdc_biosample_identifiers,
|
|
25
23
|
submit_metadata_to_db,
|
|
26
24
|
filter_ops_undone_expired,
|
|
27
25
|
construct_jobs,
|
|
@@ -50,41 +48,27 @@ from nmdc_runtime.site.ops import (
|
|
|
50
48
|
get_df_from_url,
|
|
51
49
|
site_code_mapping,
|
|
52
50
|
materialize_alldocs,
|
|
51
|
+
load_ontology,
|
|
53
52
|
get_ncbi_export_pipeline_study,
|
|
54
53
|
get_data_objects_from_biosamples,
|
|
55
54
|
get_nucleotide_sequencing_from_biosamples,
|
|
56
55
|
get_library_preparation_from_biosamples,
|
|
56
|
+
get_aggregated_pooled_biosamples,
|
|
57
57
|
get_all_instruments,
|
|
58
58
|
get_ncbi_export_pipeline_inputs,
|
|
59
59
|
ncbi_submission_xml_from_nmdc_study,
|
|
60
60
|
ncbi_submission_xml_asset,
|
|
61
|
+
render_text,
|
|
61
62
|
get_database_updater_inputs,
|
|
62
63
|
post_submission_portal_biosample_ingest_record_stitching_filename,
|
|
63
64
|
generate_data_generation_set_post_biosample_ingest,
|
|
64
65
|
get_instrument_ids_by_model,
|
|
65
66
|
log_database_ids,
|
|
67
|
+
add_public_image_urls,
|
|
66
68
|
)
|
|
67
69
|
from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
|
|
68
70
|
|
|
69
71
|
|
|
70
|
-
@graph
|
|
71
|
-
def gold_translation():
|
|
72
|
-
"""
|
|
73
|
-
Translating an export of the JGI GOLD [1] SQL database to the NMDC database JSON schema.
|
|
74
|
-
|
|
75
|
-
[1] Genomes OnLine Database (GOLD) <https://gold.jgi.doe.gov/>.
|
|
76
|
-
"""
|
|
77
|
-
local_file_to_api_object(run_etl(build_merged_db()))
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
@graph()
|
|
81
|
-
def gold_translation_curation():
|
|
82
|
-
# TODO
|
|
83
|
-
# - have produce_curated_db do actual curation (see notebook), persisting to db.
|
|
84
|
-
# - more steps in pipeline? Or handoff via run_status_sensor on DagsterRunStatus.SUCCESS.
|
|
85
|
-
produce_curated_db(get_operation())
|
|
86
|
-
|
|
87
|
-
|
|
88
72
|
@graph()
|
|
89
73
|
def create_objects_from_site_object_puts():
|
|
90
74
|
delete_operations(
|
|
@@ -112,6 +96,16 @@ def ensure_alldocs():
|
|
|
112
96
|
materialize_alldocs()
|
|
113
97
|
|
|
114
98
|
|
|
99
|
+
@graph
|
|
100
|
+
def run_ontology_load():
|
|
101
|
+
"""
|
|
102
|
+
A graph for loading ontologies.
|
|
103
|
+
The source_ontology parameter is provided by the job configuration
|
|
104
|
+
and passed to the load_ontology op.
|
|
105
|
+
"""
|
|
106
|
+
load_ontology()
|
|
107
|
+
|
|
108
|
+
|
|
115
109
|
@graph
|
|
116
110
|
def ensure_jobs():
|
|
117
111
|
jobs = construct_jobs()
|
|
@@ -120,17 +114,24 @@ def ensure_jobs():
|
|
|
120
114
|
|
|
121
115
|
@graph
|
|
122
116
|
def apply_changesheet():
|
|
117
|
+
# Note: We use `_` as a "placeholder" variable.
|
|
118
|
+
# It's a variable to whose value we assign no significance. In this case, we use it to
|
|
119
|
+
# tell Dagster that one op depends upon the output of the other (so Dagster runs them
|
|
120
|
+
# in that order), without implying to maintainers that its value is significant to us.
|
|
121
|
+
# Reference (this strategy): https://docs.dagster.io/api/dagster/types#dagster.Nothing
|
|
122
|
+
# Reference (`_` variables): https://stackoverflow.com/a/47599668
|
|
123
123
|
sheet_in = get_changesheet_in()
|
|
124
124
|
outputs = perform_changesheet_updates(sheet_in)
|
|
125
|
-
add_output_run_event(outputs)
|
|
126
|
-
materialize_alldocs()
|
|
125
|
+
_ = add_output_run_event(outputs)
|
|
126
|
+
materialize_alldocs(waits_for=_)
|
|
127
127
|
|
|
128
128
|
|
|
129
129
|
@graph
|
|
130
130
|
def apply_metadata_in():
|
|
131
|
+
# Note: We use `_` as a "placeholder" variable.
|
|
131
132
|
outputs = perform_mongo_updates(get_json_in())
|
|
132
|
-
add_output_run_event(outputs)
|
|
133
|
-
materialize_alldocs()
|
|
133
|
+
_ = add_output_run_event(outputs)
|
|
134
|
+
materialize_alldocs(waits_for=_)
|
|
134
135
|
|
|
135
136
|
|
|
136
137
|
@graph
|
|
@@ -140,6 +141,7 @@ def gold_study_to_database():
|
|
|
140
141
|
study_type,
|
|
141
142
|
gold_nmdc_instrument_mapping_file_url,
|
|
142
143
|
include_field_site_info,
|
|
144
|
+
enable_biosample_filtering,
|
|
143
145
|
) = get_gold_study_pipeline_inputs()
|
|
144
146
|
|
|
145
147
|
projects = gold_projects_by_study(study_id)
|
|
@@ -156,6 +158,7 @@ def gold_study_to_database():
|
|
|
156
158
|
analysis_projects,
|
|
157
159
|
gold_nmdc_instrument_map_df,
|
|
158
160
|
include_field_site_info,
|
|
161
|
+
enable_biosample_filtering,
|
|
159
162
|
)
|
|
160
163
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
161
164
|
filename = nmdc_schema_database_export_filename(study)
|
|
@@ -172,6 +175,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
|
|
|
172
175
|
data_object_mapping_file_url,
|
|
173
176
|
biosample_extras_file_url,
|
|
174
177
|
biosample_extras_slot_mapping_file_url,
|
|
178
|
+
study_id,
|
|
175
179
|
) = get_submission_portal_pipeline_inputs()
|
|
176
180
|
|
|
177
181
|
metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
|
|
@@ -192,6 +196,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
|
|
|
192
196
|
biosample_extras=biosample_extras,
|
|
193
197
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
194
198
|
instrument_mapping=instrument_mapping,
|
|
199
|
+
study_id=study_id,
|
|
195
200
|
)
|
|
196
201
|
|
|
197
202
|
validate_metadata(database)
|
|
@@ -212,6 +217,7 @@ def ingest_metadata_submission():
|
|
|
212
217
|
data_object_mapping_file_url,
|
|
213
218
|
biosample_extras_file_url,
|
|
214
219
|
biosample_extras_slot_mapping_file_url,
|
|
220
|
+
study_id,
|
|
215
221
|
) = get_submission_portal_pipeline_inputs()
|
|
216
222
|
|
|
217
223
|
metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
|
|
@@ -232,7 +238,9 @@ def ingest_metadata_submission():
|
|
|
232
238
|
biosample_extras=biosample_extras,
|
|
233
239
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
234
240
|
instrument_mapping=instrument_mapping,
|
|
241
|
+
study_id=study_id,
|
|
235
242
|
)
|
|
243
|
+
database = add_public_image_urls(database, submission_id)
|
|
236
244
|
|
|
237
245
|
log_database_ids(database)
|
|
238
246
|
|
|
@@ -471,6 +479,7 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
471
479
|
)
|
|
472
480
|
data_object_records = get_data_objects_from_biosamples(biosamples)
|
|
473
481
|
library_preparation_records = get_library_preparation_from_biosamples(biosamples)
|
|
482
|
+
pooled_biosamples_data = get_aggregated_pooled_biosamples(biosamples)
|
|
474
483
|
all_instruments = get_all_instruments()
|
|
475
484
|
xml_data = ncbi_submission_xml_from_nmdc_study(
|
|
476
485
|
nmdc_study,
|
|
@@ -480,17 +489,26 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
480
489
|
data_object_records,
|
|
481
490
|
library_preparation_records,
|
|
482
491
|
all_instruments,
|
|
492
|
+
pooled_biosamples_data,
|
|
483
493
|
)
|
|
484
494
|
ncbi_submission_xml_asset(xml_data)
|
|
485
495
|
|
|
486
496
|
|
|
487
497
|
@graph
|
|
488
498
|
def generate_data_generation_set_for_biosamples_in_nmdc_study():
|
|
489
|
-
(
|
|
499
|
+
(
|
|
500
|
+
study_id,
|
|
501
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
502
|
+
include_field_site_info,
|
|
503
|
+
enable_biosample_filtering,
|
|
504
|
+
) = get_database_updater_inputs()
|
|
490
505
|
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
491
506
|
|
|
492
507
|
database = generate_data_generation_set_post_biosample_ingest(
|
|
493
|
-
study_id,
|
|
508
|
+
study_id,
|
|
509
|
+
gold_nmdc_instrument_map_df,
|
|
510
|
+
include_field_site_info,
|
|
511
|
+
enable_biosample_filtering,
|
|
494
512
|
)
|
|
495
513
|
|
|
496
514
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
@@ -503,11 +521,19 @@ def generate_data_generation_set_for_biosamples_in_nmdc_study():
|
|
|
503
521
|
|
|
504
522
|
@graph
|
|
505
523
|
def generate_biosample_set_from_samples_in_gold():
|
|
506
|
-
(
|
|
524
|
+
(
|
|
525
|
+
study_id,
|
|
526
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
527
|
+
include_field_site_info,
|
|
528
|
+
enable_biosample_filtering,
|
|
529
|
+
) = get_database_updater_inputs()
|
|
507
530
|
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
508
531
|
|
|
509
532
|
database = generate_biosample_set_for_nmdc_study_from_gold(
|
|
510
|
-
study_id,
|
|
533
|
+
study_id,
|
|
534
|
+
gold_nmdc_instrument_map_df,
|
|
535
|
+
include_field_site_info,
|
|
536
|
+
enable_biosample_filtering,
|
|
511
537
|
)
|
|
512
538
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
513
539
|
filename = post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
@@ -515,3 +541,28 @@ def generate_biosample_set_from_samples_in_gold():
|
|
|
515
541
|
)
|
|
516
542
|
outputs = export_json_to_drs(database_dict, filename)
|
|
517
543
|
add_output_run_event(outputs)
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
@graph
|
|
547
|
+
def generate_update_script_for_insdc_biosample_identifiers():
|
|
548
|
+
"""Generate a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.
|
|
549
|
+
|
|
550
|
+
This graph fetches the necessary inputs, then calls the run_script_to_update_insdc_biosample_identifiers op
|
|
551
|
+
to generate a script for updating biosample records with INSDC identifiers obtained from GOLD.
|
|
552
|
+
The script is returned as a dictionary that can be executed against MongoDB.
|
|
553
|
+
"""
|
|
554
|
+
(
|
|
555
|
+
study_id,
|
|
556
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
557
|
+
include_field_site_info,
|
|
558
|
+
enable_biosample_filtering,
|
|
559
|
+
) = get_database_updater_inputs()
|
|
560
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
561
|
+
|
|
562
|
+
update_script = run_script_to_update_insdc_biosample_identifiers(
|
|
563
|
+
study_id,
|
|
564
|
+
gold_nmdc_instrument_map_df,
|
|
565
|
+
include_field_site_info,
|
|
566
|
+
enable_biosample_filtering,
|
|
567
|
+
)
|
|
568
|
+
render_text(update_script)
|