nmdc-runtime 2.6.0__py3-none-any.whl → 2.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/config.py +4 -0
- nmdc_runtime/mongo_util.py +90 -0
- nmdc_runtime/site/export/ncbi_xml.py +98 -27
- nmdc_runtime/site/export/ncbi_xml_utils.py +27 -25
- nmdc_runtime/site/graphs.py +42 -5
- nmdc_runtime/site/ops.py +405 -14
- nmdc_runtime/site/repair/database_updater.py +202 -1
- nmdc_runtime/site/repository.py +100 -1
- nmdc_runtime/site/resources.py +13 -0
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -0
- nmdc_runtime/site/translation/neon_soil_translator.py +1 -0
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -0
- nmdc_runtime/util.py +56 -2
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/METADATA +18 -3
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/RECORD +19 -18
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/config.py
CHANGED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from pymongo import MongoClient
|
|
2
|
+
from pymongo.database import Database
|
|
3
|
+
from pymongo.collection import Collection
|
|
4
|
+
from typing import Any, Mapping, Optional, Type, Callable
|
|
5
|
+
from pymongo.client_session import ClientSession
|
|
6
|
+
import inspect
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _wrap_with_session(obj: Any, name: str, session: Optional[ClientSession]) -> Any:
|
|
10
|
+
"""
|
|
11
|
+
Wraps a callable attribute of an object to automatically include a session
|
|
12
|
+
if the callable accepts a 'session' keyword argument.
|
|
13
|
+
"""
|
|
14
|
+
attr = getattr(obj, name)
|
|
15
|
+
if callable(attr):
|
|
16
|
+
signature = inspect.signature(attr)
|
|
17
|
+
parameters = signature.parameters
|
|
18
|
+
accepts_session = any(
|
|
19
|
+
param.name == "session"
|
|
20
|
+
for param in parameters.values()
|
|
21
|
+
if param.kind
|
|
22
|
+
in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def wrapper(*args, **kwargs):
|
|
26
|
+
if session is not None and accepts_session and "session" not in kwargs:
|
|
27
|
+
kwargs["session"] = session
|
|
28
|
+
return attr(*args, **kwargs)
|
|
29
|
+
|
|
30
|
+
return wrapper
|
|
31
|
+
return attr
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SessionBoundCollection:
|
|
35
|
+
"""
|
|
36
|
+
A wrapper around pymongo.collection.Collection that automatically passes a session
|
|
37
|
+
to methods that accept it.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, collection: Collection, session: Optional[ClientSession] = None):
|
|
41
|
+
self._collection = collection
|
|
42
|
+
self._session = session
|
|
43
|
+
|
|
44
|
+
def __getattr__(self, name: str):
|
|
45
|
+
return _wrap_with_session(self._collection, name, self._session)
|
|
46
|
+
|
|
47
|
+
def __getitem__(self, name: str) -> "SessionBoundCollection":
|
|
48
|
+
return SessionBoundCollection(self._collection[name], self._session)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class SessionBoundDatabase(Database):
|
|
52
|
+
"""
|
|
53
|
+
A wrapper around pymongo.database.Database that automatically passes a session
|
|
54
|
+
to methods that accept it.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, database: Database, session: Optional[ClientSession] = None):
|
|
58
|
+
super().__init__(
|
|
59
|
+
database.client,
|
|
60
|
+
database.name,
|
|
61
|
+
database.codec_options,
|
|
62
|
+
database.read_preference,
|
|
63
|
+
database.write_concern,
|
|
64
|
+
database.read_concern,
|
|
65
|
+
)
|
|
66
|
+
self._database = database
|
|
67
|
+
self._session = session
|
|
68
|
+
|
|
69
|
+
def __getattr__(self, name: str):
|
|
70
|
+
return _wrap_with_session(self._database, name, self._session)
|
|
71
|
+
|
|
72
|
+
def __getitem__(self, name: str) -> SessionBoundCollection:
|
|
73
|
+
return SessionBoundCollection(self._database[name], self._session)
|
|
74
|
+
|
|
75
|
+
def get_collection(self, name: str, **kwargs) -> SessionBoundCollection:
|
|
76
|
+
"""Get a :class:`~pymongo.collection.Collection` with the given name and options."""
|
|
77
|
+
collection = super().get_collection(name, **kwargs)
|
|
78
|
+
return SessionBoundCollection(collection, self._session)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def client(self):
|
|
82
|
+
return self._database.client
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def unbounded(self):
|
|
86
|
+
return self._database
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def name(self):
|
|
90
|
+
return self._database.name
|
|
@@ -27,7 +27,11 @@ class NCBISubmissionXML:
|
|
|
27
27
|
self.nmdc_study_id = nmdc_study.get("id")
|
|
28
28
|
self.nmdc_study_title = nmdc_study.get("title")
|
|
29
29
|
self.nmdc_study_description = nmdc_study.get("description")
|
|
30
|
-
|
|
30
|
+
# get the first INSDC BioProject ID from the NMDC study
|
|
31
|
+
self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")[0]
|
|
32
|
+
# the value asserted in "insdc_bioproject_identifiers" will be a CURIE, so extract
|
|
33
|
+
# everything after the prefix and delimiter (":")
|
|
34
|
+
self.ncbi_bioproject_id = self.ncbi_bioproject_id.split(":")[-1]
|
|
31
35
|
self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
|
|
32
36
|
nmdc_study_pi_name = (
|
|
33
37
|
nmdc_study.get("principal_investigator", {}).get("name").split()
|
|
@@ -251,7 +255,11 @@ class NCBISubmissionXML:
|
|
|
251
255
|
children=[
|
|
252
256
|
self.set_element(
|
|
253
257
|
"Title",
|
|
254
|
-
|
|
258
|
+
attributes.get(
|
|
259
|
+
"name",
|
|
260
|
+
# fallback title if "name" is not present
|
|
261
|
+
f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
|
|
262
|
+
),
|
|
255
263
|
),
|
|
256
264
|
],
|
|
257
265
|
),
|
|
@@ -577,18 +585,45 @@ class NCBISubmissionXML:
|
|
|
577
585
|
biosample_library_preparation_list: list,
|
|
578
586
|
instruments_dict: dict,
|
|
579
587
|
):
|
|
580
|
-
data_type = None
|
|
581
|
-
|
|
588
|
+
# data_type = None
|
|
589
|
+
|
|
590
|
+
biosamples_to_exclude = set()
|
|
582
591
|
for bsm_ntseq in biosample_nucleotide_sequencing_list:
|
|
583
|
-
for
|
|
592
|
+
for bsm_id, ntseq_list in bsm_ntseq.items():
|
|
593
|
+
# Check if any processing_institution is "JGI"
|
|
584
594
|
for ntseq in ntseq_list:
|
|
585
|
-
if
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
595
|
+
if (
|
|
596
|
+
"processing_institution" in ntseq
|
|
597
|
+
and ntseq["processing_institution"] == "JGI"
|
|
598
|
+
):
|
|
599
|
+
biosamples_to_exclude.add(bsm_id)
|
|
600
|
+
break
|
|
601
|
+
|
|
602
|
+
# Filter biosample_nucleotide_sequencing_list to exclude JGI records
|
|
603
|
+
filtered_nucleotide_sequencing_list = []
|
|
604
|
+
for bsm_ntseq in biosample_nucleotide_sequencing_list:
|
|
605
|
+
filtered_dict = {}
|
|
606
|
+
for bsm_id, ntseq_list in bsm_ntseq.items():
|
|
607
|
+
if bsm_id not in biosamples_to_exclude:
|
|
608
|
+
filtered_dict[bsm_id] = ntseq_list
|
|
609
|
+
if filtered_dict: # Only add non-empty dictionaries
|
|
610
|
+
filtered_nucleotide_sequencing_list.append(filtered_dict)
|
|
611
|
+
|
|
612
|
+
# Filter biosamples_list to exclude JGI-processed biosamples
|
|
613
|
+
filtered_biosamples_list = [
|
|
614
|
+
biosample
|
|
615
|
+
for biosample in biosamples_list
|
|
616
|
+
if biosample.get("id") not in biosamples_to_exclude
|
|
617
|
+
]
|
|
618
|
+
|
|
619
|
+
# Get data_type from filtered list
|
|
620
|
+
# for bsm_ntseq in filtered_nucleotide_sequencing_list:
|
|
621
|
+
# for _, ntseq_list in bsm_ntseq.items():
|
|
622
|
+
# for ntseq in ntseq_list:
|
|
623
|
+
# if "analyte_category" in ntseq:
|
|
624
|
+
# data_type = handle_string_value(
|
|
625
|
+
# ntseq["analyte_category"]
|
|
626
|
+
# ).capitalize()
|
|
592
627
|
|
|
593
628
|
self.set_description(
|
|
594
629
|
email=self.nmdc_pi_email,
|
|
@@ -597,29 +632,65 @@ class NCBISubmissionXML:
|
|
|
597
632
|
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
598
633
|
)
|
|
599
634
|
|
|
600
|
-
if not
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
635
|
+
# if not self.ncbi_bioproject_id:
|
|
636
|
+
# self.set_bioproject(
|
|
637
|
+
# title=self.nmdc_study_title,
|
|
638
|
+
# project_id=self.ncbi_bioproject_id,
|
|
639
|
+
# description=self.nmdc_study_description,
|
|
640
|
+
# data_type=data_type,
|
|
641
|
+
# org=self.ncbi_submission_metadata.get("organization", ""),
|
|
642
|
+
# )
|
|
608
643
|
|
|
609
644
|
self.set_biosample(
|
|
610
645
|
organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
|
|
611
646
|
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
612
|
-
bioproject_id=
|
|
613
|
-
nmdc_biosamples=
|
|
647
|
+
bioproject_id=self.ncbi_bioproject_id,
|
|
648
|
+
nmdc_biosamples=filtered_biosamples_list,
|
|
614
649
|
)
|
|
615
650
|
|
|
651
|
+
# Also filter biosample_data_objects_list
|
|
652
|
+
filtered_data_objects_list = []
|
|
653
|
+
acceptable_extensions = [".fastq.gz", ".fastq"]
|
|
654
|
+
|
|
655
|
+
for entry in biosample_data_objects_list:
|
|
656
|
+
filtered_entry = {}
|
|
657
|
+
for biosample_id, data_objects in entry.items():
|
|
658
|
+
if biosample_id not in biosamples_to_exclude:
|
|
659
|
+
# filter data_objects based on acceptable/allowed extensions
|
|
660
|
+
# for "url" key in data_object
|
|
661
|
+
filtered_objects = []
|
|
662
|
+
for data_object in data_objects:
|
|
663
|
+
if "url" in data_object:
|
|
664
|
+
url = urlparse(data_object["url"])
|
|
665
|
+
file_path = os.path.basename(url.path)
|
|
666
|
+
if any(
|
|
667
|
+
file_path.endswith(ext) for ext in acceptable_extensions
|
|
668
|
+
):
|
|
669
|
+
filtered_objects.append(data_object)
|
|
670
|
+
|
|
671
|
+
if filtered_objects:
|
|
672
|
+
filtered_entry[biosample_id] = filtered_objects
|
|
673
|
+
|
|
674
|
+
if filtered_entry: # Only add non-empty entries
|
|
675
|
+
filtered_data_objects_list.append(filtered_entry)
|
|
676
|
+
|
|
677
|
+
# Filter library preparation list as well
|
|
678
|
+
filtered_library_preparation_list = []
|
|
679
|
+
for lib_prep_dict in biosample_library_preparation_list:
|
|
680
|
+
filtered_lib_prep = {}
|
|
681
|
+
for biosample_id, lib_prep in lib_prep_dict.items():
|
|
682
|
+
if biosample_id not in biosamples_to_exclude:
|
|
683
|
+
filtered_lib_prep[biosample_id] = lib_prep
|
|
684
|
+
if filtered_lib_prep: # Only add non-empty entries
|
|
685
|
+
filtered_library_preparation_list.append(filtered_lib_prep)
|
|
686
|
+
|
|
616
687
|
self.set_fastq(
|
|
617
|
-
biosample_data_objects=
|
|
618
|
-
bioproject_id=
|
|
688
|
+
biosample_data_objects=filtered_data_objects_list,
|
|
689
|
+
bioproject_id=self.ncbi_bioproject_id,
|
|
619
690
|
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
620
|
-
nmdc_nucleotide_sequencing=
|
|
621
|
-
nmdc_biosamples=
|
|
622
|
-
nmdc_library_preparation=
|
|
691
|
+
nmdc_nucleotide_sequencing=filtered_nucleotide_sequencing_list,
|
|
692
|
+
nmdc_biosamples=filtered_biosamples_list,
|
|
693
|
+
nmdc_library_preparation=filtered_library_preparation_list,
|
|
623
694
|
all_instruments=instruments_dict,
|
|
624
695
|
)
|
|
625
696
|
|
|
@@ -99,31 +99,36 @@ def fetch_nucleotide_sequencing_from_biosamples(
|
|
|
99
99
|
for biosample in biosamples_list:
|
|
100
100
|
current_ids = [biosample["id"]]
|
|
101
101
|
collected_ntseq_objects = []
|
|
102
|
+
processed_ids = set() # Track already processed nucleotide sequencing IDs
|
|
102
103
|
|
|
103
104
|
while current_ids:
|
|
104
105
|
new_current_ids = []
|
|
105
106
|
for current_id in current_ids:
|
|
106
|
-
|
|
107
|
-
document
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
107
|
+
# Find all documents with current_id as input instead of just one
|
|
108
|
+
for document in all_docs_collection.find({"has_input": current_id}):
|
|
109
|
+
has_output = document.get("has_output")
|
|
110
|
+
if not has_output:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
for output_id in has_output:
|
|
114
|
+
if get_classname_from_typecode(output_id) == "DataObject":
|
|
115
|
+
# Only process if we haven't seen this document ID before
|
|
116
|
+
if document["id"] not in processed_ids:
|
|
117
|
+
nucleotide_sequencing_doc = (
|
|
118
|
+
data_generation_set.find_one(
|
|
119
|
+
{
|
|
120
|
+
"id": document["id"],
|
|
121
|
+
"type": "nmdc:NucleotideSequencing",
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
if nucleotide_sequencing_doc:
|
|
126
|
+
collected_ntseq_objects.append(
|
|
127
|
+
strip_oid(nucleotide_sequencing_doc)
|
|
128
|
+
)
|
|
129
|
+
processed_ids.add(document["id"])
|
|
130
|
+
else:
|
|
131
|
+
new_current_ids.append(output_id)
|
|
127
132
|
|
|
128
133
|
current_ids = new_current_ids
|
|
129
134
|
|
|
@@ -187,10 +192,7 @@ def handle_quantity_value(slot_value):
|
|
|
187
192
|
and "has_minimum_numeric_value" in slot_value
|
|
188
193
|
and "has_unit" in slot_value
|
|
189
194
|
):
|
|
190
|
-
range_value =
|
|
191
|
-
slot_value["has_maximum_numeric_value"]
|
|
192
|
-
- slot_value["has_minimum_numeric_value"]
|
|
193
|
-
)
|
|
195
|
+
range_value = f"{slot_value['has_minimum_numeric_value']} - {slot_value['has_maximum_numeric_value']}"
|
|
194
196
|
return f"{range_value} {slot_value['has_unit']}"
|
|
195
197
|
elif "has_raw_value" in slot_value:
|
|
196
198
|
return slot_value["has_raw_value"]
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from dagster import graph
|
|
1
|
+
from dagster import graph, GraphIn
|
|
2
2
|
|
|
3
3
|
from nmdc_runtime.site.ops import (
|
|
4
4
|
build_merged_db,
|
|
@@ -22,6 +22,7 @@ from nmdc_runtime.site.ops import (
|
|
|
22
22
|
filter_ops_done_object_puts,
|
|
23
23
|
hello,
|
|
24
24
|
mongo_stats,
|
|
25
|
+
run_script_to_update_insdc_biosample_identifiers,
|
|
25
26
|
submit_metadata_to_db,
|
|
26
27
|
filter_ops_undone_expired,
|
|
27
28
|
construct_jobs,
|
|
@@ -50,6 +51,7 @@ from nmdc_runtime.site.ops import (
|
|
|
50
51
|
get_df_from_url,
|
|
51
52
|
site_code_mapping,
|
|
52
53
|
materialize_alldocs,
|
|
54
|
+
load_ontology,
|
|
53
55
|
get_ncbi_export_pipeline_study,
|
|
54
56
|
get_data_objects_from_biosamples,
|
|
55
57
|
get_nucleotide_sequencing_from_biosamples,
|
|
@@ -58,6 +60,7 @@ from nmdc_runtime.site.ops import (
|
|
|
58
60
|
get_ncbi_export_pipeline_inputs,
|
|
59
61
|
ncbi_submission_xml_from_nmdc_study,
|
|
60
62
|
ncbi_submission_xml_asset,
|
|
63
|
+
render_text,
|
|
61
64
|
get_database_updater_inputs,
|
|
62
65
|
post_submission_portal_biosample_ingest_record_stitching_filename,
|
|
63
66
|
generate_data_generation_set_post_biosample_ingest,
|
|
@@ -112,6 +115,16 @@ def ensure_alldocs():
|
|
|
112
115
|
materialize_alldocs()
|
|
113
116
|
|
|
114
117
|
|
|
118
|
+
@graph
|
|
119
|
+
def run_ontology_load():
|
|
120
|
+
"""
|
|
121
|
+
A graph for loading ontologies.
|
|
122
|
+
The source_ontology parameter is provided by the job configuration
|
|
123
|
+
and passed to the load_ontology op.
|
|
124
|
+
"""
|
|
125
|
+
load_ontology()
|
|
126
|
+
|
|
127
|
+
|
|
115
128
|
@graph
|
|
116
129
|
def ensure_jobs():
|
|
117
130
|
jobs = construct_jobs()
|
|
@@ -120,17 +133,24 @@ def ensure_jobs():
|
|
|
120
133
|
|
|
121
134
|
@graph
|
|
122
135
|
def apply_changesheet():
|
|
136
|
+
# Note: We use `_` as a "placeholder" variable.
|
|
137
|
+
# It's a variable to whose value we assign no significance. In this case, we use it to
|
|
138
|
+
# tell Dagster that one op depends upon the output of the other (so Dagster runs them
|
|
139
|
+
# in that order), without implying to maintainers that its value is significant to us.
|
|
140
|
+
# Reference (this strategy): https://docs.dagster.io/api/dagster/types#dagster.Nothing
|
|
141
|
+
# Reference (`_` variables): https://stackoverflow.com/a/47599668
|
|
123
142
|
sheet_in = get_changesheet_in()
|
|
124
143
|
outputs = perform_changesheet_updates(sheet_in)
|
|
125
|
-
add_output_run_event(outputs)
|
|
126
|
-
materialize_alldocs()
|
|
144
|
+
_ = add_output_run_event(outputs)
|
|
145
|
+
materialize_alldocs(waits_for=_)
|
|
127
146
|
|
|
128
147
|
|
|
129
148
|
@graph
|
|
130
149
|
def apply_metadata_in():
|
|
150
|
+
# Note: We use `_` as a "placeholder" variable.
|
|
131
151
|
outputs = perform_mongo_updates(get_json_in())
|
|
132
|
-
add_output_run_event(outputs)
|
|
133
|
-
materialize_alldocs()
|
|
152
|
+
_ = add_output_run_event(outputs)
|
|
153
|
+
materialize_alldocs(waits_for=_)
|
|
134
154
|
|
|
135
155
|
|
|
136
156
|
@graph
|
|
@@ -515,3 +535,20 @@ def generate_biosample_set_from_samples_in_gold():
|
|
|
515
535
|
)
|
|
516
536
|
outputs = export_json_to_drs(database_dict, filename)
|
|
517
537
|
add_output_run_event(outputs)
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
@graph
|
|
541
|
+
def generate_update_script_for_insdc_biosample_identifiers():
|
|
542
|
+
"""Generate a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.
|
|
543
|
+
|
|
544
|
+
This graph fetches the necessary inputs, then calls the run_script_to_update_insdc_biosample_identifiers op
|
|
545
|
+
to generate a script for updating biosample records with INSDC identifiers obtained from GOLD.
|
|
546
|
+
The script is returned as a dictionary that can be executed against MongoDB.
|
|
547
|
+
"""
|
|
548
|
+
(study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
|
|
549
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
550
|
+
|
|
551
|
+
update_script = run_script_to_update_insdc_biosample_identifiers(
|
|
552
|
+
study_id, gold_nmdc_instrument_map_df
|
|
553
|
+
)
|
|
554
|
+
render_text(update_script)
|