nmdc-runtime 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/minter/config.py +18 -50
- nmdc_runtime/site/export/ncbi_xml.py +23 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +81 -30
- nmdc_runtime/site/graphs.py +39 -0
- nmdc_runtime/site/ops.py +131 -31
- nmdc_runtime/site/repair/__init__.py +0 -0
- nmdc_runtime/site/repair/database_updater.py +230 -0
- nmdc_runtime/site/repository.py +109 -9
- nmdc_runtime/site/resources.py +36 -5
- nmdc_runtime/site/translation/gold_translator.py +26 -4
- nmdc_runtime/site/translation/neon_surface_water_translator.py +128 -78
- nmdc_runtime/site/util.py +7 -2
- nmdc_runtime/util.py +143 -1
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/METADATA +11 -3
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/RECORD +19 -17
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/minter/config.py
CHANGED
|
@@ -2,8 +2,9 @@ import os
|
|
|
2
2
|
from functools import lru_cache
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from nmdc_schema.id_helpers import get_typecode_for_future_ids
|
|
6
6
|
|
|
7
|
+
from nmdc_runtime.util import get_nmdc_jsonschema_dict
|
|
7
8
|
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
8
9
|
|
|
9
10
|
|
|
@@ -12,55 +13,24 @@ def minting_service_id() -> str | None:
|
|
|
12
13
|
return os.getenv("MINTING_SERVICE_ID")
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def extract_typecode_from_pattern(pattern: str) -> str:
|
|
16
|
-
r"""
|
|
17
|
-
Returns the typecode portion of the specified string.
|
|
18
|
-
|
|
19
|
-
>>> extract_typecode_from_pattern("foo-123-456$") # original behavior
|
|
20
|
-
'foo'
|
|
21
|
-
>>> extract_typecode_from_pattern("(foo)-123-456$") # returns first and only typecode
|
|
22
|
-
'foo'
|
|
23
|
-
>>> extract_typecode_from_pattern("(foo|bar)-123-456$") # returns first of 2 typecodes
|
|
24
|
-
'foo'
|
|
25
|
-
>>> extract_typecode_from_pattern("(foo|bar|baz)-123-456$") # returns first of > 2 typecodes
|
|
26
|
-
'foo'
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
# Get the portion of the pattern preceding the first hyphen.
|
|
30
|
-
# e.g. "foo-bar-baz" → ["foo", "bar-baz"] → "foo"
|
|
31
|
-
typecode_sub_pattern = pattern.split("-", maxsplit=1)[0]
|
|
32
|
-
|
|
33
|
-
# If that portion of the pattern is enclosed in parentheses, get the portion between the parentheses.
|
|
34
|
-
# e.g. "(apple|banana|carrot)" → "apple|banana|carrot"
|
|
35
|
-
if typecode_sub_pattern.startswith("(") and typecode_sub_pattern.endswith(")"):
|
|
36
|
-
inner_pattern = typecode_sub_pattern[1:-1]
|
|
37
|
-
|
|
38
|
-
# Finally, get everything before the first `|`, if any.
|
|
39
|
-
# e.g. "apple|banana|carrot" → "apple"
|
|
40
|
-
# e.g. "apple" → "apple"
|
|
41
|
-
typecode = inner_pattern.split("|", maxsplit=1)[0]
|
|
42
|
-
else:
|
|
43
|
-
# Note: This is the original behavior, before we added support for multi-typecode patterns.
|
|
44
|
-
# e.g. "apple" → "apple"
|
|
45
|
-
typecode = typecode_sub_pattern
|
|
46
|
-
|
|
47
|
-
return typecode
|
|
48
|
-
|
|
49
|
-
|
|
50
16
|
@lru_cache()
|
|
51
17
|
def typecodes() -> List[dict]:
|
|
52
18
|
r"""
|
|
53
19
|
Returns a list of dictionaries containing typecodes and associated information derived from the schema.
|
|
54
20
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
21
|
+
Note: In this function, we rely on a helper function provided by the `nmdc-schema` package to extract—from a given
|
|
22
|
+
class's `id` slot's pattern—the typecode that the minter would use when generating an ID for an instance of
|
|
23
|
+
that class _today_; regardless of what it may have used in the past.
|
|
24
|
+
|
|
25
|
+
>>> typecode_descriptors = typecodes()
|
|
26
|
+
# Test #1: We get the typecode we expect, for a class whose pattern contains only one typecode.
|
|
27
|
+
>>> any((td["name"] == "sty" and td["schema_class"] == "nmdc:Study") for td in typecode_descriptors)
|
|
28
|
+
True
|
|
29
|
+
# Tests #2 and #3: We get only the typecode we expect, for a class whose pattern contains multiple typecodes.
|
|
30
|
+
>>> any((td["name"] == "dgms" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
|
|
31
|
+
True
|
|
32
|
+
>>> any((td["name"] == "omprc" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
|
|
33
|
+
False
|
|
64
34
|
"""
|
|
65
35
|
id_pattern_prefix = r"^(nmdc):"
|
|
66
36
|
|
|
@@ -69,16 +39,14 @@ def typecodes() -> List[dict]:
|
|
|
69
39
|
for cls_name, defn in schema_dict["$defs"].items():
|
|
70
40
|
match defn.get("properties"):
|
|
71
41
|
case {"id": {"pattern": p}} if p.startswith(id_pattern_prefix):
|
|
72
|
-
#
|
|
73
|
-
|
|
74
|
-
index_of_first_character_following_prefix = len(id_pattern_prefix)
|
|
75
|
-
pattern_without_prefix = p[index_of_first_character_following_prefix:]
|
|
42
|
+
# Extract the typecode from the pattern.
|
|
43
|
+
typecode_for_future_ids = get_typecode_for_future_ids(slot_pattern=p)
|
|
76
44
|
|
|
77
45
|
rv.append(
|
|
78
46
|
{
|
|
79
47
|
"id": "nmdc:" + cls_name + "_" + "typecode",
|
|
80
48
|
"schema_class": "nmdc:" + cls_name,
|
|
81
|
-
"name":
|
|
49
|
+
"name": typecode_for_future_ids,
|
|
82
50
|
}
|
|
83
51
|
)
|
|
84
52
|
case _:
|
|
@@ -4,7 +4,7 @@ import datetime
|
|
|
4
4
|
import xml.etree.ElementTree as ET
|
|
5
5
|
import xml.dom.minidom
|
|
6
6
|
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, List, Union
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
9
|
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
10
10
|
get_instruments,
|
|
@@ -366,7 +366,14 @@ class NCBISubmissionXML:
|
|
|
366
366
|
)
|
|
367
367
|
# Currently, we are making the assumption that only one instrument
|
|
368
368
|
# is used to sequence a Biosample
|
|
369
|
-
|
|
369
|
+
instrument_used: List[str] = ntseq.get(
|
|
370
|
+
"instrument_used", []
|
|
371
|
+
)
|
|
372
|
+
if not instrument_used:
|
|
373
|
+
instrument_id = None
|
|
374
|
+
else:
|
|
375
|
+
instrument_id = instrument_used[0]
|
|
376
|
+
|
|
370
377
|
instrument = all_instruments.get(instrument_id, {})
|
|
371
378
|
instrument_vendor = instrument.get("vendor", "")
|
|
372
379
|
instrument_model = instrument.get("model", "")
|
|
@@ -448,6 +455,20 @@ class NCBISubmissionXML:
|
|
|
448
455
|
"Attribute", "NextSeq 550", {"name": "instrument_model"}
|
|
449
456
|
)
|
|
450
457
|
)
|
|
458
|
+
elif instrument_model == "novaseq_6000":
|
|
459
|
+
sra_attributes.append(
|
|
460
|
+
self.set_element(
|
|
461
|
+
"Attribute",
|
|
462
|
+
"NovaSeq 6000",
|
|
463
|
+
{"name": "instrument_model"},
|
|
464
|
+
)
|
|
465
|
+
)
|
|
466
|
+
elif instrument_model == "hiseq":
|
|
467
|
+
sra_attributes.append(
|
|
468
|
+
self.set_element(
|
|
469
|
+
"Attribute", "HiSeq", {"name": "instrument_model"}
|
|
470
|
+
)
|
|
471
|
+
)
|
|
451
472
|
|
|
452
473
|
if analyte_category == "metagenome":
|
|
453
474
|
sra_attributes.append(
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
from io import BytesIO, StringIO
|
|
2
|
+
from typing import Any, Dict, List, Union
|
|
3
|
+
|
|
4
|
+
from nmdc_runtime.api.endpoints.util import strip_oid
|
|
2
5
|
from nmdc_runtime.minter.config import typecodes
|
|
3
6
|
from lxml import etree
|
|
7
|
+
from pymongo.collection import Collection
|
|
4
8
|
|
|
5
9
|
import csv
|
|
6
10
|
import requests
|
|
@@ -45,35 +49,53 @@ def get_instruments(instrument_set_collection):
|
|
|
45
49
|
raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
|
|
46
50
|
|
|
47
51
|
|
|
48
|
-
def fetch_data_objects_from_biosamples(
|
|
52
|
+
def fetch_data_objects_from_biosamples(
|
|
53
|
+
all_docs_collection: Collection,
|
|
54
|
+
data_object_set: Collection,
|
|
55
|
+
biosamples_list: List[Dict[str, Any]],
|
|
56
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
57
|
+
"""This method fetches the data objects that are "associated" (derived from/products of)
|
|
58
|
+
with their respective biosamples by iterating over the alldocs collection recursively.
|
|
59
|
+
The methods returns a dictionary with biosample ids as keys and the associated list of
|
|
60
|
+
data objects as values.
|
|
61
|
+
|
|
62
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
63
|
+
:param data_object_set: reference to the data_object_set collection
|
|
64
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
65
|
+
:return: list of dictionaries with biosample ids as keys and associated data objects as values
|
|
66
|
+
"""
|
|
67
|
+
biosample_data_objects = []
|
|
68
|
+
|
|
69
|
+
def collect_data_objects(doc_ids, collected_objects, unique_ids):
|
|
70
|
+
for doc_id in doc_ids:
|
|
71
|
+
if (
|
|
72
|
+
get_classname_from_typecode(doc_id) == "DataObject"
|
|
73
|
+
and doc_id not in unique_ids
|
|
74
|
+
):
|
|
75
|
+
data_obj = data_object_set.find_one({"id": doc_id})
|
|
76
|
+
if data_obj:
|
|
77
|
+
collected_objects.append(strip_oid(data_obj))
|
|
78
|
+
unique_ids.add(doc_id)
|
|
79
|
+
|
|
49
80
|
biosample_data_objects = []
|
|
50
81
|
|
|
51
82
|
for biosample in biosamples_list:
|
|
52
83
|
current_ids = [biosample["id"]]
|
|
53
84
|
collected_data_objects = []
|
|
85
|
+
unique_ids = set()
|
|
54
86
|
|
|
55
87
|
while current_ids:
|
|
56
88
|
new_current_ids = []
|
|
57
89
|
for current_id in current_ids:
|
|
58
|
-
|
|
59
|
-
|
|
90
|
+
for doc in all_docs_collection.find({"has_input": current_id}):
|
|
91
|
+
has_output = doc.get("has_output", [])
|
|
60
92
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
for output_id in has_output:
|
|
69
|
-
if get_classname_from_typecode(output_id) == "DataObject":
|
|
70
|
-
data_object_doc = all_docs_collection.find_one(
|
|
71
|
-
{"id": output_id}
|
|
72
|
-
)
|
|
73
|
-
if data_object_doc:
|
|
74
|
-
collected_data_objects.append(data_object_doc)
|
|
75
|
-
else:
|
|
76
|
-
new_current_ids.append(output_id)
|
|
93
|
+
collect_data_objects(has_output, collected_data_objects, unique_ids)
|
|
94
|
+
new_current_ids.extend(
|
|
95
|
+
op
|
|
96
|
+
for op in has_output
|
|
97
|
+
if get_classname_from_typecode(op) != "DataObject"
|
|
98
|
+
)
|
|
77
99
|
|
|
78
100
|
current_ids = new_current_ids
|
|
79
101
|
|
|
@@ -83,12 +105,25 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
|
|
|
83
105
|
return biosample_data_objects
|
|
84
106
|
|
|
85
107
|
|
|
86
|
-
def fetch_nucleotide_sequencing_from_biosamples(
|
|
87
|
-
|
|
108
|
+
def fetch_nucleotide_sequencing_from_biosamples(
|
|
109
|
+
all_docs_collection: Collection,
|
|
110
|
+
data_generation_set: Collection,
|
|
111
|
+
biosamples_list: List[Dict[str, Any]],
|
|
112
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
113
|
+
"""This method fetches the nucleotide sequencing process records that create data objects
|
|
114
|
+
for biosamples by iterating over the alldocs collection recursively.
|
|
115
|
+
|
|
116
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
117
|
+
:param data_generation_set: reference to the data_generation_set collection
|
|
118
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
119
|
+
:return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
|
|
120
|
+
process objects as values
|
|
121
|
+
"""
|
|
122
|
+
biosample_ntseq_objects = []
|
|
88
123
|
|
|
89
124
|
for biosample in biosamples_list:
|
|
90
125
|
current_ids = [biosample["id"]]
|
|
91
|
-
|
|
126
|
+
collected_ntseq_objects = []
|
|
92
127
|
|
|
93
128
|
while current_ids:
|
|
94
129
|
new_current_ids = []
|
|
@@ -105,23 +140,39 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
|
|
|
105
140
|
|
|
106
141
|
for output_id in has_output:
|
|
107
142
|
if get_classname_from_typecode(output_id) == "DataObject":
|
|
108
|
-
nucleotide_sequencing_doc =
|
|
143
|
+
nucleotide_sequencing_doc = data_generation_set.find_one(
|
|
109
144
|
{"id": document["id"]}
|
|
110
145
|
)
|
|
111
146
|
if nucleotide_sequencing_doc:
|
|
112
|
-
|
|
147
|
+
collected_ntseq_objects.append(
|
|
148
|
+
strip_oid(nucleotide_sequencing_doc)
|
|
149
|
+
)
|
|
113
150
|
else:
|
|
114
151
|
new_current_ids.append(output_id)
|
|
115
152
|
|
|
116
153
|
current_ids = new_current_ids
|
|
117
154
|
|
|
118
|
-
if
|
|
119
|
-
|
|
155
|
+
if collected_ntseq_objects:
|
|
156
|
+
biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
|
|
157
|
+
|
|
158
|
+
return biosample_ntseq_objects
|
|
120
159
|
|
|
121
|
-
return biosample_data_objects
|
|
122
160
|
|
|
161
|
+
def fetch_library_preparation_from_biosamples(
|
|
162
|
+
all_docs_collection: Collection,
|
|
163
|
+
material_processing_set: Collection,
|
|
164
|
+
biosamples_list: List[Dict[str, Any]],
|
|
165
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
166
|
+
"""This method fetches the library preparation process records that create processed samples,
|
|
167
|
+
which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
|
|
168
|
+
for biosamples by iterating over the alldocs collection recursively.
|
|
123
169
|
|
|
124
|
-
|
|
170
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
171
|
+
:param material_processing_set: reference to the material_processing_set collection
|
|
172
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
173
|
+
:return: list of dictionaries with biosample ids as keys and associated library preparation process
|
|
174
|
+
objects as values
|
|
175
|
+
"""
|
|
125
176
|
biosample_lib_prep = []
|
|
126
177
|
|
|
127
178
|
for biosample in biosamples_list:
|
|
@@ -144,10 +195,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
|
|
|
144
195
|
"has_input": output_id,
|
|
145
196
|
"type": {"$in": ["LibraryPreparation"]},
|
|
146
197
|
}
|
|
147
|
-
lib_prep_doc =
|
|
198
|
+
lib_prep_doc = material_processing_set.find_one(lib_prep_query)
|
|
148
199
|
|
|
149
200
|
if lib_prep_doc:
|
|
150
|
-
biosample_lib_prep.append({biosample_id: lib_prep_doc})
|
|
201
|
+
biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
|
|
151
202
|
break # Stop at the first document that meets the criteria
|
|
152
203
|
|
|
153
204
|
return biosample_lib_prep
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -2,6 +2,7 @@ from dagster import graph
|
|
|
2
2
|
|
|
3
3
|
from nmdc_runtime.site.ops import (
|
|
4
4
|
build_merged_db,
|
|
5
|
+
generate_biosample_set_for_nmdc_study_from_gold,
|
|
5
6
|
nmdc_schema_database_export_filename,
|
|
6
7
|
nmdc_schema_database_from_gold_study,
|
|
7
8
|
nmdc_schema_object_to_dict,
|
|
@@ -57,6 +58,9 @@ from nmdc_runtime.site.ops import (
|
|
|
57
58
|
get_ncbi_export_pipeline_inputs,
|
|
58
59
|
ncbi_submission_xml_from_nmdc_study,
|
|
59
60
|
ncbi_submission_xml_asset,
|
|
61
|
+
get_database_updater_inputs,
|
|
62
|
+
post_submission_portal_biosample_ingest_record_stitching_filename,
|
|
63
|
+
generate_data_generation_set_post_biosample_ingest,
|
|
60
64
|
)
|
|
61
65
|
from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
|
|
62
66
|
|
|
@@ -117,12 +121,14 @@ def apply_changesheet():
|
|
|
117
121
|
sheet_in = get_changesheet_in()
|
|
118
122
|
outputs = perform_changesheet_updates(sheet_in)
|
|
119
123
|
add_output_run_event(outputs)
|
|
124
|
+
materialize_alldocs()
|
|
120
125
|
|
|
121
126
|
|
|
122
127
|
@graph
|
|
123
128
|
def apply_metadata_in():
|
|
124
129
|
outputs = perform_mongo_updates(get_json_in())
|
|
125
130
|
add_output_run_event(outputs)
|
|
131
|
+
materialize_alldocs()
|
|
126
132
|
|
|
127
133
|
|
|
128
134
|
@graph
|
|
@@ -465,3 +471,36 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
465
471
|
all_instruments,
|
|
466
472
|
)
|
|
467
473
|
ncbi_submission_xml_asset(xml_data)
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
@graph
|
|
477
|
+
def generate_data_generation_set_for_biosamples_in_nmdc_study():
|
|
478
|
+
(study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
|
|
479
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
480
|
+
|
|
481
|
+
database = generate_data_generation_set_post_biosample_ingest(
|
|
482
|
+
study_id, gold_nmdc_instrument_map_df
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
database_dict = nmdc_schema_object_to_dict(database)
|
|
486
|
+
filename = post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
487
|
+
study_id
|
|
488
|
+
)
|
|
489
|
+
outputs = export_json_to_drs(database_dict, filename)
|
|
490
|
+
add_output_run_event(outputs)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
@graph
|
|
494
|
+
def generate_biosample_set_from_samples_in_gold():
|
|
495
|
+
(study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
|
|
496
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
497
|
+
|
|
498
|
+
database = generate_biosample_set_for_nmdc_study_from_gold(
|
|
499
|
+
study_id, gold_nmdc_instrument_map_df
|
|
500
|
+
)
|
|
501
|
+
database_dict = nmdc_schema_object_to_dict(database)
|
|
502
|
+
filename = post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
503
|
+
study_id
|
|
504
|
+
)
|
|
505
|
+
outputs = export_json_to_drs(database_dict, filename)
|
|
506
|
+
add_output_run_event(outputs)
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -91,7 +91,12 @@ from nmdc_runtime.site.translation.neon_surface_water_translator import (
|
|
|
91
91
|
from nmdc_runtime.site.translation.submission_portal_translator import (
|
|
92
92
|
SubmissionPortalTranslator,
|
|
93
93
|
)
|
|
94
|
-
from nmdc_runtime.site.
|
|
94
|
+
from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
|
|
95
|
+
from nmdc_runtime.site.util import (
|
|
96
|
+
run_and_log,
|
|
97
|
+
schema_collection_has_index_on_id,
|
|
98
|
+
nmdc_study_id_to_filename,
|
|
99
|
+
)
|
|
95
100
|
from nmdc_runtime.util import (
|
|
96
101
|
drs_object_in_for,
|
|
97
102
|
get_names_of_classes_in_effective_range_of_slot,
|
|
@@ -1054,10 +1059,7 @@ def materialize_alldocs(context) -> int:
|
|
|
1054
1059
|
# TODO include functional_annotation_agg for "real-time" ref integrity checking.
|
|
1055
1060
|
# For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
|
|
1056
1061
|
collection_names = populated_schema_collection_names_with_id_field(mdb)
|
|
1057
|
-
context.log.info(f"{collection_names=}")
|
|
1058
|
-
|
|
1059
|
-
# Build alldocs
|
|
1060
|
-
context.log.info("constructing `alldocs` collection")
|
|
1062
|
+
context.log.info(f"constructing `alldocs` collection using {collection_names=}")
|
|
1061
1063
|
|
|
1062
1064
|
document_class_names = set(
|
|
1063
1065
|
chain.from_iterable(collection_name_to_class_names.values())
|
|
@@ -1070,7 +1072,8 @@ def materialize_alldocs(context) -> int:
|
|
|
1070
1072
|
for cls_name in document_class_names
|
|
1071
1073
|
}
|
|
1072
1074
|
|
|
1073
|
-
# Any ancestor of a document class is a document-
|
|
1075
|
+
# Any ancestor of a document class is a document-referencable range,
|
|
1076
|
+
# i.e., a valid range of a document-reference-ranged slot.
|
|
1074
1077
|
document_referenceable_ranges = set(
|
|
1075
1078
|
chain.from_iterable(
|
|
1076
1079
|
schema_view.class_ancestors(cls_name) for cls_name in document_class_names
|
|
@@ -1086,17 +1089,15 @@ def materialize_alldocs(context) -> int:
|
|
|
1086
1089
|
):
|
|
1087
1090
|
document_reference_ranged_slots[cls_name].append(slot_name)
|
|
1088
1091
|
|
|
1089
|
-
#
|
|
1090
|
-
#
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
#
|
|
1095
|
-
mdb.alldocs.drop()
|
|
1092
|
+
# Build `alldocs` to a temporary collection for atomic replacement
|
|
1093
|
+
# https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
|
|
1094
|
+
temp_alldocs_collection_name = f"tmp.alldocs.{ObjectId()}"
|
|
1095
|
+
temp_alldocs_collection = mdb[temp_alldocs_collection_name]
|
|
1096
|
+
context.log.info(f"constructing `{temp_alldocs_collection.name}` collection")
|
|
1096
1097
|
|
|
1097
1098
|
for coll_name in collection_names:
|
|
1098
1099
|
context.log.info(f"{coll_name=}")
|
|
1099
|
-
|
|
1100
|
+
write_operations = []
|
|
1100
1101
|
documents_processed_counter = 0
|
|
1101
1102
|
for doc in mdb[coll_name].find():
|
|
1102
1103
|
doc_type = doc["type"][5:] # lop off "nmdc:" prefix
|
|
@@ -1105,30 +1106,35 @@ def materialize_alldocs(context) -> int:
|
|
|
1105
1106
|
]
|
|
1106
1107
|
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1107
1108
|
new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
|
|
1108
|
-
|
|
1109
|
-
if len(
|
|
1110
|
-
_ =
|
|
1111
|
-
|
|
1109
|
+
write_operations.append(InsertOne(new_doc))
|
|
1110
|
+
if len(write_operations) == BULK_WRITE_BATCH_SIZE:
|
|
1111
|
+
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1112
|
+
write_operations.clear()
|
|
1112
1113
|
documents_processed_counter += BULK_WRITE_BATCH_SIZE
|
|
1113
|
-
if len(
|
|
1114
|
-
_ =
|
|
1115
|
-
documents_processed_counter += len(
|
|
1114
|
+
if len(write_operations) > 0:
|
|
1115
|
+
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1116
|
+
documents_processed_counter += len(write_operations)
|
|
1116
1117
|
context.log.info(
|
|
1117
1118
|
f"Inserted {documents_processed_counter} documents from {coll_name=} "
|
|
1118
1119
|
)
|
|
1119
1120
|
|
|
1120
1121
|
context.log.info(
|
|
1121
|
-
f"
|
|
1122
|
+
f"produced `{temp_alldocs_collection.name}` collection with"
|
|
1123
|
+
f" {temp_alldocs_collection.estimated_document_count()} docs."
|
|
1122
1124
|
)
|
|
1123
1125
|
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
#
|
|
1127
|
-
|
|
1126
|
+
context.log.info(f"creating indexes on `{temp_alldocs_collection.name}` ...")
|
|
1127
|
+
# Ensure unique index on "id". Index creation here is blocking (i.e. background=False),
|
|
1128
|
+
# so that `temp_alldocs_collection` will be "good to go" on renaming.
|
|
1129
|
+
temp_alldocs_collection.create_index("id", unique=True)
|
|
1130
|
+
# Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
|
|
1128
1131
|
slots_to_index = ["has_input", "has_output", "was_informed_by"]
|
|
1129
|
-
[
|
|
1130
|
-
|
|
1132
|
+
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1131
1133
|
context.log.info(f"created indexes on id, {slots_to_index}.")
|
|
1134
|
+
|
|
1135
|
+
context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
|
|
1136
|
+
temp_alldocs_collection.rename("alldocs", dropTarget=True)
|
|
1137
|
+
|
|
1132
1138
|
return mdb.alldocs.estimated_document_count()
|
|
1133
1139
|
|
|
1134
1140
|
|
|
@@ -1182,8 +1188,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
|
|
|
1182
1188
|
def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1183
1189
|
mdb = context.resources.mongo.db
|
|
1184
1190
|
alldocs_collection = mdb["alldocs"]
|
|
1191
|
+
data_object_set = mdb["data_object_set"]
|
|
1185
1192
|
biosample_data_objects = fetch_data_objects_from_biosamples(
|
|
1186
|
-
alldocs_collection, biosamples
|
|
1193
|
+
alldocs_collection, data_object_set, biosamples
|
|
1187
1194
|
)
|
|
1188
1195
|
return biosample_data_objects
|
|
1189
1196
|
|
|
@@ -1194,8 +1201,9 @@ def get_nucleotide_sequencing_from_biosamples(
|
|
|
1194
1201
|
):
|
|
1195
1202
|
mdb = context.resources.mongo.db
|
|
1196
1203
|
alldocs_collection = mdb["alldocs"]
|
|
1204
|
+
data_generation_set = mdb["data_generation_set"]
|
|
1197
1205
|
biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
|
|
1198
|
-
alldocs_collection, biosamples
|
|
1206
|
+
alldocs_collection, data_generation_set, biosamples
|
|
1199
1207
|
)
|
|
1200
1208
|
return biosample_omics_processing
|
|
1201
1209
|
|
|
@@ -1206,8 +1214,9 @@ def get_library_preparation_from_biosamples(
|
|
|
1206
1214
|
):
|
|
1207
1215
|
mdb = context.resources.mongo.db
|
|
1208
1216
|
alldocs_collection = mdb["alldocs"]
|
|
1217
|
+
material_processing_set = mdb["material_processing_set"]
|
|
1209
1218
|
biosample_lib_prep = fetch_library_preparation_from_biosamples(
|
|
1210
|
-
alldocs_collection, biosamples
|
|
1219
|
+
alldocs_collection, material_processing_set, biosamples
|
|
1211
1220
|
)
|
|
1212
1221
|
return biosample_lib_prep
|
|
1213
1222
|
|
|
@@ -1240,3 +1249,94 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1240
1249
|
all_instruments,
|
|
1241
1250
|
)
|
|
1242
1251
|
return ncbi_xml
|
|
1252
|
+
|
|
1253
|
+
|
|
1254
|
+
@op
|
|
1255
|
+
def post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
1256
|
+
nmdc_study_id: str,
|
|
1257
|
+
) -> str:
|
|
1258
|
+
filename = nmdc_study_id_to_filename(nmdc_study_id)
|
|
1259
|
+
return f"missing_database_records_for_{filename}.json"
|
|
1260
|
+
|
|
1261
|
+
|
|
1262
|
+
@op(
|
|
1263
|
+
config_schema={
|
|
1264
|
+
"nmdc_study_id": str,
|
|
1265
|
+
"gold_nmdc_instrument_mapping_file_url": str,
|
|
1266
|
+
},
|
|
1267
|
+
out={
|
|
1268
|
+
"nmdc_study_id": Out(str),
|
|
1269
|
+
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
1270
|
+
},
|
|
1271
|
+
)
|
|
1272
|
+
def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]:
|
|
1273
|
+
return (
|
|
1274
|
+
context.op_config["nmdc_study_id"],
|
|
1275
|
+
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
|
|
1279
|
+
@op(
|
|
1280
|
+
required_resource_keys={
|
|
1281
|
+
"runtime_api_user_client",
|
|
1282
|
+
"runtime_api_site_client",
|
|
1283
|
+
"gold_api_client",
|
|
1284
|
+
}
|
|
1285
|
+
)
|
|
1286
|
+
def generate_data_generation_set_post_biosample_ingest(
|
|
1287
|
+
context: OpExecutionContext,
|
|
1288
|
+
nmdc_study_id: str,
|
|
1289
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1290
|
+
) -> nmdc.Database:
|
|
1291
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1292
|
+
context.resources.runtime_api_user_client
|
|
1293
|
+
)
|
|
1294
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1295
|
+
context.resources.runtime_api_site_client
|
|
1296
|
+
)
|
|
1297
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1298
|
+
|
|
1299
|
+
database_updater = DatabaseUpdater(
|
|
1300
|
+
runtime_api_user_client,
|
|
1301
|
+
runtime_api_site_client,
|
|
1302
|
+
gold_api_client,
|
|
1303
|
+
nmdc_study_id,
|
|
1304
|
+
gold_nmdc_instrument_map_df,
|
|
1305
|
+
)
|
|
1306
|
+
database = (
|
|
1307
|
+
database_updater.generate_data_generation_set_records_from_gold_api_for_study()
|
|
1308
|
+
)
|
|
1309
|
+
|
|
1310
|
+
return database
|
|
1311
|
+
|
|
1312
|
+
|
|
1313
|
+
@op(
|
|
1314
|
+
required_resource_keys={
|
|
1315
|
+
"runtime_api_user_client",
|
|
1316
|
+
"runtime_api_site_client",
|
|
1317
|
+
"gold_api_client",
|
|
1318
|
+
}
|
|
1319
|
+
)
|
|
1320
|
+
def generate_biosample_set_for_nmdc_study_from_gold(
|
|
1321
|
+
context: OpExecutionContext,
|
|
1322
|
+
nmdc_study_id: str,
|
|
1323
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1324
|
+
) -> nmdc.Database:
|
|
1325
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1326
|
+
context.resources.runtime_api_user_client
|
|
1327
|
+
)
|
|
1328
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1329
|
+
context.resources.runtime_api_site_client
|
|
1330
|
+
)
|
|
1331
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1332
|
+
|
|
1333
|
+
database_updater = DatabaseUpdater(
|
|
1334
|
+
runtime_api_user_client,
|
|
1335
|
+
runtime_api_site_client,
|
|
1336
|
+
gold_api_client,
|
|
1337
|
+
nmdc_study_id,
|
|
1338
|
+
gold_nmdc_instrument_map_df,
|
|
1339
|
+
)
|
|
1340
|
+
database = database_updater.generate_biosample_set_from_gold_api_for_study()
|
|
1341
|
+
|
|
1342
|
+
return database
|
|
File without changes
|