nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
from io import BytesIO, StringIO
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from nmdc_runtime.api.endpoints.util import strip_oid
|
|
5
|
+
from nmdc_runtime.minter.config import typecodes
|
|
6
|
+
from lxml import etree
|
|
7
|
+
from pymongo.collection import Collection
|
|
8
|
+
|
|
9
|
+
import csv
|
|
10
|
+
import requests
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _build_class_map(class_map_data):
|
|
14
|
+
return {
|
|
15
|
+
entry["name"]: entry["schema_class"].split(":")[1] for entry in class_map_data
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_classname_from_typecode(doc_id):
|
|
20
|
+
class_map_data = typecodes()
|
|
21
|
+
class_map = _build_class_map(class_map_data)
|
|
22
|
+
|
|
23
|
+
typecode = doc_id.split(":")[1].split("-")[0]
|
|
24
|
+
return class_map.get(typecode)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def fetch_data_objects_from_biosamples(
|
|
28
|
+
all_docs_collection: Collection,
|
|
29
|
+
data_object_set: Collection,
|
|
30
|
+
biosamples_list: List[Dict[str, Any]],
|
|
31
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
32
|
+
"""This method fetches the data objects that are "associated" (derived from/products of)
|
|
33
|
+
with their respective biosamples by iterating over the alldocs collection recursively.
|
|
34
|
+
The methods returns a dictionary with biosample ids as keys and the associated list of
|
|
35
|
+
data objects as values.
|
|
36
|
+
|
|
37
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
38
|
+
:param data_object_set: reference to the data_object_set collection
|
|
39
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
40
|
+
:return: list of dictionaries with biosample ids as keys and associated data objects as values
|
|
41
|
+
"""
|
|
42
|
+
biosample_data_objects = []
|
|
43
|
+
|
|
44
|
+
def collect_data_objects(doc_ids, collected_objects, unique_ids):
|
|
45
|
+
for doc_id in doc_ids:
|
|
46
|
+
if (
|
|
47
|
+
get_classname_from_typecode(doc_id) == "DataObject"
|
|
48
|
+
and doc_id not in unique_ids
|
|
49
|
+
):
|
|
50
|
+
data_obj = data_object_set.find_one({"id": doc_id})
|
|
51
|
+
if data_obj:
|
|
52
|
+
collected_objects.append(strip_oid(data_obj))
|
|
53
|
+
unique_ids.add(doc_id)
|
|
54
|
+
|
|
55
|
+
biosample_data_objects = []
|
|
56
|
+
|
|
57
|
+
for biosample in biosamples_list:
|
|
58
|
+
current_ids = [biosample["id"]]
|
|
59
|
+
collected_data_objects = []
|
|
60
|
+
unique_ids = set()
|
|
61
|
+
|
|
62
|
+
while current_ids:
|
|
63
|
+
new_current_ids = []
|
|
64
|
+
for current_id in current_ids:
|
|
65
|
+
for doc in all_docs_collection.find({"has_input": current_id}):
|
|
66
|
+
has_output = doc.get("has_output", [])
|
|
67
|
+
|
|
68
|
+
collect_data_objects(has_output, collected_data_objects, unique_ids)
|
|
69
|
+
new_current_ids.extend(
|
|
70
|
+
op
|
|
71
|
+
for op in has_output
|
|
72
|
+
if get_classname_from_typecode(op) != "DataObject"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
current_ids = new_current_ids
|
|
76
|
+
|
|
77
|
+
if collected_data_objects:
|
|
78
|
+
biosample_data_objects.append({biosample["id"]: collected_data_objects})
|
|
79
|
+
|
|
80
|
+
return biosample_data_objects
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def fetch_nucleotide_sequencing_from_biosamples(
|
|
84
|
+
all_docs_collection: Collection,
|
|
85
|
+
data_generation_set: Collection,
|
|
86
|
+
biosamples_list: List[Dict[str, Any]],
|
|
87
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
88
|
+
"""This method fetches the nucleotide sequencing process records that create data objects
|
|
89
|
+
for biosamples by iterating over the alldocs collection recursively.
|
|
90
|
+
|
|
91
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
92
|
+
:param data_generation_set: reference to the data_generation_set collection
|
|
93
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
94
|
+
:return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
|
|
95
|
+
process objects as values
|
|
96
|
+
"""
|
|
97
|
+
biosample_ntseq_objects = []
|
|
98
|
+
|
|
99
|
+
for biosample in biosamples_list:
|
|
100
|
+
current_ids = [biosample["id"]]
|
|
101
|
+
collected_ntseq_objects = []
|
|
102
|
+
processed_ids = set() # Track already processed nucleotide sequencing IDs
|
|
103
|
+
|
|
104
|
+
while current_ids:
|
|
105
|
+
new_current_ids = []
|
|
106
|
+
for current_id in current_ids:
|
|
107
|
+
# Find all documents with current_id as input instead of just one
|
|
108
|
+
for document in all_docs_collection.find({"has_input": current_id}):
|
|
109
|
+
has_output = document.get("has_output")
|
|
110
|
+
if not has_output:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
for output_id in has_output:
|
|
114
|
+
if get_classname_from_typecode(output_id) == "DataObject":
|
|
115
|
+
# Only process if we haven't seen this document ID before
|
|
116
|
+
if document["id"] not in processed_ids:
|
|
117
|
+
nucleotide_sequencing_doc = (
|
|
118
|
+
data_generation_set.find_one(
|
|
119
|
+
{
|
|
120
|
+
"id": document["id"],
|
|
121
|
+
"type": "nmdc:NucleotideSequencing",
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
if nucleotide_sequencing_doc:
|
|
126
|
+
collected_ntseq_objects.append(
|
|
127
|
+
strip_oid(nucleotide_sequencing_doc)
|
|
128
|
+
)
|
|
129
|
+
processed_ids.add(document["id"])
|
|
130
|
+
else:
|
|
131
|
+
new_current_ids.append(output_id)
|
|
132
|
+
|
|
133
|
+
current_ids = new_current_ids
|
|
134
|
+
|
|
135
|
+
if collected_ntseq_objects:
|
|
136
|
+
biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
|
|
137
|
+
|
|
138
|
+
return biosample_ntseq_objects
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def fetch_library_preparation_from_biosamples(
|
|
142
|
+
all_docs_collection: Collection,
|
|
143
|
+
material_processing_set: Collection,
|
|
144
|
+
biosamples_list: List[Dict[str, Any]],
|
|
145
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
146
|
+
"""This method fetches the library preparation process records that create processed samples,
|
|
147
|
+
which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
|
|
148
|
+
for biosamples by iterating over the alldocs collection recursively.
|
|
149
|
+
|
|
150
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
151
|
+
:param material_processing_set: reference to the material_processing_set collection
|
|
152
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
153
|
+
:return: list of dictionaries with biosample ids as keys and associated library preparation process
|
|
154
|
+
objects as values
|
|
155
|
+
"""
|
|
156
|
+
biosample_lib_prep = []
|
|
157
|
+
|
|
158
|
+
for biosample in biosamples_list:
|
|
159
|
+
biosample_id = biosample["id"]
|
|
160
|
+
|
|
161
|
+
# Step 1: Find any document with biosample id as has_input
|
|
162
|
+
initial_query = {"has_input": biosample_id}
|
|
163
|
+
initial_document = all_docs_collection.find_one(initial_query)
|
|
164
|
+
|
|
165
|
+
if not initial_document:
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
initial_output = initial_document.get("has_output")
|
|
169
|
+
if not initial_output:
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
# Step 2: Use has_output to find the library preparation document
|
|
173
|
+
for output_id in initial_output:
|
|
174
|
+
lib_prep_query = {
|
|
175
|
+
"has_input": output_id,
|
|
176
|
+
"type": {"$in": ["LibraryPreparation"]},
|
|
177
|
+
}
|
|
178
|
+
lib_prep_doc = material_processing_set.find_one(lib_prep_query)
|
|
179
|
+
|
|
180
|
+
if lib_prep_doc:
|
|
181
|
+
biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
|
|
182
|
+
break # Stop at the first document that meets the criteria
|
|
183
|
+
|
|
184
|
+
return biosample_lib_prep
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def handle_quantity_value(slot_value):
|
|
188
|
+
if "has_numeric_value" in slot_value and "has_unit" in slot_value:
|
|
189
|
+
return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}"
|
|
190
|
+
elif (
|
|
191
|
+
"has_maximum_numeric_value" in slot_value
|
|
192
|
+
and "has_minimum_numeric_value" in slot_value
|
|
193
|
+
and "has_unit" in slot_value
|
|
194
|
+
):
|
|
195
|
+
range_value = f"{slot_value['has_minimum_numeric_value']} - {slot_value['has_maximum_numeric_value']}"
|
|
196
|
+
return f"{range_value} {slot_value['has_unit']}"
|
|
197
|
+
elif "has_raw_value" in slot_value:
|
|
198
|
+
return slot_value["has_raw_value"]
|
|
199
|
+
return "Unknown format"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def handle_text_value(slot_value):
|
|
203
|
+
return slot_value.get("has_raw_value", "Unknown format")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def handle_timestamp_value(slot_value):
|
|
207
|
+
return slot_value.get("has_raw_value", "Unknown format")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def handle_controlled_term_value(slot_value):
|
|
211
|
+
if "term" in slot_value:
|
|
212
|
+
term = slot_value["term"]
|
|
213
|
+
if "name" in term and "id" in term:
|
|
214
|
+
return f"{term['name']} [{term['id']}]"
|
|
215
|
+
elif "id" in term:
|
|
216
|
+
return term["id"]
|
|
217
|
+
elif "name" in term:
|
|
218
|
+
return term["name"]
|
|
219
|
+
elif "has_raw_value" in slot_value:
|
|
220
|
+
return slot_value["has_raw_value"]
|
|
221
|
+
return "Unknown format"
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def handle_controlled_identified_term_value(slot_value):
|
|
225
|
+
if "term" in slot_value:
|
|
226
|
+
term = slot_value["term"]
|
|
227
|
+
if "name" in term and "id" in term:
|
|
228
|
+
return f"{term['name']} [{term['id']}]"
|
|
229
|
+
elif "id" in term:
|
|
230
|
+
return term["id"]
|
|
231
|
+
elif "has_raw_value" in slot_value:
|
|
232
|
+
return slot_value["has_raw_value"]
|
|
233
|
+
return "Unknown format"
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def handle_geolocation_value(slot_value):
|
|
237
|
+
if "latitude" in slot_value and "longitude" in slot_value:
|
|
238
|
+
return f"{slot_value['latitude']} {slot_value['longitude']}"
|
|
239
|
+
elif "has_raw_value" in slot_value:
|
|
240
|
+
return slot_value["has_raw_value"]
|
|
241
|
+
return "Unknown format"
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def handle_float_value(slot_value):
|
|
245
|
+
return f"{slot_value:.2f}"
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def handle_string_value(slot_value):
|
|
249
|
+
return f"{slot_value}"
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def load_mappings(url):
|
|
253
|
+
response = requests.get(url)
|
|
254
|
+
response.raise_for_status()
|
|
255
|
+
file_content = response.text
|
|
256
|
+
|
|
257
|
+
attribute_mappings = {}
|
|
258
|
+
slot_range_mappings = {}
|
|
259
|
+
reader = csv.DictReader(StringIO(file_content), delimiter="\t")
|
|
260
|
+
for row in reader:
|
|
261
|
+
if row["ignore"].strip():
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
json_key = row["nmdc_schema_slot"]
|
|
265
|
+
# attribute mappings
|
|
266
|
+
xml_attribute_name = row["ncbi_biosample_attribute_name"]
|
|
267
|
+
attribute_mappings[json_key] = (
|
|
268
|
+
xml_attribute_name if xml_attribute_name else json_key
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# slot range mappings
|
|
272
|
+
data_type = row["nmdc_schema_slot_range"]
|
|
273
|
+
slot_range_mappings[json_key] = data_type if data_type else "default"
|
|
274
|
+
|
|
275
|
+
return attribute_mappings, slot_range_mappings
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def check_pooling_for_biosamples(
|
|
279
|
+
material_processing_set: Collection, biosamples_list: List[Dict[str, Any]]
|
|
280
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
281
|
+
"""Check which biosamples are part of pooling processes and return pooling information.
|
|
282
|
+
|
|
283
|
+
The way in which we check if a biosample is part of a Pooling process is by checking if
|
|
284
|
+
the biosample id has been asserted on the `has_input` slot/key of an `nmdc:Pooling` process
|
|
285
|
+
instance.
|
|
286
|
+
|
|
287
|
+
:param material_processing_set: reference to the material_processing_set collection
|
|
288
|
+
:param biosamples_list: list of all biosamples to check
|
|
289
|
+
:return: dictionary mapping biosample_id to pooling information (empty dict if not pooled)
|
|
290
|
+
"""
|
|
291
|
+
result = {}
|
|
292
|
+
# get list of all biosample IDs that are part of a given study
|
|
293
|
+
biosample_lookup = {bs["id"]: bs for bs in biosamples_list}
|
|
294
|
+
|
|
295
|
+
# get list of all pooling processes
|
|
296
|
+
pooling_processes = list(material_processing_set.find({"type": "nmdc:Pooling"}))
|
|
297
|
+
|
|
298
|
+
# initialize all biosamples as not pooled
|
|
299
|
+
for biosample in biosamples_list:
|
|
300
|
+
result[biosample["id"]] = {}
|
|
301
|
+
|
|
302
|
+
# process each pooling process
|
|
303
|
+
for pooling_process in pooling_processes:
|
|
304
|
+
pooled_biosample_ids = pooling_process.get("has_input", [])
|
|
305
|
+
|
|
306
|
+
# get the processed sample output from the pooling process
|
|
307
|
+
has_output = pooling_process.get("has_output", [])
|
|
308
|
+
processed_sample_id = None
|
|
309
|
+
|
|
310
|
+
for output_id in has_output:
|
|
311
|
+
if get_classname_from_typecode(output_id) == "ProcessedSample":
|
|
312
|
+
processed_sample_id = output_id
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
# aggregate the values on `collection_date` and `depth` slots
|
|
316
|
+
# here, we are collecting the `collection_date` and `depth` values
|
|
317
|
+
# asserted on each of the biosamples that are part of a given pooling
|
|
318
|
+
# process in the following way:
|
|
319
|
+
# example of aggregated `collection_date`: 2017-06-05T16:50Z/2017-06-05T17:47Z
|
|
320
|
+
# example of aggregated `depth`: 0-10 m
|
|
321
|
+
collection_dates = []
|
|
322
|
+
depths = []
|
|
323
|
+
|
|
324
|
+
for bs_id in pooled_biosample_ids:
|
|
325
|
+
biosample = biosample_lookup.get(bs_id)
|
|
326
|
+
if not biosample:
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
if "collection_date" in biosample:
|
|
330
|
+
collection_date = biosample["collection_date"]
|
|
331
|
+
if (
|
|
332
|
+
isinstance(collection_date, dict)
|
|
333
|
+
and "has_raw_value" in collection_date
|
|
334
|
+
):
|
|
335
|
+
collection_dates.append(collection_date["has_raw_value"])
|
|
336
|
+
elif isinstance(collection_date, str):
|
|
337
|
+
collection_dates.append(collection_date)
|
|
338
|
+
|
|
339
|
+
if "depth" in biosample:
|
|
340
|
+
depth = biosample["depth"]
|
|
341
|
+
if isinstance(depth, dict):
|
|
342
|
+
if "has_numeric_value" in depth:
|
|
343
|
+
depths.append(depth["has_numeric_value"])
|
|
344
|
+
elif (
|
|
345
|
+
"has_minimum_numeric_value" in depth
|
|
346
|
+
and "has_maximum_numeric_value" in depth
|
|
347
|
+
):
|
|
348
|
+
depths.extend(
|
|
349
|
+
[
|
|
350
|
+
depth["has_minimum_numeric_value"],
|
|
351
|
+
depth["has_maximum_numeric_value"],
|
|
352
|
+
]
|
|
353
|
+
)
|
|
354
|
+
elif isinstance(depth, (int, float)):
|
|
355
|
+
depths.append(depth)
|
|
356
|
+
|
|
357
|
+
# create aggregated (forward slash separated) value for `collection_date`
|
|
358
|
+
aggregated_collection_date = None
|
|
359
|
+
if collection_dates:
|
|
360
|
+
sorted_dates = sorted(collection_dates)
|
|
361
|
+
if len(sorted_dates) > 1:
|
|
362
|
+
aggregated_collection_date = f"{sorted_dates[0]}/{sorted_dates[-1]}"
|
|
363
|
+
else:
|
|
364
|
+
aggregated_collection_date = sorted_dates[0]
|
|
365
|
+
|
|
366
|
+
# create aggregated (hyphen separated) value for `depth`
|
|
367
|
+
aggregated_depth = None
|
|
368
|
+
if depths:
|
|
369
|
+
min_depth = min(depths)
|
|
370
|
+
max_depth = max(depths)
|
|
371
|
+
if min_depth != max_depth:
|
|
372
|
+
aggregated_depth = f"{min_depth}-{max_depth} m"
|
|
373
|
+
else:
|
|
374
|
+
aggregated_depth = f"{min_depth} m"
|
|
375
|
+
|
|
376
|
+
# update all biosamples that are part of this pooling process
|
|
377
|
+
pooling_info = {
|
|
378
|
+
"processed_sample_id": processed_sample_id,
|
|
379
|
+
"pooling_process_id": pooling_process.get("id"),
|
|
380
|
+
"pooled_biosample_ids": pooled_biosample_ids,
|
|
381
|
+
"aggregated_collection_date": aggregated_collection_date,
|
|
382
|
+
"aggregated_depth": aggregated_depth,
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
for bs_id in pooled_biosample_ids:
|
|
386
|
+
if bs_id in result:
|
|
387
|
+
result[bs_id] = pooling_info
|
|
388
|
+
|
|
389
|
+
return result
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def validate_xml(xml, xsd_url):
|
|
393
|
+
response = requests.get(xsd_url)
|
|
394
|
+
response.raise_for_status()
|
|
395
|
+
xsd_content = response.text
|
|
396
|
+
|
|
397
|
+
xml_schema_doc = etree.parse(BytesIO(xsd_content.encode("utf-8")))
|
|
398
|
+
xml_schema = etree.XMLSchema(xml_schema_doc)
|
|
399
|
+
|
|
400
|
+
xml_doc = etree.parse(BytesIO(xml.encode("utf-8")))
|
|
401
|
+
|
|
402
|
+
if not xml_schema.validate(xml_doc):
|
|
403
|
+
raise ValueError(f"There were errors while validating against: {xsd_url}")
|
|
404
|
+
|
|
405
|
+
return True
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Get NMDC study-associated metadata from search api
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
import csv
|
|
5
6
|
from io import StringIO
|
|
6
7
|
|
|
7
|
-
import requests
|
|
8
8
|
from dagster import (
|
|
9
9
|
op,
|
|
10
10
|
get_dagster_logger,
|
|
@@ -25,13 +25,27 @@ def get_all_docs(client, collection, filter_):
|
|
|
25
25
|
per_page = 200
|
|
26
26
|
url_base = f"/{collection}?filter={filter_}&per_page={per_page}"
|
|
27
27
|
results = []
|
|
28
|
-
|
|
28
|
+
response = client.request("GET", url_base)
|
|
29
|
+
if response.status_code != 200:
|
|
30
|
+
raise Exception(
|
|
31
|
+
f"Runtime API request failed with status {response.status_code}."
|
|
32
|
+
f" Check URL: {url_base}"
|
|
33
|
+
)
|
|
34
|
+
rv = response.json()
|
|
29
35
|
results.extend(rv.get("results", []))
|
|
30
36
|
page, count = rv["meta"]["page"], rv["meta"]["count"]
|
|
31
37
|
assert count <= 10_000
|
|
32
38
|
while page * per_page < count:
|
|
33
|
-
|
|
34
|
-
|
|
39
|
+
page += 1
|
|
40
|
+
url = f"{url_base}&page={page}"
|
|
41
|
+
response = client.request("GET", url)
|
|
42
|
+
if response.status_code != 200:
|
|
43
|
+
raise Exception(
|
|
44
|
+
f"Runtime API request failed with status {response.status_code}."
|
|
45
|
+
f" Check URL: {url}"
|
|
46
|
+
)
|
|
47
|
+
rv = response.json()
|
|
48
|
+
results.extend(rv.get("results", []))
|
|
35
49
|
return results
|
|
36
50
|
|
|
37
51
|
|
|
@@ -114,3 +128,12 @@ def export_study_biosamples_as_csv(context: OpExecutionContext, study_export_inf
|
|
|
114
128
|
def export_study_biosamples_metadata():
|
|
115
129
|
outputs = export_study_biosamples_as_csv(get_study_biosamples_metadata())
|
|
116
130
|
add_output_run_event(outputs)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@op(required_resource_keys={"runtime_api_site_client"})
|
|
134
|
+
def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study: dict):
|
|
135
|
+
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
136
|
+
biosamples = get_all_docs(
|
|
137
|
+
client, "biosamples", f"associated_studies:{nmdc_study['id']}"
|
|
138
|
+
)
|
|
139
|
+
return biosamples
|