nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +22 -2
- nmdc_runtime/api/core/idgen.py +36 -6
- nmdc_runtime/api/db/mongo.py +0 -12
- nmdc_runtime/api/endpoints/find.py +65 -225
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
- nmdc_runtime/api/endpoints/objects.py +4 -11
- nmdc_runtime/api/endpoints/operations.py +0 -27
- nmdc_runtime/api/endpoints/queries.py +22 -0
- nmdc_runtime/api/endpoints/sites.py +0 -24
- nmdc_runtime/api/endpoints/util.py +57 -35
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +84 -60
- nmdc_runtime/api/models/util.py +12 -5
- nmdc_runtime/api/openapi.py +116 -180
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/minter/adapters/repository.py +21 -0
- nmdc_runtime/minter/domain/model.py +20 -0
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +632 -11
- nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
- nmdc_runtime/site/graphs.py +7 -0
- nmdc_runtime/site/ops.py +92 -34
- nmdc_runtime/site/repository.py +2 -0
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +87 -1
- nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
- nmdc_runtime/api/endpoints/ids.py +0 -192
- nmdc_runtime/client/__init__.py +0 -0
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/__init__.py +0 -0
- nmdc_runtime/core/db/Database.py +0 -13
- nmdc_runtime/core/db/__init__.py +0 -0
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/__init__.py +0 -0
- nmdc_runtime/domain/users/__init__.py +0 -0
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/models/user.py +0 -1
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -33
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -825
- nmdc_runtime/lib/nmdc_etl_class.py +0 -396
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/__init__.py +0 -0
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
- nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -275,6 +275,120 @@ def load_mappings(url):
|
|
|
275
275
|
return attribute_mappings, slot_range_mappings
|
|
276
276
|
|
|
277
277
|
|
|
278
|
+
def check_pooling_for_biosamples(
|
|
279
|
+
material_processing_set: Collection, biosamples_list: List[Dict[str, Any]]
|
|
280
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
281
|
+
"""Check which biosamples are part of pooling processes and return pooling information.
|
|
282
|
+
|
|
283
|
+
The way in which we check if a biosample is part of a Pooling process is by checking if
|
|
284
|
+
the biosample id has been asserted on the `has_input` slot/key of an `nmdc:Pooling` process
|
|
285
|
+
instance.
|
|
286
|
+
|
|
287
|
+
:param material_processing_set: reference to the material_processing_set collection
|
|
288
|
+
:param biosamples_list: list of all biosamples to check
|
|
289
|
+
:return: dictionary mapping biosample_id to pooling information (empty dict if not pooled)
|
|
290
|
+
"""
|
|
291
|
+
result = {}
|
|
292
|
+
# get list of all biosample IDs that are part of a given study
|
|
293
|
+
biosample_lookup = {bs["id"]: bs for bs in biosamples_list}
|
|
294
|
+
|
|
295
|
+
# get list of all pooling processes
|
|
296
|
+
pooling_processes = list(material_processing_set.find({"type": "nmdc:Pooling"}))
|
|
297
|
+
|
|
298
|
+
# initialize all biosamples as not pooled
|
|
299
|
+
for biosample in biosamples_list:
|
|
300
|
+
result[biosample["id"]] = {}
|
|
301
|
+
|
|
302
|
+
# process each pooling process
|
|
303
|
+
for pooling_process in pooling_processes:
|
|
304
|
+
pooled_biosample_ids = pooling_process.get("has_input", [])
|
|
305
|
+
|
|
306
|
+
# get the processed sample output from the pooling process
|
|
307
|
+
has_output = pooling_process.get("has_output", [])
|
|
308
|
+
processed_sample_id = None
|
|
309
|
+
|
|
310
|
+
for output_id in has_output:
|
|
311
|
+
if get_classname_from_typecode(output_id) == "ProcessedSample":
|
|
312
|
+
processed_sample_id = output_id
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
# aggregate the values on `collection_date` and `depth` slots
|
|
316
|
+
# here, we are collecting the `collection_date` and `depth` values
|
|
317
|
+
# asserted on each of the biosamples that are part of a given pooling
|
|
318
|
+
# process in the following way:
|
|
319
|
+
# example of aggregated `collection_date`: 2017-06-05T16:50Z/2017-06-05T17:47Z
|
|
320
|
+
# example of aggregated `depth`: 0-10 m
|
|
321
|
+
collection_dates = []
|
|
322
|
+
depths = []
|
|
323
|
+
|
|
324
|
+
for bs_id in pooled_biosample_ids:
|
|
325
|
+
biosample = biosample_lookup.get(bs_id)
|
|
326
|
+
if not biosample:
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
if "collection_date" in biosample:
|
|
330
|
+
collection_date = biosample["collection_date"]
|
|
331
|
+
if (
|
|
332
|
+
isinstance(collection_date, dict)
|
|
333
|
+
and "has_raw_value" in collection_date
|
|
334
|
+
):
|
|
335
|
+
collection_dates.append(collection_date["has_raw_value"])
|
|
336
|
+
elif isinstance(collection_date, str):
|
|
337
|
+
collection_dates.append(collection_date)
|
|
338
|
+
|
|
339
|
+
if "depth" in biosample:
|
|
340
|
+
depth = biosample["depth"]
|
|
341
|
+
if isinstance(depth, dict):
|
|
342
|
+
if "has_numeric_value" in depth:
|
|
343
|
+
depths.append(depth["has_numeric_value"])
|
|
344
|
+
elif (
|
|
345
|
+
"has_minimum_numeric_value" in depth
|
|
346
|
+
and "has_maximum_numeric_value" in depth
|
|
347
|
+
):
|
|
348
|
+
depths.extend(
|
|
349
|
+
[
|
|
350
|
+
depth["has_minimum_numeric_value"],
|
|
351
|
+
depth["has_maximum_numeric_value"],
|
|
352
|
+
]
|
|
353
|
+
)
|
|
354
|
+
elif isinstance(depth, (int, float)):
|
|
355
|
+
depths.append(depth)
|
|
356
|
+
|
|
357
|
+
# create aggregated (forward slash separated) value for `collection_date`
|
|
358
|
+
aggregated_collection_date = None
|
|
359
|
+
if collection_dates:
|
|
360
|
+
sorted_dates = sorted(collection_dates)
|
|
361
|
+
if len(sorted_dates) > 1:
|
|
362
|
+
aggregated_collection_date = f"{sorted_dates[0]}/{sorted_dates[-1]}"
|
|
363
|
+
else:
|
|
364
|
+
aggregated_collection_date = sorted_dates[0]
|
|
365
|
+
|
|
366
|
+
# create aggregated (hyphen separated) value for `depth`
|
|
367
|
+
aggregated_depth = None
|
|
368
|
+
if depths:
|
|
369
|
+
min_depth = min(depths)
|
|
370
|
+
max_depth = max(depths)
|
|
371
|
+
if min_depth != max_depth:
|
|
372
|
+
aggregated_depth = f"{min_depth}-{max_depth} m"
|
|
373
|
+
else:
|
|
374
|
+
aggregated_depth = f"{min_depth} m"
|
|
375
|
+
|
|
376
|
+
# update all biosamples that are part of this pooling process
|
|
377
|
+
pooling_info = {
|
|
378
|
+
"processed_sample_id": processed_sample_id,
|
|
379
|
+
"pooling_process_id": pooling_process.get("id"),
|
|
380
|
+
"pooled_biosample_ids": pooled_biosample_ids,
|
|
381
|
+
"aggregated_collection_date": aggregated_collection_date,
|
|
382
|
+
"aggregated_depth": aggregated_depth,
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
for bs_id in pooled_biosample_ids:
|
|
386
|
+
if bs_id in result:
|
|
387
|
+
result[bs_id] = pooling_info
|
|
388
|
+
|
|
389
|
+
return result
|
|
390
|
+
|
|
391
|
+
|
|
278
392
|
def validate_xml(xml, xsd_url):
|
|
279
393
|
response = requests.get(xsd_url)
|
|
280
394
|
response.raise_for_status()
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -53,6 +53,7 @@ from nmdc_runtime.site.ops import (
|
|
|
53
53
|
get_data_objects_from_biosamples,
|
|
54
54
|
get_nucleotide_sequencing_from_biosamples,
|
|
55
55
|
get_library_preparation_from_biosamples,
|
|
56
|
+
get_aggregated_pooled_biosamples,
|
|
56
57
|
get_all_instruments,
|
|
57
58
|
get_ncbi_export_pipeline_inputs,
|
|
58
59
|
ncbi_submission_xml_from_nmdc_study,
|
|
@@ -173,6 +174,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
|
|
|
173
174
|
data_object_mapping_file_url,
|
|
174
175
|
biosample_extras_file_url,
|
|
175
176
|
biosample_extras_slot_mapping_file_url,
|
|
177
|
+
study_id,
|
|
176
178
|
) = get_submission_portal_pipeline_inputs()
|
|
177
179
|
|
|
178
180
|
metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
|
|
@@ -193,6 +195,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
|
|
|
193
195
|
biosample_extras=biosample_extras,
|
|
194
196
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
195
197
|
instrument_mapping=instrument_mapping,
|
|
198
|
+
study_id=study_id,
|
|
196
199
|
)
|
|
197
200
|
|
|
198
201
|
validate_metadata(database)
|
|
@@ -213,6 +216,7 @@ def ingest_metadata_submission():
|
|
|
213
216
|
data_object_mapping_file_url,
|
|
214
217
|
biosample_extras_file_url,
|
|
215
218
|
biosample_extras_slot_mapping_file_url,
|
|
219
|
+
study_id,
|
|
216
220
|
) = get_submission_portal_pipeline_inputs()
|
|
217
221
|
|
|
218
222
|
metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
|
|
@@ -233,6 +237,7 @@ def ingest_metadata_submission():
|
|
|
233
237
|
biosample_extras=biosample_extras,
|
|
234
238
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
235
239
|
instrument_mapping=instrument_mapping,
|
|
240
|
+
study_id=study_id,
|
|
236
241
|
)
|
|
237
242
|
|
|
238
243
|
log_database_ids(database)
|
|
@@ -472,6 +477,7 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
472
477
|
)
|
|
473
478
|
data_object_records = get_data_objects_from_biosamples(biosamples)
|
|
474
479
|
library_preparation_records = get_library_preparation_from_biosamples(biosamples)
|
|
480
|
+
pooled_biosamples_data = get_aggregated_pooled_biosamples(biosamples)
|
|
475
481
|
all_instruments = get_all_instruments()
|
|
476
482
|
xml_data = ncbi_submission_xml_from_nmdc_study(
|
|
477
483
|
nmdc_study,
|
|
@@ -481,6 +487,7 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
481
487
|
data_object_records,
|
|
482
488
|
library_preparation_records,
|
|
483
489
|
all_instruments,
|
|
490
|
+
pooled_biosamples_data,
|
|
484
491
|
)
|
|
485
492
|
ncbi_submission_xml_asset(xml_data)
|
|
486
493
|
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
import mimetypes
|
|
5
4
|
import os
|
|
6
5
|
import subprocess
|
|
7
6
|
from collections import defaultdict
|
|
@@ -16,6 +15,7 @@ from ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
|
16
15
|
import pandas as pd
|
|
17
16
|
import requests
|
|
18
17
|
from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
|
|
18
|
+
from toolz import dissoc
|
|
19
19
|
|
|
20
20
|
from bson import ObjectId, json_util
|
|
21
21
|
from dagster import (
|
|
@@ -73,7 +73,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
|
73
73
|
fetch_nucleotide_sequencing_from_biosamples,
|
|
74
74
|
fetch_library_preparation_from_biosamples,
|
|
75
75
|
)
|
|
76
|
-
from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
|
|
77
76
|
from nmdc_runtime.site.resources import (
|
|
78
77
|
NmdcPortalApiClient,
|
|
79
78
|
GoldApiClient,
|
|
@@ -95,15 +94,12 @@ from nmdc_runtime.site.translation.submission_portal_translator import (
|
|
|
95
94
|
)
|
|
96
95
|
from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
|
|
97
96
|
from nmdc_runtime.site.util import (
|
|
98
|
-
run_and_log,
|
|
99
97
|
schema_collection_has_index_on_id,
|
|
100
98
|
nmdc_study_id_to_filename,
|
|
101
99
|
get_instruments_by_id,
|
|
102
100
|
)
|
|
103
101
|
from nmdc_runtime.util import (
|
|
104
|
-
drs_object_in_for,
|
|
105
102
|
pluralize,
|
|
106
|
-
put_object,
|
|
107
103
|
specialize_activity_set_docs,
|
|
108
104
|
collection_name_to_class_names,
|
|
109
105
|
nmdc_schema_view,
|
|
@@ -112,7 +108,7 @@ from nmdc_runtime.util import (
|
|
|
112
108
|
from nmdc_schema import nmdc
|
|
113
109
|
from pymongo import InsertOne, UpdateOne
|
|
114
110
|
from pymongo.database import Database as MongoDatabase
|
|
115
|
-
from
|
|
111
|
+
from pymongo.collection import Collection as MongoCollection
|
|
116
112
|
from toolz import get_in, valfilter, identity
|
|
117
113
|
|
|
118
114
|
|
|
@@ -373,6 +369,9 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn):
|
|
|
373
369
|
|
|
374
370
|
@op(required_resource_keys={"runtime_api_site_client"})
|
|
375
371
|
def get_json_in(context):
|
|
372
|
+
"""
|
|
373
|
+
TODO: Document this function.
|
|
374
|
+
"""
|
|
376
375
|
object_id = context.op_config.get("object_id")
|
|
377
376
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
378
377
|
rv = client.get_object_bytes(object_id)
|
|
@@ -385,6 +384,9 @@ def get_json_in(context):
|
|
|
385
384
|
|
|
386
385
|
@op(required_resource_keys={"runtime_api_site_client", "mongo"})
|
|
387
386
|
def perform_mongo_updates(context, json_in):
|
|
387
|
+
"""
|
|
388
|
+
TODO: Document this function.
|
|
389
|
+
"""
|
|
388
390
|
mongo = context.resources.mongo
|
|
389
391
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
390
392
|
op_id = context.op_config.get("operation_id")
|
|
@@ -414,6 +416,9 @@ def perform_mongo_updates(context, json_in):
|
|
|
414
416
|
def _add_schema_docs_with_or_without_replacement(
|
|
415
417
|
mongo: MongoDBResource, docs: Dict[str, list]
|
|
416
418
|
):
|
|
419
|
+
"""
|
|
420
|
+
TODO: Document this function.
|
|
421
|
+
"""
|
|
417
422
|
coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
|
|
418
423
|
if all(coll_index_on_id_map[coll] for coll in docs.keys()):
|
|
419
424
|
replace = True
|
|
@@ -437,7 +442,13 @@ def _add_schema_docs_with_or_without_replacement(
|
|
|
437
442
|
f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
|
|
438
443
|
)
|
|
439
444
|
op_result = mongo.add_docs(docs, validate=False, replace=replace)
|
|
440
|
-
|
|
445
|
+
|
|
446
|
+
# Translate the operation result into a dictionary in which each item's key is a collection name
|
|
447
|
+
# and each item's value is the corresponding bulk API result (excluding the "upserted" field).
|
|
448
|
+
return {
|
|
449
|
+
collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
|
|
450
|
+
for collection_name, bulk_write_result in op_result.items()
|
|
451
|
+
}
|
|
441
452
|
|
|
442
453
|
|
|
443
454
|
@op(required_resource_keys={"mongo"})
|
|
@@ -545,27 +556,39 @@ def nmdc_schema_database_from_gold_study(
|
|
|
545
556
|
|
|
546
557
|
|
|
547
558
|
@op(
|
|
559
|
+
required_resource_keys={"mongo"},
|
|
548
560
|
out={
|
|
549
561
|
"submission_id": Out(),
|
|
550
562
|
"nucleotide_sequencing_mapping_file_url": Out(Optional[str]),
|
|
551
563
|
"data_object_mapping_file_url": Out(Optional[str]),
|
|
552
564
|
"biosample_extras_file_url": Out(Optional[str]),
|
|
553
565
|
"biosample_extras_slot_mapping_file_url": Out(Optional[str]),
|
|
566
|
+
"study_id": Out(Optional[str]),
|
|
554
567
|
},
|
|
555
568
|
)
|
|
556
569
|
def get_submission_portal_pipeline_inputs(
|
|
570
|
+
context: OpExecutionContext,
|
|
557
571
|
submission_id: str,
|
|
558
572
|
nucleotide_sequencing_mapping_file_url: Optional[str],
|
|
559
573
|
data_object_mapping_file_url: Optional[str],
|
|
560
574
|
biosample_extras_file_url: Optional[str],
|
|
561
575
|
biosample_extras_slot_mapping_file_url: Optional[str],
|
|
562
|
-
|
|
576
|
+
study_id: Optional[str],
|
|
577
|
+
) -> Tuple[str, str | None, str | None, str | None, str | None, str | None]:
|
|
578
|
+
# query for studies matching the ID to see if it eists
|
|
579
|
+
if study_id:
|
|
580
|
+
mdb = context.resources.mongo.db
|
|
581
|
+
result = mdb.study_set.find_one({"id": study_id})
|
|
582
|
+
if not result:
|
|
583
|
+
raise Exception(f"Study id: {study_id} does not exist in Mongo.")
|
|
584
|
+
|
|
563
585
|
return (
|
|
564
586
|
submission_id,
|
|
565
587
|
nucleotide_sequencing_mapping_file_url,
|
|
566
588
|
data_object_mapping_file_url,
|
|
567
589
|
biosample_extras_file_url,
|
|
568
590
|
biosample_extras_slot_mapping_file_url,
|
|
591
|
+
study_id,
|
|
569
592
|
)
|
|
570
593
|
|
|
571
594
|
|
|
@@ -590,6 +613,7 @@ def translate_portal_submission_to_nmdc_schema_database(
|
|
|
590
613
|
study_pi_image_url: Optional[str],
|
|
591
614
|
biosample_extras: Optional[list[dict]],
|
|
592
615
|
biosample_extras_slot_mapping: Optional[list[dict]],
|
|
616
|
+
study_id: Optional[str],
|
|
593
617
|
) -> nmdc.Database:
|
|
594
618
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
595
619
|
|
|
@@ -607,6 +631,7 @@ def translate_portal_submission_to_nmdc_schema_database(
|
|
|
607
631
|
biosample_extras=biosample_extras,
|
|
608
632
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
609
633
|
illumina_instrument_mapping=instrument_mapping,
|
|
634
|
+
study_id=study_id,
|
|
610
635
|
)
|
|
611
636
|
database = translator.get_database()
|
|
612
637
|
return database
|
|
@@ -947,7 +972,9 @@ def load_ontology(context: OpExecutionContext):
|
|
|
947
972
|
|
|
948
973
|
|
|
949
974
|
def _add_linked_instances_to_alldocs(
|
|
950
|
-
temp_collection
|
|
975
|
+
temp_collection: MongoCollection,
|
|
976
|
+
context: OpExecutionContext,
|
|
977
|
+
document_reference_ranged_slots_by_type: dict,
|
|
951
978
|
) -> None:
|
|
952
979
|
"""
|
|
953
980
|
Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
|
|
@@ -983,16 +1010,13 @@ def _add_linked_instances_to_alldocs(
|
|
|
983
1010
|
# Store the full type with prefix intact
|
|
984
1011
|
doc_type = doc["type"]
|
|
985
1012
|
# For looking up reference slots, we still need the type without prefix
|
|
986
|
-
# FIXME `document_reference_ranged_slots_by_type` should key on `doc_type`
|
|
987
1013
|
doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
|
|
988
1014
|
|
|
989
1015
|
# Record ID to type mapping - preserve the original type with prefix
|
|
990
1016
|
id_to_type_map[doc_id] = doc_type
|
|
991
1017
|
|
|
992
1018
|
# Find all document references from this document
|
|
993
|
-
reference_slots = document_reference_ranged_slots_by_type.get(
|
|
994
|
-
doc_type_no_prefix, []
|
|
995
|
-
)
|
|
1019
|
+
reference_slots = document_reference_ranged_slots_by_type.get(doc_type, [])
|
|
996
1020
|
for slot in reference_slots:
|
|
997
1021
|
if slot in doc:
|
|
998
1022
|
# Handle both single-value and array references
|
|
@@ -1116,7 +1140,7 @@ def _add_linked_instances_to_alldocs(
|
|
|
1116
1140
|
# Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
|
|
1117
1141
|
#
|
|
1118
1142
|
@op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
|
|
1119
|
-
def materialize_alldocs(context) -> int:
|
|
1143
|
+
def materialize_alldocs(context: OpExecutionContext) -> int:
|
|
1120
1144
|
"""
|
|
1121
1145
|
This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
|
|
1122
1146
|
|
|
@@ -1167,17 +1191,16 @@ def materialize_alldocs(context) -> int:
|
|
|
1167
1191
|
)
|
|
1168
1192
|
)
|
|
1169
1193
|
|
|
1170
|
-
|
|
1171
|
-
# FIXME key on CURIE, e.g. `nmdc:Study`
|
|
1172
|
-
# (here, not upstream in `cls_slot_map`/`document_referenceable_ranges`, b/c `schema_view` used directly in those)
|
|
1173
|
-
document_reference_ranged_slots = defaultdict(list)
|
|
1194
|
+
document_reference_ranged_slots_by_type = defaultdict(list)
|
|
1174
1195
|
for cls_name, slot_map in cls_slot_map.items():
|
|
1175
1196
|
for slot_name, slot in slot_map.items():
|
|
1176
1197
|
if (
|
|
1177
1198
|
set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
|
|
1178
1199
|
& document_referenceable_ranges
|
|
1179
1200
|
):
|
|
1180
|
-
|
|
1201
|
+
document_reference_ranged_slots_by_type[f"nmdc:{cls_name}"].append(
|
|
1202
|
+
slot_name
|
|
1203
|
+
)
|
|
1181
1204
|
|
|
1182
1205
|
# Build `alldocs` to a temporary collection for atomic replacement
|
|
1183
1206
|
# https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
|
|
@@ -1194,25 +1217,19 @@ def materialize_alldocs(context) -> int:
|
|
|
1194
1217
|
# Keep the full type with prefix for document
|
|
1195
1218
|
doc_type_full = doc["type"]
|
|
1196
1219
|
# Remove prefix for slot lookup and ancestor lookup
|
|
1197
|
-
doc_type = (
|
|
1198
|
-
doc_type_full[5:]
|
|
1199
|
-
if doc_type_full.startswith("nmdc:")
|
|
1200
|
-
else doc_type_full
|
|
1201
|
-
)
|
|
1220
|
+
doc_type = doc_type_full.removeprefix("nmdc:")
|
|
1202
1221
|
except KeyError:
|
|
1203
1222
|
raise Exception(
|
|
1204
1223
|
f"doc {doc['id']} in collection {coll_name} has no 'type'!"
|
|
1205
1224
|
)
|
|
1206
|
-
slots_to_include = ["id", "type"] +
|
|
1207
|
-
|
|
1225
|
+
slots_to_include = ["id", "type"] + document_reference_ranged_slots_by_type[
|
|
1226
|
+
doc_type_full
|
|
1208
1227
|
]
|
|
1209
1228
|
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1210
1229
|
|
|
1211
|
-
new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
|
|
1212
1230
|
# Get ancestors without the prefix, but add prefix to each one in the output
|
|
1213
|
-
ancestors = schema_view.class_ancestors(doc_type)
|
|
1214
1231
|
new_doc["_type_and_ancestors"] = [
|
|
1215
|
-
"nmdc:"
|
|
1232
|
+
f"nmdc:{a}" for a in schema_view.class_ancestors(doc_type)
|
|
1216
1233
|
]
|
|
1217
1234
|
# InsertOne is a pymongo representation of a mongo command.
|
|
1218
1235
|
write_operations.append(InsertOne(new_doc))
|
|
@@ -1221,7 +1238,7 @@ def materialize_alldocs(context) -> int:
|
|
|
1221
1238
|
write_operations.clear()
|
|
1222
1239
|
documents_processed_counter += BULK_WRITE_BATCH_SIZE
|
|
1223
1240
|
if len(write_operations) > 0:
|
|
1224
|
-
# here bulk_write is a method on the
|
|
1241
|
+
# here bulk_write is a method on the pymongo db Collection class
|
|
1225
1242
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1226
1243
|
documents_processed_counter += len(write_operations)
|
|
1227
1244
|
context.log.info(
|
|
@@ -1238,15 +1255,18 @@ def materialize_alldocs(context) -> int:
|
|
|
1238
1255
|
# so that `temp_alldocs_collection` will be "good to go" on renaming.
|
|
1239
1256
|
temp_alldocs_collection.create_index("id", unique=True)
|
|
1240
1257
|
# Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
|
|
1241
|
-
|
|
1242
|
-
|
|
1258
|
+
slots_to_index = {"_type_and_ancestors"} | {
|
|
1259
|
+
slot
|
|
1260
|
+
for slots in document_reference_ranged_slots_by_type.values()
|
|
1261
|
+
for slot in slots
|
|
1262
|
+
}
|
|
1243
1263
|
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1244
|
-
context.log.info(f"created indexes on id
|
|
1264
|
+
context.log.info(f"created indexes on id and on each of {slots_to_index=}.")
|
|
1245
1265
|
|
|
1246
1266
|
# Add related-ids fields to enable efficient relationship traversal
|
|
1247
1267
|
context.log.info("Adding fields for related ids to documents...")
|
|
1248
1268
|
_add_linked_instances_to_alldocs(
|
|
1249
|
-
temp_alldocs_collection, context,
|
|
1269
|
+
temp_alldocs_collection, context, document_reference_ranged_slots_by_type
|
|
1250
1270
|
)
|
|
1251
1271
|
context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
|
|
1252
1272
|
temp_alldocs_collection.create_index("_upstream.id")
|
|
@@ -1350,6 +1370,42 @@ def get_library_preparation_from_biosamples(
|
|
|
1350
1370
|
return biosample_lib_prep
|
|
1351
1371
|
|
|
1352
1372
|
|
|
1373
|
+
@op(required_resource_keys={"mongo"})
|
|
1374
|
+
def get_aggregated_pooled_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1375
|
+
from nmdc_runtime.site.export.ncbi_xml_utils import check_pooling_for_biosamples
|
|
1376
|
+
|
|
1377
|
+
mdb = context.resources.mongo.db
|
|
1378
|
+
material_processing_set = mdb["material_processing_set"]
|
|
1379
|
+
pooled_biosamples_data = check_pooling_for_biosamples(
|
|
1380
|
+
material_processing_set, biosamples
|
|
1381
|
+
)
|
|
1382
|
+
|
|
1383
|
+
# Fetch ProcessedSample names from database
|
|
1384
|
+
processed_sample_ids = set()
|
|
1385
|
+
for biosample_id, pooling_info in pooled_biosamples_data.items():
|
|
1386
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
1387
|
+
processed_sample_ids.add(pooling_info["processed_sample_id"])
|
|
1388
|
+
|
|
1389
|
+
# Query database for ProcessedSample names
|
|
1390
|
+
if processed_sample_ids:
|
|
1391
|
+
processed_sample_set = mdb["processed_sample_set"]
|
|
1392
|
+
cursor = processed_sample_set.find(
|
|
1393
|
+
{"id": {"$in": list(processed_sample_ids)}}, {"id": 1, "name": 1}
|
|
1394
|
+
)
|
|
1395
|
+
processed_samples = {doc["id"]: doc.get("name", "") for doc in cursor}
|
|
1396
|
+
|
|
1397
|
+
# Update pooled_biosamples_data with ProcessedSample names
|
|
1398
|
+
for biosample_id, pooling_info in pooled_biosamples_data.items():
|
|
1399
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
1400
|
+
processed_sample_id = pooling_info["processed_sample_id"]
|
|
1401
|
+
if processed_sample_id in processed_samples:
|
|
1402
|
+
pooling_info["processed_sample_name"] = processed_samples[
|
|
1403
|
+
processed_sample_id
|
|
1404
|
+
]
|
|
1405
|
+
|
|
1406
|
+
return pooled_biosamples_data
|
|
1407
|
+
|
|
1408
|
+
|
|
1353
1409
|
@op(required_resource_keys={"mongo"})
|
|
1354
1410
|
def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
|
|
1355
1411
|
mdb = context.resources.mongo.db
|
|
@@ -1383,6 +1439,7 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1383
1439
|
data_object_records: list,
|
|
1384
1440
|
library_preparation_records: list,
|
|
1385
1441
|
all_instruments: dict,
|
|
1442
|
+
pooled_biosamples_data: dict,
|
|
1386
1443
|
) -> str:
|
|
1387
1444
|
ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
|
|
1388
1445
|
ncbi_xml = ncbi_exporter.get_submission_xml(
|
|
@@ -1391,6 +1448,7 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1391
1448
|
data_object_records,
|
|
1392
1449
|
library_preparation_records,
|
|
1393
1450
|
all_instruments,
|
|
1451
|
+
pooled_biosamples_data,
|
|
1394
1452
|
)
|
|
1395
1453
|
return ncbi_xml
|
|
1396
1454
|
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -502,6 +502,7 @@ def biosample_submission_ingest():
|
|
|
502
502
|
"data_object_mapping_file_url": None,
|
|
503
503
|
"biosample_extras_file_url": None,
|
|
504
504
|
"biosample_extras_slot_mapping_file_url": None,
|
|
505
|
+
"study_id": None,
|
|
505
506
|
}
|
|
506
507
|
},
|
|
507
508
|
"translate_portal_submission_to_nmdc_schema_database": {
|
|
@@ -538,6 +539,7 @@ def biosample_submission_ingest():
|
|
|
538
539
|
"data_object_mapping_file_url": None,
|
|
539
540
|
"biosample_extras_file_url": None,
|
|
540
541
|
"biosample_extras_slot_mapping_file_url": None,
|
|
542
|
+
"study_id": None,
|
|
541
543
|
}
|
|
542
544
|
},
|
|
543
545
|
"translate_portal_submission_to_nmdc_schema_database": {
|
nmdc_runtime/site/resources.py
CHANGED
|
@@ -520,11 +520,24 @@ class MongoDB:
|
|
|
520
520
|
self.db = self.client[dbname]
|
|
521
521
|
|
|
522
522
|
def add_docs(self, docs, validate=True, replace=True):
|
|
523
|
+
"""
|
|
524
|
+
TODO: Document this function.
|
|
525
|
+
"""
|
|
523
526
|
try:
|
|
524
527
|
if validate:
|
|
525
528
|
nmdc_jsonschema_validator_noidpatterns(docs)
|
|
526
529
|
rv = {}
|
|
527
|
-
for collection_name,
|
|
530
|
+
for collection_name, collection_docs in docs.items():
|
|
531
|
+
# If `collection_docs` is empty, abort this iteration.
|
|
532
|
+
#
|
|
533
|
+
# Note: We do this because the `bulk_write` method called below will raise
|
|
534
|
+
# an `InvalidOperation` exception if it is passed 0 operations.
|
|
535
|
+
#
|
|
536
|
+
# Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write
|
|
537
|
+
#
|
|
538
|
+
if len(collection_docs) == 0:
|
|
539
|
+
continue
|
|
540
|
+
|
|
528
541
|
rv[collection_name] = self.db[collection_name].bulk_write(
|
|
529
542
|
[
|
|
530
543
|
(
|
|
@@ -532,7 +545,7 @@ class MongoDB:
|
|
|
532
545
|
if replace
|
|
533
546
|
else InsertOne(d)
|
|
534
547
|
)
|
|
535
|
-
for d in
|
|
548
|
+
for d in collection_docs
|
|
536
549
|
]
|
|
537
550
|
)
|
|
538
551
|
now = datetime.now(timezone.utc)
|
|
@@ -544,7 +557,7 @@ class MongoDB:
|
|
|
544
557
|
"ts": now,
|
|
545
558
|
# "dtl": {},
|
|
546
559
|
}
|
|
547
|
-
for d in
|
|
560
|
+
for d in collection_docs
|
|
548
561
|
]
|
|
549
562
|
)
|
|
550
563
|
return rv
|