nmdc-runtime 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/minter/config.py +18 -50
- nmdc_runtime/site/export/ncbi_xml.py +23 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +81 -30
- nmdc_runtime/site/graphs.py +39 -0
- nmdc_runtime/site/ops.py +131 -31
- nmdc_runtime/site/repair/__init__.py +0 -0
- nmdc_runtime/site/repair/database_updater.py +230 -0
- nmdc_runtime/site/repository.py +109 -9
- nmdc_runtime/site/resources.py +36 -5
- nmdc_runtime/site/translation/gold_translator.py +26 -4
- nmdc_runtime/site/translation/neon_surface_water_translator.py +128 -78
- nmdc_runtime/site/util.py +7 -2
- nmdc_runtime/util.py +143 -1
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/METADATA +11 -3
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/RECORD +19 -17
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/top_level.txt +0 -0
|
@@ -71,6 +71,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
71
71
|
neon_amb_data_tables = (
|
|
72
72
|
"mms_swMetagenomeSequencing",
|
|
73
73
|
"mms_swMetagenomeDnaExtraction",
|
|
74
|
+
"mms_swRawDataFiles",
|
|
74
75
|
"amc_fieldGenetic",
|
|
75
76
|
"amc_fieldSuperParent",
|
|
76
77
|
)
|
|
@@ -88,6 +89,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
88
89
|
if_exists="replace",
|
|
89
90
|
index=False,
|
|
90
91
|
)
|
|
92
|
+
surface_water_data["mms_swRawDataFiles"].to_sql(
|
|
93
|
+
"mms_swRawDataFiles", self.conn, if_exists="replace", index=False
|
|
94
|
+
)
|
|
91
95
|
surface_water_data["amc_fieldGenetic"].to_sql(
|
|
92
96
|
"amc_fieldGenetic", self.conn, if_exists="replace", index=False
|
|
93
97
|
)
|
|
@@ -103,10 +107,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
103
107
|
"neonEnvoTerms", self.conn, if_exists="replace", index=False
|
|
104
108
|
)
|
|
105
109
|
|
|
106
|
-
self.neon_raw_data_file_mappings_df =
|
|
107
|
-
self.neon_raw_data_file_mappings_df.to_sql(
|
|
108
|
-
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
109
|
-
)
|
|
110
|
+
self.neon_raw_data_file_mappings_df = surface_water_data["mms_swRawDataFiles"]
|
|
110
111
|
|
|
111
112
|
self.site_code_mapping = site_code_mapping
|
|
112
113
|
|
|
@@ -371,7 +372,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
371
372
|
)
|
|
372
373
|
|
|
373
374
|
def _translate_data_object(
|
|
374
|
-
self, do_id: str, url: str, do_type: str,
|
|
375
|
+
self, do_id: str, url: str, do_type: str, manifest_id: str
|
|
375
376
|
) -> nmdc.DataObject:
|
|
376
377
|
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
377
378
|
object mainly contains information about the sequencing file that was generated as
|
|
@@ -395,8 +396,15 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
395
396
|
url=url,
|
|
396
397
|
description=f"sequencing results for {basename}",
|
|
397
398
|
type="nmdc:DataObject",
|
|
398
|
-
md5_checksum=checksum,
|
|
399
399
|
data_object_type=do_type,
|
|
400
|
+
in_manifest=manifest_id,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
|
|
404
|
+
return nmdc.Manifest(
|
|
405
|
+
id=manifest_id,
|
|
406
|
+
manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
|
|
407
|
+
type="nmdc:Manifest",
|
|
400
408
|
)
|
|
401
409
|
|
|
402
410
|
def get_database(self):
|
|
@@ -477,6 +485,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
477
485
|
"""
|
|
478
486
|
surface_water_samples = pd.read_sql_query(query, self.conn)
|
|
479
487
|
|
|
488
|
+
# --------------------------------------------------
|
|
489
|
+
# Create mappings for minted NMDC IDs
|
|
490
|
+
# --------------------------------------------------
|
|
480
491
|
neon_biosample_ids = surface_water_samples["parentSampleID"]
|
|
481
492
|
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
|
|
482
493
|
neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
|
|
@@ -511,30 +522,20 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
511
522
|
zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
|
|
512
523
|
)
|
|
513
524
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
)
|
|
518
|
-
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
519
|
-
|
|
520
|
-
neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
|
|
521
|
-
neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
|
|
522
|
-
nmdc_data_object_ids = self._id_minter(
|
|
523
|
-
"nmdc:DataObject", len(neon_raw_file_paths)
|
|
524
|
-
)
|
|
525
|
-
neon_to_nmdc_data_object_ids = dict(
|
|
526
|
-
zip(neon_raw_file_paths, nmdc_data_object_ids)
|
|
527
|
-
)
|
|
528
|
-
|
|
525
|
+
# --------------------------------------------------
|
|
526
|
+
# STEP 1: Insert Biosamples
|
|
527
|
+
# --------------------------------------------------
|
|
529
528
|
for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
|
|
530
529
|
biosample_row = surface_water_samples[
|
|
531
530
|
surface_water_samples["parentSampleID"] == neon_id
|
|
532
531
|
]
|
|
532
|
+
# database.biosample_set.append(
|
|
533
|
+
# self._translate_biosample(neon_id, nmdc_id, biosample_row)
|
|
534
|
+
# )
|
|
533
535
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
536
|
+
# --------------------------------------------------
|
|
537
|
+
# STEP 2: Insert Extraction Processes
|
|
538
|
+
# --------------------------------------------------
|
|
538
539
|
for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
|
|
539
540
|
extraction_row = surface_water_samples[
|
|
540
541
|
surface_water_samples["parentSampleID"] == neon_id
|
|
@@ -557,6 +558,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
557
558
|
extraction_row, "genomicsSampleID"
|
|
558
559
|
)
|
|
559
560
|
|
|
561
|
+
# Each Extraction process output => ProcessedSample
|
|
560
562
|
database.processed_sample_set.append(
|
|
561
563
|
self._translate_processed_sample(
|
|
562
564
|
processed_sample_id,
|
|
@@ -564,23 +566,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
564
566
|
)
|
|
565
567
|
)
|
|
566
568
|
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
GROUP BY dnaSampleID
|
|
571
|
-
"""
|
|
572
|
-
neon_raw_data_files = pd.read_sql_query(query, self.conn)
|
|
573
|
-
neon_raw_data_files_dict = (
|
|
574
|
-
neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
|
|
575
|
-
.str.split("|")
|
|
576
|
-
.to_dict()
|
|
577
|
-
)
|
|
578
|
-
filtered_neon_raw_data_files_dict = {
|
|
579
|
-
key: value
|
|
580
|
-
for key, value in neon_raw_data_files_dict.items()
|
|
581
|
-
if len(value) <= 2
|
|
582
|
-
}
|
|
583
|
-
|
|
569
|
+
# --------------------------------------------------
|
|
570
|
+
# STEP 3: Insert LibraryPreparation Processes
|
|
571
|
+
# --------------------------------------------------
|
|
584
572
|
for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
|
|
585
573
|
lib_prep_row = surface_water_samples[
|
|
586
574
|
surface_water_samples["parentSampleID"] == neon_id
|
|
@@ -601,6 +589,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
601
589
|
|
|
602
590
|
dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
|
|
603
591
|
|
|
592
|
+
# Each LibraryPreparation process output => ProcessedSample
|
|
604
593
|
database.processed_sample_set.append(
|
|
605
594
|
self._translate_processed_sample(
|
|
606
595
|
processed_sample_id,
|
|
@@ -608,42 +597,103 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
608
597
|
)
|
|
609
598
|
)
|
|
610
599
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
600
|
+
# --------------------------------------------------
|
|
601
|
+
# STEP 4: Group raw files by (dnaSampleID, sequencerRunID)
|
|
602
|
+
# and insert DataObjects + DataGeneration processes
|
|
603
|
+
# --------------------------------------------------
|
|
604
|
+
raw_query = """
|
|
605
|
+
SELECT dnaSampleID, sequencerRunID, rawDataFilePath
|
|
606
|
+
FROM mms_swRawDataFiles
|
|
607
|
+
"""
|
|
608
|
+
neon_raw_data_files_df = pd.read_sql_query(raw_query, self.conn)
|
|
609
|
+
|
|
610
|
+
for neon_id, nmdc_libprep_id in neon_to_nmdc_lib_prep_ids.items():
|
|
611
|
+
# 1) Pull out the row that corresponds to this parentSampleID
|
|
612
|
+
lib_prep_row = surface_water_samples[
|
|
613
|
+
surface_water_samples["parentSampleID"] == neon_id
|
|
614
|
+
]
|
|
615
|
+
|
|
616
|
+
# 2) Grab the dnaSampleID from that row
|
|
617
|
+
dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
|
|
618
|
+
if not dna_sample_id:
|
|
619
|
+
# No dnaSampleID => skip
|
|
620
|
+
continue
|
|
621
|
+
|
|
622
|
+
# 3) Find all raw files for that dnaSampleID
|
|
623
|
+
dna_files = neon_raw_data_files_df[
|
|
624
|
+
neon_raw_data_files_df["dnaSampleID"] == dna_sample_id
|
|
625
|
+
]
|
|
626
|
+
if dna_files.empty:
|
|
627
|
+
# No raw files => skip
|
|
628
|
+
continue
|
|
629
|
+
|
|
630
|
+
# -----------------------------------------
|
|
631
|
+
# LOOKUP DICT: get "has_input" for this neon_id
|
|
632
|
+
# -----------------------------------------
|
|
633
|
+
has_input_value = self.samp_procsm_dict.get(neon_id)
|
|
634
|
+
# If some neon_id isn't in the dictionary, handle it as needed
|
|
635
|
+
if not has_input_value:
|
|
636
|
+
# Could skip, or raise an error, or set a default
|
|
637
|
+
continue
|
|
638
|
+
|
|
639
|
+
# -------------------------------------------
|
|
640
|
+
# 4) CREATE A MANIFEST IF MULTIPLE RAW FILES
|
|
641
|
+
# for this row's dnaSampleID
|
|
642
|
+
# -------------------------------------------
|
|
643
|
+
manifest_id = None
|
|
644
|
+
if len(dna_files) > 2:
|
|
645
|
+
# For each row that references a dnaSampleID with multiple raw files,
|
|
646
|
+
# mint exactly one new manifest record
|
|
647
|
+
manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
|
|
648
|
+
new_manifest = self._translate_manifest(manifest_id)
|
|
649
|
+
# Add to the database
|
|
650
|
+
database.manifest_set.append(new_manifest)
|
|
651
|
+
|
|
652
|
+
# -------------------------------------------
|
|
653
|
+
# 5) NOW GROUP FILES BY sequencerRunID
|
|
654
|
+
# => one data_generation record per run
|
|
655
|
+
# -------------------------------------------
|
|
656
|
+
lib_prep_processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(
|
|
657
|
+
neon_id
|
|
658
|
+
)
|
|
659
|
+
if not lib_prep_processed_sample_id:
|
|
660
|
+
# If we don't have a ProcessedSample for some reason, skip
|
|
661
|
+
continue
|
|
662
|
+
|
|
663
|
+
for run_id, group_df in dna_files.groupby("sequencerRunID"):
|
|
664
|
+
# a) Mint new data_generation (NucleotideSequencing) ID for this run
|
|
665
|
+
data_generation_id = self._id_minter("nmdc:NucleotideSequencing", 1)[0]
|
|
666
|
+
|
|
667
|
+
# b) Create DataObjects for each raw file in this run
|
|
668
|
+
data_object_ids = []
|
|
669
|
+
for raw_fp in group_df["rawDataFilePath"]:
|
|
670
|
+
do_id = self._id_minter("nmdc:DataObject", 1)[0]
|
|
671
|
+
|
|
672
|
+
# Distinguish read type
|
|
673
|
+
do_type = None
|
|
674
|
+
if "_R1.fastq.gz" in raw_fp:
|
|
675
|
+
do_type = "Metagenome Raw Read 1"
|
|
676
|
+
elif "_R2.fastq.gz" in raw_fp:
|
|
677
|
+
do_type = "Metagenome Raw Read 2"
|
|
678
|
+
|
|
679
|
+
# Create the DataObject
|
|
680
|
+
data_obj = self._translate_data_object(
|
|
681
|
+
do_id=do_id,
|
|
682
|
+
url=raw_fp,
|
|
683
|
+
do_type=do_type,
|
|
684
|
+
manifest_id=manifest_id, # link to the new Manifest if it exists
|
|
685
|
+
)
|
|
686
|
+
database.data_object_set.append(data_obj)
|
|
687
|
+
data_object_ids.append(do_id)
|
|
688
|
+
|
|
689
|
+
# c) Finally, create the data generation record for this run
|
|
690
|
+
database.data_generation_set.append(
|
|
691
|
+
self._translate_nucleotide_sequencing(
|
|
692
|
+
nucleotide_sequencing_id=data_generation_id,
|
|
693
|
+
processed_sample_id=has_input_value,
|
|
694
|
+
raw_data_file_data=data_object_ids,
|
|
695
|
+
nucleotide_sequencing_row=lib_prep_row,
|
|
647
696
|
)
|
|
697
|
+
)
|
|
648
698
|
|
|
649
699
|
return database
|
nmdc_runtime/site/util.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from functools import lru_cache
|
|
3
|
-
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
4
2
|
|
|
3
|
+
from dagster import op
|
|
4
|
+
from functools import lru_cache
|
|
5
5
|
from pymongo.database import Database as MongoDatabase
|
|
6
|
+
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
6
7
|
|
|
7
8
|
from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
|
|
8
9
|
from nmdc_runtime.site.resources import mongo_resource
|
|
@@ -47,3 +48,7 @@ def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
|
|
|
47
48
|
|
|
48
49
|
def get_basename(filename: str) -> str:
|
|
49
50
|
return os.path.basename(filename)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
|
|
54
|
+
return nmdc_study_id.replace(":", "_").replace("-", "_")
|
nmdc_runtime/util.py
CHANGED
|
@@ -24,6 +24,10 @@ from nmdc_schema.get_nmdc_view import ViewGetter
|
|
|
24
24
|
from pydantic import Field, BaseModel
|
|
25
25
|
from pymongo.database import Database as MongoDatabase
|
|
26
26
|
from pymongo.errors import OperationFailure
|
|
27
|
+
from refscan.lib.helpers import identify_references
|
|
28
|
+
from refscan.lib.Finder import Finder
|
|
29
|
+
from refscan.lib.ReferenceList import ReferenceList
|
|
30
|
+
from refscan.scanner import scan_outgoing_references
|
|
27
31
|
from toolz import merge, unique
|
|
28
32
|
|
|
29
33
|
from nmdc_runtime.api.core.util import sha256hash_from_file
|
|
@@ -120,6 +124,23 @@ def get_class_names_from_collection_spec(
|
|
|
120
124
|
return class_names
|
|
121
125
|
|
|
122
126
|
|
|
127
|
+
@lru_cache
|
|
128
|
+
def get_allowed_references() -> ReferenceList:
|
|
129
|
+
r"""
|
|
130
|
+
Returns a `ReferenceList` of all the inter-document references that
|
|
131
|
+
the NMDC Schema allows a schema-compliant MongoDB database to contain.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
# Identify the inter-document references that the schema allows a database to contain.
|
|
135
|
+
print("Identifying schema-allowed references.")
|
|
136
|
+
references = identify_references(
|
|
137
|
+
schema_view=nmdc_schema_view(),
|
|
138
|
+
collection_name_to_class_names=collection_name_to_class_names,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
return references
|
|
142
|
+
|
|
143
|
+
|
|
123
144
|
@lru_cache
|
|
124
145
|
def get_type_collections() -> dict:
|
|
125
146
|
"""Returns a dictionary mapping class names to Mongo collection names."""
|
|
@@ -353,6 +374,14 @@ def nmdc_database_collection_instance_class_names():
|
|
|
353
374
|
|
|
354
375
|
@lru_cache
|
|
355
376
|
def nmdc_database_collection_names():
|
|
377
|
+
r"""
|
|
378
|
+
TODO: Document this function.
|
|
379
|
+
|
|
380
|
+
TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
|
|
381
|
+
collections, use the function named `get_collection_names_from_schema` in `nmdc_runtime/api/db/mongo.py`
|
|
382
|
+
instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
|
|
383
|
+
maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
|
|
384
|
+
"""
|
|
356
385
|
names = []
|
|
357
386
|
view = nmdc_schema_view()
|
|
358
387
|
all_classes = set(view.all_classes())
|
|
@@ -513,6 +542,13 @@ class OverlayDB(AbstractContextManager):
|
|
|
513
542
|
overlay collection, that id is marked as "seen" and will not also be returned when
|
|
514
543
|
subsequently scanning the (unmodified) base-database collection.
|
|
515
544
|
|
|
545
|
+
Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
|
|
546
|
+
database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
|
|
547
|
+
`overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
|
|
548
|
+
the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
|
|
549
|
+
"merging" just-in-time to process the method invocation. You can see an example of this in the implementation
|
|
550
|
+
of the `merge_find` method, which internally accesses both the real database and the overlaying database.
|
|
551
|
+
|
|
516
552
|
Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
|
|
517
553
|
documents from a base collection to the overlay, and then applying the updates to the overlay,
|
|
518
554
|
so that again, base collections are unmodified, and a "merge_find" call will produce a result
|
|
@@ -591,7 +627,33 @@ class OverlayDB(AbstractContextManager):
|
|
|
591
627
|
yield doc
|
|
592
628
|
|
|
593
629
|
|
|
594
|
-
def validate_json(
|
|
630
|
+
def validate_json(
|
|
631
|
+
in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
|
|
632
|
+
):
|
|
633
|
+
r"""
|
|
634
|
+
Checks whether the specified dictionary represents a valid instance of the `Database` class
|
|
635
|
+
defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
|
|
636
|
+
|
|
637
|
+
Example dictionary:
|
|
638
|
+
{
|
|
639
|
+
"biosample_set": [
|
|
640
|
+
{"id": "nmdc:bsm-00-000001", ...},
|
|
641
|
+
{"id": "nmdc:bsm-00-000002", ...}
|
|
642
|
+
],
|
|
643
|
+
"study_set": [
|
|
644
|
+
{"id": "nmdc:sty-00-000001", ...},
|
|
645
|
+
{"id": "nmdc:sty-00-000002", ...}
|
|
646
|
+
]
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
:param in_docs: The dictionary you want to validate
|
|
650
|
+
:param mdb: A reference to a MongoDB database
|
|
651
|
+
:param check_inter_document_references: Whether you want this function to check whether every document that
|
|
652
|
+
is referenced by any of the documents passed in would, indeed, exist
|
|
653
|
+
in the database, if the documents passed in were to be inserted into
|
|
654
|
+
the database. In other words, set this to `True` if you want this
|
|
655
|
+
function to perform referential integrity checks.
|
|
656
|
+
"""
|
|
595
657
|
validator = Draft7Validator(get_nmdc_jsonschema_dict())
|
|
596
658
|
docs = deepcopy(in_docs)
|
|
597
659
|
validation_errors = {}
|
|
@@ -599,6 +661,8 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
|
|
|
599
661
|
known_coll_names = set(nmdc_database_collection_names())
|
|
600
662
|
for coll_name, coll_docs in docs.items():
|
|
601
663
|
if coll_name not in known_coll_names:
|
|
664
|
+
# FIXME: Document what `@type` is (conceptually; e.g., why this function accepts it as a collection name).
|
|
665
|
+
# See: https://github.com/microbiomedata/nmdc-runtime/discussions/858
|
|
602
666
|
if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
|
|
603
667
|
continue
|
|
604
668
|
else:
|
|
@@ -631,6 +695,84 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
|
|
|
631
695
|
except Exception as e:
|
|
632
696
|
return {"result": "errors", "detail": str(e)}
|
|
633
697
|
|
|
698
|
+
# Third pass (if enabled): Check inter-document references.
|
|
699
|
+
if check_inter_document_references is True:
|
|
700
|
+
# Prepare to use `refscan`.
|
|
701
|
+
#
|
|
702
|
+
# Note: We check the inter-document references in two stages, which are:
|
|
703
|
+
# 1. For each document in the JSON payload, check whether each document it references already exists
|
|
704
|
+
# (in the collections the schema says it can exist in) in the database. We use the
|
|
705
|
+
# `refscan` package to do this, which returns violation details we'll use in the second stage.
|
|
706
|
+
# 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
|
|
707
|
+
# check whether that document exists (in the collections the schema says it can exist in) in the
|
|
708
|
+
# JSON payload. If it does, then we "waive" (i.e. discard) that violation.
|
|
709
|
+
# The violations that remain after those two stages are the ones we return to the caller.
|
|
710
|
+
#
|
|
711
|
+
# Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
|
|
712
|
+
# does not provide a means to perform arbitrary queries against its virtual "merged" database. It
|
|
713
|
+
# is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
|
|
714
|
+
# `refscan`'s `Finder` class accepts.
|
|
715
|
+
#
|
|
716
|
+
finder = Finder(database=mdb)
|
|
717
|
+
references = get_allowed_references()
|
|
718
|
+
reference_field_names_by_source_class_name = (
|
|
719
|
+
references.get_reference_field_names_by_source_class_name()
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
# Iterate over the collections in the JSON payload.
|
|
723
|
+
for source_collection_name, documents in in_docs.items():
|
|
724
|
+
for document in documents:
|
|
725
|
+
# Add an `_id` field to the document, since `refscan` requires the document to have one.
|
|
726
|
+
source_document = dict(document, _id=None)
|
|
727
|
+
violations = scan_outgoing_references(
|
|
728
|
+
document=source_document,
|
|
729
|
+
schema_view=nmdc_schema_view(),
|
|
730
|
+
reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
|
|
731
|
+
references=references,
|
|
732
|
+
finder=finder,
|
|
733
|
+
collection_names=nmdc_database_collection_names(),
|
|
734
|
+
source_collection_name=source_collection_name,
|
|
735
|
+
user_wants_to_locate_misplaced_documents=False,
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# For each violation, check whether the misplaced document is in the JSON payload, itself.
|
|
739
|
+
for violation in violations:
|
|
740
|
+
can_waive_violation = False
|
|
741
|
+
# Determine which collections can contain the referenced document, based upon
|
|
742
|
+
# the schema class of which this source document is an instance.
|
|
743
|
+
target_collection_names = (
|
|
744
|
+
references.get_target_collection_names(
|
|
745
|
+
source_class_name=violation.source_class_name,
|
|
746
|
+
source_field_name=violation.source_field_name,
|
|
747
|
+
)
|
|
748
|
+
)
|
|
749
|
+
# Check whether the referenced document exists in any of those collections in the JSON payload.
|
|
750
|
+
for json_coll_name, json_coll_docs in in_docs.items():
|
|
751
|
+
if json_coll_name in target_collection_names:
|
|
752
|
+
for json_coll_doc in json_coll_docs:
|
|
753
|
+
if json_coll_doc["id"] == violation.target_id:
|
|
754
|
+
can_waive_violation = True
|
|
755
|
+
break # stop checking
|
|
756
|
+
if can_waive_violation:
|
|
757
|
+
break # stop checking
|
|
758
|
+
if not can_waive_violation:
|
|
759
|
+
violation_as_str = (
|
|
760
|
+
f"Document '{violation.source_document_id}' "
|
|
761
|
+
f"in collection '{violation.source_collection_name}' "
|
|
762
|
+
f"has a field '{violation.source_field_name}' that "
|
|
763
|
+
f"references a document having id "
|
|
764
|
+
f"'{violation.target_id}', but the latter document "
|
|
765
|
+
f"does not exist in any of the collections the "
|
|
766
|
+
f"NMDC Schema says it can exist in."
|
|
767
|
+
)
|
|
768
|
+
validation_errors[source_collection_name].append(
|
|
769
|
+
violation_as_str
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
# If any collection's error list is not empty, return an error response.
|
|
773
|
+
if any(len(v) > 0 for v in validation_errors.values()):
|
|
774
|
+
return {"result": "errors", "detail": validation_errors}
|
|
775
|
+
|
|
634
776
|
return {"result": "All Okay!"}
|
|
635
777
|
else:
|
|
636
778
|
return {"result": "errors", "detail": validation_errors}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: nmdc_runtime
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: A runtime system for NMDC data management and orchestration
|
|
5
5
|
Home-page: https://github.com/microbiomedata/nmdc-runtime
|
|
6
6
|
Author: Donny Winston
|
|
@@ -11,6 +11,14 @@ Classifier: License :: OSI Approved :: Apache Software License
|
|
|
11
11
|
Requires-Python: >=3.10
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: home-page
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
14
22
|
|
|
15
23
|
A runtime system for NMDC data management and orchestration.
|
|
16
24
|
|
|
@@ -29,7 +37,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
|
|
|
29
37
|
* [nmdc-server](https://github.com/microbiomedata/nmdc-server)
|
|
30
38
|
houses code specific to the data portal -- its database, back-end API, and front-end application.
|
|
31
39
|
|
|
32
|
-
* [workflow_documentation](https://
|
|
40
|
+
* [workflow_documentation](https://docs.microbiomedata.org/workflows/)
|
|
33
41
|
references workflow code spread across several repositories, that take source data and produce computed data.
|
|
34
42
|
|
|
35
43
|
* This repo (nmdc-runtime)
|
|
@@ -2,7 +2,7 @@ nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
2
2
|
nmdc_runtime/config.py,sha256=qyV_To6t--DQUpYJ3SrE6sZlxuVXLPmx2dVtZV-3l-c,33
|
|
3
3
|
nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
|
|
4
4
|
nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
nmdc_runtime/util.py,sha256=
|
|
5
|
+
nmdc_runtime/util.py,sha256=HzQsNMYG6Pb-IuBEE9HBzX_lNkII7jiNe65UFk34ZYA,31414
|
|
6
6
|
nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
|
|
@@ -28,7 +28,7 @@ nmdc_runtime/lib/nmdc_etl_class.py,sha256=tVh3rKVMkBHQE65_LhKeIjCsaCZQk_HJzbc9K4
|
|
|
28
28
|
nmdc_runtime/lib/transform_nmdc_data.py,sha256=hij4lR3IMQRJQdL-rsP_I-m_WyFPsBMchV2MNFUkh0M,39906
|
|
29
29
|
nmdc_runtime/minter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
30
|
nmdc_runtime/minter/bootstrap.py,sha256=5Ej6pJVBRryRIi0ZwEloY78Zky7iE2okF6tPwRI2axM,822
|
|
31
|
-
nmdc_runtime/minter/config.py,sha256=
|
|
31
|
+
nmdc_runtime/minter/config.py,sha256=gsXZropDeeTO5tmLAtRuoocwqL3HgfgqVAENyCbX-Gc,2739
|
|
32
32
|
nmdc_runtime/minter/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
33
|
nmdc_runtime/minter/adapters/repository.py,sha256=I-jmGP38-9kPhkogrwUht_Ir0CfHA9_5ZImw5I_wbcw,8323
|
|
34
34
|
nmdc_runtime/minter/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -36,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
|
|
|
36
36
|
nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
37
|
nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
|
|
38
38
|
nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
-
nmdc_runtime/site/graphs.py,sha256=
|
|
40
|
-
nmdc_runtime/site/ops.py,sha256=
|
|
41
|
-
nmdc_runtime/site/repository.py,sha256=
|
|
42
|
-
nmdc_runtime/site/resources.py,sha256=
|
|
43
|
-
nmdc_runtime/site/util.py,sha256
|
|
39
|
+
nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
|
|
40
|
+
nmdc_runtime/site/ops.py,sha256=p4F5SrDbFdKOrAHu1TUhWQA33QB7hdoQmCCuU-00Eqo,46445
|
|
41
|
+
nmdc_runtime/site/repository.py,sha256=pfx7WAVgdNaPhtfF2pak-tllqPMf4-yUeOXSpr4hu30,43861
|
|
42
|
+
nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
|
|
43
|
+
nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
|
|
44
44
|
nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
|
|
46
46
|
nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
|
|
@@ -51,19 +51,21 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
51
51
|
nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
|
|
52
52
|
nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
|
|
53
53
|
nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
nmdc_runtime/site/export/ncbi_xml.py,sha256=
|
|
55
|
-
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=
|
|
54
|
+
nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
|
|
55
|
+
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
|
|
56
56
|
nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
|
|
57
57
|
nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
|
|
59
|
+
nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
|
+
nmdc_runtime/site/repair/database_updater.py,sha256=EMuY8MfwQEfdejJHp0Y-Gb1eb1zOgKgfJxbtm6wM3YU,10943
|
|
59
61
|
nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
62
|
nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
|
|
61
63
|
nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
|
|
62
|
-
nmdc_runtime/site/translation/gold_translator.py,sha256=
|
|
64
|
+
nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
|
|
63
65
|
nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
|
|
64
66
|
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
|
|
65
67
|
nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
|
|
66
|
-
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=
|
|
68
|
+
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
|
|
67
69
|
nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
|
|
68
70
|
nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
|
|
69
71
|
nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
|
|
@@ -73,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
|
|
|
73
75
|
nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
|
|
74
76
|
nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
|
|
75
77
|
nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
|
|
76
|
-
nmdc_runtime-2.
|
|
77
|
-
nmdc_runtime-2.
|
|
78
|
-
nmdc_runtime-2.
|
|
79
|
-
nmdc_runtime-2.
|
|
80
|
-
nmdc_runtime-2.
|
|
81
|
-
nmdc_runtime-2.
|
|
78
|
+
nmdc_runtime-2.4.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
|
|
79
|
+
nmdc_runtime-2.4.0.dist-info/METADATA,sha256=CeZZbucd3jrD0ZqGdreH2x7ALrM9pt4ksGV2olkkpPI,7401
|
|
80
|
+
nmdc_runtime-2.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
81
|
+
nmdc_runtime-2.4.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
|
|
82
|
+
nmdc_runtime-2.4.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
|
|
83
|
+
nmdc_runtime-2.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|