nmdc-runtime 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -71,6 +71,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
71
71
  neon_amb_data_tables = (
72
72
  "mms_swMetagenomeSequencing",
73
73
  "mms_swMetagenomeDnaExtraction",
74
+ "mms_swRawDataFiles",
74
75
  "amc_fieldGenetic",
75
76
  "amc_fieldSuperParent",
76
77
  )
@@ -88,6 +89,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
88
89
  if_exists="replace",
89
90
  index=False,
90
91
  )
92
+ surface_water_data["mms_swRawDataFiles"].to_sql(
93
+ "mms_swRawDataFiles", self.conn, if_exists="replace", index=False
94
+ )
91
95
  surface_water_data["amc_fieldGenetic"].to_sql(
92
96
  "amc_fieldGenetic", self.conn, if_exists="replace", index=False
93
97
  )
@@ -103,10 +107,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
103
107
  "neonEnvoTerms", self.conn, if_exists="replace", index=False
104
108
  )
105
109
 
106
- self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
107
- self.neon_raw_data_file_mappings_df.to_sql(
108
- "neonRawDataFile", self.conn, if_exists="replace", index=False
109
- )
110
+ self.neon_raw_data_file_mappings_df = surface_water_data["mms_swRawDataFiles"]
110
111
 
111
112
  self.site_code_mapping = site_code_mapping
112
113
 
@@ -371,7 +372,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
371
372
  )
372
373
 
373
374
  def _translate_data_object(
374
- self, do_id: str, url: str, do_type: str, checksum: str
375
+ self, do_id: str, url: str, do_type: str, manifest_id: str
375
376
  ) -> nmdc.DataObject:
376
377
  """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
377
378
  object mainly contains information about the sequencing file that was generated as
@@ -395,8 +396,15 @@ class NeonSurfaceWaterDataTranslator(Translator):
395
396
  url=url,
396
397
  description=f"sequencing results for {basename}",
397
398
  type="nmdc:DataObject",
398
- md5_checksum=checksum,
399
399
  data_object_type=do_type,
400
+ in_manifest=manifest_id,
401
+ )
402
+
403
+ def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
404
+ return nmdc.Manifest(
405
+ id=manifest_id,
406
+ manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
407
+ type="nmdc:Manifest",
400
408
  )
401
409
 
402
410
  def get_database(self):
@@ -477,6 +485,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
477
485
  """
478
486
  surface_water_samples = pd.read_sql_query(query, self.conn)
479
487
 
488
+ # --------------------------------------------------
489
+ # Create mappings for minted NMDC IDs
490
+ # --------------------------------------------------
480
491
  neon_biosample_ids = surface_water_samples["parentSampleID"]
481
492
  nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
482
493
  neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
@@ -511,30 +522,20 @@ class NeonSurfaceWaterDataTranslator(Translator):
511
522
  zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
512
523
  )
513
524
 
514
- neon_omprc_ids = surface_water_samples["parentSampleID"]
515
- nmdc_omprc_ids = self._id_minter(
516
- "nmdc:NucleotideSequencing", len(neon_omprc_ids)
517
- )
518
- neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
519
-
520
- neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
521
- neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
522
- nmdc_data_object_ids = self._id_minter(
523
- "nmdc:DataObject", len(neon_raw_file_paths)
524
- )
525
- neon_to_nmdc_data_object_ids = dict(
526
- zip(neon_raw_file_paths, nmdc_data_object_ids)
527
- )
528
-
525
+ # --------------------------------------------------
526
+ # STEP 1: Insert Biosamples
527
+ # --------------------------------------------------
529
528
  for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
530
529
  biosample_row = surface_water_samples[
531
530
  surface_water_samples["parentSampleID"] == neon_id
532
531
  ]
532
+ # database.biosample_set.append(
533
+ # self._translate_biosample(neon_id, nmdc_id, biosample_row)
534
+ # )
533
535
 
534
- database.biosample_set.append(
535
- self._translate_biosample(neon_id, nmdc_id, biosample_row)
536
- )
537
-
536
+ # --------------------------------------------------
537
+ # STEP 2: Insert Extraction Processes
538
+ # --------------------------------------------------
538
539
  for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
539
540
  extraction_row = surface_water_samples[
540
541
  surface_water_samples["parentSampleID"] == neon_id
@@ -557,6 +558,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
557
558
  extraction_row, "genomicsSampleID"
558
559
  )
559
560
 
561
+ # Each Extraction process output => ProcessedSample
560
562
  database.processed_sample_set.append(
561
563
  self._translate_processed_sample(
562
564
  processed_sample_id,
@@ -564,23 +566,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
564
566
  )
565
567
  )
566
568
 
567
- query = """
568
- SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
569
- FROM neonRawDataFile
570
- GROUP BY dnaSampleID
571
- """
572
- neon_raw_data_files = pd.read_sql_query(query, self.conn)
573
- neon_raw_data_files_dict = (
574
- neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
575
- .str.split("|")
576
- .to_dict()
577
- )
578
- filtered_neon_raw_data_files_dict = {
579
- key: value
580
- for key, value in neon_raw_data_files_dict.items()
581
- if len(value) <= 2
582
- }
583
-
569
+ # --------------------------------------------------
570
+ # STEP 3: Insert LibraryPreparation Processes
571
+ # --------------------------------------------------
584
572
  for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
585
573
  lib_prep_row = surface_water_samples[
586
574
  surface_water_samples["parentSampleID"] == neon_id
@@ -601,6 +589,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
601
589
 
602
590
  dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
603
591
 
592
+ # Each LibraryPreparation process output => ProcessedSample
604
593
  database.processed_sample_set.append(
605
594
  self._translate_processed_sample(
606
595
  processed_sample_id,
@@ -608,42 +597,103 @@ class NeonSurfaceWaterDataTranslator(Translator):
608
597
  )
609
598
  )
610
599
 
611
- has_output = None
612
- has_output_do_ids = []
613
-
614
- if dna_sample_id in filtered_neon_raw_data_files_dict:
615
- has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
616
- for item in has_output:
617
- if item in neon_to_nmdc_data_object_ids:
618
- has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
619
-
620
- checksum = None
621
- do_type = None
622
-
623
- checksum = neon_raw_data_file_mappings_df[
624
- neon_raw_data_file_mappings_df["rawDataFilePath"] == item
625
- ]["checkSum"].values[0]
626
- if "_R1.fastq.gz" in item:
627
- do_type = "Metagenome Raw Read 1"
628
- elif "_R2.fastq.gz" in item:
629
- do_type = "Metagenome Raw Read 2"
630
-
631
- database.data_object_set.append(
632
- self._translate_data_object(
633
- neon_to_nmdc_data_object_ids.get(item),
634
- item,
635
- do_type,
636
- checksum,
637
- )
638
- )
639
-
640
- database.data_generation_set.append(
641
- self._translate_nucleotide_sequencing(
642
- neon_to_nmdc_omprc_ids.get(neon_id),
643
- processed_sample_id,
644
- has_output_do_ids,
645
- lib_prep_row,
646
- )
600
+ # --------------------------------------------------
601
+ # STEP 4: Group raw files by (dnaSampleID, sequencerRunID)
602
+ # and insert DataObjects + DataGeneration processes
603
+ # --------------------------------------------------
604
+ raw_query = """
605
+ SELECT dnaSampleID, sequencerRunID, rawDataFilePath
606
+ FROM mms_swRawDataFiles
607
+ """
608
+ neon_raw_data_files_df = pd.read_sql_query(raw_query, self.conn)
609
+
610
+ for neon_id, nmdc_libprep_id in neon_to_nmdc_lib_prep_ids.items():
611
+ # 1) Pull out the row that corresponds to this parentSampleID
612
+ lib_prep_row = surface_water_samples[
613
+ surface_water_samples["parentSampleID"] == neon_id
614
+ ]
615
+
616
+ # 2) Grab the dnaSampleID from that row
617
+ dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
618
+ if not dna_sample_id:
619
+ # No dnaSampleID => skip
620
+ continue
621
+
622
+ # 3) Find all raw files for that dnaSampleID
623
+ dna_files = neon_raw_data_files_df[
624
+ neon_raw_data_files_df["dnaSampleID"] == dna_sample_id
625
+ ]
626
+ if dna_files.empty:
627
+ # No raw files => skip
628
+ continue
629
+
630
+ # -----------------------------------------
631
+ # LOOKUP DICT: get "has_input" for this neon_id
632
+ # -----------------------------------------
633
+ has_input_value = self.samp_procsm_dict.get(neon_id)
634
+ # If some neon_id isn't in the dictionary, handle it as needed
635
+ if not has_input_value:
636
+ # Could skip, or raise an error, or set a default
637
+ continue
638
+
639
+ # -------------------------------------------
640
+ # 4) CREATE A MANIFEST IF MULTIPLE RAW FILES
641
+ # for this row's dnaSampleID
642
+ # -------------------------------------------
643
+ manifest_id = None
644
+ if len(dna_files) > 2:
645
+ # For each row that references a dnaSampleID with multiple raw files,
646
+ # mint exactly one new manifest record
647
+ manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
648
+ new_manifest = self._translate_manifest(manifest_id)
649
+ # Add to the database
650
+ database.manifest_set.append(new_manifest)
651
+
652
+ # -------------------------------------------
653
+ # 5) NOW GROUP FILES BY sequencerRunID
654
+ # => one data_generation record per run
655
+ # -------------------------------------------
656
+ lib_prep_processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(
657
+ neon_id
658
+ )
659
+ if not lib_prep_processed_sample_id:
660
+ # If we don't have a ProcessedSample for some reason, skip
661
+ continue
662
+
663
+ for run_id, group_df in dna_files.groupby("sequencerRunID"):
664
+ # a) Mint new data_generation (NucleotideSequencing) ID for this run
665
+ data_generation_id = self._id_minter("nmdc:NucleotideSequencing", 1)[0]
666
+
667
+ # b) Create DataObjects for each raw file in this run
668
+ data_object_ids = []
669
+ for raw_fp in group_df["rawDataFilePath"]:
670
+ do_id = self._id_minter("nmdc:DataObject", 1)[0]
671
+
672
+ # Distinguish read type
673
+ do_type = None
674
+ if "_R1.fastq.gz" in raw_fp:
675
+ do_type = "Metagenome Raw Read 1"
676
+ elif "_R2.fastq.gz" in raw_fp:
677
+ do_type = "Metagenome Raw Read 2"
678
+
679
+ # Create the DataObject
680
+ data_obj = self._translate_data_object(
681
+ do_id=do_id,
682
+ url=raw_fp,
683
+ do_type=do_type,
684
+ manifest_id=manifest_id, # link to the new Manifest if it exists
685
+ )
686
+ database.data_object_set.append(data_obj)
687
+ data_object_ids.append(do_id)
688
+
689
+ # c) Finally, create the data generation record for this run
690
+ database.data_generation_set.append(
691
+ self._translate_nucleotide_sequencing(
692
+ nucleotide_sequencing_id=data_generation_id,
693
+ processed_sample_id=has_input_value,
694
+ raw_data_file_data=data_object_ids,
695
+ nucleotide_sequencing_row=lib_prep_row,
647
696
  )
697
+ )
648
698
 
649
699
  return database
nmdc_runtime/site/util.py CHANGED
@@ -1,8 +1,9 @@
1
1
  import os
2
- from functools import lru_cache
3
- from subprocess import Popen, PIPE, STDOUT, CalledProcessError
4
2
 
3
+ from dagster import op
4
+ from functools import lru_cache
5
5
  from pymongo.database import Database as MongoDatabase
6
+ from subprocess import Popen, PIPE, STDOUT, CalledProcessError
6
7
 
7
8
  from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
8
9
  from nmdc_runtime.site.resources import mongo_resource
@@ -47,3 +48,7 @@ def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
47
48
 
48
49
  def get_basename(filename: str) -> str:
49
50
  return os.path.basename(filename)
51
+
52
+
53
+ def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
54
+ return nmdc_study_id.replace(":", "_").replace("-", "_")
nmdc_runtime/util.py CHANGED
@@ -24,6 +24,10 @@ from nmdc_schema.get_nmdc_view import ViewGetter
24
24
  from pydantic import Field, BaseModel
25
25
  from pymongo.database import Database as MongoDatabase
26
26
  from pymongo.errors import OperationFailure
27
+ from refscan.lib.helpers import identify_references
28
+ from refscan.lib.Finder import Finder
29
+ from refscan.lib.ReferenceList import ReferenceList
30
+ from refscan.scanner import scan_outgoing_references
27
31
  from toolz import merge, unique
28
32
 
29
33
  from nmdc_runtime.api.core.util import sha256hash_from_file
@@ -120,6 +124,23 @@ def get_class_names_from_collection_spec(
120
124
  return class_names
121
125
 
122
126
 
127
+ @lru_cache
128
+ def get_allowed_references() -> ReferenceList:
129
+ r"""
130
+ Returns a `ReferenceList` of all the inter-document references that
131
+ the NMDC Schema allows a schema-compliant MongoDB database to contain.
132
+ """
133
+
134
+ # Identify the inter-document references that the schema allows a database to contain.
135
+ print("Identifying schema-allowed references.")
136
+ references = identify_references(
137
+ schema_view=nmdc_schema_view(),
138
+ collection_name_to_class_names=collection_name_to_class_names,
139
+ )
140
+
141
+ return references
142
+
143
+
123
144
  @lru_cache
124
145
  def get_type_collections() -> dict:
125
146
  """Returns a dictionary mapping class names to Mongo collection names."""
@@ -353,6 +374,14 @@ def nmdc_database_collection_instance_class_names():
353
374
 
354
375
  @lru_cache
355
376
  def nmdc_database_collection_names():
377
+ r"""
378
+ TODO: Document this function.
379
+
380
+ TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
381
+ collections, use the function named `get_collection_names_from_schema` in `nmdc_runtime/api/db/mongo.py`
382
+ instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
383
+ maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
384
+ """
356
385
  names = []
357
386
  view = nmdc_schema_view()
358
387
  all_classes = set(view.all_classes())
@@ -513,6 +542,13 @@ class OverlayDB(AbstractContextManager):
513
542
  overlay collection, that id is marked as "seen" and will not also be returned when
514
543
  subsequently scanning the (unmodified) base-database collection.
515
544
 
545
+ Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
546
+ database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
547
+ `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
548
+ the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
549
+ "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
550
+ of the `merge_find` method, which internally accesses both the real database and the overlaying database.
551
+
516
552
  Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
517
553
  documents from a base collection to the overlay, and then applying the updates to the overlay,
518
554
  so that again, base collections are unmodified, and a "merge_find" call will produce a result
@@ -591,7 +627,33 @@ class OverlayDB(AbstractContextManager):
591
627
  yield doc
592
628
 
593
629
 
594
- def validate_json(in_docs: dict, mdb: MongoDatabase):
630
+ def validate_json(
631
+ in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
632
+ ):
633
+ r"""
634
+ Checks whether the specified dictionary represents a valid instance of the `Database` class
635
+ defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
636
+
637
+ Example dictionary:
638
+ {
639
+ "biosample_set": [
640
+ {"id": "nmdc:bsm-00-000001", ...},
641
+ {"id": "nmdc:bsm-00-000002", ...}
642
+ ],
643
+ "study_set": [
644
+ {"id": "nmdc:sty-00-000001", ...},
645
+ {"id": "nmdc:sty-00-000002", ...}
646
+ ]
647
+ }
648
+
649
+ :param in_docs: The dictionary you want to validate
650
+ :param mdb: A reference to a MongoDB database
651
+ :param check_inter_document_references: Whether you want this function to check whether every document that
652
+ is referenced by any of the documents passed in would, indeed, exist
653
+ in the database, if the documents passed in were to be inserted into
654
+ the database. In other words, set this to `True` if you want this
655
+ function to perform referential integrity checks.
656
+ """
595
657
  validator = Draft7Validator(get_nmdc_jsonschema_dict())
596
658
  docs = deepcopy(in_docs)
597
659
  validation_errors = {}
@@ -599,6 +661,8 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
599
661
  known_coll_names = set(nmdc_database_collection_names())
600
662
  for coll_name, coll_docs in docs.items():
601
663
  if coll_name not in known_coll_names:
664
+ # FIXME: Document what `@type` is (conceptually; e.g., why this function accepts it as a collection name).
665
+ # See: https://github.com/microbiomedata/nmdc-runtime/discussions/858
602
666
  if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
603
667
  continue
604
668
  else:
@@ -631,6 +695,84 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
631
695
  except Exception as e:
632
696
  return {"result": "errors", "detail": str(e)}
633
697
 
698
+ # Third pass (if enabled): Check inter-document references.
699
+ if check_inter_document_references is True:
700
+ # Prepare to use `refscan`.
701
+ #
702
+ # Note: We check the inter-document references in two stages, which are:
703
+ # 1. For each document in the JSON payload, check whether each document it references already exists
704
+ # (in the collections the schema says it can exist in) in the database. We use the
705
+ # `refscan` package to do this, which returns violation details we'll use in the second stage.
706
+ # 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
707
+ # check whether that document exists (in the collections the schema says it can exist in) in the
708
+ # JSON payload. If it does, then we "waive" (i.e. discard) that violation.
709
+ # The violations that remain after those two stages are the ones we return to the caller.
710
+ #
711
+ # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
712
+ # does not provide a means to perform arbitrary queries against its virtual "merged" database. It
713
+ # is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
714
+ # `refscan`'s `Finder` class accepts.
715
+ #
716
+ finder = Finder(database=mdb)
717
+ references = get_allowed_references()
718
+ reference_field_names_by_source_class_name = (
719
+ references.get_reference_field_names_by_source_class_name()
720
+ )
721
+
722
+ # Iterate over the collections in the JSON payload.
723
+ for source_collection_name, documents in in_docs.items():
724
+ for document in documents:
725
+ # Add an `_id` field to the document, since `refscan` requires the document to have one.
726
+ source_document = dict(document, _id=None)
727
+ violations = scan_outgoing_references(
728
+ document=source_document,
729
+ schema_view=nmdc_schema_view(),
730
+ reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
731
+ references=references,
732
+ finder=finder,
733
+ collection_names=nmdc_database_collection_names(),
734
+ source_collection_name=source_collection_name,
735
+ user_wants_to_locate_misplaced_documents=False,
736
+ )
737
+
738
+ # For each violation, check whether the misplaced document is in the JSON payload, itself.
739
+ for violation in violations:
740
+ can_waive_violation = False
741
+ # Determine which collections can contain the referenced document, based upon
742
+ # the schema class of which this source document is an instance.
743
+ target_collection_names = (
744
+ references.get_target_collection_names(
745
+ source_class_name=violation.source_class_name,
746
+ source_field_name=violation.source_field_name,
747
+ )
748
+ )
749
+ # Check whether the referenced document exists in any of those collections in the JSON payload.
750
+ for json_coll_name, json_coll_docs in in_docs.items():
751
+ if json_coll_name in target_collection_names:
752
+ for json_coll_doc in json_coll_docs:
753
+ if json_coll_doc["id"] == violation.target_id:
754
+ can_waive_violation = True
755
+ break # stop checking
756
+ if can_waive_violation:
757
+ break # stop checking
758
+ if not can_waive_violation:
759
+ violation_as_str = (
760
+ f"Document '{violation.source_document_id}' "
761
+ f"in collection '{violation.source_collection_name}' "
762
+ f"has a field '{violation.source_field_name}' that "
763
+ f"references a document having id "
764
+ f"'{violation.target_id}', but the latter document "
765
+ f"does not exist in any of the collections the "
766
+ f"NMDC Schema says it can exist in."
767
+ )
768
+ validation_errors[source_collection_name].append(
769
+ violation_as_str
770
+ )
771
+
772
+ # If any collection's error list is not empty, return an error response.
773
+ if any(len(v) > 0 for v in validation_errors.values()):
774
+ return {"result": "errors", "detail": validation_errors}
775
+
634
776
  return {"result": "All Okay!"}
635
777
  else:
636
778
  return {"result": "errors", "detail": validation_errors}
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: nmdc_runtime
3
- Version: 2.2.1
3
+ Version: 2.4.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -11,6 +11,14 @@ Classifier: License :: OSI Approved :: Apache Software License
11
11
  Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: classifier
17
+ Dynamic: description
18
+ Dynamic: description-content-type
19
+ Dynamic: home-page
20
+ Dynamic: requires-python
21
+ Dynamic: summary
14
22
 
15
23
  A runtime system for NMDC data management and orchestration.
16
24
 
@@ -29,7 +37,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
29
37
  * [nmdc-server](https://github.com/microbiomedata/nmdc-server)
30
38
  houses code specific to the data portal -- its database, back-end API, and front-end application.
31
39
 
32
- * [workflow_documentation](https://nmdc-workflow-documentation.readthedocs.io/en/latest/index.html)
40
+ * [workflow_documentation](https://docs.microbiomedata.org/workflows/)
33
41
  references workflow code spread across several repositories, that take source data and produce computed data.
34
42
 
35
43
  * This repo (nmdc-runtime)
@@ -2,7 +2,7 @@ nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  nmdc_runtime/config.py,sha256=qyV_To6t--DQUpYJ3SrE6sZlxuVXLPmx2dVtZV-3l-c,33
3
3
  nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
4
4
  nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- nmdc_runtime/util.py,sha256=aMzS8eATEjpXOiuyAFYthx92fb_cgIzWWd5ZQU6ZlAY,22931
5
+ nmdc_runtime/util.py,sha256=HzQsNMYG6Pb-IuBEE9HBzX_lNkII7jiNe65UFk34ZYA,31414
6
6
  nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
@@ -28,7 +28,7 @@ nmdc_runtime/lib/nmdc_etl_class.py,sha256=tVh3rKVMkBHQE65_LhKeIjCsaCZQk_HJzbc9K4
28
28
  nmdc_runtime/lib/transform_nmdc_data.py,sha256=hij4lR3IMQRJQdL-rsP_I-m_WyFPsBMchV2MNFUkh0M,39906
29
29
  nmdc_runtime/minter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  nmdc_runtime/minter/bootstrap.py,sha256=5Ej6pJVBRryRIi0ZwEloY78Zky7iE2okF6tPwRI2axM,822
31
- nmdc_runtime/minter/config.py,sha256=WrxX9WmyN7Ft4INRAQbd31jmlm5qwaDDaNS9AktieYA,4112
31
+ nmdc_runtime/minter/config.py,sha256=gsXZropDeeTO5tmLAtRuoocwqL3HgfgqVAENyCbX-Gc,2739
32
32
  nmdc_runtime/minter/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  nmdc_runtime/minter/adapters/repository.py,sha256=I-jmGP38-9kPhkogrwUht_Ir0CfHA9_5ZImw5I_wbcw,8323
34
34
  nmdc_runtime/minter/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -36,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
36
36
  nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
38
38
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- nmdc_runtime/site/graphs.py,sha256=mu4bE8799TItWXaPBfOeFB2XMyYwPZcj-VJQmadN2MA,14171
40
- nmdc_runtime/site/ops.py,sha256=T9_WrwDaySGnu6olwOHQizHQfeofMOaqMcq_vYEIzO0,43140
41
- nmdc_runtime/site/repository.py,sha256=JtHlp6l3UVo0QhV670TGns9bMfht7NOQrNWQtvsYr2g,39183
42
- nmdc_runtime/site/resources.py,sha256=6bmvplgql3KdEXKI49BibSk0Sug96SFJi8eOs2zeKK0,18252
43
- nmdc_runtime/site/util.py,sha256=zAY0oIY7GRf63ecqWelmS27N7PVrAXVwEhtnpescBSw,1415
39
+ nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
40
+ nmdc_runtime/site/ops.py,sha256=p4F5SrDbFdKOrAHu1TUhWQA33QB7hdoQmCCuU-00Eqo,46445
41
+ nmdc_runtime/site/repository.py,sha256=pfx7WAVgdNaPhtfF2pak-tllqPMf4-yUeOXSpr4hu30,43861
42
+ nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
43
+ nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
44
44
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
46
46
  nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
@@ -51,19 +51,21 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
51
51
  nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
52
52
  nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
53
53
  nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- nmdc_runtime/site/export/ncbi_xml.py,sha256=Vb4rNP3uhnGlHqrwUGgA2DzpOotCf3S8G4sIJml7gl4,25287
55
- nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZynO1yPSLX_aHs,8390
54
+ nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
55
+ nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
56
56
  nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
57
57
  nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
59
+ nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
+ nmdc_runtime/site/repair/database_updater.py,sha256=EMuY8MfwQEfdejJHp0Y-Gb1eb1zOgKgfJxbtm6wM3YU,10943
59
61
  nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
62
  nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
61
63
  nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
62
- nmdc_runtime/site/translation/gold_translator.py,sha256=RfAB68dJ9hDep20wETmCNBc0gugZbEKqVimT8h2t0uM,31470
64
+ nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
63
65
  nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
64
66
  nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
65
67
  nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
66
- nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=MQgjIfWPgoRe-bhzyfqHSe2mZwFsjcwjdT8tNqpIhlc,27729
68
+ nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
67
69
  nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
68
70
  nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
69
71
  nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
@@ -73,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
73
75
  nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
74
76
  nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
75
77
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
76
- nmdc_runtime-2.2.1.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
77
- nmdc_runtime-2.2.1.dist-info/METADATA,sha256=yIkwZWVw8J1xDqhwVQy2Rxfz7cIc42yT4JkRBdsRBr4,7256
78
- nmdc_runtime-2.2.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
79
- nmdc_runtime-2.2.1.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
80
- nmdc_runtime-2.2.1.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
81
- nmdc_runtime-2.2.1.dist-info/RECORD,,
78
+ nmdc_runtime-2.4.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
+ nmdc_runtime-2.4.0.dist-info/METADATA,sha256=CeZZbucd3jrD0ZqGdreH2x7ALrM9pt4ksGV2olkkpPI,7401
80
+ nmdc_runtime-2.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
81
+ nmdc_runtime-2.4.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
+ nmdc_runtime-2.4.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
+ nmdc_runtime-2.4.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5