nmdc-runtime 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,12 @@
1
1
  import logging
2
2
  import re
3
+ from collections import namedtuple
3
4
  from datetime import datetime
4
5
  from enum import Enum
5
6
  from functools import lru_cache
6
7
  from importlib import resources
7
- from typing import Any, List, Optional, Union
8
+ from typing import Any, List, Optional, Union, Tuple
9
+ from urllib.parse import urlparse
8
10
 
9
11
  from linkml_runtime import SchemaView
10
12
  from linkml_runtime.linkml_model import SlotDefinition
@@ -13,8 +15,38 @@ from toolz import concat, dissoc, get_in, groupby, valmap
13
15
 
14
16
  from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
15
17
 
18
+
19
+ DataUrlSet = namedtuple("DataUrlSet", ["url", "md5_checksum"])
20
+
21
+ READ_1 = DataUrlSet("read_1_url", "read_1_md5_checksum")
22
+ READ_2 = DataUrlSet("read_2_url", "read_2_md5_checksum")
23
+ INTERLEAVED = DataUrlSet("interleaved_url", "interleaved_md5_checksum")
24
+
25
+ DATA_URL_SETS: list[DataUrlSet] = [READ_1, READ_2, INTERLEAVED]
26
+
16
27
  BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"
17
28
 
29
+ TAB_NAME_KEY = "__tab_name"
30
+ METAGENOME = nmdc.NucleotideSequencingEnum(nmdc.NucleotideSequencingEnum.metagenome)
31
+ METATRANSCRIPTOME = nmdc.NucleotideSequencingEnum(
32
+ nmdc.NucleotideSequencingEnum.metatranscriptome
33
+ )
34
+ TAB_NAME_TO_ANALYTE_CATEGORY: dict[str, nmdc.NucleotideSequencingEnum] = {
35
+ "metagenome_sequencing_non_interleaved_data": METAGENOME,
36
+ "metagenome_sequencing_interleaved_data": METAGENOME,
37
+ "metatranscriptome_sequencing_non_interleaved_data": METATRANSCRIPTOME,
38
+ "metatranscriptome_sequencing_interleaved_data": METATRANSCRIPTOME,
39
+ }
40
+
41
+ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str] = {
42
+ (READ_1, str(METAGENOME)): "Metagenome Raw Read 1",
43
+ (READ_2, str(METAGENOME)): "Metagenome Raw Read 2",
44
+ (INTERLEAVED, str(METAGENOME)): "Metagenome Raw Reads",
45
+ (READ_1, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 1",
46
+ (READ_2, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 2",
47
+ (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
48
+ }
49
+
18
50
 
19
51
  class EnvironmentPackage(Enum):
20
52
  r"""
@@ -75,6 +107,18 @@ def group_dicts_by_key(key: str, seq: Optional[list[dict]]) -> Optional[dict]:
75
107
  return grouped
76
108
 
77
109
 
110
+ def split_strip(string: str | None, sep: str) -> list[str] | None:
111
+ """Split a string by a separator and strip whitespace from each part.
112
+
113
+ :param string: string to split
114
+ :param sep: separator to split by
115
+ :return: list of stripped strings
116
+ """
117
+ if string is None:
118
+ return None
119
+ return [s.strip() for s in string.split(sep)]
120
+
121
+
78
122
  class SubmissionPortalTranslator(Translator):
79
123
  """A Translator subclass for handling submission portal entries
80
124
 
@@ -86,17 +130,15 @@ class SubmissionPortalTranslator(Translator):
86
130
 
87
131
  def __init__(
88
132
  self,
89
- metadata_submission: JSON_OBJECT = {},
133
+ metadata_submission: Optional[JSON_OBJECT] = None,
90
134
  *args,
91
135
  nucleotide_sequencing_mapping: Optional[list] = None,
92
136
  data_object_mapping: Optional[list] = None,
137
+ illumina_instrument_mapping: Optional[dict[str, str]] = None,
93
138
  # Additional study-level metadata not captured by the submission portal currently
94
139
  # See: https://github.com/microbiomedata/submission-schema/issues/162
95
- study_doi_category: Optional[str] = None,
96
- study_doi_provider: Optional[str] = None,
97
140
  study_category: Optional[str] = None,
98
141
  study_pi_image_url: Optional[str] = None,
99
- study_funding_sources: Optional[list[str]] = None,
100
142
  # Additional biosample-level metadata with optional column mapping information not captured
101
143
  # by the submission portal currently.
102
144
  # See: https://github.com/microbiomedata/submission-schema/issues/162
@@ -106,23 +148,17 @@ class SubmissionPortalTranslator(Translator):
106
148
  ) -> None:
107
149
  super().__init__(*args, **kwargs)
108
150
 
109
- self.metadata_submission = metadata_submission
151
+ self.metadata_submission: JSON_OBJECT = metadata_submission or {}
110
152
  self.nucleotide_sequencing_mapping = nucleotide_sequencing_mapping
111
153
  self.data_object_mapping = data_object_mapping
112
-
113
- self.study_doi_category = (
114
- nmdc.DoiCategoryEnum(study_doi_category)
115
- if study_doi_category
116
- else nmdc.DoiCategoryEnum.dataset_doi
117
- )
118
- self.study_doi_provider = (
119
- nmdc.DoiProviderEnum(study_doi_provider) if study_doi_provider else None
154
+ self.illumina_instrument_mapping: dict[str, str] = (
155
+ illumina_instrument_mapping or {}
120
156
  )
157
+
121
158
  self.study_category = (
122
159
  nmdc.StudyCategoryEnum(study_category) if study_category else None
123
160
  )
124
161
  self.study_pi_image_url = study_pi_image_url
125
- self.study_funding_sources = study_funding_sources
126
162
 
127
163
  self.biosample_extras = group_dicts_by_key(
128
164
  BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
@@ -153,28 +189,6 @@ class SubmissionPortalTranslator(Translator):
153
189
  type=nmdc.PersonValue.class_class_curie,
154
190
  )
155
191
 
156
- def _get_doi(self, metadata_submission: JSON_OBJECT) -> Union[List[nmdc.Doi], None]:
157
- """Get DOI information from the context form data
158
-
159
- :param metadata_submission: submission portal entry
160
- :return: list of strings or None
161
- """
162
- dataset_doi = get_in(["contextForm", "datasetDoi"], metadata_submission)
163
- if not dataset_doi:
164
- return None
165
-
166
- if not dataset_doi.startswith("doi:"):
167
- dataset_doi = f"doi:{dataset_doi}"
168
-
169
- return [
170
- nmdc.Doi(
171
- doi_value=dataset_doi,
172
- doi_provider=self.study_doi_provider,
173
- doi_category=self.study_doi_category,
174
- type="nmdc:Doi",
175
- )
176
- ]
177
-
178
192
  def _get_has_credit_associations(
179
193
  self, metadata_submission: JSON_OBJECT
180
194
  ) -> Union[List[nmdc.CreditAssociation], None]:
@@ -203,21 +217,34 @@ class SubmissionPortalTranslator(Translator):
203
217
  def _get_gold_study_identifiers(
204
218
  self, metadata_submission: JSON_OBJECT
205
219
  ) -> Union[List[str], None]:
206
- """Construct a GOLD CURIE from the multiomics from data
220
+ """Construct a GOLD CURIE from the study form data
207
221
 
208
222
  :param metadata_submission: submission portal entry
209
223
  :return: GOLD CURIE
210
224
  """
211
- gold_study_id = get_in(["multiOmicsForm", "GOLDStudyId"], metadata_submission)
225
+ gold_study_id = get_in(["studyForm", "GOLDStudyId"], metadata_submission)
212
226
  if not gold_study_id:
213
227
  return None
214
228
 
215
229
  return [self._ensure_curie(gold_study_id, default_prefix="gold")]
216
230
 
231
+ def _get_ncbi_bioproject_identifiers(
232
+ self, metadata_submission: JSON_OBJECT
233
+ ) -> Union[List[str], None]:
234
+ """Construct a NCBI Bioproject CURIE from the study form data"""
235
+
236
+ ncbi_bioproject_id = get_in(
237
+ ["studyForm", "NCBIBioProjectId"], metadata_submission
238
+ )
239
+ if not ncbi_bioproject_id:
240
+ return None
241
+
242
+ return [self._ensure_curie(ncbi_bioproject_id, default_prefix="bioproject")]
243
+
217
244
  def _get_jgi_study_identifiers(
218
245
  self, metadata_submission: JSON_OBJECT
219
246
  ) -> Union[List[str], None]:
220
- """Construct a JGI proposal CURIE from the multiomics from data
247
+ """Construct a JGI proposal CURIE from the multiomics form data
221
248
 
222
249
  :param metadata_submission: submission portal entry
223
250
  :return: JGI proposal CURIE
@@ -228,6 +255,20 @@ class SubmissionPortalTranslator(Translator):
228
255
 
229
256
  return [self._ensure_curie(jgi_study_id, default_prefix="jgi.proposal")]
230
257
 
258
+ def _get_emsl_project_identifiers(
259
+ self, metadata_submission: JSON_OBJECT
260
+ ) -> Union[List[str], None]:
261
+ """Construct an EMSL project CURIE from the multiomics form data
262
+
263
+ :param metadata_submission: submission portal entry
264
+ :return: EMSL project CURIE
265
+ """
266
+ emsl_project_id = get_in(["multiOmicsForm", "studyNumber"], metadata_submission)
267
+ if not emsl_project_id:
268
+ return None
269
+
270
+ return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
271
+
231
272
  def _get_quantity_value(
232
273
  self, raw_value: Optional[str], unit: Optional[str] = None
233
274
  ) -> Union[nmdc.QuantityValue, None]:
@@ -434,6 +475,75 @@ class SubmissionPortalTranslator(Translator):
434
475
 
435
476
  return value
436
477
 
478
+ def _get_data_objects_from_fields(
479
+ self,
480
+ sample_data: JSON_OBJECT,
481
+ *,
482
+ url_field_name: str,
483
+ md5_checksum_field_name: str,
484
+ nucleotide_sequencing_id: str,
485
+ data_object_type: nmdc.FileTypeEnum,
486
+ ) -> Tuple[List[nmdc.DataObject], nmdc.Manifest | None]:
487
+ """Get a DataObject instances based on the URLs and MD5 checksums in the given fields.
488
+
489
+ If the field provides multiple URLs, multiple DataObject instances will be created and a
490
+ Manifest will be created and provided in the second return value.
491
+
492
+ :param sample_data: sample data
493
+ :param url_field_name: field name for the URL
494
+ :param md5_checksum_field_name: field name for the MD5 checksum
495
+ :param nucleotide_sequencing_id: ID for the nmdc:NucleotideSequencing object that generated the data object(s)
496
+ :param data_object_type: FileTypeEnum representing the type of the data object
497
+ :return: nmdc.DataObject or None
498
+ """
499
+ data_objects: List[nmdc.DataObject] = []
500
+ urls = split_strip(sample_data.get(url_field_name), ";")
501
+ if not urls:
502
+ return data_objects, None
503
+
504
+ md5_checksums = split_strip(sample_data.get(md5_checksum_field_name), ";")
505
+ if md5_checksums and len(urls) != len(md5_checksums):
506
+ raise ValueError(
507
+ f"{url_field_name} and {md5_checksum_field_name} must have the same number of values"
508
+ )
509
+
510
+ data_object_ids = self._id_minter("nmdc:DataObject", len(urls))
511
+ manifest: nmdc.Manifest | None = None
512
+ if len(urls) > 1:
513
+ manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
514
+ manifest = nmdc.Manifest(
515
+ id=manifest_id,
516
+ manifest_category=nmdc.ManifestCategoryEnum(
517
+ nmdc.ManifestCategoryEnum.poolable_replicates
518
+ ),
519
+ type="nmdc:Manifest",
520
+ )
521
+
522
+ for i, url in enumerate(urls):
523
+ data_object_id = data_object_ids[i]
524
+ parsed_url = urlparse(url)
525
+ possible_filename = parsed_url.path.rsplit("/", 1)[-1]
526
+ data_object_slots = {
527
+ "id": data_object_id,
528
+ "name": possible_filename,
529
+ "description": f"{data_object_type} for {nucleotide_sequencing_id}",
530
+ "type": "nmdc:DataObject",
531
+ "url": url,
532
+ "md5_checksum": md5_checksums[i] if md5_checksums else None,
533
+ "in_manifest": [manifest.id] if manifest else None,
534
+ "data_category": nmdc.DataCategoryEnum(
535
+ nmdc.DataCategoryEnum.instrument_data
536
+ ),
537
+ "data_object_type": data_object_type,
538
+ "was_generated_by": nucleotide_sequencing_id,
539
+ }
540
+ data_object_slots.update(
541
+ self._transform_dict_for_class(sample_data, "DataObject")
542
+ )
543
+ data_objects.append(nmdc.DataObject(**data_object_slots))
544
+
545
+ return data_objects, manifest
546
+
437
547
  def _translate_study(
438
548
  self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
439
549
  ) -> nmdc.Study:
@@ -448,18 +558,17 @@ class SubmissionPortalTranslator(Translator):
448
558
  """
449
559
  return nmdc.Study(
450
560
  alternative_names=self._get_from(
451
- metadata_submission, ["multiOmicsForm", "alternativeNames"]
561
+ metadata_submission, ["studyForm", "alternativeNames"]
452
562
  ),
453
- associated_dois=self._get_doi(metadata_submission),
454
563
  description=self._get_from(
455
564
  metadata_submission, ["studyForm", "description"]
456
565
  ),
457
566
  funding_sources=self._get_from(
458
567
  metadata_submission, ["studyForm", "fundingSources"]
459
568
  ),
460
- # emsl_proposal_identifier=self._get_from(
461
- # metadata_submission, ["multiOmicsForm", "studyNumber"]
462
- # ),
569
+ emsl_project_identifiers=self._get_emsl_project_identifiers(
570
+ metadata_submission
571
+ ),
463
572
  gold_study_identifiers=self._get_gold_study_identifiers(
464
573
  metadata_submission
465
574
  ),
@@ -467,8 +576,8 @@ class SubmissionPortalTranslator(Translator):
467
576
  metadata_submission
468
577
  ),
469
578
  id=nmdc_study_id,
470
- insdc_bioproject_identifiers=self._get_from(
471
- metadata_submission, ["multiOmicsForm", "NCBIBioProjectId"]
579
+ insdc_bioproject_identifiers=self._get_ncbi_bioproject_identifiers(
580
+ metadata_submission
472
581
  ),
473
582
  jgi_portal_study_identifiers=self._get_jgi_study_identifiers(
474
583
  metadata_submission
@@ -555,7 +664,7 @@ class SubmissionPortalTranslator(Translator):
555
664
  if slot_definition.multivalued:
556
665
  value_list = value
557
666
  if isinstance(value, str):
558
- value_list = [v.strip() for v in value.split("|")]
667
+ value_list = split_strip(value, "|")
559
668
  transformed_value = [
560
669
  self._transform_value_for_slot(item, slot_definition, unit)
561
670
  for item in value_list
@@ -629,16 +738,18 @@ class SubmissionPortalTranslator(Translator):
629
738
  :return: nmdc:Database object
630
739
  """
631
740
  database = nmdc.Database()
632
-
633
- nmdc_study_id = self._id_minter("nmdc:Study")[0]
634
-
635
741
  metadata_submission_data = self.metadata_submission.get(
636
742
  "metadata_submission", {}
637
743
  )
744
+
745
+ # Generate one Study instance based on the metadata submission
746
+ nmdc_study_id = self._id_minter("nmdc:Study")[0]
638
747
  database.study_set = [
639
748
  self._translate_study(metadata_submission_data, nmdc_study_id)
640
749
  ]
641
750
 
751
+ # Automatically populate the `env_package` field in the sample data based on which
752
+ # environmental data tab the sample data came from.
642
753
  sample_data = metadata_submission_data.get("sampleData", {})
643
754
  for key in sample_data.keys():
644
755
  env = key.removesuffix("_data").upper()
@@ -647,8 +758,16 @@ class SubmissionPortalTranslator(Translator):
647
758
  for sample in sample_data[key]:
648
759
  sample["env_package"] = package_name
649
760
  except KeyError:
761
+ # This is expected when processing rows from tabs like the JGI/EMSL tabs or external
762
+ # sequencing data tabs.
650
763
  pass
651
764
 
765
+ # Before regrouping the data by sample name, record which tab each object came from
766
+ for tab_name in sample_data.keys():
767
+ for tab in sample_data[tab_name]:
768
+ tab[TAB_NAME_KEY] = tab_name
769
+
770
+ # Reorganize the sample data by sample name and generate a unique NMDC ID for each
652
771
  sample_data_by_id = groupby(
653
772
  BIOSAMPLE_UNIQUE_KEY_SLOT,
654
773
  concat(sample_data.values()),
@@ -658,6 +777,7 @@ class SubmissionPortalTranslator(Translator):
658
777
  zip(sample_data_by_id.keys(), nmdc_biosample_ids)
659
778
  )
660
779
 
780
+ # Translate the sample data into nmdc:Biosample objects
661
781
  database.biosample_set = [
662
782
  self._translate_biosample(
663
783
  sample_data,
@@ -668,6 +788,104 @@ class SubmissionPortalTranslator(Translator):
668
788
  if sample_data
669
789
  ]
670
790
 
791
+ # This section handles the translation of information in the external sequencing tabs into
792
+ # various NMDC objects.
793
+ database.data_generation_set = []
794
+ database.data_object_set = []
795
+ database.instrument_set = []
796
+ database.manifest_set = []
797
+ today = datetime.now().strftime("%Y-%m-%d")
798
+ for sample_data_id, sample_data in sample_data_by_id.items():
799
+ for tab in sample_data:
800
+ tab_name = tab.get(TAB_NAME_KEY)
801
+ analyte_category = TAB_NAME_TO_ANALYTE_CATEGORY.get(tab_name)
802
+ if not analyte_category:
803
+ # If the tab name cannot be mapped to an analyte category, that means we're
804
+ # not in an external sequencing data tabs (e.g. this is an environmental data
805
+ # tab or a JGI/EMSL tab). Skip this tab.
806
+ continue
807
+
808
+ # Start by generating one NucleotideSequencing instance with a has_input
809
+ # relationship to the current Biosample instance.
810
+ nucleotide_sequencing_id = self._id_minter(
811
+ "nmdc:NucleotideSequencing", 1
812
+ )[0]
813
+ nucleotide_sequencing_slots = {
814
+ "id": nucleotide_sequencing_id,
815
+ "has_input": sample_data_to_nmdc_biosample_ids[sample_data_id],
816
+ "has_output": [],
817
+ "associated_studies": [nmdc_study_id],
818
+ "add_date": today,
819
+ "mod_date": today,
820
+ "analyte_category": analyte_category,
821
+ "type": "nmdc:NucleotideSequencing",
822
+ }
823
+ # If the protocol_link column was filled in, expand it into an nmdc:Protocol object
824
+ if "protocol_link" in tab:
825
+ protocol_link = tab.pop("protocol_link")
826
+ nucleotide_sequencing_slots["protocol_link"] = nmdc.Protocol(
827
+ url=protocol_link,
828
+ type="nmdc:Protocol",
829
+ )
830
+ # If model column was filled in, expand it into an nmdc:Instrument object. This is
831
+ # done by first checking the provided instrument mapping to see if the model is
832
+ # already present. If it is not, a new instrument object is created and added to the
833
+ # instrument_set. Currently, we only accept sequencing data in the submission portal
834
+ # that was generated by Illumina instruments, so the vendor is hardcoded here.
835
+ if "model" in tab:
836
+ model = tab.pop("model")
837
+ if model not in self.illumina_instrument_mapping:
838
+ # If the model is not already in the mapping, create a new record for it
839
+ nmdc_instrument_id = self._id_minter("nmdc:Instrument", 1)[0]
840
+ database.instrument_set.append(
841
+ nmdc.Instrument(
842
+ id=nmdc_instrument_id,
843
+ vendor=nmdc.InstrumentVendorEnum(
844
+ nmdc.InstrumentVendorEnum.illumina
845
+ ),
846
+ model=nmdc.InstrumentModelEnum(model),
847
+ type="nmdc:Instrument",
848
+ )
849
+ )
850
+ self.illumina_instrument_mapping[model] = nmdc_instrument_id
851
+ nucleotide_sequencing_slots["instrument_used"] = (
852
+ self.illumina_instrument_mapping[model]
853
+ )
854
+ # Process the remaining columns according to the NucleotideSequencing class
855
+ # definition
856
+ nucleotide_sequencing_slots.update(
857
+ self._transform_dict_for_class(tab, "NucleotideSequencing")
858
+ )
859
+ nucleotide_sequencing = nmdc.NucleotideSequencing(
860
+ **nucleotide_sequencing_slots
861
+ )
862
+ database.data_generation_set.append(nucleotide_sequencing)
863
+
864
+ # Iterate over the columns that contain URLs and MD5 checksums and translate them
865
+ # into DataObject instances. Each of these DataObject instances will be connected
866
+ # to the NucleotideSequencing instance via the has_output/was_generated_by
867
+ # relationships.
868
+ for data_url in DATA_URL_SETS:
869
+ data_object_type = DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE[
870
+ (data_url, str(analyte_category))
871
+ ]
872
+ data_objects, manifest = self._get_data_objects_from_fields(
873
+ tab,
874
+ url_field_name=data_url.url,
875
+ md5_checksum_field_name=data_url.md5_checksum,
876
+ nucleotide_sequencing_id=nucleotide_sequencing_id,
877
+ data_object_type=nmdc.FileTypeEnum(data_object_type),
878
+ )
879
+ if manifest:
880
+ database.manifest_set.append(manifest)
881
+ for data_object in data_objects:
882
+ nucleotide_sequencing.has_output.append(data_object.id)
883
+ database.data_object_set.append(data_object)
884
+
885
+ # This is the older way of handling attaching NucleotideSequencing and DataObject instances
886
+ # to the Biosample instances. This should now mainly be handled by the external sequencing
887
+ # data tabs in the submission portal. This code is being left in place for now in case it is
888
+ # needed in the future.
671
889
  if self.nucleotide_sequencing_mapping:
672
890
  # If there is data from an NucleotideSequencing mapping file, process it now. This part
673
891
  # assumes that there is a column in that file with the header __biosample_samp_name
nmdc_runtime/site/util.py CHANGED
@@ -1,9 +1,9 @@
1
1
  import os
2
2
 
3
- from dagster import op
4
3
  from functools import lru_cache
5
4
  from pymongo.database import Database as MongoDatabase
6
5
  from subprocess import Popen, PIPE, STDOUT, CalledProcessError
6
+ from toolz import groupby
7
7
 
8
8
  from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
9
9
  from nmdc_runtime.site.resources import mongo_resource
@@ -52,3 +52,10 @@ def get_basename(filename: str) -> str:
52
52
 
53
53
  def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
54
54
  return nmdc_study_id.replace(":", "_").replace("-", "_")
55
+
56
+
57
+ def get_instruments_by_id(mdb: MongoDatabase) -> dict[str, dict]:
58
+ """Get all documents from the instrument_set collection in a dict keyed by id."""
59
+ return {
60
+ instrument["id"]: instrument for instrument in mdb["instrument_set"].find({})
61
+ }
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: nmdc_runtime
3
- Version: 2.4.0
3
+ Version: 2.6.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -17,6 +17,7 @@ Dynamic: classifier
17
17
  Dynamic: description
18
18
  Dynamic: description-content-type
19
19
  Dynamic: home-page
20
+ Dynamic: license-file
20
21
  Dynamic: requires-python
21
22
  Dynamic: summary
22
23
 
@@ -37,8 +38,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
37
38
  * [nmdc-server](https://github.com/microbiomedata/nmdc-server)
38
39
  houses code specific to the data portal -- its database, back-end API, and front-end application.
39
40
 
40
- * [workflow_documentation](https://docs.microbiomedata.org/workflows/)
41
- references workflow code spread across several repositories, that take source data and produce computed data.
41
+ * Workflows — documented in the [workflows](https://docs.microbiomedata.org/workflows/) section of the NMDC documentation website — take source data and produce computed data.
42
42
 
43
43
  * This repo (nmdc-runtime)
44
44
  * houses code that takes source data and computed data, and transforms it
@@ -106,10 +106,10 @@ docker compose version
106
106
  docker info
107
107
  ```
108
108
 
109
- Ensure the permissions of `./mongoKeyFile` are such that only the file's owner can read or write the file.
109
+ Ensure the permissions of `./.docker/mongoKeyFile` are such that only the file's owner can read or write the file.
110
110
 
111
111
  ```shell
112
- chmod 600 ./mongoKeyFile
112
+ chmod 600 ./.docker/mongoKeyFile
113
113
  ```
114
114
 
115
115
  Ensure you have a `.env` file for the Docker services to source from. You may copy `.env.example` to
@@ -156,6 +156,9 @@ Tests can be found in `tests` and are run with the following commands:
156
156
  ```bash
157
157
  make up-test
158
158
  make test
159
+
160
+ # Run a Specific test file eg. tests/test_api/test_endpoints.py
161
+ make test ARGS="tests/test_api/test_endpoints.py"
159
162
  ```
160
163
 
161
164
  As you create Dagster solids and pipelines, add tests in `tests/` to check that your code behaves as
@@ -164,6 +167,16 @@ desired and does not break over time.
164
167
  [For hints on how to write tests for solids and pipelines in Dagster, see their documentation
165
168
  tutorial on Testing](https://docs.dagster.io/tutorial/testable).
166
169
 
170
+ ### RAM usage
171
+
172
+ The `dagster-daemon` and `dagster-dagit` containers can consume a lot of RAM. If tests are failing and the console of
173
+ the `test` container shows "Error 137," here is something you can try as a workaround: In Docker Desktop, go to
174
+ "Settings > Resources > Advanced," and increase the memory limit. One of our team members has
175
+ found **12 GB** to be sufficient for running the tests.
176
+
177
+ > Dedicating 12 GB of RAM to Docker may be prohibitive for some prospective developers.
178
+ > There is an open [issue](https://github.com/microbiomedata/nmdc-runtime/issues/928) about the memory requirement.
179
+
167
180
  ## Publish to PyPI
168
181
 
169
182
  This repository contains a GitHub Actions workflow that publishes a Python package to [PyPI](https://pypi.org/project/nmdc-runtime/).
@@ -36,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
36
36
  nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
38
38
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
40
- nmdc_runtime/site/ops.py,sha256=p4F5SrDbFdKOrAHu1TUhWQA33QB7hdoQmCCuU-00Eqo,46445
41
- nmdc_runtime/site/repository.py,sha256=pfx7WAVgdNaPhtfF2pak-tllqPMf4-yUeOXSpr4hu30,43861
39
+ nmdc_runtime/site/graphs.py,sha256=DoKK6B6xkSwRcY5PVVo6jV_IA4HI5qL8xW9_n94jVfQ,15990
40
+ nmdc_runtime/site/ops.py,sha256=atZNkU5mzRRqTnaW39fvq7gVO2sKSH8ztVOp8_dOLbU,48048
41
+ nmdc_runtime/site/repository.py,sha256=nHu1skayyTjJWwGEf5eToX02cgBNTG_kdSluzJZ6rJc,43695
42
42
  nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
43
- nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
43
+ nmdc_runtime/site/util.py,sha256=h70UJCT9g-I63EJn0drZjv1iaQ8LHJTbG29R9kqJ04c,1821
44
44
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
46
46
  nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
@@ -51,23 +51,23 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
51
51
  nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
52
52
  nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
53
53
  nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
55
- nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
54
+ nmdc_runtime/site/export/ncbi_xml.py,sha256=4RqaT6qs1LDSiDDfF-JNZL5gOel8m65oCOelfr0blXs,26209
55
+ nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=X35zbkxBxEyCnA9peY9YBAa_0oeoWy3DQEXoAXmc6vg,10100
56
56
  nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
57
57
  nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
59
59
  nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
- nmdc_runtime/site/repair/database_updater.py,sha256=EMuY8MfwQEfdejJHp0Y-Gb1eb1zOgKgfJxbtm6wM3YU,10943
60
+ nmdc_runtime/site/repair/database_updater.py,sha256=eTNAPtgAc_xQodADBfgomwow9-14j5rBqQWF8R7BheY,11525
61
61
  nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
62
  nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
63
63
  nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
64
- nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
64
+ nmdc_runtime/site/translation/gold_translator.py,sha256=HGbWeuxppqlVfU8F5oKTYIDoC6qaftugJeWFIALB9XE,32720
65
65
  nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
66
- nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
66
+ nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=VxN7yCziQE-ZP9mtrzqI-yaS9taEgTy0EnIEattYeKo,23727
67
67
  nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
68
68
  nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
69
69
  nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
70
- nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
70
+ nmdc_runtime/site/translation/submission_portal_translator.py,sha256=UEeqlkz_YGqcnx8vomFysetOlXxDu23q0Ryr93SZy78,41684
71
71
  nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
72
72
  nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
73
73
  nmdc_runtime/site/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
75
75
  nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
76
76
  nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
77
77
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
78
- nmdc_runtime-2.4.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
- nmdc_runtime-2.4.0.dist-info/METADATA,sha256=CeZZbucd3jrD0ZqGdreH2x7ALrM9pt4ksGV2olkkpPI,7401
80
- nmdc_runtime-2.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
81
- nmdc_runtime-2.4.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
- nmdc_runtime-2.4.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
- nmdc_runtime-2.4.0.dist-info/RECORD,,
78
+ nmdc_runtime-2.6.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
+ nmdc_runtime-2.6.0.dist-info/METADATA,sha256=RK075FB7BHmZL0fm3elHYjBV7YB4hsG-pQRHfgCUm0g,8155
80
+ nmdc_runtime-2.6.0.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
81
+ nmdc_runtime-2.6.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
+ nmdc_runtime-2.6.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
+ nmdc_runtime-2.6.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (80.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5