nmdc-runtime 2.5.0__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -7,7 +7,6 @@ import xml.dom.minidom
7
7
  from typing import Any, List, Union
8
8
  from urllib.parse import urlparse
9
9
  from nmdc_runtime.site.export.ncbi_xml_utils import (
10
- get_instruments,
11
10
  handle_controlled_identified_term_value,
12
11
  handle_controlled_term_value,
13
12
  handle_geolocation_value,
@@ -24,31 +24,6 @@ def get_classname_from_typecode(doc_id):
24
24
  return class_map.get(typecode)
25
25
 
26
26
 
27
- def get_instruments(instrument_set_collection):
28
- # dictionary to capture a list of all instruments
29
- # Structure of dict:
30
- # {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}}
31
- all_instruments = {}
32
-
33
- try:
34
- query = {"type": "nmdc:Instrument"}
35
- cursor = instrument_set_collection.find(query)
36
-
37
- for document in cursor:
38
- instrument_id = document.get("id")
39
- vendor = document.get("vendor")
40
- model = document.get("model")
41
-
42
- if not instrument_id or not vendor or not model:
43
- continue
44
-
45
- all_instruments[instrument_id] = {"vendor": vendor, "model": model}
46
-
47
- return all_instruments
48
- except Exception as e:
49
- raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
50
-
51
-
52
27
  def fetch_data_objects_from_biosamples(
53
28
  all_docs_collection: Collection,
54
29
  data_object_set: Collection,
@@ -61,6 +61,8 @@ from nmdc_runtime.site.ops import (
61
61
  get_database_updater_inputs,
62
62
  post_submission_portal_biosample_ingest_record_stitching_filename,
63
63
  generate_data_generation_set_post_biosample_ingest,
64
+ get_instrument_ids_by_model,
65
+ log_database_ids,
64
66
  )
65
67
  from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
66
68
 
@@ -181,6 +183,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
181
183
  biosample_extras_slot_mapping = get_csv_rows_from_url(
182
184
  biosample_extras_slot_mapping_file_url
183
185
  )
186
+ instrument_mapping = get_instrument_ids_by_model()
184
187
 
185
188
  database = translate_portal_submission_to_nmdc_schema_database(
186
189
  metadata_submission,
@@ -188,10 +191,13 @@ def translate_metadata_submission_to_nmdc_schema_database():
188
191
  data_object_mapping=data_object_mapping,
189
192
  biosample_extras=biosample_extras,
190
193
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
194
+ instrument_mapping=instrument_mapping,
191
195
  )
192
196
 
193
197
  validate_metadata(database)
194
198
 
199
+ log_database_ids(database)
200
+
195
201
  database_dict = nmdc_schema_object_to_dict(database)
196
202
  filename = nmdc_schema_database_export_filename(metadata_submission)
197
203
  outputs = export_json_to_drs(database_dict, filename)
@@ -217,6 +223,7 @@ def ingest_metadata_submission():
217
223
  biosample_extras_slot_mapping = get_csv_rows_from_url(
218
224
  biosample_extras_slot_mapping_file_url
219
225
  )
226
+ instrument_mapping = get_instrument_ids_by_model()
220
227
 
221
228
  database = translate_portal_submission_to_nmdc_schema_database(
222
229
  metadata_submission,
@@ -224,7 +231,11 @@ def ingest_metadata_submission():
224
231
  data_object_mapping=data_object_mapping,
225
232
  biosample_extras=biosample_extras,
226
233
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
234
+ instrument_mapping=instrument_mapping,
227
235
  )
236
+
237
+ log_database_ids(database)
238
+
228
239
  run_id = submit_metadata_to_db(database)
229
240
  poll_for_run_completion(run_id)
230
241
 
nmdc_runtime/site/ops.py CHANGED
@@ -7,6 +7,7 @@ import tempfile
7
7
  from collections import defaultdict
8
8
  from datetime import datetime, timezone
9
9
  from io import BytesIO, StringIO
10
+ from pprint import pformat
10
11
  from toolz.dicttoolz import keyfilter
11
12
  from typing import Tuple
12
13
  from zipfile import ZipFile
@@ -38,7 +39,7 @@ from dagster import (
38
39
  Bool,
39
40
  )
40
41
  from gridfs import GridFS
41
- from linkml_runtime.dumpers import json_dumper
42
+ from linkml_runtime.utils.dictutils import as_simple_dict
42
43
  from linkml_runtime.utils.yamlutils import YAMLRoot
43
44
  from nmdc_runtime.api.db.mongo import get_mongo_db
44
45
  from nmdc_runtime.api.core.idgen import generate_one_id
@@ -69,7 +70,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
69
70
  fetch_data_objects_from_biosamples,
70
71
  fetch_nucleotide_sequencing_from_biosamples,
71
72
  fetch_library_preparation_from_biosamples,
72
- get_instruments,
73
73
  )
74
74
  from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
75
75
  from nmdc_runtime.site.resources import (
@@ -96,6 +96,7 @@ from nmdc_runtime.site.util import (
96
96
  run_and_log,
97
97
  schema_collection_has_index_on_id,
98
98
  nmdc_study_id_to_filename,
99
+ get_instruments_by_id,
99
100
  )
100
101
  from nmdc_runtime.util import (
101
102
  drs_object_in_for,
@@ -720,9 +721,8 @@ def translate_portal_submission_to_nmdc_schema_database(
720
721
  metadata_submission: Dict[str, Any],
721
722
  nucleotide_sequencing_mapping: List,
722
723
  data_object_mapping: List,
724
+ instrument_mapping: Dict[str, str],
723
725
  study_category: Optional[str],
724
- study_doi_category: Optional[str],
725
- study_doi_provider: Optional[str],
726
726
  study_pi_image_url: Optional[str],
727
727
  biosample_extras: Optional[list[dict]],
728
728
  biosample_extras_slot_mapping: Optional[list[dict]],
@@ -739,11 +739,10 @@ def translate_portal_submission_to_nmdc_schema_database(
739
739
  data_object_mapping=data_object_mapping,
740
740
  id_minter=id_minter,
741
741
  study_category=study_category,
742
- study_doi_category=study_doi_category,
743
- study_doi_provider=study_doi_provider,
744
742
  study_pi_image_url=study_pi_image_url,
745
743
  biosample_extras=biosample_extras,
746
744
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
745
+ illumina_instrument_mapping=instrument_mapping,
747
746
  )
748
747
  database = translator.get_database()
749
748
  return database
@@ -761,7 +760,7 @@ def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
761
760
 
762
761
  @op
763
762
  def nmdc_schema_object_to_dict(object: YAMLRoot) -> Dict[str, Any]:
764
- return json_dumper.to_dict(object)
763
+ return as_simple_dict(object)
765
764
 
766
765
 
767
766
  @op(required_resource_keys={"mongo"}, config_schema={"username": str})
@@ -1227,11 +1226,26 @@ def get_library_preparation_from_biosamples(
1227
1226
 
1228
1227
 
1229
1228
  @op(required_resource_keys={"mongo"})
1230
- def get_all_instruments(context: OpExecutionContext):
1229
+ def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
1231
1230
  mdb = context.resources.mongo.db
1232
- instrument_set_collection = mdb["instrument_set"]
1233
- all_instruments = get_instruments(instrument_set_collection)
1234
- return all_instruments
1231
+ return get_instruments_by_id(mdb)
1232
+
1233
+
1234
+ @op(required_resource_keys={"mongo"})
1235
+ def get_instrument_ids_by_model(context: OpExecutionContext) -> dict[str, str]:
1236
+ mdb = context.resources.mongo.db
1237
+ instruments_by_id = get_instruments_by_id(mdb)
1238
+ instruments_by_model: dict[str, str] = {}
1239
+ for inst_id, instrument in instruments_by_id.items():
1240
+ model = instrument.get("model")
1241
+ if model is None:
1242
+ context.log.warning(f"Instrument {inst_id} has no model.")
1243
+ continue
1244
+ if model in instruments_by_model:
1245
+ context.log.warning(f"Instrument model {model} is not unique.")
1246
+ instruments_by_model[model] = inst_id
1247
+ context.log.info("Instrument models: %s", pformat(instruments_by_model))
1248
+ return instruments_by_model
1235
1249
 
1236
1250
 
1237
1251
  @op
@@ -1345,3 +1359,26 @@ def generate_biosample_set_for_nmdc_study_from_gold(
1345
1359
  database = database_updater.generate_biosample_set_from_gold_api_for_study()
1346
1360
 
1347
1361
  return database
1362
+
1363
+
1364
+ @op
1365
+ def log_database_ids(
1366
+ context: OpExecutionContext,
1367
+ database: nmdc.Database,
1368
+ ) -> None:
1369
+ """Log the IDs of the database."""
1370
+ database_dict = as_simple_dict(database)
1371
+ message = ""
1372
+ for collection_name, collection in database_dict.items():
1373
+ if not isinstance(collection, list):
1374
+ continue
1375
+ message += f"{collection_name} ({len(collection)}):\n"
1376
+ if len(collection) < 10:
1377
+ message += "\n".join(f" {doc['id']}" for doc in collection)
1378
+ else:
1379
+ message += "\n".join(f" {doc['id']}" for doc in collection[:4])
1380
+ message += f"\n ... {len(collection) - 8} more\n"
1381
+ message += "\n".join(f" {doc['id']}" for doc in collection[-4:])
1382
+ message += "\n"
1383
+ if message:
1384
+ context.log.info(message)
@@ -553,8 +553,6 @@ def biosample_submission_ingest():
553
553
  "translate_portal_submission_to_nmdc_schema_database": {
554
554
  "inputs": {
555
555
  "study_category": "research_study",
556
- "study_doi_category": None,
557
- "study_doi_provider": None,
558
556
  "study_pi_image_url": None,
559
557
  }
560
558
  },
@@ -591,8 +589,6 @@ def biosample_submission_ingest():
591
589
  "translate_portal_submission_to_nmdc_schema_database": {
592
590
  "inputs": {
593
591
  "study_category": None,
594
- "study_doi_category": None,
595
- "study_doi_provider": None,
596
592
  "study_pi_image_url": None,
597
593
  }
598
594
  },
@@ -639,6 +639,16 @@ class GoldStudyTranslator(Translator):
639
639
  :return: nmdc:NucleotideSequencing object
640
640
  """
641
641
  gold_project_id = gold_project["projectGoldId"]
642
+ ncbi_bioproject_identifier = gold_project.get("ncbiBioProjectAccession")
643
+ insdc_bioproject_identifiers = []
644
+ if ncbi_bioproject_identifier:
645
+ insdc_bioproject_identifiers.append(
646
+ self._ensure_curie(
647
+ ncbi_bioproject_identifier,
648
+ default_prefix="bioproject",
649
+ )
650
+ )
651
+
642
652
  return nmdc.NucleotideSequencing(
643
653
  id=nmdc_nucleotide_sequencing_id,
644
654
  name=gold_project.get("projectName"),
@@ -650,6 +660,7 @@ class GoldStudyTranslator(Translator):
650
660
  has_input=nmdc_biosample_id,
651
661
  add_date=gold_project.get("addDate"),
652
662
  mod_date=self._get_mod_date(gold_project),
663
+ insdc_bioproject_identifiers=insdc_bioproject_identifiers,
653
664
  principal_investigator=self._get_pi(gold_project),
654
665
  processing_institution=self._get_processing_institution(gold_project),
655
666
  instrument_used=self._get_instrument(gold_project),
@@ -1,10 +1,12 @@
1
1
  import logging
2
2
  import re
3
+ from collections import namedtuple
3
4
  from datetime import datetime
4
5
  from enum import Enum
5
6
  from functools import lru_cache
6
7
  from importlib import resources
7
- from typing import Any, List, Optional, Union
8
+ from typing import Any, List, Optional, Union, Tuple
9
+ from urllib.parse import urlparse
8
10
 
9
11
  from linkml_runtime import SchemaView
10
12
  from linkml_runtime.linkml_model import SlotDefinition
@@ -13,8 +15,38 @@ from toolz import concat, dissoc, get_in, groupby, valmap
13
15
 
14
16
  from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
15
17
 
18
+
19
+ DataUrlSet = namedtuple("DataUrlSet", ["url", "md5_checksum"])
20
+
21
+ READ_1 = DataUrlSet("read_1_url", "read_1_md5_checksum")
22
+ READ_2 = DataUrlSet("read_2_url", "read_2_md5_checksum")
23
+ INTERLEAVED = DataUrlSet("interleaved_url", "interleaved_md5_checksum")
24
+
25
+ DATA_URL_SETS: list[DataUrlSet] = [READ_1, READ_2, INTERLEAVED]
26
+
16
27
  BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"
17
28
 
29
+ TAB_NAME_KEY = "__tab_name"
30
+ METAGENOME = nmdc.NucleotideSequencingEnum(nmdc.NucleotideSequencingEnum.metagenome)
31
+ METATRANSCRIPTOME = nmdc.NucleotideSequencingEnum(
32
+ nmdc.NucleotideSequencingEnum.metatranscriptome
33
+ )
34
+ TAB_NAME_TO_ANALYTE_CATEGORY: dict[str, nmdc.NucleotideSequencingEnum] = {
35
+ "metagenome_sequencing_non_interleaved_data": METAGENOME,
36
+ "metagenome_sequencing_interleaved_data": METAGENOME,
37
+ "metatranscriptome_sequencing_non_interleaved_data": METATRANSCRIPTOME,
38
+ "metatranscriptome_sequencing_interleaved_data": METATRANSCRIPTOME,
39
+ }
40
+
41
+ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str] = {
42
+ (READ_1, str(METAGENOME)): "Metagenome Raw Read 1",
43
+ (READ_2, str(METAGENOME)): "Metagenome Raw Read 2",
44
+ (INTERLEAVED, str(METAGENOME)): "Metagenome Raw Reads",
45
+ (READ_1, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 1",
46
+ (READ_2, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 2",
47
+ (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
48
+ }
49
+
18
50
 
19
51
  class EnvironmentPackage(Enum):
20
52
  r"""
@@ -75,6 +107,18 @@ def group_dicts_by_key(key: str, seq: Optional[list[dict]]) -> Optional[dict]:
75
107
  return grouped
76
108
 
77
109
 
110
+ def split_strip(string: str | None, sep: str) -> list[str] | None:
111
+ """Split a string by a separator and strip whitespace from each part.
112
+
113
+ :param string: string to split
114
+ :param sep: separator to split by
115
+ :return: list of stripped strings
116
+ """
117
+ if string is None:
118
+ return None
119
+ return [s.strip() for s in string.split(sep)]
120
+
121
+
78
122
  class SubmissionPortalTranslator(Translator):
79
123
  """A Translator subclass for handling submission portal entries
80
124
 
@@ -86,17 +130,15 @@ class SubmissionPortalTranslator(Translator):
86
130
 
87
131
  def __init__(
88
132
  self,
89
- metadata_submission: JSON_OBJECT = {},
133
+ metadata_submission: Optional[JSON_OBJECT] = None,
90
134
  *args,
91
135
  nucleotide_sequencing_mapping: Optional[list] = None,
92
136
  data_object_mapping: Optional[list] = None,
137
+ illumina_instrument_mapping: Optional[dict[str, str]] = None,
93
138
  # Additional study-level metadata not captured by the submission portal currently
94
139
  # See: https://github.com/microbiomedata/submission-schema/issues/162
95
- study_doi_category: Optional[str] = None,
96
- study_doi_provider: Optional[str] = None,
97
140
  study_category: Optional[str] = None,
98
141
  study_pi_image_url: Optional[str] = None,
99
- study_funding_sources: Optional[list[str]] = None,
100
142
  # Additional biosample-level metadata with optional column mapping information not captured
101
143
  # by the submission portal currently.
102
144
  # See: https://github.com/microbiomedata/submission-schema/issues/162
@@ -106,23 +148,17 @@ class SubmissionPortalTranslator(Translator):
106
148
  ) -> None:
107
149
  super().__init__(*args, **kwargs)
108
150
 
109
- self.metadata_submission = metadata_submission
151
+ self.metadata_submission: JSON_OBJECT = metadata_submission or {}
110
152
  self.nucleotide_sequencing_mapping = nucleotide_sequencing_mapping
111
153
  self.data_object_mapping = data_object_mapping
112
-
113
- self.study_doi_category = (
114
- nmdc.DoiCategoryEnum(study_doi_category)
115
- if study_doi_category
116
- else nmdc.DoiCategoryEnum.dataset_doi
117
- )
118
- self.study_doi_provider = (
119
- nmdc.DoiProviderEnum(study_doi_provider) if study_doi_provider else None
154
+ self.illumina_instrument_mapping: dict[str, str] = (
155
+ illumina_instrument_mapping or {}
120
156
  )
157
+
121
158
  self.study_category = (
122
159
  nmdc.StudyCategoryEnum(study_category) if study_category else None
123
160
  )
124
161
  self.study_pi_image_url = study_pi_image_url
125
- self.study_funding_sources = study_funding_sources
126
162
 
127
163
  self.biosample_extras = group_dicts_by_key(
128
164
  BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
@@ -153,28 +189,6 @@ class SubmissionPortalTranslator(Translator):
153
189
  type=nmdc.PersonValue.class_class_curie,
154
190
  )
155
191
 
156
- def _get_doi(self, metadata_submission: JSON_OBJECT) -> Union[List[nmdc.Doi], None]:
157
- """Get DOI information from the context form data
158
-
159
- :param metadata_submission: submission portal entry
160
- :return: list of strings or None
161
- """
162
- dataset_doi = get_in(["contextForm", "datasetDoi"], metadata_submission)
163
- if not dataset_doi:
164
- return None
165
-
166
- if not dataset_doi.startswith("doi:"):
167
- dataset_doi = f"doi:{dataset_doi}"
168
-
169
- return [
170
- nmdc.Doi(
171
- doi_value=dataset_doi,
172
- doi_provider=self.study_doi_provider,
173
- doi_category=self.study_doi_category,
174
- type="nmdc:Doi",
175
- )
176
- ]
177
-
178
192
  def _get_has_credit_associations(
179
193
  self, metadata_submission: JSON_OBJECT
180
194
  ) -> Union[List[nmdc.CreditAssociation], None]:
@@ -203,21 +217,34 @@ class SubmissionPortalTranslator(Translator):
203
217
  def _get_gold_study_identifiers(
204
218
  self, metadata_submission: JSON_OBJECT
205
219
  ) -> Union[List[str], None]:
206
- """Construct a GOLD CURIE from the multiomics from data
220
+ """Construct a GOLD CURIE from the study form data
207
221
 
208
222
  :param metadata_submission: submission portal entry
209
223
  :return: GOLD CURIE
210
224
  """
211
- gold_study_id = get_in(["multiOmicsForm", "GOLDStudyId"], metadata_submission)
225
+ gold_study_id = get_in(["studyForm", "GOLDStudyId"], metadata_submission)
212
226
  if not gold_study_id:
213
227
  return None
214
228
 
215
229
  return [self._ensure_curie(gold_study_id, default_prefix="gold")]
216
230
 
231
+ def _get_ncbi_bioproject_identifiers(
232
+ self, metadata_submission: JSON_OBJECT
233
+ ) -> Union[List[str], None]:
234
+ """Construct a NCBI Bioproject CURIE from the study form data"""
235
+
236
+ ncbi_bioproject_id = get_in(
237
+ ["studyForm", "NCBIBioProjectId"], metadata_submission
238
+ )
239
+ if not ncbi_bioproject_id:
240
+ return None
241
+
242
+ return [self._ensure_curie(ncbi_bioproject_id, default_prefix="bioproject")]
243
+
217
244
  def _get_jgi_study_identifiers(
218
245
  self, metadata_submission: JSON_OBJECT
219
246
  ) -> Union[List[str], None]:
220
- """Construct a JGI proposal CURIE from the multiomics from data
247
+ """Construct a JGI proposal CURIE from the multiomics form data
221
248
 
222
249
  :param metadata_submission: submission portal entry
223
250
  :return: JGI proposal CURIE
@@ -228,6 +255,20 @@ class SubmissionPortalTranslator(Translator):
228
255
 
229
256
  return [self._ensure_curie(jgi_study_id, default_prefix="jgi.proposal")]
230
257
 
258
+ def _get_emsl_project_identifiers(
259
+ self, metadata_submission: JSON_OBJECT
260
+ ) -> Union[List[str], None]:
261
+ """Construct an EMSL project CURIE from the multiomics form data
262
+
263
+ :param metadata_submission: submission portal entry
264
+ :return: EMSL project CURIE
265
+ """
266
+ emsl_project_id = get_in(["multiOmicsForm", "studyNumber"], metadata_submission)
267
+ if not emsl_project_id:
268
+ return None
269
+
270
+ return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
271
+
231
272
  def _get_quantity_value(
232
273
  self, raw_value: Optional[str], unit: Optional[str] = None
233
274
  ) -> Union[nmdc.QuantityValue, None]:
@@ -434,6 +475,75 @@ class SubmissionPortalTranslator(Translator):
434
475
 
435
476
  return value
436
477
 
478
+ def _get_data_objects_from_fields(
479
+ self,
480
+ sample_data: JSON_OBJECT,
481
+ *,
482
+ url_field_name: str,
483
+ md5_checksum_field_name: str,
484
+ nucleotide_sequencing_id: str,
485
+ data_object_type: nmdc.FileTypeEnum,
486
+ ) -> Tuple[List[nmdc.DataObject], nmdc.Manifest | None]:
487
+ """Get a DataObject instances based on the URLs and MD5 checksums in the given fields.
488
+
489
+ If the field provides multiple URLs, multiple DataObject instances will be created and a
490
+ Manifest will be created and provided in the second return value.
491
+
492
+ :param sample_data: sample data
493
+ :param url_field_name: field name for the URL
494
+ :param md5_checksum_field_name: field name for the MD5 checksum
495
+ :param nucleotide_sequencing_id: ID for the nmdc:NucleotideSequencing object that generated the data object(s)
496
+ :param data_object_type: FileTypeEnum representing the type of the data object
497
+ :return: nmdc.DataObject or None
498
+ """
499
+ data_objects: List[nmdc.DataObject] = []
500
+ urls = split_strip(sample_data.get(url_field_name), ";")
501
+ if not urls:
502
+ return data_objects, None
503
+
504
+ md5_checksums = split_strip(sample_data.get(md5_checksum_field_name), ";")
505
+ if md5_checksums and len(urls) != len(md5_checksums):
506
+ raise ValueError(
507
+ f"{url_field_name} and {md5_checksum_field_name} must have the same number of values"
508
+ )
509
+
510
+ data_object_ids = self._id_minter("nmdc:DataObject", len(urls))
511
+ manifest: nmdc.Manifest | None = None
512
+ if len(urls) > 1:
513
+ manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
514
+ manifest = nmdc.Manifest(
515
+ id=manifest_id,
516
+ manifest_category=nmdc.ManifestCategoryEnum(
517
+ nmdc.ManifestCategoryEnum.poolable_replicates
518
+ ),
519
+ type="nmdc:Manifest",
520
+ )
521
+
522
+ for i, url in enumerate(urls):
523
+ data_object_id = data_object_ids[i]
524
+ parsed_url = urlparse(url)
525
+ possible_filename = parsed_url.path.rsplit("/", 1)[-1]
526
+ data_object_slots = {
527
+ "id": data_object_id,
528
+ "name": possible_filename,
529
+ "description": f"{data_object_type} for {nucleotide_sequencing_id}",
530
+ "type": "nmdc:DataObject",
531
+ "url": url,
532
+ "md5_checksum": md5_checksums[i] if md5_checksums else None,
533
+ "in_manifest": [manifest.id] if manifest else None,
534
+ "data_category": nmdc.DataCategoryEnum(
535
+ nmdc.DataCategoryEnum.instrument_data
536
+ ),
537
+ "data_object_type": data_object_type,
538
+ "was_generated_by": nucleotide_sequencing_id,
539
+ }
540
+ data_object_slots.update(
541
+ self._transform_dict_for_class(sample_data, "DataObject")
542
+ )
543
+ data_objects.append(nmdc.DataObject(**data_object_slots))
544
+
545
+ return data_objects, manifest
546
+
437
547
  def _translate_study(
438
548
  self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
439
549
  ) -> nmdc.Study:
@@ -448,18 +558,17 @@ class SubmissionPortalTranslator(Translator):
448
558
  """
449
559
  return nmdc.Study(
450
560
  alternative_names=self._get_from(
451
- metadata_submission, ["multiOmicsForm", "alternativeNames"]
561
+ metadata_submission, ["studyForm", "alternativeNames"]
452
562
  ),
453
- associated_dois=self._get_doi(metadata_submission),
454
563
  description=self._get_from(
455
564
  metadata_submission, ["studyForm", "description"]
456
565
  ),
457
566
  funding_sources=self._get_from(
458
567
  metadata_submission, ["studyForm", "fundingSources"]
459
568
  ),
460
- # emsl_proposal_identifier=self._get_from(
461
- # metadata_submission, ["multiOmicsForm", "studyNumber"]
462
- # ),
569
+ emsl_project_identifiers=self._get_emsl_project_identifiers(
570
+ metadata_submission
571
+ ),
463
572
  gold_study_identifiers=self._get_gold_study_identifiers(
464
573
  metadata_submission
465
574
  ),
@@ -467,8 +576,8 @@ class SubmissionPortalTranslator(Translator):
467
576
  metadata_submission
468
577
  ),
469
578
  id=nmdc_study_id,
470
- insdc_bioproject_identifiers=self._get_from(
471
- metadata_submission, ["multiOmicsForm", "NCBIBioProjectId"]
579
+ insdc_bioproject_identifiers=self._get_ncbi_bioproject_identifiers(
580
+ metadata_submission
472
581
  ),
473
582
  jgi_portal_study_identifiers=self._get_jgi_study_identifiers(
474
583
  metadata_submission
@@ -555,7 +664,7 @@ class SubmissionPortalTranslator(Translator):
555
664
  if slot_definition.multivalued:
556
665
  value_list = value
557
666
  if isinstance(value, str):
558
- value_list = [v.strip() for v in value.split("|")]
667
+ value_list = split_strip(value, "|")
559
668
  transformed_value = [
560
669
  self._transform_value_for_slot(item, slot_definition, unit)
561
670
  for item in value_list
@@ -629,16 +738,18 @@ class SubmissionPortalTranslator(Translator):
629
738
  :return: nmdc:Database object
630
739
  """
631
740
  database = nmdc.Database()
632
-
633
- nmdc_study_id = self._id_minter("nmdc:Study")[0]
634
-
635
741
  metadata_submission_data = self.metadata_submission.get(
636
742
  "metadata_submission", {}
637
743
  )
744
+
745
+ # Generate one Study instance based on the metadata submission
746
+ nmdc_study_id = self._id_minter("nmdc:Study")[0]
638
747
  database.study_set = [
639
748
  self._translate_study(metadata_submission_data, nmdc_study_id)
640
749
  ]
641
750
 
751
+ # Automatically populate the `env_package` field in the sample data based on which
752
+ # environmental data tab the sample data came from.
642
753
  sample_data = metadata_submission_data.get("sampleData", {})
643
754
  for key in sample_data.keys():
644
755
  env = key.removesuffix("_data").upper()
@@ -647,8 +758,16 @@ class SubmissionPortalTranslator(Translator):
647
758
  for sample in sample_data[key]:
648
759
  sample["env_package"] = package_name
649
760
  except KeyError:
761
+ # This is expected when processing rows from tabs like the JGI/EMSL tabs or external
762
+ # sequencing data tabs.
650
763
  pass
651
764
 
765
+ # Before regrouping the data by sample name, record which tab each object came from
766
+ for tab_name in sample_data.keys():
767
+ for tab in sample_data[tab_name]:
768
+ tab[TAB_NAME_KEY] = tab_name
769
+
770
+ # Reorganize the sample data by sample name and generate a unique NMDC ID for each
652
771
  sample_data_by_id = groupby(
653
772
  BIOSAMPLE_UNIQUE_KEY_SLOT,
654
773
  concat(sample_data.values()),
@@ -658,6 +777,7 @@ class SubmissionPortalTranslator(Translator):
658
777
  zip(sample_data_by_id.keys(), nmdc_biosample_ids)
659
778
  )
660
779
 
780
+ # Translate the sample data into nmdc:Biosample objects
661
781
  database.biosample_set = [
662
782
  self._translate_biosample(
663
783
  sample_data,
@@ -668,6 +788,104 @@ class SubmissionPortalTranslator(Translator):
668
788
  if sample_data
669
789
  ]
670
790
 
791
+ # This section handles the translation of information in the external sequencing tabs into
792
+ # various NMDC objects.
793
+ database.data_generation_set = []
794
+ database.data_object_set = []
795
+ database.instrument_set = []
796
+ database.manifest_set = []
797
+ today = datetime.now().strftime("%Y-%m-%d")
798
+ for sample_data_id, sample_data in sample_data_by_id.items():
799
+ for tab in sample_data:
800
+ tab_name = tab.get(TAB_NAME_KEY)
801
+ analyte_category = TAB_NAME_TO_ANALYTE_CATEGORY.get(tab_name)
802
+ if not analyte_category:
803
+ # If the tab name cannot be mapped to an analyte category, that means we're
804
+ # not in an external sequencing data tabs (e.g. this is an environmental data
805
+ # tab or a JGI/EMSL tab). Skip this tab.
806
+ continue
807
+
808
+ # Start by generating one NucleotideSequencing instance with a has_input
809
+ # relationship to the current Biosample instance.
810
+ nucleotide_sequencing_id = self._id_minter(
811
+ "nmdc:NucleotideSequencing", 1
812
+ )[0]
813
+ nucleotide_sequencing_slots = {
814
+ "id": nucleotide_sequencing_id,
815
+ "has_input": sample_data_to_nmdc_biosample_ids[sample_data_id],
816
+ "has_output": [],
817
+ "associated_studies": [nmdc_study_id],
818
+ "add_date": today,
819
+ "mod_date": today,
820
+ "analyte_category": analyte_category,
821
+ "type": "nmdc:NucleotideSequencing",
822
+ }
823
+ # If the protocol_link column was filled in, expand it into an nmdc:Protocol object
824
+ if "protocol_link" in tab:
825
+ protocol_link = tab.pop("protocol_link")
826
+ nucleotide_sequencing_slots["protocol_link"] = nmdc.Protocol(
827
+ url=protocol_link,
828
+ type="nmdc:Protocol",
829
+ )
830
+ # If model column was filled in, expand it into an nmdc:Instrument object. This is
831
+ # done by first checking the provided instrument mapping to see if the model is
832
+ # already present. If it is not, a new instrument object is created and added to the
833
+ # instrument_set. Currently, we only accept sequencing data in the submission portal
834
+ # that was generated by Illumina instruments, so the vendor is hardcoded here.
835
+ if "model" in tab:
836
+ model = tab.pop("model")
837
+ if model not in self.illumina_instrument_mapping:
838
+ # If the model is not already in the mapping, create a new record for it
839
+ nmdc_instrument_id = self._id_minter("nmdc:Instrument", 1)[0]
840
+ database.instrument_set.append(
841
+ nmdc.Instrument(
842
+ id=nmdc_instrument_id,
843
+ vendor=nmdc.InstrumentVendorEnum(
844
+ nmdc.InstrumentVendorEnum.illumina
845
+ ),
846
+ model=nmdc.InstrumentModelEnum(model),
847
+ type="nmdc:Instrument",
848
+ )
849
+ )
850
+ self.illumina_instrument_mapping[model] = nmdc_instrument_id
851
+ nucleotide_sequencing_slots["instrument_used"] = (
852
+ self.illumina_instrument_mapping[model]
853
+ )
854
+ # Process the remaining columns according to the NucleotideSequencing class
855
+ # definition
856
+ nucleotide_sequencing_slots.update(
857
+ self._transform_dict_for_class(tab, "NucleotideSequencing")
858
+ )
859
+ nucleotide_sequencing = nmdc.NucleotideSequencing(
860
+ **nucleotide_sequencing_slots
861
+ )
862
+ database.data_generation_set.append(nucleotide_sequencing)
863
+
864
+ # Iterate over the columns that contain URLs and MD5 checksums and translate them
865
+ # into DataObject instances. Each of these DataObject instances will be connected
866
+ # to the NucleotideSequencing instance via the has_output/was_generated_by
867
+ # relationships.
868
+ for data_url in DATA_URL_SETS:
869
+ data_object_type = DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE[
870
+ (data_url, str(analyte_category))
871
+ ]
872
+ data_objects, manifest = self._get_data_objects_from_fields(
873
+ tab,
874
+ url_field_name=data_url.url,
875
+ md5_checksum_field_name=data_url.md5_checksum,
876
+ nucleotide_sequencing_id=nucleotide_sequencing_id,
877
+ data_object_type=nmdc.FileTypeEnum(data_object_type),
878
+ )
879
+ if manifest:
880
+ database.manifest_set.append(manifest)
881
+ for data_object in data_objects:
882
+ nucleotide_sequencing.has_output.append(data_object.id)
883
+ database.data_object_set.append(data_object)
884
+
885
+ # This is the older way of handling attaching NucleotideSequencing and DataObject instances
886
+ # to the Biosample instances. This should now mainly be handled by the external sequencing
887
+ # data tabs in the submission portal. This code is being left in place for now in case it is
888
+ # needed in the future.
671
889
  if self.nucleotide_sequencing_mapping:
672
890
  # If there is data from an NucleotideSequencing mapping file, process it now. This part
673
891
  # assumes that there is a column in that file with the header __biosample_samp_name
nmdc_runtime/site/util.py CHANGED
@@ -1,9 +1,9 @@
1
1
  import os
2
2
 
3
- from dagster import op
4
3
  from functools import lru_cache
5
4
  from pymongo.database import Database as MongoDatabase
6
5
  from subprocess import Popen, PIPE, STDOUT, CalledProcessError
6
+ from toolz import groupby
7
7
 
8
8
  from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
9
9
  from nmdc_runtime.site.resources import mongo_resource
@@ -52,3 +52,10 @@ def get_basename(filename: str) -> str:
52
52
 
53
53
  def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
54
54
  return nmdc_study_id.replace(":", "_").replace("-", "_")
55
+
56
+
57
+ def get_instruments_by_id(mdb: MongoDatabase) -> dict[str, dict]:
58
+ """Get all documents from the instrument_set collection in a dict keyed by id."""
59
+ return {
60
+ instrument["id"]: instrument for instrument in mdb["instrument_set"].find({})
61
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nmdc_runtime
3
- Version: 2.5.0
3
+ Version: 2.7.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -106,10 +106,10 @@ docker compose version
106
106
  docker info
107
107
  ```
108
108
 
109
- Ensure the permissions of `./mongoKeyFile` are such that only the file's owner can read or write the file.
109
+ Ensure the permissions of `./.docker/mongoKeyFile` are such that only the file's owner can read or write the file.
110
110
 
111
111
  ```shell
112
- chmod 600 ./mongoKeyFile
112
+ chmod 600 ./.docker/mongoKeyFile
113
113
  ```
114
114
 
115
115
  Ensure you have a `.env` file for the Docker services to source from. You may copy `.env.example` to
@@ -36,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
36
36
  nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
38
38
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
40
- nmdc_runtime/site/ops.py,sha256=tg-zRlVSUSJ7B0cJbBsUwmMRmpIUmK5tsL8ABnY0wnY,46626
41
- nmdc_runtime/site/repository.py,sha256=kVCoIMF2rgAMUAf9a6jk0WbejFpmWgxh6nN4U37Mgc8,43919
39
+ nmdc_runtime/site/graphs.py,sha256=DoKK6B6xkSwRcY5PVVo6jV_IA4HI5qL8xW9_n94jVfQ,15990
40
+ nmdc_runtime/site/ops.py,sha256=atZNkU5mzRRqTnaW39fvq7gVO2sKSH8ztVOp8_dOLbU,48048
41
+ nmdc_runtime/site/repository.py,sha256=nHu1skayyTjJWwGEf5eToX02cgBNTG_kdSluzJZ6rJc,43695
42
42
  nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
43
- nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
43
+ nmdc_runtime/site/util.py,sha256=h70UJCT9g-I63EJn0drZjv1iaQ8LHJTbG29R9kqJ04c,1821
44
44
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
46
46
  nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
@@ -51,8 +51,8 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
51
51
  nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
52
52
  nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
53
53
  nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
55
- nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
54
+ nmdc_runtime/site/export/ncbi_xml.py,sha256=4RqaT6qs1LDSiDDfF-JNZL5gOel8m65oCOelfr0blXs,26209
55
+ nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=X35zbkxBxEyCnA9peY9YBAa_0oeoWy3DQEXoAXmc6vg,10100
56
56
  nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
57
57
  nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
@@ -61,13 +61,13 @@ nmdc_runtime/site/repair/database_updater.py,sha256=eTNAPtgAc_xQodADBfgomwow9-14
61
61
  nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
62
  nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
63
63
  nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
64
- nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
64
+ nmdc_runtime/site/translation/gold_translator.py,sha256=HGbWeuxppqlVfU8F5oKTYIDoC6qaftugJeWFIALB9XE,32720
65
65
  nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
66
66
  nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=VxN7yCziQE-ZP9mtrzqI-yaS9taEgTy0EnIEattYeKo,23727
67
67
  nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
68
68
  nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
69
69
  nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
70
- nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
70
+ nmdc_runtime/site/translation/submission_portal_translator.py,sha256=UEeqlkz_YGqcnx8vomFysetOlXxDu23q0Ryr93SZy78,41684
71
71
  nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
72
72
  nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
73
73
  nmdc_runtime/site/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
75
75
  nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
76
76
  nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
77
77
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
78
- nmdc_runtime-2.5.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
- nmdc_runtime-2.5.0.dist-info/METADATA,sha256=tli66QKJC-48TzLXbI9iHMzTLyugbRBKj9CJEeKHXLY,8139
80
- nmdc_runtime-2.5.0.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
81
- nmdc_runtime-2.5.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
- nmdc_runtime-2.5.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
- nmdc_runtime-2.5.0.dist-info/RECORD,,
78
+ nmdc_runtime-2.7.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
+ nmdc_runtime-2.7.0.dist-info/METADATA,sha256=YgD6NKMOIO2FpMKIy7EWaGDTE_XkEM15ZXG2AhgMFFk,8155
80
+ nmdc_runtime-2.7.0.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
81
+ nmdc_runtime-2.7.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
+ nmdc_runtime-2.7.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
+ nmdc_runtime-2.7.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.0.2)
2
+ Generator: setuptools (80.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5