nmdc-runtime 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,6 @@ import xml.dom.minidom
7
7
  from typing import Any, List, Union
8
8
  from urllib.parse import urlparse
9
9
  from nmdc_runtime.site.export.ncbi_xml_utils import (
10
- get_instruments,
11
10
  handle_controlled_identified_term_value,
12
11
  handle_controlled_term_value,
13
12
  handle_geolocation_value,
@@ -24,31 +24,6 @@ def get_classname_from_typecode(doc_id):
24
24
  return class_map.get(typecode)
25
25
 
26
26
 
27
- def get_instruments(instrument_set_collection):
28
- # dictionary to capture a list of all instruments
29
- # Structure of dict:
30
- # {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}}
31
- all_instruments = {}
32
-
33
- try:
34
- query = {"type": "nmdc:Instrument"}
35
- cursor = instrument_set_collection.find(query)
36
-
37
- for document in cursor:
38
- instrument_id = document.get("id")
39
- vendor = document.get("vendor")
40
- model = document.get("model")
41
-
42
- if not instrument_id or not vendor or not model:
43
- continue
44
-
45
- all_instruments[instrument_id] = {"vendor": vendor, "model": model}
46
-
47
- return all_instruments
48
- except Exception as e:
49
- raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
50
-
51
-
52
27
  def fetch_data_objects_from_biosamples(
53
28
  all_docs_collection: Collection,
54
29
  data_object_set: Collection,
@@ -61,6 +61,8 @@ from nmdc_runtime.site.ops import (
61
61
  get_database_updater_inputs,
62
62
  post_submission_portal_biosample_ingest_record_stitching_filename,
63
63
  generate_data_generation_set_post_biosample_ingest,
64
+ get_instrument_ids_by_model,
65
+ log_database_ids,
64
66
  )
65
67
  from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
66
68
 
@@ -181,6 +183,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
181
183
  biosample_extras_slot_mapping = get_csv_rows_from_url(
182
184
  biosample_extras_slot_mapping_file_url
183
185
  )
186
+ instrument_mapping = get_instrument_ids_by_model()
184
187
 
185
188
  database = translate_portal_submission_to_nmdc_schema_database(
186
189
  metadata_submission,
@@ -188,10 +191,13 @@ def translate_metadata_submission_to_nmdc_schema_database():
188
191
  data_object_mapping=data_object_mapping,
189
192
  biosample_extras=biosample_extras,
190
193
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
194
+ instrument_mapping=instrument_mapping,
191
195
  )
192
196
 
193
197
  validate_metadata(database)
194
198
 
199
+ log_database_ids(database)
200
+
195
201
  database_dict = nmdc_schema_object_to_dict(database)
196
202
  filename = nmdc_schema_database_export_filename(metadata_submission)
197
203
  outputs = export_json_to_drs(database_dict, filename)
@@ -217,6 +223,7 @@ def ingest_metadata_submission():
217
223
  biosample_extras_slot_mapping = get_csv_rows_from_url(
218
224
  biosample_extras_slot_mapping_file_url
219
225
  )
226
+ instrument_mapping = get_instrument_ids_by_model()
220
227
 
221
228
  database = translate_portal_submission_to_nmdc_schema_database(
222
229
  metadata_submission,
@@ -224,7 +231,11 @@ def ingest_metadata_submission():
224
231
  data_object_mapping=data_object_mapping,
225
232
  biosample_extras=biosample_extras,
226
233
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
234
+ instrument_mapping=instrument_mapping,
227
235
  )
236
+
237
+ log_database_ids(database)
238
+
228
239
  run_id = submit_metadata_to_db(database)
229
240
  poll_for_run_completion(run_id)
230
241
 
nmdc_runtime/site/ops.py CHANGED
@@ -7,6 +7,7 @@ import tempfile
7
7
  from collections import defaultdict
8
8
  from datetime import datetime, timezone
9
9
  from io import BytesIO, StringIO
10
+ from pprint import pformat
10
11
  from toolz.dicttoolz import keyfilter
11
12
  from typing import Tuple
12
13
  from zipfile import ZipFile
@@ -38,7 +39,7 @@ from dagster import (
38
39
  Bool,
39
40
  )
40
41
  from gridfs import GridFS
41
- from linkml_runtime.dumpers import json_dumper
42
+ from linkml_runtime.utils.dictutils import as_simple_dict
42
43
  from linkml_runtime.utils.yamlutils import YAMLRoot
43
44
  from nmdc_runtime.api.db.mongo import get_mongo_db
44
45
  from nmdc_runtime.api.core.idgen import generate_one_id
@@ -69,7 +70,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
69
70
  fetch_data_objects_from_biosamples,
70
71
  fetch_nucleotide_sequencing_from_biosamples,
71
72
  fetch_library_preparation_from_biosamples,
72
- get_instruments,
73
73
  )
74
74
  from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
75
75
  from nmdc_runtime.site.resources import (
@@ -96,6 +96,7 @@ from nmdc_runtime.site.util import (
96
96
  run_and_log,
97
97
  schema_collection_has_index_on_id,
98
98
  nmdc_study_id_to_filename,
99
+ get_instruments_by_id,
99
100
  )
100
101
  from nmdc_runtime.util import (
101
102
  drs_object_in_for,
@@ -720,9 +721,8 @@ def translate_portal_submission_to_nmdc_schema_database(
720
721
  metadata_submission: Dict[str, Any],
721
722
  nucleotide_sequencing_mapping: List,
722
723
  data_object_mapping: List,
724
+ instrument_mapping: Dict[str, str],
723
725
  study_category: Optional[str],
724
- study_doi_category: Optional[str],
725
- study_doi_provider: Optional[str],
726
726
  study_pi_image_url: Optional[str],
727
727
  biosample_extras: Optional[list[dict]],
728
728
  biosample_extras_slot_mapping: Optional[list[dict]],
@@ -739,11 +739,10 @@ def translate_portal_submission_to_nmdc_schema_database(
739
739
  data_object_mapping=data_object_mapping,
740
740
  id_minter=id_minter,
741
741
  study_category=study_category,
742
- study_doi_category=study_doi_category,
743
- study_doi_provider=study_doi_provider,
744
742
  study_pi_image_url=study_pi_image_url,
745
743
  biosample_extras=biosample_extras,
746
744
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
745
+ illumina_instrument_mapping=instrument_mapping,
747
746
  )
748
747
  database = translator.get_database()
749
748
  return database
@@ -761,7 +760,7 @@ def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
761
760
 
762
761
  @op
763
762
  def nmdc_schema_object_to_dict(object: YAMLRoot) -> Dict[str, Any]:
764
- return json_dumper.to_dict(object)
763
+ return as_simple_dict(object)
765
764
 
766
765
 
767
766
  @op(required_resource_keys={"mongo"}, config_schema={"username": str})
@@ -1100,7 +1099,12 @@ def materialize_alldocs(context) -> int:
1100
1099
  write_operations = []
1101
1100
  documents_processed_counter = 0
1102
1101
  for doc in mdb[coll_name].find():
1103
- doc_type = doc["type"][5:] # lop off "nmdc:" prefix
1102
+ try:
1103
+ doc_type = doc["type"][5:] # lop off "nmdc:" prefix
1104
+ except KeyError:
1105
+ raise Exception(
1106
+ f"doc {doc['id']} in collection {coll_name} has no 'type'!"
1107
+ )
1104
1108
  slots_to_include = ["id", "type"] + document_reference_ranged_slots[
1105
1109
  doc_type
1106
1110
  ]
@@ -1222,11 +1226,26 @@ def get_library_preparation_from_biosamples(
1222
1226
 
1223
1227
 
1224
1228
  @op(required_resource_keys={"mongo"})
1225
- def get_all_instruments(context: OpExecutionContext):
1229
+ def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
1230
+ mdb = context.resources.mongo.db
1231
+ return get_instruments_by_id(mdb)
1232
+
1233
+
1234
+ @op(required_resource_keys={"mongo"})
1235
+ def get_instrument_ids_by_model(context: OpExecutionContext) -> dict[str, str]:
1226
1236
  mdb = context.resources.mongo.db
1227
- instrument_set_collection = mdb["instrument_set"]
1228
- all_instruments = get_instruments(instrument_set_collection)
1229
- return all_instruments
1237
+ instruments_by_id = get_instruments_by_id(mdb)
1238
+ instruments_by_model: dict[str, str] = {}
1239
+ for inst_id, instrument in instruments_by_id.items():
1240
+ model = instrument.get("model")
1241
+ if model is None:
1242
+ context.log.warning(f"Instrument {inst_id} has no model.")
1243
+ continue
1244
+ if model in instruments_by_model:
1245
+ context.log.warning(f"Instrument model {model} is not unique.")
1246
+ instruments_by_model[model] = inst_id
1247
+ context.log.info("Instrument models: %s", pformat(instruments_by_model))
1248
+ return instruments_by_model
1230
1249
 
1231
1250
 
1232
1251
  @op
@@ -1340,3 +1359,26 @@ def generate_biosample_set_for_nmdc_study_from_gold(
1340
1359
  database = database_updater.generate_biosample_set_from_gold_api_for_study()
1341
1360
 
1342
1361
  return database
1362
+
1363
+
1364
+ @op
1365
+ def log_database_ids(
1366
+ context: OpExecutionContext,
1367
+ database: nmdc.Database,
1368
+ ) -> None:
1369
+ """Log the IDs of the database."""
1370
+ database_dict = as_simple_dict(database)
1371
+ message = ""
1372
+ for collection_name, collection in database_dict.items():
1373
+ if not isinstance(collection, list):
1374
+ continue
1375
+ message += f"{collection_name} ({len(collection)}):\n"
1376
+ if len(collection) < 10:
1377
+ message += "\n".join(f" {doc['id']}" for doc in collection)
1378
+ else:
1379
+ message += "\n".join(f" {doc['id']}" for doc in collection[:4])
1380
+ message += f"\n ... {len(collection) - 8} more\n"
1381
+ message += "\n".join(f" {doc['id']}" for doc in collection[-4:])
1382
+ message += "\n"
1383
+ if message:
1384
+ context.log.info(message)
@@ -199,8 +199,20 @@ class DatabaseUpdater:
199
199
  if gbs.get("biosampleGoldId") not in nmdc_gold_ids
200
200
  ]
201
201
 
202
+ # use the GOLD study id to fetch all sequencing project records associated with the study
203
+ gold_sequencing_projects_for_study = (
204
+ self.gold_api_client.fetch_projects_by_study(gold_study_id)
205
+ )
206
+
207
+ # use the GOLD study id to fetch all analysis project records associated with the study
208
+ gold_analysis_projects_for_study = (
209
+ self.gold_api_client.fetch_analysis_projects_by_study(gold_study_id)
210
+ )
211
+
202
212
  gold_study_translator = GoldStudyTranslator(
203
213
  biosamples=missing_gold_biosamples,
214
+ projects=gold_sequencing_projects_for_study,
215
+ analysis_projects=gold_analysis_projects_for_study,
204
216
  gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
205
217
  )
206
218
 
@@ -553,8 +553,6 @@ def biosample_submission_ingest():
553
553
  "translate_portal_submission_to_nmdc_schema_database": {
554
554
  "inputs": {
555
555
  "study_category": "research_study",
556
- "study_doi_category": None,
557
- "study_doi_provider": None,
558
556
  "study_pi_image_url": None,
559
557
  }
560
558
  },
@@ -591,8 +589,6 @@ def biosample_submission_ingest():
591
589
  "translate_portal_submission_to_nmdc_schema_database": {
592
590
  "inputs": {
593
591
  "study_category": None,
594
- "study_doi_category": None,
595
- "study_doi_provider": None,
596
592
  "study_pi_image_url": None,
597
593
  }
598
594
  },
@@ -744,7 +740,7 @@ def biosample_submission_ingest():
744
740
  "config": {
745
741
  "benthic_data_product": {
746
742
  "product_id": "DP1.20279.001",
747
- "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
743
+ "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
748
744
  }
749
745
  }
750
746
  },
@@ -771,7 +767,7 @@ def biosample_submission_ingest():
771
767
  "config": {
772
768
  "benthic_data_product": {
773
769
  "product_id": "DP1.20279.001",
774
- "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
770
+ "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
775
771
  }
776
772
  }
777
773
  },
@@ -639,6 +639,16 @@ class GoldStudyTranslator(Translator):
639
639
  :return: nmdc:NucleotideSequencing object
640
640
  """
641
641
  gold_project_id = gold_project["projectGoldId"]
642
+ ncbi_bioproject_identifier = gold_project.get("ncbiBioProjectAccession")
643
+ insdc_bioproject_identifiers = []
644
+ if ncbi_bioproject_identifier:
645
+ insdc_bioproject_identifiers.append(
646
+ self._ensure_curie(
647
+ ncbi_bioproject_identifier,
648
+ default_prefix="bioproject",
649
+ )
650
+ )
651
+
642
652
  return nmdc.NucleotideSequencing(
643
653
  id=nmdc_nucleotide_sequencing_id,
644
654
  name=gold_project.get("projectName"),
@@ -650,6 +660,7 @@ class GoldStudyTranslator(Translator):
650
660
  has_input=nmdc_biosample_id,
651
661
  add_date=gold_project.get("addDate"),
652
662
  mod_date=self._get_mod_date(gold_project),
663
+ insdc_bioproject_identifiers=insdc_bioproject_identifiers,
653
664
  principal_investigator=self._get_pi(gold_project),
654
665
  processing_institution=self._get_processing_institution(gold_project),
655
666
  instrument_used=self._get_instrument(gold_project),