nmdc-runtime 2.6.0__py3-none-any.whl → 2.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

nmdc_runtime/config.py CHANGED
@@ -1 +1,5 @@
1
1
  DATABASE_CLASS_NAME = "Database"
2
+
3
+ # Feature flag that can be used to enable/disable the `/nmdcschema/related_ids`
4
+ # endpoint and the tests that target it.
5
+ IS_RELATED_IDS_ENDPOINT_ENABLED = False
@@ -0,0 +1,90 @@
1
+ from pymongo import MongoClient
2
+ from pymongo.database import Database
3
+ from pymongo.collection import Collection
4
+ from typing import Any, Mapping, Optional, Type, Callable
5
+ from pymongo.client_session import ClientSession
6
+ import inspect
7
+
8
+
9
+ def _wrap_with_session(obj: Any, name: str, session: Optional[ClientSession]) -> Any:
10
+ """
11
+ Wraps a callable attribute of an object to automatically include a session
12
+ if the callable accepts a 'session' keyword argument.
13
+ """
14
+ attr = getattr(obj, name)
15
+ if callable(attr):
16
+ signature = inspect.signature(attr)
17
+ parameters = signature.parameters
18
+ accepts_session = any(
19
+ param.name == "session"
20
+ for param in parameters.values()
21
+ if param.kind
22
+ in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
23
+ )
24
+
25
+ def wrapper(*args, **kwargs):
26
+ if session is not None and accepts_session and "session" not in kwargs:
27
+ kwargs["session"] = session
28
+ return attr(*args, **kwargs)
29
+
30
+ return wrapper
31
+ return attr
32
+
33
+
34
+ class SessionBoundCollection:
35
+ """
36
+ A wrapper around pymongo.collection.Collection that automatically passes a session
37
+ to methods that accept it.
38
+ """
39
+
40
+ def __init__(self, collection: Collection, session: Optional[ClientSession] = None):
41
+ self._collection = collection
42
+ self._session = session
43
+
44
+ def __getattr__(self, name: str):
45
+ return _wrap_with_session(self._collection, name, self._session)
46
+
47
+ def __getitem__(self, name: str) -> "SessionBoundCollection":
48
+ return SessionBoundCollection(self._collection[name], self._session)
49
+
50
+
51
+ class SessionBoundDatabase(Database):
52
+ """
53
+ A wrapper around pymongo.database.Database that automatically passes a session
54
+ to methods that accept it.
55
+ """
56
+
57
+ def __init__(self, database: Database, session: Optional[ClientSession] = None):
58
+ super().__init__(
59
+ database.client,
60
+ database.name,
61
+ database.codec_options,
62
+ database.read_preference,
63
+ database.write_concern,
64
+ database.read_concern,
65
+ )
66
+ self._database = database
67
+ self._session = session
68
+
69
+ def __getattr__(self, name: str):
70
+ return _wrap_with_session(self._database, name, self._session)
71
+
72
+ def __getitem__(self, name: str) -> SessionBoundCollection:
73
+ return SessionBoundCollection(self._database[name], self._session)
74
+
75
+ def get_collection(self, name: str, **kwargs) -> SessionBoundCollection:
76
+ """Get a :class:`~pymongo.collection.Collection` with the given name and options."""
77
+ collection = super().get_collection(name, **kwargs)
78
+ return SessionBoundCollection(collection, self._session)
79
+
80
+ @property
81
+ def client(self):
82
+ return self._database.client
83
+
84
+ @property
85
+ def unbounded(self):
86
+ return self._database
87
+
88
+ @property
89
+ def name(self):
90
+ return self._database.name
@@ -27,7 +27,11 @@ class NCBISubmissionXML:
27
27
  self.nmdc_study_id = nmdc_study.get("id")
28
28
  self.nmdc_study_title = nmdc_study.get("title")
29
29
  self.nmdc_study_description = nmdc_study.get("description")
30
- self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")
30
+ # get the first INSDC BioProject ID from the NMDC study
31
+ self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")[0]
32
+ # the value asserted in "insdc_bioproject_identifiers" will be a CURIE, so extract
33
+ # everything after the prefix and delimiter (":")
34
+ self.ncbi_bioproject_id = self.ncbi_bioproject_id.split(":")[-1]
31
35
  self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
32
36
  nmdc_study_pi_name = (
33
37
  nmdc_study.get("principal_investigator", {}).get("name").split()
@@ -251,7 +255,11 @@ class NCBISubmissionXML:
251
255
  children=[
252
256
  self.set_element(
253
257
  "Title",
254
- f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
258
+ attributes.get(
259
+ "name",
260
+ # fallback title if "name" is not present
261
+ f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
262
+ ),
255
263
  ),
256
264
  ],
257
265
  ),
@@ -577,18 +585,45 @@ class NCBISubmissionXML:
577
585
  biosample_library_preparation_list: list,
578
586
  instruments_dict: dict,
579
587
  ):
580
- data_type = None
581
- ncbi_project_id = None
588
+ # data_type = None
589
+
590
+ biosamples_to_exclude = set()
582
591
  for bsm_ntseq in biosample_nucleotide_sequencing_list:
583
- for _, ntseq_list in bsm_ntseq.items():
592
+ for bsm_id, ntseq_list in bsm_ntseq.items():
593
+ # Check if any processing_institution is "JGI"
584
594
  for ntseq in ntseq_list:
585
- if "analyte_category" in ntseq:
586
- data_type = handle_string_value(
587
- ntseq["analyte_category"]
588
- ).capitalize()
589
-
590
- if "ncbi_project_name" in ntseq:
591
- ncbi_project_id = ntseq["ncbi_project_name"]
595
+ if (
596
+ "processing_institution" in ntseq
597
+ and ntseq["processing_institution"] == "JGI"
598
+ ):
599
+ biosamples_to_exclude.add(bsm_id)
600
+ break
601
+
602
+ # Filter biosample_nucleotide_sequencing_list to exclude JGI records
603
+ filtered_nucleotide_sequencing_list = []
604
+ for bsm_ntseq in biosample_nucleotide_sequencing_list:
605
+ filtered_dict = {}
606
+ for bsm_id, ntseq_list in bsm_ntseq.items():
607
+ if bsm_id not in biosamples_to_exclude:
608
+ filtered_dict[bsm_id] = ntseq_list
609
+ if filtered_dict: # Only add non-empty dictionaries
610
+ filtered_nucleotide_sequencing_list.append(filtered_dict)
611
+
612
+ # Filter biosamples_list to exclude JGI-processed biosamples
613
+ filtered_biosamples_list = [
614
+ biosample
615
+ for biosample in biosamples_list
616
+ if biosample.get("id") not in biosamples_to_exclude
617
+ ]
618
+
619
+ # Get data_type from filtered list
620
+ # for bsm_ntseq in filtered_nucleotide_sequencing_list:
621
+ # for _, ntseq_list in bsm_ntseq.items():
622
+ # for ntseq in ntseq_list:
623
+ # if "analyte_category" in ntseq:
624
+ # data_type = handle_string_value(
625
+ # ntseq["analyte_category"]
626
+ # ).capitalize()
592
627
 
593
628
  self.set_description(
594
629
  email=self.nmdc_pi_email,
@@ -597,29 +632,65 @@ class NCBISubmissionXML:
597
632
  org=self.ncbi_submission_metadata.get("organization", ""),
598
633
  )
599
634
 
600
- if not ncbi_project_id:
601
- self.set_bioproject(
602
- title=self.nmdc_study_title,
603
- project_id=ncbi_project_id,
604
- description=self.nmdc_study_description,
605
- data_type=data_type,
606
- org=self.ncbi_submission_metadata.get("organization", ""),
607
- )
635
+ # if not self.ncbi_bioproject_id:
636
+ # self.set_bioproject(
637
+ # title=self.nmdc_study_title,
638
+ # project_id=self.ncbi_bioproject_id,
639
+ # description=self.nmdc_study_description,
640
+ # data_type=data_type,
641
+ # org=self.ncbi_submission_metadata.get("organization", ""),
642
+ # )
608
643
 
609
644
  self.set_biosample(
610
645
  organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
611
646
  org=self.ncbi_submission_metadata.get("organization", ""),
612
- bioproject_id=ncbi_project_id,
613
- nmdc_biosamples=biosamples_list,
647
+ bioproject_id=self.ncbi_bioproject_id,
648
+ nmdc_biosamples=filtered_biosamples_list,
614
649
  )
615
650
 
651
+ # Also filter biosample_data_objects_list
652
+ filtered_data_objects_list = []
653
+ acceptable_extensions = [".fastq.gz", ".fastq"]
654
+
655
+ for entry in biosample_data_objects_list:
656
+ filtered_entry = {}
657
+ for biosample_id, data_objects in entry.items():
658
+ if biosample_id not in biosamples_to_exclude:
659
+ # filter data_objects based on acceptable/allowed extensions
660
+ # for "url" key in data_object
661
+ filtered_objects = []
662
+ for data_object in data_objects:
663
+ if "url" in data_object:
664
+ url = urlparse(data_object["url"])
665
+ file_path = os.path.basename(url.path)
666
+ if any(
667
+ file_path.endswith(ext) for ext in acceptable_extensions
668
+ ):
669
+ filtered_objects.append(data_object)
670
+
671
+ if filtered_objects:
672
+ filtered_entry[biosample_id] = filtered_objects
673
+
674
+ if filtered_entry: # Only add non-empty entries
675
+ filtered_data_objects_list.append(filtered_entry)
676
+
677
+ # Filter library preparation list as well
678
+ filtered_library_preparation_list = []
679
+ for lib_prep_dict in biosample_library_preparation_list:
680
+ filtered_lib_prep = {}
681
+ for biosample_id, lib_prep in lib_prep_dict.items():
682
+ if biosample_id not in biosamples_to_exclude:
683
+ filtered_lib_prep[biosample_id] = lib_prep
684
+ if filtered_lib_prep: # Only add non-empty entries
685
+ filtered_library_preparation_list.append(filtered_lib_prep)
686
+
616
687
  self.set_fastq(
617
- biosample_data_objects=biosample_data_objects_list,
618
- bioproject_id=ncbi_project_id,
688
+ biosample_data_objects=filtered_data_objects_list,
689
+ bioproject_id=self.ncbi_bioproject_id,
619
690
  org=self.ncbi_submission_metadata.get("organization", ""),
620
- nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
621
- nmdc_biosamples=biosamples_list,
622
- nmdc_library_preparation=biosample_library_preparation_list,
691
+ nmdc_nucleotide_sequencing=filtered_nucleotide_sequencing_list,
692
+ nmdc_biosamples=filtered_biosamples_list,
693
+ nmdc_library_preparation=filtered_library_preparation_list,
623
694
  all_instruments=instruments_dict,
624
695
  )
625
696
 
@@ -99,31 +99,36 @@ def fetch_nucleotide_sequencing_from_biosamples(
99
99
  for biosample in biosamples_list:
100
100
  current_ids = [biosample["id"]]
101
101
  collected_ntseq_objects = []
102
+ processed_ids = set() # Track already processed nucleotide sequencing IDs
102
103
 
103
104
  while current_ids:
104
105
  new_current_ids = []
105
106
  for current_id in current_ids:
106
- query = {"has_input": current_id}
107
- document = all_docs_collection.find_one(query)
108
-
109
- if not document:
110
- continue
111
-
112
- has_output = document.get("has_output")
113
- if not has_output:
114
- continue
115
-
116
- for output_id in has_output:
117
- if get_classname_from_typecode(output_id) == "DataObject":
118
- nucleotide_sequencing_doc = data_generation_set.find_one(
119
- {"id": document["id"]}
120
- )
121
- if nucleotide_sequencing_doc:
122
- collected_ntseq_objects.append(
123
- strip_oid(nucleotide_sequencing_doc)
124
- )
125
- else:
126
- new_current_ids.append(output_id)
107
+ # Find all documents with current_id as input instead of just one
108
+ for document in all_docs_collection.find({"has_input": current_id}):
109
+ has_output = document.get("has_output")
110
+ if not has_output:
111
+ continue
112
+
113
+ for output_id in has_output:
114
+ if get_classname_from_typecode(output_id) == "DataObject":
115
+ # Only process if we haven't seen this document ID before
116
+ if document["id"] not in processed_ids:
117
+ nucleotide_sequencing_doc = (
118
+ data_generation_set.find_one(
119
+ {
120
+ "id": document["id"],
121
+ "type": "nmdc:NucleotideSequencing",
122
+ }
123
+ )
124
+ )
125
+ if nucleotide_sequencing_doc:
126
+ collected_ntseq_objects.append(
127
+ strip_oid(nucleotide_sequencing_doc)
128
+ )
129
+ processed_ids.add(document["id"])
130
+ else:
131
+ new_current_ids.append(output_id)
127
132
 
128
133
  current_ids = new_current_ids
129
134
 
@@ -187,10 +192,7 @@ def handle_quantity_value(slot_value):
187
192
  and "has_minimum_numeric_value" in slot_value
188
193
  and "has_unit" in slot_value
189
194
  ):
190
- range_value = (
191
- slot_value["has_maximum_numeric_value"]
192
- - slot_value["has_minimum_numeric_value"]
193
- )
195
+ range_value = f"{slot_value['has_minimum_numeric_value']} - {slot_value['has_maximum_numeric_value']}"
194
196
  return f"{range_value} {slot_value['has_unit']}"
195
197
  elif "has_raw_value" in slot_value:
196
198
  return slot_value["has_raw_value"]
@@ -1,4 +1,4 @@
1
- from dagster import graph
1
+ from dagster import graph, GraphIn
2
2
 
3
3
  from nmdc_runtime.site.ops import (
4
4
  build_merged_db,
@@ -22,6 +22,7 @@ from nmdc_runtime.site.ops import (
22
22
  filter_ops_done_object_puts,
23
23
  hello,
24
24
  mongo_stats,
25
+ run_script_to_update_insdc_biosample_identifiers,
25
26
  submit_metadata_to_db,
26
27
  filter_ops_undone_expired,
27
28
  construct_jobs,
@@ -50,6 +51,7 @@ from nmdc_runtime.site.ops import (
50
51
  get_df_from_url,
51
52
  site_code_mapping,
52
53
  materialize_alldocs,
54
+ load_ontology,
53
55
  get_ncbi_export_pipeline_study,
54
56
  get_data_objects_from_biosamples,
55
57
  get_nucleotide_sequencing_from_biosamples,
@@ -58,6 +60,7 @@ from nmdc_runtime.site.ops import (
58
60
  get_ncbi_export_pipeline_inputs,
59
61
  ncbi_submission_xml_from_nmdc_study,
60
62
  ncbi_submission_xml_asset,
63
+ render_text,
61
64
  get_database_updater_inputs,
62
65
  post_submission_portal_biosample_ingest_record_stitching_filename,
63
66
  generate_data_generation_set_post_biosample_ingest,
@@ -112,6 +115,16 @@ def ensure_alldocs():
112
115
  materialize_alldocs()
113
116
 
114
117
 
118
+ @graph
119
+ def run_ontology_load():
120
+ """
121
+ A graph for loading ontologies.
122
+ The source_ontology parameter is provided by the job configuration
123
+ and passed to the load_ontology op.
124
+ """
125
+ load_ontology()
126
+
127
+
115
128
  @graph
116
129
  def ensure_jobs():
117
130
  jobs = construct_jobs()
@@ -120,17 +133,24 @@ def ensure_jobs():
120
133
 
121
134
  @graph
122
135
  def apply_changesheet():
136
+ # Note: We use `_` as a "placeholder" variable.
137
+ # It's a variable to whose value we assign no significance. In this case, we use it to
138
+ # tell Dagster that one op depends upon the output of the other (so Dagster runs them
139
+ # in that order), without implying to maintainers that its value is significant to us.
140
+ # Reference (this strategy): https://docs.dagster.io/api/dagster/types#dagster.Nothing
141
+ # Reference (`_` variables): https://stackoverflow.com/a/47599668
123
142
  sheet_in = get_changesheet_in()
124
143
  outputs = perform_changesheet_updates(sheet_in)
125
- add_output_run_event(outputs)
126
- materialize_alldocs()
144
+ _ = add_output_run_event(outputs)
145
+ materialize_alldocs(waits_for=_)
127
146
 
128
147
 
129
148
  @graph
130
149
  def apply_metadata_in():
150
+ # Note: We use `_` as a "placeholder" variable.
131
151
  outputs = perform_mongo_updates(get_json_in())
132
- add_output_run_event(outputs)
133
- materialize_alldocs()
152
+ _ = add_output_run_event(outputs)
153
+ materialize_alldocs(waits_for=_)
134
154
 
135
155
 
136
156
  @graph
@@ -515,3 +535,20 @@ def generate_biosample_set_from_samples_in_gold():
515
535
  )
516
536
  outputs = export_json_to_drs(database_dict, filename)
517
537
  add_output_run_event(outputs)
538
+
539
+
540
+ @graph
541
+ def generate_update_script_for_insdc_biosample_identifiers():
542
+ """Generate a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.
543
+
544
+ This graph fetches the necessary inputs, then calls the run_script_to_update_insdc_biosample_identifiers op
545
+ to generate a script for updating biosample records with INSDC identifiers obtained from GOLD.
546
+ The script is returned as a dictionary that can be executed against MongoDB.
547
+ """
548
+ (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
549
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
550
+
551
+ update_script = run_script_to_update_insdc_biosample_identifiers(
552
+ study_id, gold_nmdc_instrument_map_df
553
+ )
554
+ render_text(update_script)