nmdc-runtime 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

nmdc_runtime/config.py CHANGED
@@ -1 +1,57 @@
1
- DATABASE_CLASS_NAME = "Database"
1
+ """
2
+ This module acts as a unified interface between the codebase and the environment.
3
+ We will eventually move all of the Runtime's environment variables reads into this
4
+ module, instead of leaving them sprinkled throughout the codebase.
5
+
6
+ TODO: Move all environment variable reads into this module and update references accordingly.
7
+ """
8
+
9
+ from typing import Set
10
+ import os
11
+
12
+
13
+ def is_env_var_true(name: str, default: str = "false") -> bool:
14
+ r"""
15
+ Checks whether the value of the specified environment variable
16
+ meets our criteria for true-ness.
17
+
18
+ Reference: https://docs.python.org/3/library/os.html#os.environ
19
+
20
+ Run doctests via: $ python -m doctest nmdc_runtime/config.py
21
+
22
+ >>> import os
23
+ >>> name = "EXAMPLE_ENV_VAR"
24
+ >>> os.unsetenv(name) # Undefined
25
+ >>> is_env_var_true(name)
26
+ False
27
+ >>> is_env_var_true(name, "true") # Undefined, overridden default
28
+ True
29
+ >>> os.environ[name] = "false" # Defined as false
30
+ >>> is_env_var_true(name)
31
+ False
32
+ >>> os.environ[name] = "true" # Defined as true
33
+ >>> is_env_var_true(name)
34
+ True
35
+ >>> os.environ[name] = "TRUE" # Case-insensitive
36
+ >>> is_env_var_true(name)
37
+ True
38
+ >>> os.environ[name] = "potato" # Non-boolean string
39
+ >>> is_env_var_true(name)
40
+ False
41
+ """
42
+ lowercase_true_strings: Set[str] = {"true"}
43
+ return os.environ.get(name, default).lower() in lowercase_true_strings
44
+
45
+
46
+ # The name of the schema class representing the database. We don't bother to
47
+ # make this customizable via the environment, as we expect it to never change.
48
+ DATABASE_CLASS_NAME: str = "Database"
49
+
50
+ # Feature flag that can be used to enable/disable the `/nmdcschema/related_ids`
51
+ # endpoint and the tests that target it.
52
+ IS_RELATED_IDS_ENDPOINT_ENABLED: bool = is_env_var_true(
53
+ "IS_RELATED_IDS_ENDPOINT_ENABLED", default="true"
54
+ )
55
+
56
+ # Feature flag that can be used to enable/disable the `/scalar` endpoint.
57
+ IS_SCALAR_ENABLED: bool = is_env_var_true("IS_SCALAR_ENABLED", default="true")
@@ -0,0 +1,90 @@
1
+ from pymongo import MongoClient
2
+ from pymongo.database import Database
3
+ from pymongo.collection import Collection
4
+ from typing import Any, Mapping, Optional, Type, Callable
5
+ from pymongo.client_session import ClientSession
6
+ import inspect
7
+
8
+
9
+ def _wrap_with_session(obj: Any, name: str, session: Optional[ClientSession]) -> Any:
10
+ """
11
+ Wraps a callable attribute of an object to automatically include a session
12
+ if the callable accepts a 'session' keyword argument.
13
+ """
14
+ attr = getattr(obj, name)
15
+ if callable(attr):
16
+ signature = inspect.signature(attr)
17
+ parameters = signature.parameters
18
+ accepts_session = any(
19
+ param.name == "session"
20
+ for param in parameters.values()
21
+ if param.kind
22
+ in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
23
+ )
24
+
25
+ def wrapper(*args, **kwargs):
26
+ if session is not None and accepts_session and "session" not in kwargs:
27
+ kwargs["session"] = session
28
+ return attr(*args, **kwargs)
29
+
30
+ return wrapper
31
+ return attr
32
+
33
+
34
+ class SessionBoundCollection:
35
+ """
36
+ A wrapper around pymongo.collection.Collection that automatically passes a session
37
+ to methods that accept it.
38
+ """
39
+
40
+ def __init__(self, collection: Collection, session: Optional[ClientSession] = None):
41
+ self._collection = collection
42
+ self._session = session
43
+
44
+ def __getattr__(self, name: str):
45
+ return _wrap_with_session(self._collection, name, self._session)
46
+
47
+ def __getitem__(self, name: str) -> "SessionBoundCollection":
48
+ return SessionBoundCollection(self._collection[name], self._session)
49
+
50
+
51
+ class SessionBoundDatabase(Database):
52
+ """
53
+ A wrapper around pymongo.database.Database that automatically passes a session
54
+ to methods that accept it.
55
+ """
56
+
57
+ def __init__(self, database: Database, session: Optional[ClientSession] = None):
58
+ super().__init__(
59
+ database.client,
60
+ database.name,
61
+ database.codec_options,
62
+ database.read_preference,
63
+ database.write_concern,
64
+ database.read_concern,
65
+ )
66
+ self._database = database
67
+ self._session = session
68
+
69
+ def __getattr__(self, name: str):
70
+ return _wrap_with_session(self._database, name, self._session)
71
+
72
+ def __getitem__(self, name: str) -> SessionBoundCollection:
73
+ return SessionBoundCollection(self._database[name], self._session)
74
+
75
+ def get_collection(self, name: str, **kwargs) -> SessionBoundCollection:
76
+ """Get a :class:`~pymongo.collection.Collection` with the given name and options."""
77
+ collection = super().get_collection(name, **kwargs)
78
+ return SessionBoundCollection(collection, self._session)
79
+
80
+ @property
81
+ def client(self):
82
+ return self._database.client
83
+
84
+ @property
85
+ def unbounded(self):
86
+ return self._database
87
+
88
+ @property
89
+ def name(self):
90
+ return self._database.name
@@ -27,7 +27,11 @@ class NCBISubmissionXML:
27
27
  self.nmdc_study_id = nmdc_study.get("id")
28
28
  self.nmdc_study_title = nmdc_study.get("title")
29
29
  self.nmdc_study_description = nmdc_study.get("description")
30
- self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")
30
+ # get the first INSDC BioProject ID from the NMDC study
31
+ self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")[0]
32
+ # the value asserted in "insdc_bioproject_identifiers" will be a CURIE, so extract
33
+ # everything after the prefix and delimiter (":")
34
+ self.ncbi_bioproject_id = self.ncbi_bioproject_id.split(":")[-1]
31
35
  self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
32
36
  nmdc_study_pi_name = (
33
37
  nmdc_study.get("principal_investigator", {}).get("name").split()
@@ -251,7 +255,11 @@ class NCBISubmissionXML:
251
255
  children=[
252
256
  self.set_element(
253
257
  "Title",
254
- f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
258
+ attributes.get(
259
+ "name",
260
+ # fallback title if "name" is not present
261
+ f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
262
+ ),
255
263
  ),
256
264
  ],
257
265
  ),
@@ -577,18 +585,45 @@ class NCBISubmissionXML:
577
585
  biosample_library_preparation_list: list,
578
586
  instruments_dict: dict,
579
587
  ):
580
- data_type = None
581
- ncbi_project_id = None
588
+ # data_type = None
589
+
590
+ biosamples_to_exclude = set()
582
591
  for bsm_ntseq in biosample_nucleotide_sequencing_list:
583
- for _, ntseq_list in bsm_ntseq.items():
592
+ for bsm_id, ntseq_list in bsm_ntseq.items():
593
+ # Check if any processing_institution is "JGI"
584
594
  for ntseq in ntseq_list:
585
- if "analyte_category" in ntseq:
586
- data_type = handle_string_value(
587
- ntseq["analyte_category"]
588
- ).capitalize()
589
-
590
- if "ncbi_project_name" in ntseq:
591
- ncbi_project_id = ntseq["ncbi_project_name"]
595
+ if (
596
+ "processing_institution" in ntseq
597
+ and ntseq["processing_institution"] == "JGI"
598
+ ):
599
+ biosamples_to_exclude.add(bsm_id)
600
+ break
601
+
602
+ # Filter biosample_nucleotide_sequencing_list to exclude JGI records
603
+ filtered_nucleotide_sequencing_list = []
604
+ for bsm_ntseq in biosample_nucleotide_sequencing_list:
605
+ filtered_dict = {}
606
+ for bsm_id, ntseq_list in bsm_ntseq.items():
607
+ if bsm_id not in biosamples_to_exclude:
608
+ filtered_dict[bsm_id] = ntseq_list
609
+ if filtered_dict: # Only add non-empty dictionaries
610
+ filtered_nucleotide_sequencing_list.append(filtered_dict)
611
+
612
+ # Filter biosamples_list to exclude JGI-processed biosamples
613
+ filtered_biosamples_list = [
614
+ biosample
615
+ for biosample in biosamples_list
616
+ if biosample.get("id") not in biosamples_to_exclude
617
+ ]
618
+
619
+ # Get data_type from filtered list
620
+ # for bsm_ntseq in filtered_nucleotide_sequencing_list:
621
+ # for _, ntseq_list in bsm_ntseq.items():
622
+ # for ntseq in ntseq_list:
623
+ # if "analyte_category" in ntseq:
624
+ # data_type = handle_string_value(
625
+ # ntseq["analyte_category"]
626
+ # ).capitalize()
592
627
 
593
628
  self.set_description(
594
629
  email=self.nmdc_pi_email,
@@ -597,29 +632,65 @@ class NCBISubmissionXML:
597
632
  org=self.ncbi_submission_metadata.get("organization", ""),
598
633
  )
599
634
 
600
- if not ncbi_project_id:
601
- self.set_bioproject(
602
- title=self.nmdc_study_title,
603
- project_id=ncbi_project_id,
604
- description=self.nmdc_study_description,
605
- data_type=data_type,
606
- org=self.ncbi_submission_metadata.get("organization", ""),
607
- )
635
+ # if not self.ncbi_bioproject_id:
636
+ # self.set_bioproject(
637
+ # title=self.nmdc_study_title,
638
+ # project_id=self.ncbi_bioproject_id,
639
+ # description=self.nmdc_study_description,
640
+ # data_type=data_type,
641
+ # org=self.ncbi_submission_metadata.get("organization", ""),
642
+ # )
608
643
 
609
644
  self.set_biosample(
610
645
  organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
611
646
  org=self.ncbi_submission_metadata.get("organization", ""),
612
- bioproject_id=ncbi_project_id,
613
- nmdc_biosamples=biosamples_list,
647
+ bioproject_id=self.ncbi_bioproject_id,
648
+ nmdc_biosamples=filtered_biosamples_list,
614
649
  )
615
650
 
651
+ # Also filter biosample_data_objects_list
652
+ filtered_data_objects_list = []
653
+ acceptable_extensions = [".fastq.gz", ".fastq"]
654
+
655
+ for entry in biosample_data_objects_list:
656
+ filtered_entry = {}
657
+ for biosample_id, data_objects in entry.items():
658
+ if biosample_id not in biosamples_to_exclude:
659
+ # filter data_objects based on acceptable/allowed extensions
660
+ # for "url" key in data_object
661
+ filtered_objects = []
662
+ for data_object in data_objects:
663
+ if "url" in data_object:
664
+ url = urlparse(data_object["url"])
665
+ file_path = os.path.basename(url.path)
666
+ if any(
667
+ file_path.endswith(ext) for ext in acceptable_extensions
668
+ ):
669
+ filtered_objects.append(data_object)
670
+
671
+ if filtered_objects:
672
+ filtered_entry[biosample_id] = filtered_objects
673
+
674
+ if filtered_entry: # Only add non-empty entries
675
+ filtered_data_objects_list.append(filtered_entry)
676
+
677
+ # Filter library preparation list as well
678
+ filtered_library_preparation_list = []
679
+ for lib_prep_dict in biosample_library_preparation_list:
680
+ filtered_lib_prep = {}
681
+ for biosample_id, lib_prep in lib_prep_dict.items():
682
+ if biosample_id not in biosamples_to_exclude:
683
+ filtered_lib_prep[biosample_id] = lib_prep
684
+ if filtered_lib_prep: # Only add non-empty entries
685
+ filtered_library_preparation_list.append(filtered_lib_prep)
686
+
616
687
  self.set_fastq(
617
- biosample_data_objects=biosample_data_objects_list,
618
- bioproject_id=ncbi_project_id,
688
+ biosample_data_objects=filtered_data_objects_list,
689
+ bioproject_id=self.ncbi_bioproject_id,
619
690
  org=self.ncbi_submission_metadata.get("organization", ""),
620
- nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
621
- nmdc_biosamples=biosamples_list,
622
- nmdc_library_preparation=biosample_library_preparation_list,
691
+ nmdc_nucleotide_sequencing=filtered_nucleotide_sequencing_list,
692
+ nmdc_biosamples=filtered_biosamples_list,
693
+ nmdc_library_preparation=filtered_library_preparation_list,
623
694
  all_instruments=instruments_dict,
624
695
  )
625
696
 
@@ -99,31 +99,36 @@ def fetch_nucleotide_sequencing_from_biosamples(
99
99
  for biosample in biosamples_list:
100
100
  current_ids = [biosample["id"]]
101
101
  collected_ntseq_objects = []
102
+ processed_ids = set() # Track already processed nucleotide sequencing IDs
102
103
 
103
104
  while current_ids:
104
105
  new_current_ids = []
105
106
  for current_id in current_ids:
106
- query = {"has_input": current_id}
107
- document = all_docs_collection.find_one(query)
108
-
109
- if not document:
110
- continue
111
-
112
- has_output = document.get("has_output")
113
- if not has_output:
114
- continue
115
-
116
- for output_id in has_output:
117
- if get_classname_from_typecode(output_id) == "DataObject":
118
- nucleotide_sequencing_doc = data_generation_set.find_one(
119
- {"id": document["id"]}
120
- )
121
- if nucleotide_sequencing_doc:
122
- collected_ntseq_objects.append(
123
- strip_oid(nucleotide_sequencing_doc)
124
- )
125
- else:
126
- new_current_ids.append(output_id)
107
+ # Find all documents with current_id as input instead of just one
108
+ for document in all_docs_collection.find({"has_input": current_id}):
109
+ has_output = document.get("has_output")
110
+ if not has_output:
111
+ continue
112
+
113
+ for output_id in has_output:
114
+ if get_classname_from_typecode(output_id) == "DataObject":
115
+ # Only process if we haven't seen this document ID before
116
+ if document["id"] not in processed_ids:
117
+ nucleotide_sequencing_doc = (
118
+ data_generation_set.find_one(
119
+ {
120
+ "id": document["id"],
121
+ "type": "nmdc:NucleotideSequencing",
122
+ }
123
+ )
124
+ )
125
+ if nucleotide_sequencing_doc:
126
+ collected_ntseq_objects.append(
127
+ strip_oid(nucleotide_sequencing_doc)
128
+ )
129
+ processed_ids.add(document["id"])
130
+ else:
131
+ new_current_ids.append(output_id)
127
132
 
128
133
  current_ids = new_current_ids
129
134
 
@@ -187,10 +192,7 @@ def handle_quantity_value(slot_value):
187
192
  and "has_minimum_numeric_value" in slot_value
188
193
  and "has_unit" in slot_value
189
194
  ):
190
- range_value = (
191
- slot_value["has_maximum_numeric_value"]
192
- - slot_value["has_minimum_numeric_value"]
193
- )
195
+ range_value = f"{slot_value['has_minimum_numeric_value']} - {slot_value['has_maximum_numeric_value']}"
194
196
  return f"{range_value} {slot_value['has_unit']}"
195
197
  elif "has_raw_value" in slot_value:
196
198
  return slot_value["has_raw_value"]
@@ -1,4 +1,4 @@
1
- from dagster import graph
1
+ from dagster import graph, GraphIn
2
2
 
3
3
  from nmdc_runtime.site.ops import (
4
4
  build_merged_db,
@@ -22,6 +22,7 @@ from nmdc_runtime.site.ops import (
22
22
  filter_ops_done_object_puts,
23
23
  hello,
24
24
  mongo_stats,
25
+ run_script_to_update_insdc_biosample_identifiers,
25
26
  submit_metadata_to_db,
26
27
  filter_ops_undone_expired,
27
28
  construct_jobs,
@@ -50,6 +51,7 @@ from nmdc_runtime.site.ops import (
50
51
  get_df_from_url,
51
52
  site_code_mapping,
52
53
  materialize_alldocs,
54
+ load_ontology,
53
55
  get_ncbi_export_pipeline_study,
54
56
  get_data_objects_from_biosamples,
55
57
  get_nucleotide_sequencing_from_biosamples,
@@ -58,6 +60,7 @@ from nmdc_runtime.site.ops import (
58
60
  get_ncbi_export_pipeline_inputs,
59
61
  ncbi_submission_xml_from_nmdc_study,
60
62
  ncbi_submission_xml_asset,
63
+ render_text,
61
64
  get_database_updater_inputs,
62
65
  post_submission_portal_biosample_ingest_record_stitching_filename,
63
66
  generate_data_generation_set_post_biosample_ingest,
@@ -112,6 +115,16 @@ def ensure_alldocs():
112
115
  materialize_alldocs()
113
116
 
114
117
 
118
+ @graph
119
+ def run_ontology_load():
120
+ """
121
+ A graph for loading ontologies.
122
+ The source_ontology parameter is provided by the job configuration
123
+ and passed to the load_ontology op.
124
+ """
125
+ load_ontology()
126
+
127
+
115
128
  @graph
116
129
  def ensure_jobs():
117
130
  jobs = construct_jobs()
@@ -120,17 +133,24 @@ def ensure_jobs():
120
133
 
121
134
  @graph
122
135
  def apply_changesheet():
136
+ # Note: We use `_` as a "placeholder" variable.
137
+ # It's a variable to whose value we assign no significance. In this case, we use it to
138
+ # tell Dagster that one op depends upon the output of the other (so Dagster runs them
139
+ # in that order), without implying to maintainers that its value is significant to us.
140
+ # Reference (this strategy): https://docs.dagster.io/api/dagster/types#dagster.Nothing
141
+ # Reference (`_` variables): https://stackoverflow.com/a/47599668
123
142
  sheet_in = get_changesheet_in()
124
143
  outputs = perform_changesheet_updates(sheet_in)
125
- add_output_run_event(outputs)
126
- materialize_alldocs()
144
+ _ = add_output_run_event(outputs)
145
+ materialize_alldocs(waits_for=_)
127
146
 
128
147
 
129
148
  @graph
130
149
  def apply_metadata_in():
150
+ # Note: We use `_` as a "placeholder" variable.
131
151
  outputs = perform_mongo_updates(get_json_in())
132
- add_output_run_event(outputs)
133
- materialize_alldocs()
152
+ _ = add_output_run_event(outputs)
153
+ materialize_alldocs(waits_for=_)
134
154
 
135
155
 
136
156
  @graph
@@ -140,6 +160,7 @@ def gold_study_to_database():
140
160
  study_type,
141
161
  gold_nmdc_instrument_mapping_file_url,
142
162
  include_field_site_info,
163
+ enable_biosample_filtering,
143
164
  ) = get_gold_study_pipeline_inputs()
144
165
 
145
166
  projects = gold_projects_by_study(study_id)
@@ -156,6 +177,7 @@ def gold_study_to_database():
156
177
  analysis_projects,
157
178
  gold_nmdc_instrument_map_df,
158
179
  include_field_site_info,
180
+ enable_biosample_filtering,
159
181
  )
160
182
  database_dict = nmdc_schema_object_to_dict(database)
161
183
  filename = nmdc_schema_database_export_filename(study)
@@ -486,11 +508,19 @@ def nmdc_study_to_ncbi_submission_export():
486
508
 
487
509
  @graph
488
510
  def generate_data_generation_set_for_biosamples_in_nmdc_study():
489
- (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
511
+ (
512
+ study_id,
513
+ gold_nmdc_instrument_mapping_file_url,
514
+ include_field_site_info,
515
+ enable_biosample_filtering,
516
+ ) = get_database_updater_inputs()
490
517
  gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
491
518
 
492
519
  database = generate_data_generation_set_post_biosample_ingest(
493
- study_id, gold_nmdc_instrument_map_df
520
+ study_id,
521
+ gold_nmdc_instrument_map_df,
522
+ include_field_site_info,
523
+ enable_biosample_filtering,
494
524
  )
495
525
 
496
526
  database_dict = nmdc_schema_object_to_dict(database)
@@ -503,11 +533,19 @@ def generate_data_generation_set_for_biosamples_in_nmdc_study():
503
533
 
504
534
  @graph
505
535
  def generate_biosample_set_from_samples_in_gold():
506
- (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
536
+ (
537
+ study_id,
538
+ gold_nmdc_instrument_mapping_file_url,
539
+ include_field_site_info,
540
+ enable_biosample_filtering,
541
+ ) = get_database_updater_inputs()
507
542
  gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
508
543
 
509
544
  database = generate_biosample_set_for_nmdc_study_from_gold(
510
- study_id, gold_nmdc_instrument_map_df
545
+ study_id,
546
+ gold_nmdc_instrument_map_df,
547
+ include_field_site_info,
548
+ enable_biosample_filtering,
511
549
  )
512
550
  database_dict = nmdc_schema_object_to_dict(database)
513
551
  filename = post_submission_portal_biosample_ingest_record_stitching_filename(
@@ -515,3 +553,28 @@ def generate_biosample_set_from_samples_in_gold():
515
553
  )
516
554
  outputs = export_json_to_drs(database_dict, filename)
517
555
  add_output_run_event(outputs)
556
+
557
+
558
+ @graph
559
+ def generate_update_script_for_insdc_biosample_identifiers():
560
+ """Generate a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.
561
+
562
+ This graph fetches the necessary inputs, then calls the run_script_to_update_insdc_biosample_identifiers op
563
+ to generate a script for updating biosample records with INSDC identifiers obtained from GOLD.
564
+ The script is returned as a dictionary that can be executed against MongoDB.
565
+ """
566
+ (
567
+ study_id,
568
+ gold_nmdc_instrument_mapping_file_url,
569
+ include_field_site_info,
570
+ enable_biosample_filtering,
571
+ ) = get_database_updater_inputs()
572
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
573
+
574
+ update_script = run_script_to_update_insdc_biosample_identifiers(
575
+ study_id,
576
+ gold_nmdc_instrument_map_df,
577
+ include_field_site_info,
578
+ enable_biosample_filtering,
579
+ )
580
+ render_text(update_script)