nmdc-runtime 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -2,8 +2,9 @@ import os
2
2
  from functools import lru_cache
3
3
  from typing import List
4
4
 
5
- from nmdc_runtime.util import get_nmdc_jsonschema_dict
5
+ from nmdc_schema.id_helpers import get_typecode_for_future_ids
6
6
 
7
+ from nmdc_runtime.util import get_nmdc_jsonschema_dict
7
8
  from nmdc_runtime.api.db.mongo import get_mongo_db
8
9
 
9
10
 
@@ -12,55 +13,24 @@ def minting_service_id() -> str | None:
12
13
  return os.getenv("MINTING_SERVICE_ID")
13
14
 
14
15
 
15
- def extract_typecode_from_pattern(pattern: str) -> str:
16
- r"""
17
- Returns the typecode portion of the specified string.
18
-
19
- >>> extract_typecode_from_pattern("foo-123-456$") # original behavior
20
- 'foo'
21
- >>> extract_typecode_from_pattern("(foo)-123-456$") # returns first and only typecode
22
- 'foo'
23
- >>> extract_typecode_from_pattern("(foo|bar)-123-456$") # returns first of 2 typecodes
24
- 'foo'
25
- >>> extract_typecode_from_pattern("(foo|bar|baz)-123-456$") # returns first of > 2 typecodes
26
- 'foo'
27
- """
28
-
29
- # Get the portion of the pattern preceding the first hyphen.
30
- # e.g. "foo-bar-baz" → ["foo", "bar-baz"] → "foo"
31
- typecode_sub_pattern = pattern.split("-", maxsplit=1)[0]
32
-
33
- # If that portion of the pattern is enclosed in parentheses, get the portion between the parentheses.
34
- # e.g. "(apple|banana|carrot)" → "apple|banana|carrot"
35
- if typecode_sub_pattern.startswith("(") and typecode_sub_pattern.endswith(")"):
36
- inner_pattern = typecode_sub_pattern[1:-1]
37
-
38
- # Finally, get everything before the first `|`, if any.
39
- # e.g. "apple|banana|carrot" → "apple"
40
- # e.g. "apple" → "apple"
41
- typecode = inner_pattern.split("|", maxsplit=1)[0]
42
- else:
43
- # Note: This is the original behavior, before we added support for multi-typecode patterns.
44
- # e.g. "apple" → "apple"
45
- typecode = typecode_sub_pattern
46
-
47
- return typecode
48
-
49
-
50
16
  @lru_cache()
51
17
  def typecodes() -> List[dict]:
52
18
  r"""
53
19
  Returns a list of dictionaries containing typecodes and associated information derived from the schema.
54
20
 
55
- Preconditions about the schema:
56
- - The typecode portion of the pattern is between the pattern prefix and the first subsequent hyphen.
57
- - The typecode portion of the pattern either consists of a single typecode verbatim (e.g. "foo");
58
- or consists of multiple typecodes in a pipe-delimited list enclosed in parentheses (e.g. "(foo|bar|baz)").
59
- - The typecode portion of the pattern does not, itself, contain any hyphens.
60
-
61
- TODO: Get the typecodes in a different way than by extracting them from a larger string, which seems brittle to me.
62
- Getting them a different way may require schema authors to _define_ them a different way (e.g. defining them
63
- in a dedicated property of a class; for example, one named `typecode`).
21
+ Note: In this function, we rely on a helper function provided by the `nmdc-schema` package to extract—from a given
22
+ class's `id` slot's pattern—the typecode that the minter would use when generating an ID for an instance of
23
+ that class _today_; regardless of what it may have used in the past.
24
+
25
+ >>> typecode_descriptors = typecodes()
26
+ # Test #1: We get the typecode we expect, for a class whose pattern contains only one typecode.
27
+ >>> any((td["name"] == "sty" and td["schema_class"] == "nmdc:Study") for td in typecode_descriptors)
28
+ True
29
+ # Tests #2 and #3: We get only the typecode we expect, for a class whose pattern contains multiple typecodes.
30
+ >>> any((td["name"] == "dgms" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
31
+ True
32
+ >>> any((td["name"] == "omprc" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
33
+ False
64
34
  """
65
35
  id_pattern_prefix = r"^(nmdc):"
66
36
 
@@ -69,16 +39,14 @@ def typecodes() -> List[dict]:
69
39
  for cls_name, defn in schema_dict["$defs"].items():
70
40
  match defn.get("properties"):
71
41
  case {"id": {"pattern": p}} if p.startswith(id_pattern_prefix):
72
- # Get the portion of the pattern following the prefix.
73
- # e.g. "^(nmdc):foo-bar-baz" → "foo-bar-baz"
74
- index_of_first_character_following_prefix = len(id_pattern_prefix)
75
- pattern_without_prefix = p[index_of_first_character_following_prefix:]
42
+ # Extract the typecode from the pattern.
43
+ typecode_for_future_ids = get_typecode_for_future_ids(slot_pattern=p)
76
44
 
77
45
  rv.append(
78
46
  {
79
47
  "id": "nmdc:" + cls_name + "_" + "typecode",
80
48
  "schema_class": "nmdc:" + cls_name,
81
- "name": extract_typecode_from_pattern(pattern_without_prefix),
49
+ "name": typecode_for_future_ids,
82
50
  }
83
51
  )
84
52
  case _:
@@ -4,7 +4,7 @@ import datetime
4
4
  import xml.etree.ElementTree as ET
5
5
  import xml.dom.minidom
6
6
 
7
- from typing import Any
7
+ from typing import Any, List, Union
8
8
  from urllib.parse import urlparse
9
9
  from nmdc_runtime.site.export.ncbi_xml_utils import (
10
10
  get_instruments,
@@ -366,7 +366,14 @@ class NCBISubmissionXML:
366
366
  )
367
367
  # Currently, we are making the assumption that only one instrument
368
368
  # is used to sequence a Biosample
369
- instrument_id = ntseq.get("instrument_used", "")[0]
369
+ instrument_used: List[str] = ntseq.get(
370
+ "instrument_used", []
371
+ )
372
+ if not instrument_used:
373
+ instrument_id = None
374
+ else:
375
+ instrument_id = instrument_used[0]
376
+
370
377
  instrument = all_instruments.get(instrument_id, {})
371
378
  instrument_vendor = instrument.get("vendor", "")
372
379
  instrument_model = instrument.get("model", "")
@@ -448,6 +455,20 @@ class NCBISubmissionXML:
448
455
  "Attribute", "NextSeq 550", {"name": "instrument_model"}
449
456
  )
450
457
  )
458
+ elif instrument_model == "novaseq_6000":
459
+ sra_attributes.append(
460
+ self.set_element(
461
+ "Attribute",
462
+ "NovaSeq 6000",
463
+ {"name": "instrument_model"},
464
+ )
465
+ )
466
+ elif instrument_model == "hiseq":
467
+ sra_attributes.append(
468
+ self.set_element(
469
+ "Attribute", "HiSeq", {"name": "instrument_model"}
470
+ )
471
+ )
451
472
 
452
473
  if analyte_category == "metagenome":
453
474
  sra_attributes.append(
@@ -1,6 +1,10 @@
1
1
  from io import BytesIO, StringIO
2
+ from typing import Any, Dict, List, Union
3
+
4
+ from nmdc_runtime.api.endpoints.util import strip_oid
2
5
  from nmdc_runtime.minter.config import typecodes
3
6
  from lxml import etree
7
+ from pymongo.collection import Collection
4
8
 
5
9
  import csv
6
10
  import requests
@@ -45,35 +49,53 @@ def get_instruments(instrument_set_collection):
45
49
  raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
46
50
 
47
51
 
48
- def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
52
+ def fetch_data_objects_from_biosamples(
53
+ all_docs_collection: Collection,
54
+ data_object_set: Collection,
55
+ biosamples_list: List[Dict[str, Any]],
56
+ ) -> List[Dict[str, Dict[str, Any]]]:
57
+ """This method fetches the data objects that are "associated" (derived from/products of)
58
+ with their respective biosamples by iterating over the alldocs collection recursively.
59
+ The methods returns a dictionary with biosample ids as keys and the associated list of
60
+ data objects as values.
61
+
62
+ :param all_docs_collection: reference to the alldocs collection
63
+ :param data_object_set: reference to the data_object_set collection
64
+ :param biosamples_list: list of biosamples as JSON documents
65
+ :return: list of dictionaries with biosample ids as keys and associated data objects as values
66
+ """
67
+ biosample_data_objects = []
68
+
69
+ def collect_data_objects(doc_ids, collected_objects, unique_ids):
70
+ for doc_id in doc_ids:
71
+ if (
72
+ get_classname_from_typecode(doc_id) == "DataObject"
73
+ and doc_id not in unique_ids
74
+ ):
75
+ data_obj = data_object_set.find_one({"id": doc_id})
76
+ if data_obj:
77
+ collected_objects.append(strip_oid(data_obj))
78
+ unique_ids.add(doc_id)
79
+
49
80
  biosample_data_objects = []
50
81
 
51
82
  for biosample in biosamples_list:
52
83
  current_ids = [biosample["id"]]
53
84
  collected_data_objects = []
85
+ unique_ids = set()
54
86
 
55
87
  while current_ids:
56
88
  new_current_ids = []
57
89
  for current_id in current_ids:
58
- query = {"has_input": current_id}
59
- document = all_docs_collection.find_one(query)
90
+ for doc in all_docs_collection.find({"has_input": current_id}):
91
+ has_output = doc.get("has_output", [])
60
92
 
61
- if not document:
62
- continue
63
-
64
- has_output = document.get("has_output")
65
- if not has_output:
66
- continue
67
-
68
- for output_id in has_output:
69
- if get_classname_from_typecode(output_id) == "DataObject":
70
- data_object_doc = all_docs_collection.find_one(
71
- {"id": output_id}
72
- )
73
- if data_object_doc:
74
- collected_data_objects.append(data_object_doc)
75
- else:
76
- new_current_ids.append(output_id)
93
+ collect_data_objects(has_output, collected_data_objects, unique_ids)
94
+ new_current_ids.extend(
95
+ op
96
+ for op in has_output
97
+ if get_classname_from_typecode(op) != "DataObject"
98
+ )
77
99
 
78
100
  current_ids = new_current_ids
79
101
 
@@ -83,12 +105,25 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
83
105
  return biosample_data_objects
84
106
 
85
107
 
86
- def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
87
- biosample_data_objects = []
108
+ def fetch_nucleotide_sequencing_from_biosamples(
109
+ all_docs_collection: Collection,
110
+ data_generation_set: Collection,
111
+ biosamples_list: List[Dict[str, Any]],
112
+ ) -> List[Dict[str, Dict[str, Any]]]:
113
+ """This method fetches the nucleotide sequencing process records that create data objects
114
+ for biosamples by iterating over the alldocs collection recursively.
115
+
116
+ :param all_docs_collection: reference to the alldocs collection
117
+ :param data_generation_set: reference to the data_generation_set collection
118
+ :param biosamples_list: list of biosamples as JSON documents
119
+ :return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
120
+ process objects as values
121
+ """
122
+ biosample_ntseq_objects = []
88
123
 
89
124
  for biosample in biosamples_list:
90
125
  current_ids = [biosample["id"]]
91
- collected_data_objects = []
126
+ collected_ntseq_objects = []
92
127
 
93
128
  while current_ids:
94
129
  new_current_ids = []
@@ -105,23 +140,39 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
105
140
 
106
141
  for output_id in has_output:
107
142
  if get_classname_from_typecode(output_id) == "DataObject":
108
- nucleotide_sequencing_doc = all_docs_collection.find_one(
143
+ nucleotide_sequencing_doc = data_generation_set.find_one(
109
144
  {"id": document["id"]}
110
145
  )
111
146
  if nucleotide_sequencing_doc:
112
- collected_data_objects.append(nucleotide_sequencing_doc)
147
+ collected_ntseq_objects.append(
148
+ strip_oid(nucleotide_sequencing_doc)
149
+ )
113
150
  else:
114
151
  new_current_ids.append(output_id)
115
152
 
116
153
  current_ids = new_current_ids
117
154
 
118
- if collected_data_objects:
119
- biosample_data_objects.append({biosample["id"]: collected_data_objects})
155
+ if collected_ntseq_objects:
156
+ biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
157
+
158
+ return biosample_ntseq_objects
120
159
 
121
- return biosample_data_objects
122
160
 
161
+ def fetch_library_preparation_from_biosamples(
162
+ all_docs_collection: Collection,
163
+ material_processing_set: Collection,
164
+ biosamples_list: List[Dict[str, Any]],
165
+ ) -> List[Dict[str, Dict[str, Any]]]:
166
+ """This method fetches the library preparation process records that create processed samples,
167
+ which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
168
+ for biosamples by iterating over the alldocs collection recursively.
123
169
 
124
- def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_list):
170
+ :param all_docs_collection: reference to the alldocs collection
171
+ :param material_processing_set: reference to the material_processing_set collection
172
+ :param biosamples_list: list of biosamples as JSON documents
173
+ :return: list of dictionaries with biosample ids as keys and associated library preparation process
174
+ objects as values
175
+ """
125
176
  biosample_lib_prep = []
126
177
 
127
178
  for biosample in biosamples_list:
@@ -144,10 +195,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
144
195
  "has_input": output_id,
145
196
  "type": {"$in": ["LibraryPreparation"]},
146
197
  }
147
- lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
198
+ lib_prep_doc = material_processing_set.find_one(lib_prep_query)
148
199
 
149
200
  if lib_prep_doc:
150
- biosample_lib_prep.append({biosample_id: lib_prep_doc})
201
+ biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
151
202
  break # Stop at the first document that meets the criteria
152
203
 
153
204
  return biosample_lib_prep
@@ -2,6 +2,7 @@ from dagster import graph
2
2
 
3
3
  from nmdc_runtime.site.ops import (
4
4
  build_merged_db,
5
+ generate_biosample_set_for_nmdc_study_from_gold,
5
6
  nmdc_schema_database_export_filename,
6
7
  nmdc_schema_database_from_gold_study,
7
8
  nmdc_schema_object_to_dict,
@@ -57,6 +58,9 @@ from nmdc_runtime.site.ops import (
57
58
  get_ncbi_export_pipeline_inputs,
58
59
  ncbi_submission_xml_from_nmdc_study,
59
60
  ncbi_submission_xml_asset,
61
+ get_database_updater_inputs,
62
+ post_submission_portal_biosample_ingest_record_stitching_filename,
63
+ generate_data_generation_set_post_biosample_ingest,
60
64
  )
61
65
  from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
62
66
 
@@ -117,12 +121,14 @@ def apply_changesheet():
117
121
  sheet_in = get_changesheet_in()
118
122
  outputs = perform_changesheet_updates(sheet_in)
119
123
  add_output_run_event(outputs)
124
+ materialize_alldocs()
120
125
 
121
126
 
122
127
  @graph
123
128
  def apply_metadata_in():
124
129
  outputs = perform_mongo_updates(get_json_in())
125
130
  add_output_run_event(outputs)
131
+ materialize_alldocs()
126
132
 
127
133
 
128
134
  @graph
@@ -465,3 +471,36 @@ def nmdc_study_to_ncbi_submission_export():
465
471
  all_instruments,
466
472
  )
467
473
  ncbi_submission_xml_asset(xml_data)
474
+
475
+
476
+ @graph
477
+ def generate_data_generation_set_for_biosamples_in_nmdc_study():
478
+ (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
479
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
480
+
481
+ database = generate_data_generation_set_post_biosample_ingest(
482
+ study_id, gold_nmdc_instrument_map_df
483
+ )
484
+
485
+ database_dict = nmdc_schema_object_to_dict(database)
486
+ filename = post_submission_portal_biosample_ingest_record_stitching_filename(
487
+ study_id
488
+ )
489
+ outputs = export_json_to_drs(database_dict, filename)
490
+ add_output_run_event(outputs)
491
+
492
+
493
+ @graph
494
+ def generate_biosample_set_from_samples_in_gold():
495
+ (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
496
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
497
+
498
+ database = generate_biosample_set_for_nmdc_study_from_gold(
499
+ study_id, gold_nmdc_instrument_map_df
500
+ )
501
+ database_dict = nmdc_schema_object_to_dict(database)
502
+ filename = post_submission_portal_biosample_ingest_record_stitching_filename(
503
+ study_id
504
+ )
505
+ outputs = export_json_to_drs(database_dict, filename)
506
+ add_output_run_event(outputs)
nmdc_runtime/site/ops.py CHANGED
@@ -91,7 +91,12 @@ from nmdc_runtime.site.translation.neon_surface_water_translator import (
91
91
  from nmdc_runtime.site.translation.submission_portal_translator import (
92
92
  SubmissionPortalTranslator,
93
93
  )
94
- from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id
94
+ from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
95
+ from nmdc_runtime.site.util import (
96
+ run_and_log,
97
+ schema_collection_has_index_on_id,
98
+ nmdc_study_id_to_filename,
99
+ )
95
100
  from nmdc_runtime.util import (
96
101
  drs_object_in_for,
97
102
  get_names_of_classes_in_effective_range_of_slot,
@@ -1054,10 +1059,7 @@ def materialize_alldocs(context) -> int:
1054
1059
  # TODO include functional_annotation_agg for "real-time" ref integrity checking.
1055
1060
  # For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
1056
1061
  collection_names = populated_schema_collection_names_with_id_field(mdb)
1057
- context.log.info(f"{collection_names=}")
1058
-
1059
- # Build alldocs
1060
- context.log.info("constructing `alldocs` collection")
1062
+ context.log.info(f"constructing `alldocs` collection using {collection_names=}")
1061
1063
 
1062
1064
  document_class_names = set(
1063
1065
  chain.from_iterable(collection_name_to_class_names.values())
@@ -1070,7 +1072,8 @@ def materialize_alldocs(context) -> int:
1070
1072
  for cls_name in document_class_names
1071
1073
  }
1072
1074
 
1073
- # Any ancestor of a document class is a document-referenceable range, i.e., a valid range of a document-reference-ranged slot.
1075
+ # Any ancestor of a document class is a document-referencable range,
1076
+ # i.e., a valid range of a document-reference-ranged slot.
1074
1077
  document_referenceable_ranges = set(
1075
1078
  chain.from_iterable(
1076
1079
  schema_view.class_ancestors(cls_name) for cls_name in document_class_names
@@ -1086,17 +1089,15 @@ def materialize_alldocs(context) -> int:
1086
1089
  ):
1087
1090
  document_reference_ranged_slots[cls_name].append(slot_name)
1088
1091
 
1089
- # Drop any existing `alldocs` collection (e.g. from previous use of this op).
1090
- #
1091
- # FIXME: This "nuke and pave" approach introduces a race condition.
1092
- # For example, if someone were to visit an API endpoint that uses the "alldocs" collection,
1093
- # the endpoint would fail to perform its job since the "alldocs" collection is temporarily missing.
1094
- #
1095
- mdb.alldocs.drop()
1092
+ # Build `alldocs` to a temporary collection for atomic replacement
1093
+ # https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
1094
+ temp_alldocs_collection_name = f"tmp.alldocs.{ObjectId()}"
1095
+ temp_alldocs_collection = mdb[temp_alldocs_collection_name]
1096
+ context.log.info(f"constructing `{temp_alldocs_collection.name}` collection")
1096
1097
 
1097
1098
  for coll_name in collection_names:
1098
1099
  context.log.info(f"{coll_name=}")
1099
- requests = []
1100
+ write_operations = []
1100
1101
  documents_processed_counter = 0
1101
1102
  for doc in mdb[coll_name].find():
1102
1103
  doc_type = doc["type"][5:] # lop off "nmdc:" prefix
@@ -1105,30 +1106,35 @@ def materialize_alldocs(context) -> int:
1105
1106
  ]
1106
1107
  new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
1107
1108
  new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
1108
- requests.append(InsertOne(new_doc))
1109
- if len(requests) == BULK_WRITE_BATCH_SIZE:
1110
- _ = mdb.alldocs.bulk_write(requests, ordered=False)
1111
- requests.clear()
1109
+ write_operations.append(InsertOne(new_doc))
1110
+ if len(write_operations) == BULK_WRITE_BATCH_SIZE:
1111
+ _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1112
+ write_operations.clear()
1112
1113
  documents_processed_counter += BULK_WRITE_BATCH_SIZE
1113
- if len(requests) > 0:
1114
- _ = mdb.alldocs.bulk_write(requests, ordered=False)
1115
- documents_processed_counter += len(requests)
1114
+ if len(write_operations) > 0:
1115
+ _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1116
+ documents_processed_counter += len(write_operations)
1116
1117
  context.log.info(
1117
1118
  f"Inserted {documents_processed_counter} documents from {coll_name=} "
1118
1119
  )
1119
1120
 
1120
1121
  context.log.info(
1121
- f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
1122
+ f"produced `{temp_alldocs_collection.name}` collection with"
1123
+ f" {temp_alldocs_collection.estimated_document_count()} docs."
1122
1124
  )
1123
1125
 
1124
- # Re-idx for `alldocs` collection
1125
- mdb.alldocs.create_index("id", unique=True)
1126
- # The indexes were added to improve the performance of the
1127
- # /data_objects/study/{study_id} endpoint
1126
+ context.log.info(f"creating indexes on `{temp_alldocs_collection.name}` ...")
1127
+ # Ensure unique index on "id". Index creation here is blocking (i.e. background=False),
1128
+ # so that `temp_alldocs_collection` will be "good to go" on renaming.
1129
+ temp_alldocs_collection.create_index("id", unique=True)
1130
+ # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
1128
1131
  slots_to_index = ["has_input", "has_output", "was_informed_by"]
1129
- [mdb.alldocs.create_index(slot) for slot in slots_to_index]
1130
-
1132
+ [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
1131
1133
  context.log.info(f"created indexes on id, {slots_to_index}.")
1134
+
1135
+ context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
1136
+ temp_alldocs_collection.rename("alldocs", dropTarget=True)
1137
+
1132
1138
  return mdb.alldocs.estimated_document_count()
1133
1139
 
1134
1140
 
@@ -1182,8 +1188,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
1182
1188
  def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
1183
1189
  mdb = context.resources.mongo.db
1184
1190
  alldocs_collection = mdb["alldocs"]
1191
+ data_object_set = mdb["data_object_set"]
1185
1192
  biosample_data_objects = fetch_data_objects_from_biosamples(
1186
- alldocs_collection, biosamples
1193
+ alldocs_collection, data_object_set, biosamples
1187
1194
  )
1188
1195
  return biosample_data_objects
1189
1196
 
@@ -1194,8 +1201,9 @@ def get_nucleotide_sequencing_from_biosamples(
1194
1201
  ):
1195
1202
  mdb = context.resources.mongo.db
1196
1203
  alldocs_collection = mdb["alldocs"]
1204
+ data_generation_set = mdb["data_generation_set"]
1197
1205
  biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
1198
- alldocs_collection, biosamples
1206
+ alldocs_collection, data_generation_set, biosamples
1199
1207
  )
1200
1208
  return biosample_omics_processing
1201
1209
 
@@ -1206,8 +1214,9 @@ def get_library_preparation_from_biosamples(
1206
1214
  ):
1207
1215
  mdb = context.resources.mongo.db
1208
1216
  alldocs_collection = mdb["alldocs"]
1217
+ material_processing_set = mdb["material_processing_set"]
1209
1218
  biosample_lib_prep = fetch_library_preparation_from_biosamples(
1210
- alldocs_collection, biosamples
1219
+ alldocs_collection, material_processing_set, biosamples
1211
1220
  )
1212
1221
  return biosample_lib_prep
1213
1222
 
@@ -1240,3 +1249,94 @@ def ncbi_submission_xml_from_nmdc_study(
1240
1249
  all_instruments,
1241
1250
  )
1242
1251
  return ncbi_xml
1252
+
1253
+
1254
+ @op
1255
+ def post_submission_portal_biosample_ingest_record_stitching_filename(
1256
+ nmdc_study_id: str,
1257
+ ) -> str:
1258
+ filename = nmdc_study_id_to_filename(nmdc_study_id)
1259
+ return f"missing_database_records_for_{filename}.json"
1260
+
1261
+
1262
+ @op(
1263
+ config_schema={
1264
+ "nmdc_study_id": str,
1265
+ "gold_nmdc_instrument_mapping_file_url": str,
1266
+ },
1267
+ out={
1268
+ "nmdc_study_id": Out(str),
1269
+ "gold_nmdc_instrument_mapping_file_url": Out(str),
1270
+ },
1271
+ )
1272
+ def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]:
1273
+ return (
1274
+ context.op_config["nmdc_study_id"],
1275
+ context.op_config["gold_nmdc_instrument_mapping_file_url"],
1276
+ )
1277
+
1278
+
1279
+ @op(
1280
+ required_resource_keys={
1281
+ "runtime_api_user_client",
1282
+ "runtime_api_site_client",
1283
+ "gold_api_client",
1284
+ }
1285
+ )
1286
+ def generate_data_generation_set_post_biosample_ingest(
1287
+ context: OpExecutionContext,
1288
+ nmdc_study_id: str,
1289
+ gold_nmdc_instrument_map_df: pd.DataFrame,
1290
+ ) -> nmdc.Database:
1291
+ runtime_api_user_client: RuntimeApiUserClient = (
1292
+ context.resources.runtime_api_user_client
1293
+ )
1294
+ runtime_api_site_client: RuntimeApiSiteClient = (
1295
+ context.resources.runtime_api_site_client
1296
+ )
1297
+ gold_api_client: GoldApiClient = context.resources.gold_api_client
1298
+
1299
+ database_updater = DatabaseUpdater(
1300
+ runtime_api_user_client,
1301
+ runtime_api_site_client,
1302
+ gold_api_client,
1303
+ nmdc_study_id,
1304
+ gold_nmdc_instrument_map_df,
1305
+ )
1306
+ database = (
1307
+ database_updater.generate_data_generation_set_records_from_gold_api_for_study()
1308
+ )
1309
+
1310
+ return database
1311
+
1312
+
1313
+ @op(
1314
+ required_resource_keys={
1315
+ "runtime_api_user_client",
1316
+ "runtime_api_site_client",
1317
+ "gold_api_client",
1318
+ }
1319
+ )
1320
+ def generate_biosample_set_for_nmdc_study_from_gold(
1321
+ context: OpExecutionContext,
1322
+ nmdc_study_id: str,
1323
+ gold_nmdc_instrument_map_df: pd.DataFrame,
1324
+ ) -> nmdc.Database:
1325
+ runtime_api_user_client: RuntimeApiUserClient = (
1326
+ context.resources.runtime_api_user_client
1327
+ )
1328
+ runtime_api_site_client: RuntimeApiSiteClient = (
1329
+ context.resources.runtime_api_site_client
1330
+ )
1331
+ gold_api_client: GoldApiClient = context.resources.gold_api_client
1332
+
1333
+ database_updater = DatabaseUpdater(
1334
+ runtime_api_user_client,
1335
+ runtime_api_site_client,
1336
+ gold_api_client,
1337
+ nmdc_study_id,
1338
+ gold_nmdc_instrument_map_df,
1339
+ )
1340
+ database = database_updater.generate_biosample_set_from_gold_api_for_study()
1341
+
1342
+ return database
File without changes