nmdc-runtime 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -2,8 +2,9 @@ import os
2
2
  from functools import lru_cache
3
3
  from typing import List
4
4
 
5
- from nmdc_runtime.util import get_nmdc_jsonschema_dict
5
+ from nmdc_schema.id_helpers import get_typecode_for_future_ids
6
6
 
7
+ from nmdc_runtime.util import get_nmdc_jsonschema_dict
7
8
  from nmdc_runtime.api.db.mongo import get_mongo_db
8
9
 
9
10
 
@@ -12,55 +13,24 @@ def minting_service_id() -> str | None:
12
13
  return os.getenv("MINTING_SERVICE_ID")
13
14
 
14
15
 
15
- def extract_typecode_from_pattern(pattern: str) -> str:
16
- r"""
17
- Returns the typecode portion of the specified string.
18
-
19
- >>> extract_typecode_from_pattern("foo-123-456$") # original behavior
20
- 'foo'
21
- >>> extract_typecode_from_pattern("(foo)-123-456$") # returns first and only typecode
22
- 'foo'
23
- >>> extract_typecode_from_pattern("(foo|bar)-123-456$") # returns first of 2 typecodes
24
- 'foo'
25
- >>> extract_typecode_from_pattern("(foo|bar|baz)-123-456$") # returns first of > 2 typecodes
26
- 'foo'
27
- """
28
-
29
- # Get the portion of the pattern preceding the first hyphen.
30
- # e.g. "foo-bar-baz" → ["foo", "bar-baz"] → "foo"
31
- typecode_sub_pattern = pattern.split("-", maxsplit=1)[0]
32
-
33
- # If that portion of the pattern is enclosed in parentheses, get the portion between the parentheses.
34
- # e.g. "(apple|banana|carrot)" → "apple|banana|carrot"
35
- if typecode_sub_pattern.startswith("(") and typecode_sub_pattern.endswith(")"):
36
- inner_pattern = typecode_sub_pattern[1:-1]
37
-
38
- # Finally, get everything before the first `|`, if any.
39
- # e.g. "apple|banana|carrot" → "apple"
40
- # e.g. "apple" → "apple"
41
- typecode = inner_pattern.split("|", maxsplit=1)[0]
42
- else:
43
- # Note: This is the original behavior, before we added support for multi-typecode patterns.
44
- # e.g. "apple" → "apple"
45
- typecode = typecode_sub_pattern
46
-
47
- return typecode
48
-
49
-
50
16
  @lru_cache()
51
17
  def typecodes() -> List[dict]:
52
18
  r"""
53
19
  Returns a list of dictionaries containing typecodes and associated information derived from the schema.
54
20
 
55
- Preconditions about the schema:
56
- - The typecode portion of the pattern is between the pattern prefix and the first subsequent hyphen.
57
- - The typecode portion of the pattern either consists of a single typecode verbatim (e.g. "foo");
58
- or consists of multiple typecodes in a pipe-delimited list enclosed in parentheses (e.g. "(foo|bar|baz)").
59
- - The typecode portion of the pattern does not, itself, contain any hyphens.
60
-
61
- TODO: Get the typecodes in a different way than by extracting them from a larger string, which seems brittle to me.
62
- Getting them a different way may require schema authors to _define_ them a different way (e.g. defining them
63
- in a dedicated property of a class; for example, one named `typecode`).
21
+ Note: In this function, we rely on a helper function provided by the `nmdc-schema` package to extract—from a given
22
+ class's `id` slot's pattern—the typecode that the minter would use when generating an ID for an instance of
23
+ that class _today_; regardless of what it may have used in the past.
24
+
25
+ >>> typecode_descriptors = typecodes()
26
+ # Test #1: We get the typecode we expect, for a class whose pattern contains only one typecode.
27
+ >>> any((td["name"] == "sty" and td["schema_class"] == "nmdc:Study") for td in typecode_descriptors)
28
+ True
29
+ # Tests #2 and #3: We get only the typecode we expect, for a class whose pattern contains multiple typecodes.
30
+ >>> any((td["name"] == "dgms" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
31
+ True
32
+ >>> any((td["name"] == "omprc" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
33
+ False
64
34
  """
65
35
  id_pattern_prefix = r"^(nmdc):"
66
36
 
@@ -69,16 +39,14 @@ def typecodes() -> List[dict]:
69
39
  for cls_name, defn in schema_dict["$defs"].items():
70
40
  match defn.get("properties"):
71
41
  case {"id": {"pattern": p}} if p.startswith(id_pattern_prefix):
72
- # Get the portion of the pattern following the prefix.
73
- # e.g. "^(nmdc):foo-bar-baz" → "foo-bar-baz"
74
- index_of_first_character_following_prefix = len(id_pattern_prefix)
75
- pattern_without_prefix = p[index_of_first_character_following_prefix:]
42
+ # Extract the typecode from the pattern.
43
+ typecode_for_future_ids = get_typecode_for_future_ids(slot_pattern=p)
76
44
 
77
45
  rv.append(
78
46
  {
79
47
  "id": "nmdc:" + cls_name + "_" + "typecode",
80
48
  "schema_class": "nmdc:" + cls_name,
81
- "name": extract_typecode_from_pattern(pattern_without_prefix),
49
+ "name": typecode_for_future_ids,
82
50
  }
83
51
  )
84
52
  case _:
@@ -2,6 +2,7 @@ from dagster import graph
2
2
 
3
3
  from nmdc_runtime.site.ops import (
4
4
  build_merged_db,
5
+ generate_biosample_set_for_nmdc_study_from_gold,
5
6
  nmdc_schema_database_export_filename,
6
7
  nmdc_schema_database_from_gold_study,
7
8
  nmdc_schema_object_to_dict,
@@ -57,6 +58,9 @@ from nmdc_runtime.site.ops import (
57
58
  get_ncbi_export_pipeline_inputs,
58
59
  ncbi_submission_xml_from_nmdc_study,
59
60
  ncbi_submission_xml_asset,
61
+ get_database_updater_inputs,
62
+ post_submission_portal_biosample_ingest_record_stitching_filename,
63
+ generate_data_generation_set_post_biosample_ingest,
60
64
  )
61
65
  from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
62
66
 
@@ -117,12 +121,14 @@ def apply_changesheet():
117
121
  sheet_in = get_changesheet_in()
118
122
  outputs = perform_changesheet_updates(sheet_in)
119
123
  add_output_run_event(outputs)
124
+ materialize_alldocs()
120
125
 
121
126
 
122
127
  @graph
123
128
  def apply_metadata_in():
124
129
  outputs = perform_mongo_updates(get_json_in())
125
130
  add_output_run_event(outputs)
131
+ materialize_alldocs()
126
132
 
127
133
 
128
134
  @graph
@@ -465,3 +471,36 @@ def nmdc_study_to_ncbi_submission_export():
465
471
  all_instruments,
466
472
  )
467
473
  ncbi_submission_xml_asset(xml_data)
474
+
475
+
476
+ @graph
477
+ def generate_data_generation_set_for_biosamples_in_nmdc_study():
478
+ (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
479
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
480
+
481
+ database = generate_data_generation_set_post_biosample_ingest(
482
+ study_id, gold_nmdc_instrument_map_df
483
+ )
484
+
485
+ database_dict = nmdc_schema_object_to_dict(database)
486
+ filename = post_submission_portal_biosample_ingest_record_stitching_filename(
487
+ study_id
488
+ )
489
+ outputs = export_json_to_drs(database_dict, filename)
490
+ add_output_run_event(outputs)
491
+
492
+
493
+ @graph
494
+ def generate_biosample_set_from_samples_in_gold():
495
+ (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
496
+ gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
497
+
498
+ database = generate_biosample_set_for_nmdc_study_from_gold(
499
+ study_id, gold_nmdc_instrument_map_df
500
+ )
501
+ database_dict = nmdc_schema_object_to_dict(database)
502
+ filename = post_submission_portal_biosample_ingest_record_stitching_filename(
503
+ study_id
504
+ )
505
+ outputs = export_json_to_drs(database_dict, filename)
506
+ add_output_run_event(outputs)
nmdc_runtime/site/ops.py CHANGED
@@ -91,7 +91,12 @@ from nmdc_runtime.site.translation.neon_surface_water_translator import (
91
91
  from nmdc_runtime.site.translation.submission_portal_translator import (
92
92
  SubmissionPortalTranslator,
93
93
  )
94
- from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id
94
+ from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
95
+ from nmdc_runtime.site.util import (
96
+ run_and_log,
97
+ schema_collection_has_index_on_id,
98
+ nmdc_study_id_to_filename,
99
+ )
95
100
  from nmdc_runtime.util import (
96
101
  drs_object_in_for,
97
102
  get_names_of_classes_in_effective_range_of_slot,
@@ -1054,10 +1059,7 @@ def materialize_alldocs(context) -> int:
1054
1059
  # TODO include functional_annotation_agg for "real-time" ref integrity checking.
1055
1060
  # For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
1056
1061
  collection_names = populated_schema_collection_names_with_id_field(mdb)
1057
- context.log.info(f"{collection_names=}")
1058
-
1059
- # Build alldocs
1060
- context.log.info("constructing `alldocs` collection")
1062
+ context.log.info(f"constructing `alldocs` collection using {collection_names=}")
1061
1063
 
1062
1064
  document_class_names = set(
1063
1065
  chain.from_iterable(collection_name_to_class_names.values())
@@ -1070,7 +1072,8 @@ def materialize_alldocs(context) -> int:
1070
1072
  for cls_name in document_class_names
1071
1073
  }
1072
1074
 
1073
- # Any ancestor of a document class is a document-referenceable range, i.e., a valid range of a document-reference-ranged slot.
1075
+ # Any ancestor of a document class is a document-referencable range,
1076
+ # i.e., a valid range of a document-reference-ranged slot.
1074
1077
  document_referenceable_ranges = set(
1075
1078
  chain.from_iterable(
1076
1079
  schema_view.class_ancestors(cls_name) for cls_name in document_class_names
@@ -1086,17 +1089,15 @@ def materialize_alldocs(context) -> int:
1086
1089
  ):
1087
1090
  document_reference_ranged_slots[cls_name].append(slot_name)
1088
1091
 
1089
- # Drop any existing `alldocs` collection (e.g. from previous use of this op).
1090
- #
1091
- # FIXME: This "nuke and pave" approach introduces a race condition.
1092
- # For example, if someone were to visit an API endpoint that uses the "alldocs" collection,
1093
- # the endpoint would fail to perform its job since the "alldocs" collection is temporarily missing.
1094
- #
1095
- mdb.alldocs.drop()
1092
+ # Build `alldocs` to a temporary collection for atomic replacement
1093
+ # https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
1094
+ temp_alldocs_collection_name = f"tmp.alldocs.{ObjectId()}"
1095
+ temp_alldocs_collection = mdb[temp_alldocs_collection_name]
1096
+ context.log.info(f"constructing `{temp_alldocs_collection.name}` collection")
1096
1097
 
1097
1098
  for coll_name in collection_names:
1098
1099
  context.log.info(f"{coll_name=}")
1099
- requests = []
1100
+ write_operations = []
1100
1101
  documents_processed_counter = 0
1101
1102
  for doc in mdb[coll_name].find():
1102
1103
  doc_type = doc["type"][5:] # lop off "nmdc:" prefix
@@ -1105,30 +1106,35 @@ def materialize_alldocs(context) -> int:
1105
1106
  ]
1106
1107
  new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
1107
1108
  new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
1108
- requests.append(InsertOne(new_doc))
1109
- if len(requests) == BULK_WRITE_BATCH_SIZE:
1110
- _ = mdb.alldocs.bulk_write(requests, ordered=False)
1111
- requests.clear()
1109
+ write_operations.append(InsertOne(new_doc))
1110
+ if len(write_operations) == BULK_WRITE_BATCH_SIZE:
1111
+ _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1112
+ write_operations.clear()
1112
1113
  documents_processed_counter += BULK_WRITE_BATCH_SIZE
1113
- if len(requests) > 0:
1114
- _ = mdb.alldocs.bulk_write(requests, ordered=False)
1115
- documents_processed_counter += len(requests)
1114
+ if len(write_operations) > 0:
1115
+ _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1116
+ documents_processed_counter += len(write_operations)
1116
1117
  context.log.info(
1117
1118
  f"Inserted {documents_processed_counter} documents from {coll_name=} "
1118
1119
  )
1119
1120
 
1120
1121
  context.log.info(
1121
- f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
1122
+ f"produced `{temp_alldocs_collection.name}` collection with"
1123
+ f" {temp_alldocs_collection.estimated_document_count()} docs."
1122
1124
  )
1123
1125
 
1124
- # Re-idx for `alldocs` collection
1125
- mdb.alldocs.create_index("id", unique=True)
1126
- # The indexes were added to improve the performance of the
1127
- # /data_objects/study/{study_id} endpoint
1126
+ context.log.info(f"creating indexes on `{temp_alldocs_collection.name}` ...")
1127
+ # Ensure unique index on "id". Index creation here is blocking (i.e. background=False),
1128
+ # so that `temp_alldocs_collection` will be "good to go" on renaming.
1129
+ temp_alldocs_collection.create_index("id", unique=True)
1130
+ # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
1128
1131
  slots_to_index = ["has_input", "has_output", "was_informed_by"]
1129
- [mdb.alldocs.create_index(slot) for slot in slots_to_index]
1130
-
1132
+ [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
1131
1133
  context.log.info(f"created indexes on id, {slots_to_index}.")
1134
+
1135
+ context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
1136
+ temp_alldocs_collection.rename("alldocs", dropTarget=True)
1137
+
1132
1138
  return mdb.alldocs.estimated_document_count()
1133
1139
 
1134
1140
 
@@ -1240,3 +1246,94 @@ def ncbi_submission_xml_from_nmdc_study(
1240
1246
  all_instruments,
1241
1247
  )
1242
1248
  return ncbi_xml
1249
+
1250
+
1251
+ @op
1252
+ def post_submission_portal_biosample_ingest_record_stitching_filename(
1253
+ nmdc_study_id: str,
1254
+ ) -> str:
1255
+ filename = nmdc_study_id_to_filename(nmdc_study_id)
1256
+ return f"missing_database_records_for_{filename}.json"
1257
+
1258
+
1259
+ @op(
1260
+ config_schema={
1261
+ "nmdc_study_id": str,
1262
+ "gold_nmdc_instrument_mapping_file_url": str,
1263
+ },
1264
+ out={
1265
+ "nmdc_study_id": Out(str),
1266
+ "gold_nmdc_instrument_mapping_file_url": Out(str),
1267
+ },
1268
+ )
1269
+ def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]:
1270
+ return (
1271
+ context.op_config["nmdc_study_id"],
1272
+ context.op_config["gold_nmdc_instrument_mapping_file_url"],
1273
+ )
1274
+
1275
+
1276
+ @op(
1277
+ required_resource_keys={
1278
+ "runtime_api_user_client",
1279
+ "runtime_api_site_client",
1280
+ "gold_api_client",
1281
+ }
1282
+ )
1283
+ def generate_data_generation_set_post_biosample_ingest(
1284
+ context: OpExecutionContext,
1285
+ nmdc_study_id: str,
1286
+ gold_nmdc_instrument_map_df: pd.DataFrame,
1287
+ ) -> nmdc.Database:
1288
+ runtime_api_user_client: RuntimeApiUserClient = (
1289
+ context.resources.runtime_api_user_client
1290
+ )
1291
+ runtime_api_site_client: RuntimeApiSiteClient = (
1292
+ context.resources.runtime_api_site_client
1293
+ )
1294
+ gold_api_client: GoldApiClient = context.resources.gold_api_client
1295
+
1296
+ database_updater = DatabaseUpdater(
1297
+ runtime_api_user_client,
1298
+ runtime_api_site_client,
1299
+ gold_api_client,
1300
+ nmdc_study_id,
1301
+ gold_nmdc_instrument_map_df,
1302
+ )
1303
+ database = (
1304
+ database_updater.generate_data_generation_set_records_from_gold_api_for_study()
1305
+ )
1306
+
1307
+ return database
1308
+
1309
+
1310
+ @op(
1311
+ required_resource_keys={
1312
+ "runtime_api_user_client",
1313
+ "runtime_api_site_client",
1314
+ "gold_api_client",
1315
+ }
1316
+ )
1317
+ def generate_biosample_set_for_nmdc_study_from_gold(
1318
+ context: OpExecutionContext,
1319
+ nmdc_study_id: str,
1320
+ gold_nmdc_instrument_map_df: pd.DataFrame,
1321
+ ) -> nmdc.Database:
1322
+ runtime_api_user_client: RuntimeApiUserClient = (
1323
+ context.resources.runtime_api_user_client
1324
+ )
1325
+ runtime_api_site_client: RuntimeApiSiteClient = (
1326
+ context.resources.runtime_api_site_client
1327
+ )
1328
+ gold_api_client: GoldApiClient = context.resources.gold_api_client
1329
+
1330
+ database_updater = DatabaseUpdater(
1331
+ runtime_api_user_client,
1332
+ runtime_api_site_client,
1333
+ gold_api_client,
1334
+ nmdc_study_id,
1335
+ gold_nmdc_instrument_map_df,
1336
+ )
1337
+ database = database_updater.generate_biosample_set_from_gold_api_for_study()
1338
+
1339
+ return database
File without changes
@@ -0,0 +1,230 @@
1
+ from functools import lru_cache
2
+ from typing import Any, Dict, List
3
+ import pandas as pd
4
+ from nmdc_runtime.site.resources import (
5
+ RuntimeApiUserClient,
6
+ RuntimeApiSiteClient,
7
+ GoldApiClient,
8
+ )
9
+ from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
10
+ from nmdc_schema import nmdc
11
+
12
+
13
+ class DatabaseUpdater:
14
+ def __init__(
15
+ self,
16
+ runtime_api_user_client: RuntimeApiUserClient,
17
+ runtime_api_site_client: RuntimeApiSiteClient,
18
+ gold_api_client: GoldApiClient,
19
+ study_id: str,
20
+ gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
21
+ ):
22
+ """This class serves as an API for repairing connections in the database by
23
+ adding records that are essentially missing "links"/"connections". As we identify
24
+ common use cases for adding missing records to the database, we can
25
+ add helper methods to this class.
26
+
27
+ :param runtime_api_user_client: An object of RuntimeApiUserClient which can be
28
+ used to retrieve instance records from the NMDC database.
29
+ :param runtime_api_site_client: An object of RuntimeApiSiteClient which can be
30
+ used to mint new IDs for the repaired records that need to be added into the NMDC database.
31
+ :param gold_api_client: An object of GoldApiClient which can be used to retrieve
32
+ records from GOLD via the GOLD API.
33
+ :param study_id: NMDC study ID for which the missing records need to be added.
34
+ :param gold_nmdc_instrument_map_df: A dataframe originally stored as a TSV mapping file in the
35
+ NMDC schema repo, which maps GOLD instrument IDs to IDs of NMDC instrument_set records.
36
+ """
37
+ self.runtime_api_user_client = runtime_api_user_client
38
+ self.runtime_api_site_client = runtime_api_site_client
39
+ self.gold_api_client = gold_api_client
40
+ self.study_id = study_id
41
+ self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
42
+
43
+ @lru_cache
44
+ def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
45
+ """Fetch response from GOLD /biosamples API for a given biosample id.
46
+
47
+ :param gold_biosample_id: GOLD biosample ID.
48
+ :return: Dictionary containing the response from the GOLD /biosamples API.
49
+ """
50
+ return self.gold_api_client.fetch_biosample_by_biosample_id(gold_biosample_id)
51
+
52
+ @lru_cache
53
+ def _fetch_gold_projects(self, gold_biosample_id: str):
54
+ """Fetch response from GOLD /projects API for a given biosample id.
55
+
56
+ :param gold_biosample_id: GOLD biosample ID
57
+ :return: Dictionary containing the response from the GOLD /projects API.
58
+ """
59
+ return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id)
60
+
61
+ def generate_data_generation_set_records_from_gold_api_for_study(
62
+ self,
63
+ ) -> nmdc.Database:
64
+ """This method creates missing data generation records for a given study in the NMDC database using
65
+ metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated
66
+ with the study from the NMDC database. Then, it fetches all the biosample and project data data
67
+ associated with the individual biosamples from the GOLD API using the NMDC-GOLD biosample id
68
+ mappings on the "gold_biosample_identifiers" key/slot. We use the GoldStudyTranslator class
69
+ to mint the required number of `nmdc:DataGeneration` (`nmdc:NucleotideSequencing`) records based
70
+ on the number of GOLD sequencing projects, and then reimplement only the part of logic from that
71
+ class which is responsible for making data_generation_set records.
72
+
73
+ :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
74
+ """
75
+ database = nmdc.Database()
76
+
77
+ biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
78
+ self.study_id
79
+ )
80
+
81
+ all_gold_biosamples = []
82
+ all_gold_projects = []
83
+ for biosample in biosample_set:
84
+ gold_biosample_identifiers = biosample.get("gold_biosample_identifiers")
85
+ if gold_biosample_identifiers:
86
+ for gold_biosample_id in gold_biosample_identifiers:
87
+ gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0]
88
+ gold_projects = self._fetch_gold_projects(gold_biosample_id)
89
+ gold_biosample["projects"] = gold_projects
90
+
91
+ all_gold_biosamples.append(gold_biosample)
92
+ all_gold_projects.extend(gold_projects)
93
+
94
+ gold_study_translator = GoldStudyTranslator(
95
+ biosamples=all_gold_biosamples,
96
+ projects=all_gold_projects,
97
+ gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
98
+ )
99
+
100
+ # The GoldStudyTranslator class has some pre-processing logic which filters out
101
+ # invalid biosamples and projects (based on `sequencingStrategy`, `projectStatus`, etc.)
102
+ filtered_biosamples = gold_study_translator.biosamples
103
+ filtered_projects = gold_study_translator.projects
104
+
105
+ gold_project_ids = [project["projectGoldId"] for project in filtered_projects]
106
+ nmdc_nucleotide_sequencing_ids = self.runtime_api_site_client.mint_id(
107
+ "nmdc:NucleotideSequencing", len(gold_project_ids)
108
+ ).json()
109
+ gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
110
+ zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
111
+ )
112
+
113
+ gold_to_nmdc_biosample_ids = {}
114
+
115
+ for biosample in biosample_set:
116
+ gold_ids = biosample.get("gold_biosample_identifiers", [])
117
+ for gold_id in gold_ids:
118
+ gold_id_stripped = gold_id.replace("gold:", "")
119
+ gold_to_nmdc_biosample_ids[gold_id_stripped] = biosample["id"]
120
+
121
+ database.data_generation_set = []
122
+ # Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records
123
+ # created is based on the number of GOLD sequencing projects
124
+ for project in filtered_projects:
125
+ # map the projectGoldId to the NMDC biosample ID
126
+ biosample_gold_id = next(
127
+ (
128
+ biosample["biosampleGoldId"]
129
+ for biosample in filtered_biosamples
130
+ if any(
131
+ p["projectGoldId"] == project["projectGoldId"]
132
+ for p in biosample.get("projects", [])
133
+ )
134
+ ),
135
+ None,
136
+ )
137
+
138
+ if biosample_gold_id:
139
+ nmdc_biosample_id = gold_to_nmdc_biosample_ids.get(biosample_gold_id)
140
+ if nmdc_biosample_id:
141
+ database.data_generation_set.append(
142
+ gold_study_translator._translate_nucleotide_sequencing(
143
+ project,
144
+ nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
145
+ project["projectGoldId"]
146
+ ],
147
+ nmdc_biosample_id=nmdc_biosample_id,
148
+ nmdc_study_id=self.study_id,
149
+ )
150
+ )
151
+
152
+ return database
153
+
154
+ def generate_biosample_set_from_gold_api_for_study(self) -> nmdc.Database:
155
+ """This method creates biosample_set records for a given study in the NMDC database using
156
+ metadata from GOLD. The logic works by first fetching the biosampleGoldId values of all
157
+ biosamples associated with the study. Then, it fetches the list of all biosamples associated
158
+ with the GOLD study using the GOLD API. There's pre-processing logic in the GoldStudyTranslator
159
+ to filter out biosamples based on `sequencingStrategy` and `projectStatus`. On this list of
160
+ filtered biosamples, we compute a "set difference" (conceptually) between the list of
161
+ filtered samples and ones that are already in the NMDC database, i.e., we ignore biosamples
162
+ that are already present in the database, and continue on to create biosample_set records for
163
+ those that do not have records in the database already.
164
+
165
+ :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
166
+ """
167
+ database = nmdc.Database()
168
+
169
+ # get a list of all biosamples associated with a given NMDC study id
170
+ biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
171
+ self.study_id
172
+ )
173
+
174
+ # get a list of GOLD biosample ids (`biosampleGoldId` values) by iterating
175
+ # over all the biosample_set records retrieved using the above logic
176
+ nmdc_gold_ids = set()
177
+ for biosample in biosample_set:
178
+ gold_ids = biosample.get("gold_biosample_identifiers", [])
179
+ for gold_id in gold_ids:
180
+ nmdc_gold_ids.add(gold_id.replace("gold:", ""))
181
+
182
+ # retrieve GOLD study id by looking at the `gold_study_identifiers` key/slot
183
+ # on the NMDC study record
184
+ nmdc_study = self.runtime_api_user_client.get_study(self.study_id)[0]
185
+ gold_study_id = nmdc_study.get("gold_study_identifiers", [])[0].replace(
186
+ "gold:", ""
187
+ )
188
+
189
+ # use the GOLD study id to fetch all biosample records associated with the study
190
+ gold_biosamples_for_study = self.gold_api_client.fetch_biosamples_by_study(
191
+ gold_study_id
192
+ )
193
+
194
+ # part of the code where we are (conceptually) computing a set difference between
195
+ # the list of filtered samples and ones that are already in the NMDC database
196
+ missing_gold_biosamples = [
197
+ gbs
198
+ for gbs in gold_biosamples_for_study
199
+ if gbs.get("biosampleGoldId") not in nmdc_gold_ids
200
+ ]
201
+
202
+ gold_study_translator = GoldStudyTranslator(
203
+ biosamples=missing_gold_biosamples,
204
+ gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
205
+ )
206
+
207
+ translated_biosamples = gold_study_translator.biosamples
208
+
209
+ # mint new NMDC biosample IDs for the "missing" biosamples
210
+ gold_biosample_ids = [
211
+ biosample["biosampleGoldId"] for biosample in translated_biosamples
212
+ ]
213
+ nmdc_biosample_ids = self.runtime_api_site_client.mint_id(
214
+ "nmdc:Biosample", len(translated_biosamples)
215
+ ).json()
216
+ gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
217
+
218
+ database.biosample_set = [
219
+ gold_study_translator._translate_biosample(
220
+ biosample,
221
+ nmdc_biosample_id=gold_to_nmdc_biosample_ids[
222
+ biosample["biosampleGoldId"]
223
+ ],
224
+ nmdc_study_id=self.study_id,
225
+ nmdc_field_site_id=None,
226
+ )
227
+ for biosample in translated_biosamples
228
+ ]
229
+
230
+ return database
@@ -25,6 +25,7 @@ from nmdc_runtime.api.models.run import _add_run_fail_event
25
25
  from nmdc_runtime.api.models.trigger import Trigger
26
26
  from nmdc_runtime.site.export.study_metadata import export_study_biosamples_metadata
27
27
  from nmdc_runtime.site.graphs import (
28
+ generate_biosample_set_from_samples_in_gold,
28
29
  translate_metadata_submission_to_nmdc_schema_database,
29
30
  ingest_metadata_submission,
30
31
  gold_study_to_database,
@@ -44,6 +45,7 @@ from nmdc_runtime.site.graphs import (
44
45
  ingest_neon_surface_water_metadata,
45
46
  ensure_alldocs,
46
47
  nmdc_study_to_ncbi_submission_export,
48
+ generate_data_generation_set_for_biosamples_in_nmdc_study,
47
49
  )
48
50
  from nmdc_runtime.site.resources import (
49
51
  get_mongo,
@@ -113,6 +115,13 @@ housekeeping_weekly = ScheduleDefinition(
113
115
  job=housekeeping.to_job(**preset_normal),
114
116
  )
115
117
 
118
+ ensure_alldocs_daily = ScheduleDefinition(
119
+ name="daily_ensure_alldocs",
120
+ cron_schedule="0 3 * * *",
121
+ execution_timezone="America/New_York",
122
+ job=ensure_alldocs.to_job(**preset_normal),
123
+ )
124
+
116
125
 
117
126
  def asset_materialization_metadata(asset_event, key):
118
127
  """Get metadata from an asset materialization event.
@@ -453,7 +462,7 @@ def repo():
453
462
  export_study_biosamples_metadata.to_job(**preset_normal),
454
463
  ensure_alldocs.to_job(**preset_normal),
455
464
  ]
456
- schedules = [housekeeping_weekly]
465
+ schedules = [housekeeping_weekly, ensure_alldocs_daily]
457
466
  sensors = [
458
467
  done_object_put_ops,
459
468
  ensure_gold_translation_job,
@@ -915,6 +924,97 @@ def biosample_export():
915
924
  ]
916
925
 
917
926
 
927
+ @repository
928
+ def database_records_stitching():
929
+ normal_resources = run_config_frozen__normal_env["resources"]
930
+ return [
931
+ generate_data_generation_set_for_biosamples_in_nmdc_study.to_job(
932
+ description="This job can be used to create a data_generation_set JSON for biosamples that are already present in the NMDC database.",
933
+ resource_defs=resource_defs,
934
+ config={
935
+ "resources": merge(
936
+ unfreeze(normal_resources),
937
+ {
938
+ "runtime_api_user_client": {
939
+ "config": {
940
+ "base_url": {"env": "API_HOST"},
941
+ "username": {"env": "API_ADMIN_USER"},
942
+ "password": {"env": "API_ADMIN_PASS"},
943
+ },
944
+ },
945
+ "runtime_api_site_client": {
946
+ "config": {
947
+ "base_url": {"env": "API_HOST"},
948
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
949
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
950
+ "site_id": {"env": "API_SITE_ID"},
951
+ },
952
+ },
953
+ "gold_api_client": {
954
+ "config": {
955
+ "base_url": {"env": "GOLD_API_BASE_URL"},
956
+ "username": {"env": "GOLD_API_USERNAME"},
957
+ "password": {"env": "GOLD_API_PASSWORD"},
958
+ },
959
+ },
960
+ },
961
+ ),
962
+ "ops": {
963
+ "get_database_updater_inputs": {
964
+ "config": {
965
+ "nmdc_study_id": "",
966
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
967
+ }
968
+ },
969
+ "export_json_to_drs": {"config": {"username": ""}},
970
+ },
971
+ },
972
+ ),
973
+ generate_biosample_set_from_samples_in_gold.to_job(
974
+ description="This job can be used to create a biosample_set JSON from samples in GOLD for a given study in NMDC.",
975
+ resource_defs=resource_defs,
976
+ config={
977
+ "resources": merge(
978
+ unfreeze(normal_resources),
979
+ {
980
+ "runtime_api_user_client": {
981
+ "config": {
982
+ "base_url": {"env": "API_HOST"},
983
+ "username": {"env": "API_ADMIN_USER"},
984
+ "password": {"env": "API_ADMIN_PASS"},
985
+ },
986
+ },
987
+ "runtime_api_site_client": {
988
+ "config": {
989
+ "base_url": {"env": "API_HOST"},
990
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
991
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
992
+ "site_id": {"env": "API_SITE_ID"},
993
+ },
994
+ },
995
+ "gold_api_client": {
996
+ "config": {
997
+ "base_url": {"env": "GOLD_API_BASE_URL"},
998
+ "username": {"env": "GOLD_API_USERNAME"},
999
+ "password": {"env": "GOLD_API_PASSWORD"},
1000
+ },
1001
+ },
1002
+ },
1003
+ ),
1004
+ "ops": {
1005
+ "get_database_updater_inputs": {
1006
+ "config": {
1007
+ "nmdc_study_id": "",
1008
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
1009
+ }
1010
+ },
1011
+ "export_json_to_drs": {"config": {"username": ""}},
1012
+ },
1013
+ },
1014
+ ),
1015
+ ]
1016
+
1017
+
918
1018
  # @repository
919
1019
  # def validation():
920
1020
  # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
@@ -129,16 +129,23 @@ class RuntimeApiUserClient(RuntimeApiClient):
129
129
  return response.json()["cursor"]["firstBatch"]
130
130
 
131
131
  def get_biosamples_for_study(self, study_id: str):
132
+ # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
133
+ # The /nmdcschema/{collection-name} endpoint implements pagination via the page_token mechanism,
134
+ # but the tradeoff there is that we would need to make multiple requests to step through the
135
+ # each of the pages. By picking a large number for max_page_size, we can get all the results
136
+ # in a single request.
137
+ # This method previously used the /queries:run endpoint but the problem with that was that
138
+ # it used to truncate the number of results returned to 100.
132
139
  response = self.request(
133
- "POST",
134
- f"/queries:run",
140
+ "GET",
141
+ f"/nmdcschema/biosample_set",
135
142
  {
136
- "find": "biosample_set",
137
- "filter": {"part_of": {"$elemMatch": {"$eq": study_id}}},
143
+ "filter": json.dumps({"associated_studies": study_id}),
144
+ "max_page_size": 10000,
138
145
  },
139
146
  )
140
147
  response.raise_for_status()
141
- return response.json()["cursor"]["firstBatch"]
148
+ return response.json()["resources"]
142
149
 
143
150
  def get_omics_processing_by_name(self, name: str):
144
151
  response = self.request(
@@ -152,6 +159,18 @@ class RuntimeApiUserClient(RuntimeApiClient):
152
159
  response.raise_for_status()
153
160
  return response.json()["cursor"]["firstBatch"]
154
161
 
162
+ def get_study(self, study_id: str):
163
+ response = self.request(
164
+ "POST",
165
+ f"/queries:run",
166
+ {
167
+ "find": "study_set",
168
+ "filter": {"id": study_id},
169
+ },
170
+ )
171
+ response.raise_for_status()
172
+ return response.json()["cursor"]["firstBatch"]
173
+
155
174
 
156
175
  class RuntimeApiSiteClient(RuntimeApiClient):
157
176
  def __init__(
@@ -370,6 +389,18 @@ class GoldApiClient(BasicAuthClient):
370
389
  return None
371
390
  return results[0]
372
391
 
392
+ def fetch_projects_by_biosample(self, biosample_id: str) -> List[Dict[str, Any]]:
393
+ id = self._normalize_id(biosample_id)
394
+ results = self.request("/projects", params={"biosampleGoldId": id})
395
+ return results
396
+
397
+ def fetch_biosample_by_biosample_id(
398
+ self, biosample_id: str
399
+ ) -> List[Dict[str, Any]]:
400
+ id = self._normalize_id(biosample_id)
401
+ results = self.request("/biosamples", params={"biosampleGoldId": id})
402
+ return results
403
+
373
404
 
374
405
  @resource(
375
406
  config_schema={
@@ -12,6 +12,29 @@ from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
12
12
  SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
13
13
 
14
14
 
15
+ def _is_valid_project(project: dict) -> bool:
16
+ """A project is considered valid if:
17
+ 1. `sequencingStrategy` is in {"Metagenome", "Metatranscriptome"}
18
+ 2. if `sequencingCenters` == 'DOE Joint Genome Institute (JGI)' then
19
+ `projectStatus` must be in ("Permanent Draft", "Complete and Published")
20
+ 3. otherwise, no `projectStatus` filter is applied
21
+
22
+ :param project: GOLD project object (structurally similar to response
23
+ from `/projects` endpoint)
24
+ :return: True if the project is valid, False otherwise
25
+ """
26
+ if project.get("sequencingStrategy") not in SEQUENCING_STRATEGIES:
27
+ return False
28
+
29
+ if project.get("sequencingCenters") == "DOE Joint Genome Institute (JGI)":
30
+ return project.get("projectStatus") in (
31
+ "Permanent Draft",
32
+ "Complete and Published",
33
+ )
34
+
35
+ return True
36
+
37
+
15
38
  class GoldStudyTranslator(Translator):
16
39
  def __init__(
17
40
  self,
@@ -36,16 +59,15 @@ class GoldStudyTranslator(Translator):
36
59
  biosample
37
60
  for biosample in biosamples
38
61
  if any(
39
- project.get("sequencingStrategy") in SEQUENCING_STRATEGIES
40
- for project in biosample.get("projects", [])
62
+ _is_valid_project(project) for project in biosample.get("projects", [])
41
63
  )
42
64
  ]
43
65
  # Fetch the valid projectGoldIds that are associated with filtered
44
66
  # biosamples on their `projects` field
45
67
  valid_project_ids = {
46
68
  project.get("projectGoldId")
47
- for biosample in self.biosamples
48
- for project in biosample.get("projects", [])
69
+ for project in projects
70
+ if _is_valid_project(project)
49
71
  }
50
72
  # Filter projects to only those with `projectGoldId` in valid_project_ids
51
73
  self.projects = [
nmdc_runtime/site/util.py CHANGED
@@ -1,8 +1,9 @@
1
1
  import os
2
- from functools import lru_cache
3
- from subprocess import Popen, PIPE, STDOUT, CalledProcessError
4
2
 
3
+ from dagster import op
4
+ from functools import lru_cache
5
5
  from pymongo.database import Database as MongoDatabase
6
+ from subprocess import Popen, PIPE, STDOUT, CalledProcessError
6
7
 
7
8
  from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
8
9
  from nmdc_runtime.site.resources import mongo_resource
@@ -47,3 +48,7 @@ def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
47
48
 
48
49
  def get_basename(filename: str) -> str:
49
50
  return os.path.basename(filename)
51
+
52
+
53
+ def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
54
+ return nmdc_study_id.replace(":", "_").replace("-", "_")
nmdc_runtime/util.py CHANGED
@@ -24,6 +24,10 @@ from nmdc_schema.get_nmdc_view import ViewGetter
24
24
  from pydantic import Field, BaseModel
25
25
  from pymongo.database import Database as MongoDatabase
26
26
  from pymongo.errors import OperationFailure
27
+ from refscan.lib.helpers import identify_references
28
+ from refscan.lib.Finder import Finder
29
+ from refscan.lib.ReferenceList import ReferenceList
30
+ from refscan.scanner import scan_outgoing_references
27
31
  from toolz import merge, unique
28
32
 
29
33
  from nmdc_runtime.api.core.util import sha256hash_from_file
@@ -120,6 +124,23 @@ def get_class_names_from_collection_spec(
120
124
  return class_names
121
125
 
122
126
 
127
+ @lru_cache
128
+ def get_allowed_references() -> ReferenceList:
129
+ r"""
130
+ Returns a `ReferenceList` of all the inter-document references that
131
+ the NMDC Schema allows a schema-compliant MongoDB database to contain.
132
+ """
133
+
134
+ # Identify the inter-document references that the schema allows a database to contain.
135
+ print("Identifying schema-allowed references.")
136
+ references = identify_references(
137
+ schema_view=nmdc_schema_view(),
138
+ collection_name_to_class_names=collection_name_to_class_names,
139
+ )
140
+
141
+ return references
142
+
143
+
123
144
  @lru_cache
124
145
  def get_type_collections() -> dict:
125
146
  """Returns a dictionary mapping class names to Mongo collection names."""
@@ -353,6 +374,14 @@ def nmdc_database_collection_instance_class_names():
353
374
 
354
375
  @lru_cache
355
376
  def nmdc_database_collection_names():
377
+ r"""
378
+ TODO: Document this function.
379
+
380
+ TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
381
+ collections, use the function named `get_collection_names_from_schema` in `nmdc_runtime/api/db/mongo.py`
382
+ instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
383
+ maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
384
+ """
356
385
  names = []
357
386
  view = nmdc_schema_view()
358
387
  all_classes = set(view.all_classes())
@@ -513,6 +542,13 @@ class OverlayDB(AbstractContextManager):
513
542
  overlay collection, that id is marked as "seen" and will not also be returned when
514
543
  subsequently scanning the (unmodified) base-database collection.
515
544
 
545
+ Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
546
+ database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
547
+ `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
548
+ the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
549
+ "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
550
+ of the `merge_find` method, which internally accesses both the real database and the overlaying database.
551
+
516
552
  Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
517
553
  documents from a base collection to the overlay, and then applying the updates to the overlay,
518
554
  so that again, base collections are unmodified, and a "merge_find" call will produce a result
@@ -591,7 +627,33 @@ class OverlayDB(AbstractContextManager):
591
627
  yield doc
592
628
 
593
629
 
594
- def validate_json(in_docs: dict, mdb: MongoDatabase):
630
+ def validate_json(
631
+ in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
632
+ ):
633
+ r"""
634
+ Checks whether the specified dictionary represents a valid instance of the `Database` class
635
+ defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
636
+
637
+ Example dictionary:
638
+ {
639
+ "biosample_set": [
640
+ {"id": "nmdc:bsm-00-000001", ...},
641
+ {"id": "nmdc:bsm-00-000002", ...}
642
+ ],
643
+ "study_set": [
644
+ {"id": "nmdc:sty-00-000001", ...},
645
+ {"id": "nmdc:sty-00-000002", ...}
646
+ ]
647
+ }
648
+
649
+ :param in_docs: The dictionary you want to validate
650
+ :param mdb: A reference to a MongoDB database
651
+ :param check_inter_document_references: Whether you want this function to check whether every document that
652
+ is referenced by any of the documents passed in would, indeed, exist
653
+ in the database, if the documents passed in were to be inserted into
654
+ the database. In other words, set this to `True` if you want this
655
+ function to perform referential integrity checks.
656
+ """
595
657
  validator = Draft7Validator(get_nmdc_jsonschema_dict())
596
658
  docs = deepcopy(in_docs)
597
659
  validation_errors = {}
@@ -599,6 +661,8 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
599
661
  known_coll_names = set(nmdc_database_collection_names())
600
662
  for coll_name, coll_docs in docs.items():
601
663
  if coll_name not in known_coll_names:
664
+ # FIXME: Document what `@type` is (conceptually; e.g., why this function accepts it as a collection name).
665
+ # See: https://github.com/microbiomedata/nmdc-runtime/discussions/858
602
666
  if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
603
667
  continue
604
668
  else:
@@ -631,6 +695,84 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
631
695
  except Exception as e:
632
696
  return {"result": "errors", "detail": str(e)}
633
697
 
698
+ # Third pass (if enabled): Check inter-document references.
699
+ if check_inter_document_references is True:
700
+ # Prepare to use `refscan`.
701
+ #
702
+ # Note: We check the inter-document references in two stages, which are:
703
+ # 1. For each document in the JSON payload, check whether each document it references already exists
704
+ # (in the collections the schema says it can exist in) in the database. We use the
705
+ # `refscan` package to do this, which returns violation details we'll use in the second stage.
706
+ # 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
707
+ # check whether that document exists (in the collections the schema says it can exist in) in the
708
+ # JSON payload. If it does, then we "waive" (i.e. discard) that violation.
709
+ # The violations that remain after those two stages are the ones we return to the caller.
710
+ #
711
+ # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
712
+ # does not provide a means to perform arbitrary queries against its virtual "merged" database. It
713
+ # is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
714
+ # `refscan`'s `Finder` class accepts.
715
+ #
716
+ finder = Finder(database=mdb)
717
+ references = get_allowed_references()
718
+ reference_field_names_by_source_class_name = (
719
+ references.get_reference_field_names_by_source_class_name()
720
+ )
721
+
722
+ # Iterate over the collections in the JSON payload.
723
+ for source_collection_name, documents in in_docs.items():
724
+ for document in documents:
725
+ # Add an `_id` field to the document, since `refscan` requires the document to have one.
726
+ source_document = dict(document, _id=None)
727
+ violations = scan_outgoing_references(
728
+ document=source_document,
729
+ schema_view=nmdc_schema_view(),
730
+ reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
731
+ references=references,
732
+ finder=finder,
733
+ collection_names=nmdc_database_collection_names(),
734
+ source_collection_name=source_collection_name,
735
+ user_wants_to_locate_misplaced_documents=False,
736
+ )
737
+
738
+ # For each violation, check whether the misplaced document is in the JSON payload, itself.
739
+ for violation in violations:
740
+ can_waive_violation = False
741
+ # Determine which collections can contain the referenced document, based upon
742
+ # the schema class of which this source document is an instance.
743
+ target_collection_names = (
744
+ references.get_target_collection_names(
745
+ source_class_name=violation.source_class_name,
746
+ source_field_name=violation.source_field_name,
747
+ )
748
+ )
749
+ # Check whether the referenced document exists in any of those collections in the JSON payload.
750
+ for json_coll_name, json_coll_docs in in_docs.items():
751
+ if json_coll_name in target_collection_names:
752
+ for json_coll_doc in json_coll_docs:
753
+ if json_coll_doc["id"] == violation.target_id:
754
+ can_waive_violation = True
755
+ break # stop checking
756
+ if can_waive_violation:
757
+ break # stop checking
758
+ if not can_waive_violation:
759
+ violation_as_str = (
760
+ f"Document '{violation.source_document_id}' "
761
+ f"in collection '{violation.source_collection_name}' "
762
+ f"has a field '{violation.source_field_name}' that "
763
+ f"references a document having id "
764
+ f"'{violation.target_id}', but the latter document "
765
+ f"does not exist in any of the collections the "
766
+ f"NMDC Schema says it can exist in."
767
+ )
768
+ validation_errors[source_collection_name].append(
769
+ violation_as_str
770
+ )
771
+
772
+ # If any collection's error list is not empty, return an error response.
773
+ if any(len(v) > 0 for v in validation_errors.values()):
774
+ return {"result": "errors", "detail": validation_errors}
775
+
634
776
  return {"result": "All Okay!"}
635
777
  else:
636
778
  return {"result": "errors", "detail": validation_errors}
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: nmdc_runtime
3
- Version: 2.2.1
3
+ Version: 2.3.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -11,6 +11,14 @@ Classifier: License :: OSI Approved :: Apache Software License
11
11
  Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: classifier
17
+ Dynamic: description
18
+ Dynamic: description-content-type
19
+ Dynamic: home-page
20
+ Dynamic: requires-python
21
+ Dynamic: summary
14
22
 
15
23
  A runtime system for NMDC data management and orchestration.
16
24
 
@@ -2,7 +2,7 @@ nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  nmdc_runtime/config.py,sha256=qyV_To6t--DQUpYJ3SrE6sZlxuVXLPmx2dVtZV-3l-c,33
3
3
  nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
4
4
  nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- nmdc_runtime/util.py,sha256=aMzS8eATEjpXOiuyAFYthx92fb_cgIzWWd5ZQU6ZlAY,22931
5
+ nmdc_runtime/util.py,sha256=HzQsNMYG6Pb-IuBEE9HBzX_lNkII7jiNe65UFk34ZYA,31414
6
6
  nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
@@ -28,7 +28,7 @@ nmdc_runtime/lib/nmdc_etl_class.py,sha256=tVh3rKVMkBHQE65_LhKeIjCsaCZQk_HJzbc9K4
28
28
  nmdc_runtime/lib/transform_nmdc_data.py,sha256=hij4lR3IMQRJQdL-rsP_I-m_WyFPsBMchV2MNFUkh0M,39906
29
29
  nmdc_runtime/minter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  nmdc_runtime/minter/bootstrap.py,sha256=5Ej6pJVBRryRIi0ZwEloY78Zky7iE2okF6tPwRI2axM,822
31
- nmdc_runtime/minter/config.py,sha256=WrxX9WmyN7Ft4INRAQbd31jmlm5qwaDDaNS9AktieYA,4112
31
+ nmdc_runtime/minter/config.py,sha256=gsXZropDeeTO5tmLAtRuoocwqL3HgfgqVAENyCbX-Gc,2739
32
32
  nmdc_runtime/minter/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  nmdc_runtime/minter/adapters/repository.py,sha256=I-jmGP38-9kPhkogrwUht_Ir0CfHA9_5ZImw5I_wbcw,8323
34
34
  nmdc_runtime/minter/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -36,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
36
36
  nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
38
38
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- nmdc_runtime/site/graphs.py,sha256=mu4bE8799TItWXaPBfOeFB2XMyYwPZcj-VJQmadN2MA,14171
40
- nmdc_runtime/site/ops.py,sha256=T9_WrwDaySGnu6olwOHQizHQfeofMOaqMcq_vYEIzO0,43140
41
- nmdc_runtime/site/repository.py,sha256=JtHlp6l3UVo0QhV670TGns9bMfht7NOQrNWQtvsYr2g,39183
42
- nmdc_runtime/site/resources.py,sha256=6bmvplgql3KdEXKI49BibSk0Sug96SFJi8eOs2zeKK0,18252
43
- nmdc_runtime/site/util.py,sha256=zAY0oIY7GRf63ecqWelmS27N7PVrAXVwEhtnpescBSw,1415
39
+ nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
40
+ nmdc_runtime/site/ops.py,sha256=OrTQLSBToih-kI754RtVqjUMRojgYJQmb7B_VRjZWtg,46223
41
+ nmdc_runtime/site/repository.py,sha256=b3UVQznelU8wDOfuc9_vE_eqFGOoFRiHtQJJH7or73E,43875
42
+ nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
43
+ nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
44
44
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
46
46
  nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
@@ -56,10 +56,12 @@ nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZ
56
56
  nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
57
57
  nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
59
+ nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
+ nmdc_runtime/site/repair/database_updater.py,sha256=EMuY8MfwQEfdejJHp0Y-Gb1eb1zOgKgfJxbtm6wM3YU,10943
59
61
  nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
62
  nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
61
63
  nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
62
- nmdc_runtime/site/translation/gold_translator.py,sha256=RfAB68dJ9hDep20wETmCNBc0gugZbEKqVimT8h2t0uM,31470
64
+ nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
63
65
  nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
64
66
  nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
65
67
  nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
@@ -73,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
73
75
  nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
74
76
  nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
75
77
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
76
- nmdc_runtime-2.2.1.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
77
- nmdc_runtime-2.2.1.dist-info/METADATA,sha256=yIkwZWVw8J1xDqhwVQy2Rxfz7cIc42yT4JkRBdsRBr4,7256
78
- nmdc_runtime-2.2.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
79
- nmdc_runtime-2.2.1.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
80
- nmdc_runtime-2.2.1.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
81
- nmdc_runtime-2.2.1.dist-info/RECORD,,
78
+ nmdc_runtime-2.3.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
+ nmdc_runtime-2.3.0.dist-info/METADATA,sha256=BEManThNKOEkfS9woYYiwm1ya6BENBC6vXE6b7L_z2E,7430
80
+ nmdc_runtime-2.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
81
+ nmdc_runtime-2.3.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
+ nmdc_runtime-2.3.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
+ nmdc_runtime-2.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5