nmdc-runtime 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

nmdc_runtime/site/ops.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import csv
2
2
  import json
3
+ import logging
3
4
  import mimetypes
4
5
  import os
5
6
  import subprocess
@@ -9,10 +10,10 @@ from datetime import datetime, timezone
9
10
  from io import BytesIO, StringIO
10
11
  from pprint import pformat
11
12
  from toolz.dicttoolz import keyfilter
12
- from typing import Tuple
13
+ from typing import Tuple, Set, Union
13
14
  from zipfile import ZipFile
14
15
  from itertools import chain
15
-
16
+ from ontology_loader.ontology_load_controller import OntologyLoaderController
16
17
  import pandas as pd
17
18
  import requests
18
19
 
@@ -26,6 +27,7 @@ from dagster import (
26
27
  Failure,
27
28
  List,
28
29
  MetadataValue,
30
+ Noneable,
29
31
  OpExecutionContext,
30
32
  Out,
31
33
  Output,
@@ -36,12 +38,13 @@ from dagster import (
36
38
  Optional,
37
39
  Field,
38
40
  Permissive,
39
- Bool,
41
+ In,
42
+ Nothing,
40
43
  )
41
44
  from gridfs import GridFS
42
45
  from linkml_runtime.utils.dictutils import as_simple_dict
43
46
  from linkml_runtime.utils.yamlutils import YAMLRoot
44
- from nmdc_runtime.api.db.mongo import get_mongo_db
47
+ from nmdc_runtime.api.db.mongo import get_mongo_db, validate_json
45
48
  from nmdc_runtime.api.core.idgen import generate_one_id
46
49
  from nmdc_runtime.api.core.metadata import (
47
50
  _validate_changesheet,
@@ -103,7 +106,6 @@ from nmdc_runtime.util import (
103
106
  get_names_of_classes_in_effective_range_of_slot,
104
107
  pluralize,
105
108
  put_object,
106
- validate_json,
107
109
  specialize_activity_set_docs,
108
110
  collection_name_to_class_names,
109
111
  class_hierarchy_as_list,
@@ -113,11 +115,14 @@ from nmdc_runtime.util import (
113
115
  from nmdc_schema import nmdc
114
116
  from nmdc_schema.nmdc import Database as NMDCDatabase
115
117
  from pydantic import BaseModel
116
- from pymongo import InsertOne
118
+ from pymongo import InsertOne, UpdateOne
117
119
  from pymongo.database import Database as MongoDatabase
118
120
  from starlette import status
119
121
  from toolz import assoc, dissoc, get_in, valfilter, identity
120
122
 
123
+ # batch size for writing documents to alldocs
124
+ BULK_WRITE_BATCH_SIZE = 2000
125
+
121
126
 
122
127
  @op
123
128
  def hello(context):
@@ -475,53 +480,6 @@ def get_json_in(context):
475
480
  return rv.json()
476
481
 
477
482
 
478
- def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
479
- """Does not ensure ordering of `docs`."""
480
-
481
- if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
482
- return docs, 0
483
-
484
- do_docs = docs["data_object_set"]
485
-
486
- class FileTypeEnumBase(BaseModel):
487
- name: str
488
- description: str
489
- filter: str # JSON-encoded data_object_set mongo collection filter document
490
-
491
- class FileTypeEnum(FileTypeEnumBase):
492
- id: str
493
-
494
- temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
495
- temp_collection = mdb[temp_collection_name]
496
- temp_collection.insert_many(do_docs)
497
- temp_collection.create_index("id")
498
-
499
- def fte_matches(fte_filter: str):
500
- return [
501
- dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
502
- ]
503
-
504
- do_docs_map = {d["id"]: d for d in do_docs}
505
-
506
- n_docs_with_types_added = 0
507
-
508
- for fte_doc in mdb.file_type_enum.find():
509
- fte = FileTypeEnum(**fte_doc)
510
- docs_matching = fte_matches(fte.filter)
511
- for doc in docs_matching:
512
- if "data_object_type" not in doc:
513
- do_docs_map[doc["id"]] = assoc(doc, "data_object_type", fte.id)
514
- n_docs_with_types_added += 1
515
-
516
- mdb.drop_collection(temp_collection_name)
517
- return (
518
- assoc(
519
- docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
520
- ),
521
- n_docs_with_types_added,
522
- )
523
-
524
-
525
483
  @op(required_resource_keys={"runtime_api_site_client", "mongo"})
526
484
  def perform_mongo_updates(context, json_in):
527
485
  mongo = context.resources.mongo
@@ -530,8 +488,6 @@ def perform_mongo_updates(context, json_in):
530
488
 
531
489
  docs = json_in
532
490
  docs, _ = specialize_activity_set_docs(docs)
533
- docs, n_docs_with_types_added = ensure_data_object_type(docs, mongo.db)
534
- context.log.info(f"added `data_object_type` to {n_docs_with_types_added} docs")
535
491
  context.log.debug(f"{docs}")
536
492
 
537
493
  rv = validate_json(
@@ -600,22 +556,25 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
600
556
  "study_type": str,
601
557
  "gold_nmdc_instrument_mapping_file_url": str,
602
558
  "include_field_site_info": bool,
559
+ "enable_biosample_filtering": bool,
603
560
  },
604
561
  out={
605
562
  "study_id": Out(str),
606
563
  "study_type": Out(str),
607
564
  "gold_nmdc_instrument_mapping_file_url": Out(str),
608
565
  "include_field_site_info": Out(bool),
566
+ "enable_biosample_filtering": Out(bool),
609
567
  },
610
568
  )
611
569
  def get_gold_study_pipeline_inputs(
612
570
  context: OpExecutionContext,
613
- ) -> Tuple[str, str, str, bool]:
571
+ ) -> Tuple[str, str, str, bool, bool]:
614
572
  return (
615
573
  context.op_config["study_id"],
616
574
  context.op_config["study_type"],
617
575
  context.op_config["gold_nmdc_instrument_mapping_file_url"],
618
576
  context.op_config["include_field_site_info"],
577
+ context.op_config["enable_biosample_filtering"],
619
578
  )
620
579
 
621
580
 
@@ -659,6 +618,7 @@ def nmdc_schema_database_from_gold_study(
659
618
  analysis_projects: List[Dict[str, Any]],
660
619
  gold_nmdc_instrument_map_df: pd.DataFrame,
661
620
  include_field_site_info: bool,
621
+ enable_biosample_filtering: bool,
662
622
  ) -> nmdc.Database:
663
623
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
664
624
 
@@ -674,6 +634,7 @@ def nmdc_schema_database_from_gold_study(
674
634
  analysis_projects,
675
635
  gold_nmdc_instrument_map_df,
676
636
  include_field_site_info,
637
+ enable_biosample_filtering,
677
638
  id_minter=id_minter,
678
639
  )
679
640
  database = translator.get_database()
@@ -1043,18 +1004,249 @@ def site_code_mapping() -> dict:
1043
1004
  )
1044
1005
 
1045
1006
 
1046
- @op(required_resource_keys={"mongo"})
1007
+ @op(
1008
+ required_resource_keys={"mongo"},
1009
+ config_schema={
1010
+ "source_ontology": str,
1011
+ "output_directory": Field(Noneable(str), default_value=None, is_required=False),
1012
+ "generate_reports": Field(bool, default_value=True, is_required=False),
1013
+ },
1014
+ )
1015
+ def load_ontology(context: OpExecutionContext):
1016
+ cfg = context.op_config
1017
+ source_ontology = cfg["source_ontology"]
1018
+ output_directory = cfg.get("output_directory")
1019
+ generate_reports = cfg.get("generate_reports", True)
1020
+
1021
+ if output_directory is None:
1022
+ output_directory = os.path.join(os.getcwd(), "ontology_reports")
1023
+
1024
+ # Redirect Python logging to Dagster context
1025
+ handler = logging.Handler()
1026
+ handler.emit = lambda record: context.log.info(record.getMessage())
1027
+
1028
+ # Get logger from ontology-loader package
1029
+ controller_logger = logging.getLogger("ontology_loader.ontology_load_controller")
1030
+ controller_logger.setLevel(logging.INFO)
1031
+ controller_logger.addHandler(handler)
1032
+
1033
+ context.log.info(f"Running Ontology Loader for ontology: {source_ontology}")
1034
+ loader = OntologyLoaderController(
1035
+ source_ontology=source_ontology,
1036
+ output_directory=output_directory,
1037
+ generate_reports=generate_reports,
1038
+ mongo_client=context.resources.mongo.client,
1039
+ db_name=context.resources.mongo.db.name,
1040
+ )
1041
+
1042
+ loader.run_ontology_loader()
1043
+ context.log.info(f"Ontology load for {source_ontology} completed successfully!")
1044
+
1045
+
1046
+ def _add_related_ids_to_alldocs(
1047
+ temp_collection, context, document_reference_ranged_slots_by_type
1048
+ ) -> None:
1049
+ """
1050
+ Adds {`_inbound`,`_outbound`} fields to each document in the temporary alldocs collection.
1051
+
1052
+ The {`_inbound`,`_outbound`} fields each contain an array of subdocuments, each with fields `id` and `type`.
1053
+ Each subdocument represents a link to any other document that either links to or is linked from
1054
+ the document via document-reference-ranged slots.
1055
+
1056
+ Args:
1057
+ temp_collection: The temporary MongoDB collection to process
1058
+ context: The Dagster execution context for logging
1059
+ document_reference_ranged_slots_by_type: Dictionary mapping document types to their reference-ranged slot names
1060
+
1061
+ Returns:
1062
+ None (modifies the documents in place)
1063
+ """
1064
+
1065
+ context.log.info(
1066
+ "Building relationships and adding `_inbound` and `_outbound` fields..."
1067
+ )
1068
+
1069
+ # document ID -> type (with "nmdc:" prefix preserved)
1070
+ id_to_type_map: Dict[str, str] = {}
1071
+
1072
+ # set of (<referencing document ID>, <slot>, <referenced document ID>) 3-tuples.
1073
+ relationship_triples: Set[Tuple[str, str, str]] = set()
1074
+
1075
+ # Collect relationship triples.
1076
+ for doc in temp_collection.find():
1077
+ doc_id = doc["id"]
1078
+ # Store the full type with prefix intact
1079
+ doc_type = doc["type"]
1080
+ # For looking up reference slots, we still need the type without prefix
1081
+ doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
1082
+
1083
+ # Record ID to type mapping - preserve the original type with prefix
1084
+ id_to_type_map[doc_id] = doc_type
1085
+
1086
+ # Find all document references from this document
1087
+ reference_slots = document_reference_ranged_slots_by_type.get(
1088
+ doc_type_no_prefix, []
1089
+ )
1090
+ for slot in reference_slots:
1091
+ if slot in doc:
1092
+ # Handle both single-value and array references
1093
+ refs = doc[slot] if isinstance(doc[slot], list) else [doc[slot]]
1094
+ for ref_doc in temp_collection.find(
1095
+ {"id": {"$in": refs}}, ["id", "type"]
1096
+ ):
1097
+ id_to_type_map[ref_doc["id"]] = ref_doc["type"]
1098
+ for ref_id in refs:
1099
+ relationship_triples.add((doc_id, slot, ref_id))
1100
+
1101
+ context.log.info(
1102
+ f"Found {len(id_to_type_map)} documents, with "
1103
+ f"{len({d for (d, _, _) in relationship_triples})} containing references"
1104
+ )
1105
+
1106
+ # The bifurcation of document-reference-ranged slots as "inbound" and "outbound" is essential
1107
+ # in order to perform graph traversal and collect all entities "related" to a given entity without
1108
+ # recursion "exploding".
1109
+ #
1110
+ # Note: We are hard-coding this "direction" information here in the Runtime
1111
+ # because the NMDC schema does not currently contain or expose it.
1112
+ #
1113
+ # An "inbound" slot is one for which an entity in the domain "was influenced by" (formally,
1114
+ # <https://www.w3.org/ns/prov#wasInfluencedBy>, with typical CURIE prov:wasInfluencedBy) an entity in the range.
1115
+ inbound_document_reference_ranged_slots = [
1116
+ "collected_from", # a `nmdc:Biosample` was influenced by the `nmdc:Site` from which it was collected.
1117
+ "has_chromatography_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
1118
+ "has_input", # a `nmdc:PlannedProcess` was influenced by a `nmdc:NamedThing`.
1119
+ "has_mass_spectrometry_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
1120
+ "instrument_used", # a `nmdc:PlannedProcess` was influenced by a used `nmdc:Instrument`.
1121
+ "uses_calibration", # a `nmdc:PlannedProcess` was influenced by `nmdc:CalibrationInformation`.
1122
+ "was_generated_by", # prov:wasGeneratedBy rdfs:subPropertyOf prov:wasInfluencedBy.
1123
+ "was_informed_by", # prov:wasInformedBy rdfs:subPropertyOf prov:wasInfluencedBy.
1124
+ ]
1125
+ # An "outbound" slot is one for which an entity in the domain "influences"
1126
+ # (i.e., [owl:inverseOf prov:wasInfluencedBy]) an entity in the range.
1127
+ outbound_document_reference_ranged_slots = [
1128
+ "associated_studies", # a `nmdc:Biosample` influences a `nmdc:Study`.
1129
+ "calibration_object", # `nmdc:CalibrationInformation` generates a `nmdc:DataObject`.
1130
+ "generates_calibration", # a `nmdc:PlannedProcess` generates `nmdc:CalibrationInformation`.
1131
+ "has_output", # a `nmdc:PlannedProcess` generates a `nmdc:NamedThing`.
1132
+ "in_manifest", # a `nmdc:DataObject` becomes associated with `nmdc:Manifest`.
1133
+ "part_of", # a "contained" `nmdc:NamedThing` influences its "container" `nmdc:NamedThing`,
1134
+ ]
1135
+
1136
+ unique_document_reference_ranged_slot_names = set()
1137
+ for slot_names in document_reference_ranged_slots_by_type.values():
1138
+ for slot_name in slot_names:
1139
+ unique_document_reference_ranged_slot_names.add(slot_name)
1140
+ context.log.info(f"{unique_document_reference_ranged_slot_names=}")
1141
+ if len(inbound_document_reference_ranged_slots) + len(
1142
+ outbound_document_reference_ranged_slots
1143
+ ) != len(unique_document_reference_ranged_slot_names):
1144
+ raise Failure(
1145
+ "Number of detected unique document-reference-ranged slot names does not match "
1146
+ "sum of accounted-for inbound and outbound document-reference-ranged slot names."
1147
+ )
1148
+
1149
+ # Construct, and update documents with, `_incoming` and `_outgoing` field values.
1150
+ #
1151
+ # manage batching of MongoDB `bulk_write` operations
1152
+ bulk_operations, update_count = [], 0
1153
+ for doc_id, slot, ref_id in relationship_triples:
1154
+
1155
+ # Determine in which respective fields to push this relationship
1156
+ # for the subject (doc) and object (ref) of this triple.
1157
+ if slot in inbound_document_reference_ranged_slots:
1158
+ field_for_doc, field_for_ref = "_inbound", "_outbound"
1159
+ elif slot in outbound_document_reference_ranged_slots:
1160
+ field_for_doc, field_for_ref = "_outbound", "_inbound"
1161
+ else:
1162
+ raise Failure(f"Unknown slot {slot} for document {doc_id}")
1163
+
1164
+ updates = [
1165
+ {
1166
+ "filter": {"id": doc_id},
1167
+ "update": {
1168
+ "$push": {
1169
+ field_for_doc: {
1170
+ "id": ref_id,
1171
+ # TODO existing tests are failing due to `KeyError`s for `id_to_type_map.get[ref_id]` here,
1172
+ # which acts as an implicit referential integrity checker (!). Using `.get` with
1173
+ # "nmdc:NamedThing" as default in order to (for now) allow such tests to continue to pass.
1174
+ "type": id_to_type_map.get(ref_id, "nmdc:NamedThing"),
1175
+ }
1176
+ }
1177
+ },
1178
+ },
1179
+ {
1180
+ "filter": {"id": ref_id},
1181
+ "update": {
1182
+ "$push": {
1183
+ field_for_ref: {"id": doc_id, "type": id_to_type_map[doc_id]}
1184
+ }
1185
+ },
1186
+ },
1187
+ ]
1188
+ for update in updates:
1189
+ bulk_operations.append(UpdateOne(**update))
1190
+
1191
+ # Execute in batches for efficiency
1192
+ if len(bulk_operations) >= BULK_WRITE_BATCH_SIZE:
1193
+ temp_collection.bulk_write(bulk_operations)
1194
+ update_count += len(bulk_operations)
1195
+ context.log.info(
1196
+ f"Pushed {update_count/(2*len(relationship_triples)):.1%} of updates so far..."
1197
+ )
1198
+ bulk_operations = []
1199
+
1200
+ # Execute any remaining operations
1201
+ if bulk_operations:
1202
+ temp_collection.bulk_write(bulk_operations)
1203
+ update_count += len(bulk_operations)
1204
+
1205
+ context.log.info(f"Pushed {update_count} updates in total")
1206
+
1207
+ context.log.info("Creating {`_inbound`,`_outbound`} indexes...")
1208
+ temp_collection.create_index("_inbound.id")
1209
+ temp_collection.create_index("_outbound.id")
1210
+ # Create compound indexes to ensure index-covered queries
1211
+ temp_collection.create_index([("_inbound.type", 1), ("_inbound.id", 1)])
1212
+ temp_collection.create_index([("_outbound.type", 1), ("_outbound.id", 1)])
1213
+ context.log.info("Successfully created {`_inbound`,`_outbound`} indexes")
1214
+
1215
+
1216
+ # Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
1217
+ # pass an argument to the op (in order to specify the order of the ops in the graph)
1218
+ # while also telling Dagster that this op doesn't need the _value_ of that argument.
1219
+ # This is the approach shown on: https://docs.dagster.io/api/dagster/types#dagster.Nothing
1220
+ # Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
1221
+ #
1222
+ @op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
1047
1223
  def materialize_alldocs(context) -> int:
1048
1224
  """
1049
- This function re-creates the alldocs collection to reflect the current state of the Mongo database.
1050
- See nmdc-runtime/docs/nb/bulk_validation_referential_integrity_check.ipynb for more details.
1225
+ This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
1226
+
1227
+ 1. Getting all populated schema collection names with an `id` field.
1228
+ 2. Create a temporary collection to build the new alldocs collection.
1229
+ 3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
1230
+ 4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
1231
+ 5. Add special `_inbound` and `_outbound` fields with subdocuments containing ID and type of related entities.
1232
+ 6. Add indexes for `id`, relationship fields, and `{_inbound,_outbound}.type`/`.id` compound indexes.
1233
+ 7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
1234
+
1235
+ The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
1236
+ `nmdc_runtime.site.repository.ensure_alldocs_daily`. The collection is also updated as part of various workflows,
1237
+ such as when applying a changesheet or metadata updates (see `nmdc_runtime.site.graphs`).
1238
+
1239
+ The `alldocs` collection is used primarily by API endpoints like `/data_objects/study/{study_id}` and
1240
+ `/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
1241
+ related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
1242
+
1243
+ The {`_inbound`,`_outbound`} fields enable efficient index-covered queries to find all entities of specific types
1244
+ that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
1245
+ expansions.
1051
1246
  """
1052
1247
  mdb = context.resources.mongo.db
1053
1248
  schema_view = nmdc_schema_view()
1054
1249
 
1055
- # batch size for writing documents to alldocs
1056
- BULK_WRITE_BATCH_SIZE = 2000
1057
-
1058
1250
  # TODO include functional_annotation_agg for "real-time" ref integrity checking.
1059
1251
  # For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
1060
1252
  collection_names = populated_schema_collection_names_with_id_field(mdb)
@@ -1100,7 +1292,14 @@ def materialize_alldocs(context) -> int:
1100
1292
  documents_processed_counter = 0
1101
1293
  for doc in mdb[coll_name].find():
1102
1294
  try:
1103
- doc_type = doc["type"][5:] # lop off "nmdc:" prefix
1295
+ # Keep the full type with prefix for document
1296
+ doc_type_full = doc["type"]
1297
+ # Remove prefix for slot lookup and ancestor lookup
1298
+ doc_type = (
1299
+ doc_type_full[5:]
1300
+ if doc_type_full.startswith("nmdc:")
1301
+ else doc_type_full
1302
+ )
1104
1303
  except KeyError:
1105
1304
  raise Exception(
1106
1305
  f"doc {doc['id']} in collection {coll_name} has no 'type'!"
@@ -1109,13 +1308,21 @@ def materialize_alldocs(context) -> int:
1109
1308
  doc_type
1110
1309
  ]
1111
1310
  new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
1311
+
1112
1312
  new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
1313
+ # InsertOne is a method on the py-mongo Client class.
1314
+ # Get ancestors without the prefix, but add prefix to each one in the output
1315
+ ancestors = schema_view.class_ancestors(doc_type)
1316
+ new_doc["_type_and_ancestors"] = [
1317
+ "nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
1318
+ ]
1113
1319
  write_operations.append(InsertOne(new_doc))
1114
1320
  if len(write_operations) == BULK_WRITE_BATCH_SIZE:
1115
1321
  _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1116
1322
  write_operations.clear()
1117
1323
  documents_processed_counter += BULK_WRITE_BATCH_SIZE
1118
1324
  if len(write_operations) > 0:
1325
+ # here bulk_write is a method on the py-mongo db Client class
1119
1326
  _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1120
1327
  documents_processed_counter += len(write_operations)
1121
1328
  context.log.info(
@@ -1136,10 +1343,20 @@ def materialize_alldocs(context) -> int:
1136
1343
  [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
1137
1344
  context.log.info(f"created indexes on id, {slots_to_index}.")
1138
1345
 
1346
+ # Add related-ids fields to enable efficient relationship traversal
1347
+ context.log.info("Adding fields for related ids to documents...")
1348
+ _add_related_ids_to_alldocs(
1349
+ temp_alldocs_collection, context, document_reference_ranged_slots
1350
+ )
1351
+
1139
1352
  context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
1140
1353
  temp_alldocs_collection.rename("alldocs", dropTarget=True)
1141
1354
 
1142
- return mdb.alldocs.estimated_document_count()
1355
+ n_alldocs_documents = mdb.alldocs.estimated_document_count()
1356
+ context.log.info(
1357
+ f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
1358
+ )
1359
+ return n_alldocs_documents
1143
1360
 
1144
1361
 
1145
1362
  @op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
@@ -1282,16 +1499,24 @@ def post_submission_portal_biosample_ingest_record_stitching_filename(
1282
1499
  config_schema={
1283
1500
  "nmdc_study_id": str,
1284
1501
  "gold_nmdc_instrument_mapping_file_url": str,
1502
+ "include_field_site_info": bool,
1503
+ "enable_biosample_filtering": bool,
1285
1504
  },
1286
1505
  out={
1287
1506
  "nmdc_study_id": Out(str),
1288
1507
  "gold_nmdc_instrument_mapping_file_url": Out(str),
1508
+ "include_field_site_info": Out(bool),
1509
+ "enable_biosample_filtering": Out(bool),
1289
1510
  },
1290
1511
  )
1291
- def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]:
1512
+ def get_database_updater_inputs(
1513
+ context: OpExecutionContext,
1514
+ ) -> Tuple[str, str, bool, bool]:
1292
1515
  return (
1293
1516
  context.op_config["nmdc_study_id"],
1294
1517
  context.op_config["gold_nmdc_instrument_mapping_file_url"],
1518
+ context.op_config["include_field_site_info"],
1519
+ context.op_config["enable_biosample_filtering"],
1295
1520
  )
1296
1521
 
1297
1522
 
@@ -1306,6 +1531,8 @@ def generate_data_generation_set_post_biosample_ingest(
1306
1531
  context: OpExecutionContext,
1307
1532
  nmdc_study_id: str,
1308
1533
  gold_nmdc_instrument_map_df: pd.DataFrame,
1534
+ include_field_site_info: bool,
1535
+ enable_biosample_filtering: bool,
1309
1536
  ) -> nmdc.Database:
1310
1537
  runtime_api_user_client: RuntimeApiUserClient = (
1311
1538
  context.resources.runtime_api_user_client
@@ -1321,6 +1548,8 @@ def generate_data_generation_set_post_biosample_ingest(
1321
1548
  gold_api_client,
1322
1549
  nmdc_study_id,
1323
1550
  gold_nmdc_instrument_map_df,
1551
+ include_field_site_info,
1552
+ enable_biosample_filtering,
1324
1553
  )
1325
1554
  database = (
1326
1555
  database_updater.generate_data_generation_set_records_from_gold_api_for_study()
@@ -1340,6 +1569,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
1340
1569
  context: OpExecutionContext,
1341
1570
  nmdc_study_id: str,
1342
1571
  gold_nmdc_instrument_map_df: pd.DataFrame,
1572
+ include_field_site_info: bool = False,
1573
+ enable_biosample_filtering: bool = False,
1343
1574
  ) -> nmdc.Database:
1344
1575
  runtime_api_user_client: RuntimeApiUserClient = (
1345
1576
  context.resources.runtime_api_user_client
@@ -1355,12 +1586,72 @@ def generate_biosample_set_for_nmdc_study_from_gold(
1355
1586
  gold_api_client,
1356
1587
  nmdc_study_id,
1357
1588
  gold_nmdc_instrument_map_df,
1589
+ include_field_site_info,
1590
+ enable_biosample_filtering,
1358
1591
  )
1359
1592
  database = database_updater.generate_biosample_set_from_gold_api_for_study()
1360
1593
 
1361
1594
  return database
1362
1595
 
1363
1596
 
1597
+ @op(
1598
+ required_resource_keys={
1599
+ "runtime_api_user_client",
1600
+ "runtime_api_site_client",
1601
+ "gold_api_client",
1602
+ },
1603
+ out=Out(Any),
1604
+ )
1605
+ def run_script_to_update_insdc_biosample_identifiers(
1606
+ context: OpExecutionContext,
1607
+ nmdc_study_id: str,
1608
+ gold_nmdc_instrument_map_df: pd.DataFrame,
1609
+ include_field_site_info: bool,
1610
+ enable_biosample_filtering: bool,
1611
+ ):
1612
+ """Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
1613
+
1614
+ This op uses the DatabaseUpdater to generate a script that can be used to update biosample
1615
+ records with INSDC identifiers obtained from GOLD.
1616
+
1617
+ Args:
1618
+ context: The execution context
1619
+ nmdc_study_id: The NMDC study ID for which to generate the update script
1620
+ gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
1621
+
1622
+ Returns:
1623
+ A dictionary or list of dictionaries containing the MongoDB update script(s)
1624
+ """
1625
+ runtime_api_user_client: RuntimeApiUserClient = (
1626
+ context.resources.runtime_api_user_client
1627
+ )
1628
+ runtime_api_site_client: RuntimeApiSiteClient = (
1629
+ context.resources.runtime_api_site_client
1630
+ )
1631
+ gold_api_client: GoldApiClient = context.resources.gold_api_client
1632
+
1633
+ database_updater = DatabaseUpdater(
1634
+ runtime_api_user_client,
1635
+ runtime_api_site_client,
1636
+ gold_api_client,
1637
+ nmdc_study_id,
1638
+ gold_nmdc_instrument_map_df,
1639
+ include_field_site_info,
1640
+ enable_biosample_filtering,
1641
+ )
1642
+ update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
1643
+
1644
+ if isinstance(update_script, list):
1645
+ total_updates = sum(len(item.get("updates", [])) for item in update_script)
1646
+ else:
1647
+ total_updates = len(update_script.get("updates", []))
1648
+ context.log.info(
1649
+ f"Generated update script for study {nmdc_study_id} with {total_updates} updates"
1650
+ )
1651
+
1652
+ return update_script
1653
+
1654
+
1364
1655
  @op
1365
1656
  def log_database_ids(
1366
1657
  context: OpExecutionContext,
@@ -1382,3 +1673,55 @@ def log_database_ids(
1382
1673
  message += "\n"
1383
1674
  if message:
1384
1675
  context.log.info(message)
1676
+
1677
+
1678
+ @op(
1679
+ description="Render free text through the Dagit UI",
1680
+ out=Out(description="Text content rendered through Dagit UI"),
1681
+ )
1682
+ def render_text(context: OpExecutionContext, text: Any):
1683
+ """
1684
+ Renders content as a Dagster Asset in the Dagit UI.
1685
+
1686
+ This operation creates a Dagster Asset with the provided content, making it
1687
+ visible in the Dagit UI for easy viewing and sharing.
1688
+
1689
+ Args:
1690
+ context: The execution context
1691
+ text: The content to render (can be a string or a dictionary that will be converted to JSON)
1692
+
1693
+ Returns:
1694
+ The same content that was provided as input
1695
+ """
1696
+ # Convert dictionary to formatted JSON string if needed
1697
+ if isinstance(text, dict):
1698
+ import json
1699
+
1700
+ content = json.dumps(text, indent=2)
1701
+ file_extension = "json"
1702
+ hash_text = json.dumps(text, sort_keys=True)[:20] # For consistent hashing
1703
+ else:
1704
+ content = str(text) # Convert to string in case it's not already
1705
+ file_extension = "txt"
1706
+ hash_text = content[:20]
1707
+
1708
+ filename = f"rendered_text_{context.run_id}.{file_extension}"
1709
+ file_path = os.path.join(context.instance.storage_directory(), filename)
1710
+
1711
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
1712
+
1713
+ with open(file_path, "w") as f:
1714
+ f.write(content)
1715
+
1716
+ context.log_event(
1717
+ AssetMaterialization(
1718
+ asset_key=f"rendered_text_{hash_from_str(hash_text, 'md5')[:8]}",
1719
+ description="Rendered Content",
1720
+ metadata={
1721
+ "file_path": MetadataValue.path(file_path),
1722
+ "content": MetadataValue.text(content),
1723
+ },
1724
+ )
1725
+ )
1726
+
1727
+ return Output(text)