nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
nmdc_runtime/site/ops.py CHANGED
@@ -1,21 +1,21 @@
1
1
  import csv
2
2
  import json
3
- import mimetypes
3
+ import logging
4
4
  import os
5
5
  import subprocess
6
- import tempfile
7
6
  from collections import defaultdict
8
7
  from datetime import datetime, timezone
9
- from io import BytesIO, StringIO
8
+ from io import BytesIO
10
9
  from pprint import pformat
11
10
  from toolz.dicttoolz import keyfilter
12
- from typing import Tuple
11
+ from typing import Tuple, Set
13
12
  from zipfile import ZipFile
14
13
  from itertools import chain
15
-
14
+ from ontology_loader.ontology_load_controller import OntologyLoaderController
16
15
  import pandas as pd
17
16
  import requests
18
-
17
+ from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
18
+ from toolz import dissoc
19
19
 
20
20
  from bson import ObjectId, json_util
21
21
  from dagster import (
@@ -26,6 +26,7 @@ from dagster import (
26
26
  Failure,
27
27
  List,
28
28
  MetadataValue,
29
+ Noneable,
29
30
  OpExecutionContext,
30
31
  Out,
31
32
  Output,
@@ -36,12 +37,13 @@ from dagster import (
36
37
  Optional,
37
38
  Field,
38
39
  Permissive,
39
- Bool,
40
+ In,
41
+ Nothing,
40
42
  )
41
43
  from gridfs import GridFS
42
44
  from linkml_runtime.utils.dictutils import as_simple_dict
43
45
  from linkml_runtime.utils.yamlutils import YAMLRoot
44
- from nmdc_runtime.api.db.mongo import get_mongo_db
46
+ from nmdc_runtime.api.db.mongo import validate_json
45
47
  from nmdc_runtime.api.core.idgen import generate_one_id
46
48
  from nmdc_runtime.api.core.metadata import (
47
49
  _validate_changesheet,
@@ -71,7 +73,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
71
73
  fetch_nucleotide_sequencing_from_biosamples,
72
74
  fetch_library_preparation_from_biosamples,
73
75
  )
74
- from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
75
76
  from nmdc_runtime.site.resources import (
76
77
  NmdcPortalApiClient,
77
78
  GoldApiClient,
@@ -93,30 +94,26 @@ from nmdc_runtime.site.translation.submission_portal_translator import (
93
94
  )
94
95
  from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
95
96
  from nmdc_runtime.site.util import (
96
- run_and_log,
97
97
  schema_collection_has_index_on_id,
98
98
  nmdc_study_id_to_filename,
99
99
  get_instruments_by_id,
100
100
  )
101
101
  from nmdc_runtime.util import (
102
- drs_object_in_for,
103
- get_names_of_classes_in_effective_range_of_slot,
104
102
  pluralize,
105
- put_object,
106
- validate_json,
107
103
  specialize_activity_set_docs,
108
104
  collection_name_to_class_names,
109
- class_hierarchy_as_list,
110
105
  nmdc_schema_view,
111
106
  populated_schema_collection_names_with_id_field,
112
107
  )
113
108
  from nmdc_schema import nmdc
114
- from nmdc_schema.nmdc import Database as NMDCDatabase
115
- from pydantic import BaseModel
116
- from pymongo import InsertOne
109
+ from pymongo import InsertOne, UpdateOne
117
110
  from pymongo.database import Database as MongoDatabase
118
- from starlette import status
119
- from toolz import assoc, dissoc, get_in, valfilter, identity
111
+ from pymongo.collection import Collection as MongoCollection
112
+ from toolz import get_in, valfilter, identity
113
+
114
+
115
+ # batch size for writing documents to alldocs
116
+ BULK_WRITE_BATCH_SIZE = 2000
120
117
 
121
118
 
122
119
  @op
@@ -148,99 +145,6 @@ def mongo_stats(context) -> List[str]:
148
145
  return collection_names
149
146
 
150
147
 
151
- @op(
152
- required_resource_keys={"mongo", "runtime_api_site_client"},
153
- retry_policy=RetryPolicy(max_retries=2),
154
- )
155
- def local_file_to_api_object(context, file_info):
156
- client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
157
- storage_path: str = file_info["storage_path"]
158
- mime_type = file_info.get("mime_type")
159
- if mime_type is None:
160
- mime_type = mimetypes.guess_type(storage_path)[0]
161
- rv = client.put_object_in_site(
162
- {"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
163
- )
164
- if not rv.status_code == status.HTTP_200_OK:
165
- raise Failure(description=f"put_object_in_site failed: {rv.content}")
166
- op = rv.json()
167
- context.log.info(f"put_object_in_site: {op}")
168
- rv = put_object(storage_path, op["metadata"]["url"])
169
- if not rv.status_code == status.HTTP_200_OK:
170
- raise Failure(description=f"put_object failed: {rv.content}")
171
- op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
172
- rv = client.update_operation(op["id"], op_patch)
173
- if not rv.status_code == status.HTTP_200_OK:
174
- raise Failure(description="update_operation failed")
175
- op = rv.json()
176
- context.log.info(f"update_operation: {op}")
177
- rv = client.create_object_from_op(op)
178
- if rv.status_code != status.HTTP_201_CREATED:
179
- raise Failure("create_object_from_op failed")
180
- obj = rv.json()
181
- context.log.info(f'Created /objects/{obj["id"]}')
182
- mdb = context.resources.mongo.db
183
- rv = mdb.operations.delete_one({"id": op["id"]})
184
- if rv.deleted_count != 1:
185
- context.log.error("deleting op failed")
186
- yield AssetMaterialization(
187
- asset_key=AssetKey(["object", obj["name"]]),
188
- description="output of metadata-translation run_etl",
189
- metadata={"object_id": MetadataValue.text(obj["id"])},
190
- )
191
- yield Output(obj)
192
-
193
-
194
- @op(
195
- out={
196
- "merged_data_path": Out(
197
- str,
198
- description="path to TSV merging of source metadata",
199
- )
200
- }
201
- )
202
- def build_merged_db(context) -> str:
203
- context.log.info("metadata-translation: running `make build-merged-db`")
204
- run_and_log(
205
- "cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
206
- )
207
- storage_path = (
208
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
209
- )
210
- yield AssetMaterialization(
211
- asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
212
- description="input to metadata-translation run_etl",
213
- metadata={"path": MetadataValue.path(storage_path)},
214
- )
215
- yield Output(storage_path, "merged_data_path")
216
-
217
-
218
- @op(
219
- required_resource_keys={"runtime_api_site_client"},
220
- )
221
- def run_etl(context, merged_data_path: str):
222
- context.log.info("metadata-translation: running `make run-etl`")
223
- if not os.path.exists(merged_data_path):
224
- raise Failure(description=f"merged_db not present at {merged_data_path}")
225
- run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
226
- storage_path = (
227
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
228
- )
229
- with ZipFile(storage_path) as zf:
230
- name = zf.namelist()[0]
231
- with zf.open(name) as f:
232
- rv = json.load(f)
233
- context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
234
- yield AssetMaterialization(
235
- asset_key=AssetKey(["gold_translation", "database.json.zip"]),
236
- description="output of metadata-translation run_etl",
237
- metadata={
238
- "path": MetadataValue.path(storage_path),
239
- },
240
- )
241
- yield Output({"storage_path": storage_path})
242
-
243
-
244
148
  @op(required_resource_keys={"mongo"})
245
149
  def get_operation(context):
246
150
  mdb = context.resources.mongo.db
@@ -465,6 +369,9 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn):
465
369
 
466
370
  @op(required_resource_keys={"runtime_api_site_client"})
467
371
  def get_json_in(context):
372
+ """
373
+ TODO: Document this function.
374
+ """
468
375
  object_id = context.op_config.get("object_id")
469
376
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
470
377
  rv = client.get_object_bytes(object_id)
@@ -475,63 +382,17 @@ def get_json_in(context):
475
382
  return rv.json()
476
383
 
477
384
 
478
- def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
479
- """Does not ensure ordering of `docs`."""
480
-
481
- if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
482
- return docs, 0
483
-
484
- do_docs = docs["data_object_set"]
485
-
486
- class FileTypeEnumBase(BaseModel):
487
- name: str
488
- description: str
489
- filter: str # JSON-encoded data_object_set mongo collection filter document
490
-
491
- class FileTypeEnum(FileTypeEnumBase):
492
- id: str
493
-
494
- temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
495
- temp_collection = mdb[temp_collection_name]
496
- temp_collection.insert_many(do_docs)
497
- temp_collection.create_index("id")
498
-
499
- def fte_matches(fte_filter: str):
500
- return [
501
- dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
502
- ]
503
-
504
- do_docs_map = {d["id"]: d for d in do_docs}
505
-
506
- n_docs_with_types_added = 0
507
-
508
- for fte_doc in mdb.file_type_enum.find():
509
- fte = FileTypeEnum(**fte_doc)
510
- docs_matching = fte_matches(fte.filter)
511
- for doc in docs_matching:
512
- if "data_object_type" not in doc:
513
- do_docs_map[doc["id"]] = assoc(doc, "data_object_type", fte.id)
514
- n_docs_with_types_added += 1
515
-
516
- mdb.drop_collection(temp_collection_name)
517
- return (
518
- assoc(
519
- docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
520
- ),
521
- n_docs_with_types_added,
522
- )
523
-
524
-
525
385
  @op(required_resource_keys={"runtime_api_site_client", "mongo"})
526
386
  def perform_mongo_updates(context, json_in):
387
+ """
388
+ TODO: Document this function.
389
+ """
527
390
  mongo = context.resources.mongo
528
391
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
529
392
  op_id = context.op_config.get("operation_id")
530
393
 
531
394
  docs = json_in
532
395
  docs, _ = specialize_activity_set_docs(docs)
533
- docs, n_docs_with_types_added = ensure_data_object_type(docs, mongo.db)
534
- context.log.info(f"added `data_object_type` to {n_docs_with_types_added} docs")
535
396
  context.log.debug(f"{docs}")
536
397
 
537
398
  rv = validate_json(
@@ -555,6 +416,9 @@ def perform_mongo_updates(context, json_in):
555
416
  def _add_schema_docs_with_or_without_replacement(
556
417
  mongo: MongoDBResource, docs: Dict[str, list]
557
418
  ):
419
+ """
420
+ TODO: Document this function.
421
+ """
558
422
  coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
559
423
  if all(coll_index_on_id_map[coll] for coll in docs.keys()):
560
424
  replace = True
@@ -578,7 +442,13 @@ def _add_schema_docs_with_or_without_replacement(
578
442
  f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
579
443
  )
580
444
  op_result = mongo.add_docs(docs, validate=False, replace=replace)
581
- return mongo_add_docs_result_as_dict(op_result)
445
+
446
+ # Translate the operation result into a dictionary in which each item's key is a collection name
447
+ # and each item's value is the corresponding bulk API result (excluding the "upserted" field).
448
+ return {
449
+ collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
450
+ for collection_name, bulk_write_result in op_result.items()
451
+ }
582
452
 
583
453
 
584
454
  @op(required_resource_keys={"mongo"})
@@ -600,22 +470,25 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
600
470
  "study_type": str,
601
471
  "gold_nmdc_instrument_mapping_file_url": str,
602
472
  "include_field_site_info": bool,
473
+ "enable_biosample_filtering": bool,
603
474
  },
604
475
  out={
605
476
  "study_id": Out(str),
606
477
  "study_type": Out(str),
607
478
  "gold_nmdc_instrument_mapping_file_url": Out(str),
608
479
  "include_field_site_info": Out(bool),
480
+ "enable_biosample_filtering": Out(bool),
609
481
  },
610
482
  )
611
483
  def get_gold_study_pipeline_inputs(
612
484
  context: OpExecutionContext,
613
- ) -> Tuple[str, str, str, bool]:
485
+ ) -> Tuple[str, str, str, bool, bool]:
614
486
  return (
615
487
  context.op_config["study_id"],
616
488
  context.op_config["study_type"],
617
489
  context.op_config["gold_nmdc_instrument_mapping_file_url"],
618
490
  context.op_config["include_field_site_info"],
491
+ context.op_config["enable_biosample_filtering"],
619
492
  )
620
493
 
621
494
 
@@ -659,6 +532,7 @@ def nmdc_schema_database_from_gold_study(
659
532
  analysis_projects: List[Dict[str, Any]],
660
533
  gold_nmdc_instrument_map_df: pd.DataFrame,
661
534
  include_field_site_info: bool,
535
+ enable_biosample_filtering: bool,
662
536
  ) -> nmdc.Database:
663
537
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
664
538
 
@@ -674,6 +548,7 @@ def nmdc_schema_database_from_gold_study(
674
548
  analysis_projects,
675
549
  gold_nmdc_instrument_map_df,
676
550
  include_field_site_info,
551
+ enable_biosample_filtering,
677
552
  id_minter=id_minter,
678
553
  )
679
554
  database = translator.get_database()
@@ -681,27 +556,39 @@ def nmdc_schema_database_from_gold_study(
681
556
 
682
557
 
683
558
  @op(
559
+ required_resource_keys={"mongo"},
684
560
  out={
685
561
  "submission_id": Out(),
686
562
  "nucleotide_sequencing_mapping_file_url": Out(Optional[str]),
687
563
  "data_object_mapping_file_url": Out(Optional[str]),
688
564
  "biosample_extras_file_url": Out(Optional[str]),
689
565
  "biosample_extras_slot_mapping_file_url": Out(Optional[str]),
566
+ "study_id": Out(Optional[str]),
690
567
  },
691
568
  )
692
569
  def get_submission_portal_pipeline_inputs(
570
+ context: OpExecutionContext,
693
571
  submission_id: str,
694
572
  nucleotide_sequencing_mapping_file_url: Optional[str],
695
573
  data_object_mapping_file_url: Optional[str],
696
574
  biosample_extras_file_url: Optional[str],
697
575
  biosample_extras_slot_mapping_file_url: Optional[str],
698
- ) -> Tuple[str, str | None, str | None, str | None, str | None]:
576
+ study_id: Optional[str],
577
+ ) -> Tuple[str, str | None, str | None, str | None, str | None, str | None]:
578
+ # query for studies matching the ID to see if it eists
579
+ if study_id:
580
+ mdb = context.resources.mongo.db
581
+ result = mdb.study_set.find_one({"id": study_id})
582
+ if not result:
583
+ raise Exception(f"Study id: {study_id} does not exist in Mongo.")
584
+
699
585
  return (
700
586
  submission_id,
701
587
  nucleotide_sequencing_mapping_file_url,
702
588
  data_object_mapping_file_url,
703
589
  biosample_extras_file_url,
704
590
  biosample_extras_slot_mapping_file_url,
591
+ study_id,
705
592
  )
706
593
 
707
594
 
@@ -726,6 +613,7 @@ def translate_portal_submission_to_nmdc_schema_database(
726
613
  study_pi_image_url: Optional[str],
727
614
  biosample_extras: Optional[list[dict]],
728
615
  biosample_extras_slot_mapping: Optional[list[dict]],
616
+ study_id: Optional[str],
729
617
  ) -> nmdc.Database:
730
618
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
731
619
 
@@ -743,11 +631,37 @@ def translate_portal_submission_to_nmdc_schema_database(
743
631
  biosample_extras=biosample_extras,
744
632
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
745
633
  illumina_instrument_mapping=instrument_mapping,
634
+ study_id=study_id,
746
635
  )
747
636
  database = translator.get_database()
748
637
  return database
749
638
 
750
639
 
640
+ @op(required_resource_keys={"nmdc_portal_api_client"})
641
+ def add_public_image_urls(
642
+ context: OpExecutionContext, database: nmdc.Database, submission_id: str
643
+ ) -> nmdc.Database:
644
+ client: NmdcPortalApiClient = context.resources.nmdc_portal_api_client
645
+
646
+ if len(database.study_set) != 1:
647
+ raise Failure(
648
+ description="Expected exactly one study in the database to add public image URLs."
649
+ )
650
+
651
+ study_id = database.study_set[0].id
652
+ public_images = client.make_submission_images_public(
653
+ submission_id, study_id=study_id
654
+ )
655
+ SubmissionPortalTranslator.set_study_images(
656
+ database.study_set[0],
657
+ public_images.get("pi_image_url"),
658
+ public_images.get("primary_study_image_url"),
659
+ public_images.get("study_image_urls"),
660
+ )
661
+
662
+ return database
663
+
664
+
751
665
  @op
752
666
  def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
753
667
  source_id = None
@@ -1043,18 +957,246 @@ def site_code_mapping() -> dict:
1043
957
  )
1044
958
 
1045
959
 
1046
- @op(required_resource_keys={"mongo"})
1047
- def materialize_alldocs(context) -> int:
960
+ @op(
961
+ required_resource_keys={"mongo"},
962
+ config_schema={
963
+ "source_ontology": str,
964
+ "output_directory": Field(Noneable(str), default_value=None, is_required=False),
965
+ "generate_reports": Field(bool, default_value=True, is_required=False),
966
+ },
967
+ )
968
+ def load_ontology(context: OpExecutionContext):
969
+ cfg = context.op_config
970
+ source_ontology = cfg["source_ontology"]
971
+ output_directory = cfg.get("output_directory")
972
+ generate_reports = cfg.get("generate_reports", True)
973
+
974
+ if output_directory is None:
975
+ output_directory = os.path.join(os.getcwd(), "ontology_reports")
976
+
977
+ # Redirect Python logging to Dagster context
978
+ handler = logging.Handler()
979
+ handler.emit = lambda record: context.log.info(record.getMessage())
980
+
981
+ # Get logger from ontology-loader package
982
+ controller_logger = logging.getLogger("ontology_loader.ontology_load_controller")
983
+ controller_logger.setLevel(logging.INFO)
984
+ controller_logger.addHandler(handler)
985
+
986
+ context.log.info(f"Running Ontology Loader for ontology: {source_ontology}")
987
+ loader = OntologyLoaderController(
988
+ source_ontology=source_ontology,
989
+ output_directory=output_directory,
990
+ generate_reports=generate_reports,
991
+ mongo_client=context.resources.mongo.client,
992
+ db_name=context.resources.mongo.db.name,
993
+ )
994
+
995
+ loader.run_ontology_loader()
996
+ context.log.info(f"Ontology load for {source_ontology} completed successfully!")
997
+
998
+
999
+ def _add_linked_instances_to_alldocs(
1000
+ temp_collection: MongoCollection,
1001
+ context: OpExecutionContext,
1002
+ document_reference_ranged_slots_by_type: dict,
1003
+ ) -> None:
1004
+ """
1005
+ Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
1006
+
1007
+ The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
1008
+ Each subdocument represents a link to another document that either links to or is linked from the document via
1009
+ document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
1010
+ document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
1011
+ considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
1012
+
1013
+ Args:
1014
+ temp_collection: The temporary MongoDB collection to process
1015
+ context: The Dagster execution context for logging
1016
+ document_reference_ranged_slots_by_type: Dictionary mapping document types to their reference-ranged slot names
1017
+
1018
+ Returns:
1019
+ None (modifies the documents in place)
1020
+ """
1021
+
1022
+ context.log.info(
1023
+ "Building relationships and adding `_upstream` and `_downstream` fields..."
1024
+ )
1025
+
1026
+ # document ID -> type (with "nmdc:" prefix preserved)
1027
+ id_to_type_map: Dict[str, str] = {}
1028
+
1029
+ # set of (<referencing document ID>, <slot>, <referenced document ID>) 3-tuples.
1030
+ relationship_triples: Set[Tuple[str, str, str]] = set()
1031
+
1032
+ # Collect relationship triples.
1033
+ for doc in temp_collection.find():
1034
+ doc_id = doc["id"]
1035
+ # Store the full type with prefix intact
1036
+ doc_type = doc["type"]
1037
+ # For looking up reference slots, we still need the type without prefix
1038
+ doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
1039
+
1040
+ # Record ID to type mapping - preserve the original type with prefix
1041
+ id_to_type_map[doc_id] = doc_type
1042
+
1043
+ # Find all document references from this document
1044
+ reference_slots = document_reference_ranged_slots_by_type.get(doc_type, [])
1045
+ for slot in reference_slots:
1046
+ if slot in doc:
1047
+ # Handle both single-value and array references
1048
+ refs = doc[slot] if isinstance(doc[slot], list) else [doc[slot]]
1049
+ for ref_doc in temp_collection.find(
1050
+ {"id": {"$in": refs}}, ["id", "type"]
1051
+ ):
1052
+ id_to_type_map[ref_doc["id"]] = ref_doc["type"]
1053
+ for ref_id in refs:
1054
+ relationship_triples.add((doc_id, slot, ref_id))
1055
+
1056
+ context.log.info(
1057
+ f"Found {len(id_to_type_map)} documents, with "
1058
+ f"{len({d for (d, _, _) in relationship_triples})} containing references"
1059
+ )
1060
+
1061
+ # The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
1062
+ # in order to perform graph traversal and collect all entities "related" to a given entity without
1063
+ # recursion "exploding".
1064
+ #
1065
+ # Note: We are hard-coding this "direction" information here in the Runtime
1066
+ # because the NMDC schema does not currently contain or expose it.
1067
+ #
1068
+ # An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
1069
+ upstream_document_reference_ranged_slots = [
1070
+ "associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
1071
+ "collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
1072
+ "has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1073
+ "has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
1074
+ "has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1075
+ "instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
1076
+ "part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
1077
+ "was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
1078
+ "was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
1079
+ ]
1080
+ # A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
1081
+ downstream_document_reference_ranged_slots = [
1082
+ "calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
1083
+ "generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
1084
+ "has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
1085
+ "in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
1086
+ "uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
1087
+ # Note: I don't think of superseding something as being either upstream or downstream of that thing;
1088
+ # but this function requires every document-reference-ranged slot to be accounted for in one
1089
+ # list or the other, and the superseding thing does arise _later_ than the thing it supersedes,
1090
+ # so I have opted to treat the superseding thing as being downstream.
1091
+ "superseded_by", # when a `nmdc:WorkflowExecution` or `nmdc:DataObject` is superseded by a `nmdc:WorkflowExecution`.
1092
+ ]
1093
+
1094
+ unique_document_reference_ranged_slot_names = set()
1095
+ for slot_names in document_reference_ranged_slots_by_type.values():
1096
+ for slot_name in slot_names:
1097
+ unique_document_reference_ranged_slot_names.add(slot_name)
1098
+ context.log.info(f"{unique_document_reference_ranged_slot_names=}")
1099
+ if len(upstream_document_reference_ranged_slots) + len(
1100
+ downstream_document_reference_ranged_slots
1101
+ ) != len(unique_document_reference_ranged_slot_names):
1102
+ raise Failure(
1103
+ "Number of detected unique document-reference-ranged slot names does not match "
1104
+ "sum of accounted-for upstream and downstream document-reference-ranged slot names."
1105
+ )
1106
+
1107
+ # Construct, and update documents with, `_upstream` and `_downstream` field values.
1108
+ #
1109
+ # manage batching of MongoDB `bulk_write` operations
1110
+ bulk_operations, update_count = [], 0
1111
+ for doc_id, slot, ref_id in relationship_triples:
1112
+
1113
+ # Determine in which respective fields to push this relationship
1114
+ # for the subject (doc) and object (ref) of this triple.
1115
+ if slot in upstream_document_reference_ranged_slots:
1116
+ field_for_doc, field_for_ref = "_upstream", "_downstream"
1117
+ elif slot in downstream_document_reference_ranged_slots:
1118
+ field_for_doc, field_for_ref = "_downstream", "_upstream"
1119
+ else:
1120
+ raise Failure(f"Unknown slot {slot} for document {doc_id}")
1121
+
1122
+ updates = [
1123
+ {
1124
+ "filter": {"id": doc_id},
1125
+ "update": {
1126
+ "$push": {
1127
+ field_for_doc: {
1128
+ "id": ref_id,
1129
+ # TODO existing tests are failing due to `KeyError`s for `id_to_type_map.get[ref_id]` here,
1130
+ # which acts as an implicit referential integrity checker (!). Using `.get` with
1131
+ # "nmdc:NamedThing" as default in order to (for now) allow such tests to continue to pass.
1132
+ "type": id_to_type_map.get(ref_id, "nmdc:NamedThing"),
1133
+ }
1134
+ }
1135
+ },
1136
+ },
1137
+ {
1138
+ "filter": {"id": ref_id},
1139
+ "update": {
1140
+ "$push": {
1141
+ field_for_ref: {"id": doc_id, "type": id_to_type_map[doc_id]}
1142
+ }
1143
+ },
1144
+ },
1145
+ ]
1146
+ for update in updates:
1147
+ bulk_operations.append(UpdateOne(**update))
1148
+
1149
+ # Execute in batches for efficiency
1150
+ if len(bulk_operations) >= BULK_WRITE_BATCH_SIZE:
1151
+ temp_collection.bulk_write(bulk_operations)
1152
+ update_count += len(bulk_operations)
1153
+ context.log.info(
1154
+ f"Pushed {update_count/(2*len(relationship_triples)):.1%} of updates so far..."
1155
+ )
1156
+ bulk_operations = []
1157
+
1158
+ # Execute any remaining operations
1159
+ if bulk_operations:
1160
+ temp_collection.bulk_write(bulk_operations)
1161
+ update_count += len(bulk_operations)
1162
+
1163
+ context.log.info(f"Pushed {update_count} updates in total")
1164
+
1165
+
1166
+ # Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
1167
+ # pass an argument to the op (in order to specify the order of the ops in the graph)
1168
+ # while also telling Dagster that this op doesn't need the _value_ of that argument.
1169
+ # This is the approach shown on: https://docs.dagster.io/api/dagster/types#dagster.Nothing
1170
+ # Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
1171
+ #
1172
+ @op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
1173
+ def materialize_alldocs(context: OpExecutionContext) -> int:
1048
1174
  """
1049
- This function re-creates the alldocs collection to reflect the current state of the Mongo database.
1050
- See nmdc-runtime/docs/nb/bulk_validation_referential_integrity_check.ipynb for more details.
1175
+ This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
1176
+
1177
+ 1. Getting all populated schema collection names with an `id` field.
1178
+ 2. Create a temporary collection to build the new alldocs collection.
1179
+ 3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
1180
+ 4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
1181
+ 5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
1182
+ 6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
1183
+ 7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
1184
+
1185
+ The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
1186
+ `nmdc_runtime.site.repository.ensure_alldocs_daily`. The collection is also updated as part of various workflows,
1187
+ such as when applying a changesheet or metadata updates (see `nmdc_runtime.site.graphs`).
1188
+
1189
+ The `alldocs` collection is used primarily by API endpoints like `/data_objects/study/{study_id}` and
1190
+ `/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
1191
+ related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
1192
+
1193
+ The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
1194
+ that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
1195
+ expansions.
1051
1196
  """
1052
1197
  mdb = context.resources.mongo.db
1053
1198
  schema_view = nmdc_schema_view()
1054
1199
 
1055
- # batch size for writing documents to alldocs
1056
- BULK_WRITE_BATCH_SIZE = 2000
1057
-
1058
1200
  # TODO include functional_annotation_agg for "real-time" ref integrity checking.
1059
1201
  # For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
1060
1202
  collection_names = populated_schema_collection_names_with_id_field(mdb)
@@ -1079,14 +1221,16 @@ def materialize_alldocs(context) -> int:
1079
1221
  )
1080
1222
  )
1081
1223
 
1082
- document_reference_ranged_slots = defaultdict(list)
1224
+ document_reference_ranged_slots_by_type = defaultdict(list)
1083
1225
  for cls_name, slot_map in cls_slot_map.items():
1084
1226
  for slot_name, slot in slot_map.items():
1085
1227
  if (
1086
1228
  set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
1087
1229
  & document_referenceable_ranges
1088
1230
  ):
1089
- document_reference_ranged_slots[cls_name].append(slot_name)
1231
+ document_reference_ranged_slots_by_type[f"nmdc:{cls_name}"].append(
1232
+ slot_name
1233
+ )
1090
1234
 
1091
1235
  # Build `alldocs` to a temporary collection for atomic replacement
1092
1236
  # https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
@@ -1100,22 +1244,31 @@ def materialize_alldocs(context) -> int:
1100
1244
  documents_processed_counter = 0
1101
1245
  for doc in mdb[coll_name].find():
1102
1246
  try:
1103
- doc_type = doc["type"][5:] # lop off "nmdc:" prefix
1247
+ # Keep the full type with prefix for document
1248
+ doc_type_full = doc["type"]
1249
+ # Remove prefix for slot lookup and ancestor lookup
1250
+ doc_type = doc_type_full.removeprefix("nmdc:")
1104
1251
  except KeyError:
1105
1252
  raise Exception(
1106
1253
  f"doc {doc['id']} in collection {coll_name} has no 'type'!"
1107
1254
  )
1108
- slots_to_include = ["id", "type"] + document_reference_ranged_slots[
1109
- doc_type
1255
+ slots_to_include = ["id", "type"] + document_reference_ranged_slots_by_type[
1256
+ doc_type_full
1110
1257
  ]
1111
1258
  new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
1112
- new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
1259
+
1260
+ # Get ancestors without the prefix, but add prefix to each one in the output
1261
+ new_doc["_type_and_ancestors"] = [
1262
+ f"nmdc:{a}" for a in schema_view.class_ancestors(doc_type)
1263
+ ]
1264
+ # InsertOne is a pymongo representation of a mongo command.
1113
1265
  write_operations.append(InsertOne(new_doc))
1114
1266
  if len(write_operations) == BULK_WRITE_BATCH_SIZE:
1115
1267
  _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1116
1268
  write_operations.clear()
1117
1269
  documents_processed_counter += BULK_WRITE_BATCH_SIZE
1118
1270
  if len(write_operations) > 0:
1271
+ # here bulk_write is a method on the pymongo db Collection class
1119
1272
  _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1120
1273
  documents_processed_counter += len(write_operations)
1121
1274
  context.log.info(
@@ -1132,14 +1285,36 @@ def materialize_alldocs(context) -> int:
1132
1285
  # so that `temp_alldocs_collection` will be "good to go" on renaming.
1133
1286
  temp_alldocs_collection.create_index("id", unique=True)
1134
1287
  # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
1135
- slots_to_index = ["has_input", "has_output", "was_informed_by"]
1288
+ slots_to_index = {"_type_and_ancestors"} | {
1289
+ slot
1290
+ for slots in document_reference_ranged_slots_by_type.values()
1291
+ for slot in slots
1292
+ }
1136
1293
  [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
1137
- context.log.info(f"created indexes on id, {slots_to_index}.")
1294
+ context.log.info(f"created indexes on id and on each of {slots_to_index=}.")
1295
+
1296
+ # Add related-ids fields to enable efficient relationship traversal
1297
+ context.log.info("Adding fields for related ids to documents...")
1298
+ _add_linked_instances_to_alldocs(
1299
+ temp_alldocs_collection, context, document_reference_ranged_slots_by_type
1300
+ )
1301
+ context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
1302
+ temp_alldocs_collection.create_index("_upstream.id")
1303
+ temp_alldocs_collection.create_index("_downstream.id")
1304
+ # Create compound indexes to ensure index-covered queries
1305
+ temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
1306
+ temp_alldocs_collection.create_index(
1307
+ [("_downstream.type", 1), ("_downstream.id", 1)]
1308
+ )
1309
+ context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
1138
1310
 
1139
1311
  context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
1140
1312
  temp_alldocs_collection.rename("alldocs", dropTarget=True)
1141
-
1142
- return mdb.alldocs.estimated_document_count()
1313
+ n_alldocs_documents = mdb.alldocs.estimated_document_count()
1314
+ context.log.info(
1315
+ f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
1316
+ )
1317
+ return n_alldocs_documents
1143
1318
 
1144
1319
 
1145
1320
  @op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
@@ -1225,6 +1400,42 @@ def get_library_preparation_from_biosamples(
1225
1400
  return biosample_lib_prep
1226
1401
 
1227
1402
 
1403
+ @op(required_resource_keys={"mongo"})
1404
+ def get_aggregated_pooled_biosamples(context: OpExecutionContext, biosamples: list):
1405
+ from nmdc_runtime.site.export.ncbi_xml_utils import check_pooling_for_biosamples
1406
+
1407
+ mdb = context.resources.mongo.db
1408
+ material_processing_set = mdb["material_processing_set"]
1409
+ pooled_biosamples_data = check_pooling_for_biosamples(
1410
+ material_processing_set, biosamples
1411
+ )
1412
+
1413
+ # Fetch ProcessedSample names from database
1414
+ processed_sample_ids = set()
1415
+ for biosample_id, pooling_info in pooled_biosamples_data.items():
1416
+ if pooling_info and pooling_info.get("processed_sample_id"):
1417
+ processed_sample_ids.add(pooling_info["processed_sample_id"])
1418
+
1419
+ # Query database for ProcessedSample names
1420
+ if processed_sample_ids:
1421
+ processed_sample_set = mdb["processed_sample_set"]
1422
+ cursor = processed_sample_set.find(
1423
+ {"id": {"$in": list(processed_sample_ids)}}, {"id": 1, "name": 1}
1424
+ )
1425
+ processed_samples = {doc["id"]: doc.get("name", "") for doc in cursor}
1426
+
1427
+ # Update pooled_biosamples_data with ProcessedSample names
1428
+ for biosample_id, pooling_info in pooled_biosamples_data.items():
1429
+ if pooling_info and pooling_info.get("processed_sample_id"):
1430
+ processed_sample_id = pooling_info["processed_sample_id"]
1431
+ if processed_sample_id in processed_samples:
1432
+ pooling_info["processed_sample_name"] = processed_samples[
1433
+ processed_sample_id
1434
+ ]
1435
+
1436
+ return pooled_biosamples_data
1437
+
1438
+
1228
1439
  @op(required_resource_keys={"mongo"})
1229
1440
  def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
1230
1441
  mdb = context.resources.mongo.db
@@ -1258,6 +1469,7 @@ def ncbi_submission_xml_from_nmdc_study(
1258
1469
  data_object_records: list,
1259
1470
  library_preparation_records: list,
1260
1471
  all_instruments: dict,
1472
+ pooled_biosamples_data: dict,
1261
1473
  ) -> str:
1262
1474
  ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
1263
1475
  ncbi_xml = ncbi_exporter.get_submission_xml(
@@ -1266,6 +1478,7 @@ def ncbi_submission_xml_from_nmdc_study(
1266
1478
  data_object_records,
1267
1479
  library_preparation_records,
1268
1480
  all_instruments,
1481
+ pooled_biosamples_data,
1269
1482
  )
1270
1483
  return ncbi_xml
1271
1484
 
@@ -1282,16 +1495,24 @@ def post_submission_portal_biosample_ingest_record_stitching_filename(
1282
1495
  config_schema={
1283
1496
  "nmdc_study_id": str,
1284
1497
  "gold_nmdc_instrument_mapping_file_url": str,
1498
+ "include_field_site_info": bool,
1499
+ "enable_biosample_filtering": bool,
1285
1500
  },
1286
1501
  out={
1287
1502
  "nmdc_study_id": Out(str),
1288
1503
  "gold_nmdc_instrument_mapping_file_url": Out(str),
1504
+ "include_field_site_info": Out(bool),
1505
+ "enable_biosample_filtering": Out(bool),
1289
1506
  },
1290
1507
  )
1291
- def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]:
1508
+ def get_database_updater_inputs(
1509
+ context: OpExecutionContext,
1510
+ ) -> Tuple[str, str, bool, bool]:
1292
1511
  return (
1293
1512
  context.op_config["nmdc_study_id"],
1294
1513
  context.op_config["gold_nmdc_instrument_mapping_file_url"],
1514
+ context.op_config["include_field_site_info"],
1515
+ context.op_config["enable_biosample_filtering"],
1295
1516
  )
1296
1517
 
1297
1518
 
@@ -1306,6 +1527,8 @@ def generate_data_generation_set_post_biosample_ingest(
1306
1527
  context: OpExecutionContext,
1307
1528
  nmdc_study_id: str,
1308
1529
  gold_nmdc_instrument_map_df: pd.DataFrame,
1530
+ include_field_site_info: bool,
1531
+ enable_biosample_filtering: bool,
1309
1532
  ) -> nmdc.Database:
1310
1533
  runtime_api_user_client: RuntimeApiUserClient = (
1311
1534
  context.resources.runtime_api_user_client
@@ -1321,6 +1544,8 @@ def generate_data_generation_set_post_biosample_ingest(
1321
1544
  gold_api_client,
1322
1545
  nmdc_study_id,
1323
1546
  gold_nmdc_instrument_map_df,
1547
+ include_field_site_info,
1548
+ enable_biosample_filtering,
1324
1549
  )
1325
1550
  database = (
1326
1551
  database_updater.generate_data_generation_set_records_from_gold_api_for_study()
@@ -1340,6 +1565,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
1340
1565
  context: OpExecutionContext,
1341
1566
  nmdc_study_id: str,
1342
1567
  gold_nmdc_instrument_map_df: pd.DataFrame,
1568
+ include_field_site_info: bool = False,
1569
+ enable_biosample_filtering: bool = False,
1343
1570
  ) -> nmdc.Database:
1344
1571
  runtime_api_user_client: RuntimeApiUserClient = (
1345
1572
  context.resources.runtime_api_user_client
@@ -1355,12 +1582,72 @@ def generate_biosample_set_for_nmdc_study_from_gold(
1355
1582
  gold_api_client,
1356
1583
  nmdc_study_id,
1357
1584
  gold_nmdc_instrument_map_df,
1585
+ include_field_site_info,
1586
+ enable_biosample_filtering,
1358
1587
  )
1359
1588
  database = database_updater.generate_biosample_set_from_gold_api_for_study()
1360
1589
 
1361
1590
  return database
1362
1591
 
1363
1592
 
1593
+ @op(
1594
+ required_resource_keys={
1595
+ "runtime_api_user_client",
1596
+ "runtime_api_site_client",
1597
+ "gold_api_client",
1598
+ },
1599
+ out=Out(Any),
1600
+ )
1601
+ def run_script_to_update_insdc_biosample_identifiers(
1602
+ context: OpExecutionContext,
1603
+ nmdc_study_id: str,
1604
+ gold_nmdc_instrument_map_df: pd.DataFrame,
1605
+ include_field_site_info: bool,
1606
+ enable_biosample_filtering: bool,
1607
+ ):
1608
+ """Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
1609
+
1610
+ This op uses the DatabaseUpdater to generate a script that can be used to update biosample
1611
+ records with INSDC identifiers obtained from GOLD.
1612
+
1613
+ Args:
1614
+ context: The execution context
1615
+ nmdc_study_id: The NMDC study ID for which to generate the update script
1616
+ gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
1617
+
1618
+ Returns:
1619
+ A dictionary or list of dictionaries containing the MongoDB update script(s)
1620
+ """
1621
+ runtime_api_user_client: RuntimeApiUserClient = (
1622
+ context.resources.runtime_api_user_client
1623
+ )
1624
+ runtime_api_site_client: RuntimeApiSiteClient = (
1625
+ context.resources.runtime_api_site_client
1626
+ )
1627
+ gold_api_client: GoldApiClient = context.resources.gold_api_client
1628
+
1629
+ database_updater = DatabaseUpdater(
1630
+ runtime_api_user_client,
1631
+ runtime_api_site_client,
1632
+ gold_api_client,
1633
+ nmdc_study_id,
1634
+ gold_nmdc_instrument_map_df,
1635
+ include_field_site_info,
1636
+ enable_biosample_filtering,
1637
+ )
1638
+ update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
1639
+
1640
+ if isinstance(update_script, list):
1641
+ total_updates = sum(len(item.get("updates", [])) for item in update_script)
1642
+ else:
1643
+ total_updates = len(update_script.get("updates", []))
1644
+ context.log.info(
1645
+ f"Generated update script for study {nmdc_study_id} with {total_updates} updates"
1646
+ )
1647
+
1648
+ return update_script
1649
+
1650
+
1364
1651
  @op
1365
1652
  def log_database_ids(
1366
1653
  context: OpExecutionContext,
@@ -1382,3 +1669,55 @@ def log_database_ids(
1382
1669
  message += "\n"
1383
1670
  if message:
1384
1671
  context.log.info(message)
1672
+
1673
+
1674
+ @op(
1675
+ description="Render free text through the Dagit UI",
1676
+ out=Out(description="Text content rendered through Dagit UI"),
1677
+ )
1678
+ def render_text(context: OpExecutionContext, text: Any):
1679
+ """
1680
+ Renders content as a Dagster Asset in the Dagit UI.
1681
+
1682
+ This operation creates a Dagster Asset with the provided content, making it
1683
+ visible in the Dagit UI for easy viewing and sharing.
1684
+
1685
+ Args:
1686
+ context: The execution context
1687
+ text: The content to render (can be a string or a dictionary that will be converted to JSON)
1688
+
1689
+ Returns:
1690
+ The same content that was provided as input
1691
+ """
1692
+ # Convert dictionary to formatted JSON string if needed
1693
+ if isinstance(text, dict):
1694
+ import json
1695
+
1696
+ content = json.dumps(text, indent=2)
1697
+ file_extension = "json"
1698
+ hash_text = json.dumps(text, sort_keys=True)[:20] # For consistent hashing
1699
+ else:
1700
+ content = str(text) # Convert to string in case it's not already
1701
+ file_extension = "txt"
1702
+ hash_text = content[:20]
1703
+
1704
+ filename = f"rendered_text_{context.run_id}.{file_extension}"
1705
+ file_path = os.path.join(context.instance.storage_directory(), filename)
1706
+
1707
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
1708
+
1709
+ with open(file_path, "w") as f:
1710
+ f.write(content)
1711
+
1712
+ context.log_event(
1713
+ AssetMaterialization(
1714
+ asset_key=f"rendered_text_{hash_from_str(hash_text, 'md5')[:8]}",
1715
+ description="Rendered Content",
1716
+ metadata={
1717
+ "file_path": MetadataValue.path(file_path),
1718
+ "content": MetadataValue.text(content),
1719
+ },
1720
+ )
1721
+ )
1722
+
1723
+ return Output(text)