nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
nmdc_runtime/site/ops.py CHANGED
@@ -1,16 +1,21 @@
1
1
  import csv
2
2
  import json
3
- import mimetypes
3
+ import logging
4
4
  import os
5
5
  import subprocess
6
- import tempfile
7
6
  from collections import defaultdict
8
7
  from datetime import datetime, timezone
9
- from io import BytesIO, StringIO
10
- from typing import Tuple
8
+ from io import BytesIO
9
+ from pprint import pformat
10
+ from toolz.dicttoolz import keyfilter
11
+ from typing import Tuple, Set
11
12
  from zipfile import ZipFile
13
+ from itertools import chain
14
+ from ontology_loader.ontology_load_controller import OntologyLoaderController
12
15
  import pandas as pd
13
16
  import requests
17
+ from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
18
+ from toolz import dissoc
14
19
 
15
20
  from bson import ObjectId, json_util
16
21
  from dagster import (
@@ -21,6 +26,7 @@ from dagster import (
21
26
  Failure,
22
27
  List,
23
28
  MetadataValue,
29
+ Noneable,
24
30
  OpExecutionContext,
25
31
  Out,
26
32
  Output,
@@ -29,10 +35,15 @@ from dagster import (
29
35
  String,
30
36
  op,
31
37
  Optional,
38
+ Field,
39
+ Permissive,
40
+ In,
41
+ Nothing,
32
42
  )
33
43
  from gridfs import GridFS
34
- from linkml_runtime.dumpers import json_dumper
44
+ from linkml_runtime.utils.dictutils import as_simple_dict
35
45
  from linkml_runtime.utils.yamlutils import YAMLRoot
46
+ from nmdc_runtime.api.db.mongo import validate_json
36
47
  from nmdc_runtime.api.core.idgen import generate_one_id
37
48
  from nmdc_runtime.api.core.metadata import (
38
49
  _validate_changesheet,
@@ -42,6 +53,7 @@ from nmdc_runtime.api.core.metadata import (
42
53
  )
43
54
  from nmdc_runtime.api.core.util import dotted_path_for, hash_from_str, json_clean, now
44
55
  from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object
56
+ from nmdc_runtime.api.endpoints.find import find_study_by_id
45
57
  from nmdc_runtime.api.models.job import Job, JobOperationMetadata
46
58
  from nmdc_runtime.api.models.metadata import ChangesheetIn
47
59
  from nmdc_runtime.api.models.operation import (
@@ -55,36 +67,53 @@ from nmdc_runtime.api.models.run import (
55
67
  _add_run_complete_event,
56
68
  )
57
69
  from nmdc_runtime.api.models.util import ResultT
58
- from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
70
+ from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
71
+ from nmdc_runtime.site.export.ncbi_xml_utils import (
72
+ fetch_data_objects_from_biosamples,
73
+ fetch_nucleotide_sequencing_from_biosamples,
74
+ fetch_library_preparation_from_biosamples,
75
+ )
59
76
  from nmdc_runtime.site.resources import (
60
77
  NmdcPortalApiClient,
61
78
  GoldApiClient,
62
79
  RuntimeApiSiteClient,
63
80
  RuntimeApiUserClient,
64
81
  NeonApiClient,
82
+ MongoDB as MongoDBResource,
65
83
  )
66
84
  from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
67
85
  from nmdc_runtime.site.translation.neon_soil_translator import NeonSoilDataTranslator
68
86
  from nmdc_runtime.site.translation.neon_benthic_translator import (
69
87
  NeonBenthicDataTranslator,
70
88
  )
89
+ from nmdc_runtime.site.translation.neon_surface_water_translator import (
90
+ NeonSurfaceWaterDataTranslator,
91
+ )
71
92
  from nmdc_runtime.site.translation.submission_portal_translator import (
72
93
  SubmissionPortalTranslator,
73
94
  )
74
- from nmdc_runtime.site.util import collection_indexed_on_id, run_and_log
95
+ from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
96
+ from nmdc_runtime.site.util import (
97
+ schema_collection_has_index_on_id,
98
+ nmdc_study_id_to_filename,
99
+ get_instruments_by_id,
100
+ )
75
101
  from nmdc_runtime.util import (
76
- drs_object_in_for,
77
102
  pluralize,
78
- put_object,
79
- validate_json,
80
103
  specialize_activity_set_docs,
104
+ collection_name_to_class_names,
105
+ nmdc_schema_view,
106
+ populated_schema_collection_names_with_id_field,
81
107
  )
82
108
  from nmdc_schema import nmdc
83
- from pydantic import BaseModel
109
+ from pymongo import InsertOne, UpdateOne
84
110
  from pymongo.database import Database as MongoDatabase
85
- from starlette import status
86
- from terminusdb_client.woqlquery import WOQLQuery as WQ
87
- from toolz import assoc, dissoc, get_in, valfilter, identity
111
+ from pymongo.collection import Collection as MongoCollection
112
+ from toolz import get_in, valfilter, identity
113
+
114
+
115
+ # batch size for writing documents to alldocs
116
+ BULK_WRITE_BATCH_SIZE = 2000
88
117
 
89
118
 
90
119
  @op
@@ -108,14 +137,6 @@ def log_env(context):
108
137
  context.log.info("\n".join(out))
109
138
 
110
139
 
111
- @op(required_resource_keys={"terminus"})
112
- def list_databases(context) -> List[String]:
113
- client = context.resources.terminus.client
114
- list_ = client.list_databases()
115
- context.log.info(f"databases: {list_}")
116
- return list_
117
-
118
-
119
140
  @op(required_resource_keys={"mongo"})
120
141
  def mongo_stats(context) -> List[str]:
121
142
  db = context.resources.mongo.db
@@ -124,134 +145,6 @@ def mongo_stats(context) -> List[str]:
124
145
  return collection_names
125
146
 
126
147
 
127
- @op(required_resource_keys={"terminus"})
128
- def update_schema(context):
129
- with tempfile.TemporaryDirectory() as tmpdirname:
130
- try:
131
- context.log.info("shallow-cloning nmdc-schema repo")
132
- subprocess.check_output(
133
- "git clone https://github.com/microbiomedata/nmdc-schema.git"
134
- f" --branch main --single-branch {tmpdirname}/nmdc-schema",
135
- shell=True,
136
- )
137
- context.log.info("generating TerminusDB JSON-LD from NMDC LinkML")
138
- subprocess.check_output(
139
- f"gen-terminusdb {tmpdirname}/nmdc-schema/src/schema/nmdc.yaml"
140
- f" > {tmpdirname}/nmdc.terminus.json",
141
- shell=True,
142
- )
143
- except subprocess.CalledProcessError as e:
144
- if e.stdout:
145
- context.log.debug(e.stdout.decode())
146
- if e.stderr:
147
- context.log.error(e.stderr.decode())
148
- context.log.debug(str(e.returncode))
149
- raise e
150
-
151
- with open(f"{tmpdirname}/nmdc.terminus.json") as f:
152
- woql_dict = json.load(f)
153
-
154
- context.log.info("Updating terminus schema via WOQLQuery")
155
- rv = WQ(query=woql_dict).execute(
156
- context.resources.terminus.client, "update schema via WOQL"
157
- )
158
- context.log.info(str(rv))
159
- return rv
160
-
161
-
162
- @op(
163
- required_resource_keys={"mongo", "runtime_api_site_client"},
164
- retry_policy=RetryPolicy(max_retries=2),
165
- )
166
- def local_file_to_api_object(context, file_info):
167
- client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
168
- storage_path: str = file_info["storage_path"]
169
- mime_type = file_info.get("mime_type")
170
- if mime_type is None:
171
- mime_type = mimetypes.guess_type(storage_path)[0]
172
- rv = client.put_object_in_site(
173
- {"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
174
- )
175
- if not rv.status_code == status.HTTP_200_OK:
176
- raise Failure(description=f"put_object_in_site failed: {rv.content}")
177
- op = rv.json()
178
- context.log.info(f"put_object_in_site: {op}")
179
- rv = put_object(storage_path, op["metadata"]["url"])
180
- if not rv.status_code == status.HTTP_200_OK:
181
- raise Failure(description=f"put_object failed: {rv.content}")
182
- op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
183
- rv = client.update_operation(op["id"], op_patch)
184
- if not rv.status_code == status.HTTP_200_OK:
185
- raise Failure(description="update_operation failed")
186
- op = rv.json()
187
- context.log.info(f"update_operation: {op}")
188
- rv = client.create_object_from_op(op)
189
- if rv.status_code != status.HTTP_201_CREATED:
190
- raise Failure("create_object_from_op failed")
191
- obj = rv.json()
192
- context.log.info(f'Created /objects/{obj["id"]}')
193
- mdb = context.resources.mongo.db
194
- rv = mdb.operations.delete_one({"id": op["id"]})
195
- if rv.deleted_count != 1:
196
- context.log.error("deleting op failed")
197
- yield AssetMaterialization(
198
- asset_key=AssetKey(["object", obj["name"]]),
199
- description="output of metadata-translation run_etl",
200
- metadata={"object_id": MetadataValue.text(obj["id"])},
201
- )
202
- yield Output(obj)
203
-
204
-
205
- @op(
206
- out={
207
- "merged_data_path": Out(
208
- str,
209
- description="path to TSV merging of source metadata",
210
- )
211
- }
212
- )
213
- def build_merged_db(context) -> str:
214
- context.log.info("metadata-translation: running `make build-merged-db`")
215
- run_and_log(
216
- "cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
217
- )
218
- storage_path = (
219
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
220
- )
221
- yield AssetMaterialization(
222
- asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
223
- description="input to metadata-translation run_etl",
224
- metadata={"path": MetadataValue.path(storage_path)},
225
- )
226
- yield Output(storage_path, "merged_data_path")
227
-
228
-
229
- @op(
230
- required_resource_keys={"runtime_api_site_client"},
231
- )
232
- def run_etl(context, merged_data_path: str):
233
- context.log.info("metadata-translation: running `make run-etl`")
234
- if not os.path.exists(merged_data_path):
235
- raise Failure(description=f"merged_db not present at {merged_data_path}")
236
- run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
237
- storage_path = (
238
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
239
- )
240
- with ZipFile(storage_path) as zf:
241
- name = zf.namelist()[0]
242
- with zf.open(name) as f:
243
- rv = json.load(f)
244
- context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
245
- yield AssetMaterialization(
246
- asset_key=AssetKey(["gold_translation", "database.json.zip"]),
247
- description="output of metadata-translation run_etl",
248
- metadata={
249
- "path": MetadataValue.path(storage_path),
250
- },
251
- )
252
- yield Output({"storage_path": storage_path})
253
-
254
-
255
148
  @op(required_resource_keys={"mongo"})
256
149
  def get_operation(context):
257
150
  mdb = context.resources.mongo.db
@@ -476,6 +369,9 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn):
476
369
 
477
370
  @op(required_resource_keys={"runtime_api_site_client"})
478
371
  def get_json_in(context):
372
+ """
373
+ TODO: Document this function.
374
+ """
479
375
  object_id = context.op_config.get("object_id")
480
376
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
481
377
  rv = client.get_object_bytes(object_id)
@@ -486,63 +382,17 @@ def get_json_in(context):
486
382
  return rv.json()
487
383
 
488
384
 
489
- def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
490
- """Does not ensure ordering of `docs`."""
491
-
492
- if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
493
- return docs, 0
494
-
495
- do_docs = docs["data_object_set"]
496
-
497
- class FileTypeEnumBase(BaseModel):
498
- name: str
499
- description: str
500
- filter: str # JSON-encoded data_object_set mongo collection filter document
501
-
502
- class FileTypeEnum(FileTypeEnumBase):
503
- id: str
504
-
505
- temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
506
- temp_collection = mdb[temp_collection_name]
507
- temp_collection.insert_many(do_docs)
508
- temp_collection.create_index("id")
509
-
510
- def fte_matches(fte_filter: str):
511
- return [
512
- dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
513
- ]
514
-
515
- do_docs_map = {d["id"]: d for d in do_docs}
516
-
517
- n_docs_with_types_added = 0
518
-
519
- for fte_doc in mdb.file_type_enum.find():
520
- fte = FileTypeEnum(**fte_doc)
521
- docs_matching = fte_matches(fte.filter)
522
- for doc in docs_matching:
523
- if "data_object_type" not in doc:
524
- do_docs_map[doc["id"]] = assoc(doc, "data_object_type", fte.id)
525
- n_docs_with_types_added += 1
526
-
527
- mdb.drop_collection(temp_collection_name)
528
- return (
529
- assoc(
530
- docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
531
- ),
532
- n_docs_with_types_added,
533
- )
534
-
535
-
536
385
  @op(required_resource_keys={"runtime_api_site_client", "mongo"})
537
386
  def perform_mongo_updates(context, json_in):
387
+ """
388
+ TODO: Document this function.
389
+ """
538
390
  mongo = context.resources.mongo
539
391
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
540
392
  op_id = context.op_config.get("operation_id")
541
393
 
542
394
  docs = json_in
543
395
  docs, _ = specialize_activity_set_docs(docs)
544
- docs, n_docs_with_types_added = ensure_data_object_type(docs, mongo.db)
545
- context.log.info(f"added `data_object_type` to {n_docs_with_types_added} docs")
546
396
  context.log.debug(f"{docs}")
547
397
 
548
398
  rv = validate_json(
@@ -551,29 +401,54 @@ def perform_mongo_updates(context, json_in):
551
401
  if rv["result"] == "errors":
552
402
  raise Failure(str(rv["detail"]))
553
403
 
554
- coll_has_id_index = collection_indexed_on_id(mongo.db)
555
- if all(coll_has_id_index[coll] for coll in docs.keys()):
404
+ # TODO containing op `perform_mongo_updates` needs test coverage, as below line had trivial bug.
405
+ # ref: https://github.com/microbiomedata/nmdc-runtime/issues/631
406
+ add_docs_result = _add_schema_docs_with_or_without_replacement(mongo, docs)
407
+ op_patch = UpdateOperationRequest(
408
+ done=True,
409
+ result=add_docs_result,
410
+ metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")},
411
+ )
412
+ op_doc = client.update_operation(op_id, op_patch).json()
413
+ return ["/operations/" + op_doc["id"]]
414
+
415
+
416
+ def _add_schema_docs_with_or_without_replacement(
417
+ mongo: MongoDBResource, docs: Dict[str, list]
418
+ ):
419
+ """
420
+ TODO: Document this function.
421
+ """
422
+ coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
423
+ if all(coll_index_on_id_map[coll] for coll in docs.keys()):
556
424
  replace = True
557
- elif all(not coll_has_id_index[coll] for coll in docs.keys()):
425
+ elif all(not coll_index_on_id_map[coll] for coll in docs.keys()):
426
+ # FIXME: XXX: This is a hack because e.g. <https://w3id.org/nmdc/FunctionalAnnotationAggMember>
427
+ # documents should be unique with compound key (metagenome_annotation_id, gene_function_id)
428
+ # and yet this is not explicit in the schema. One potential solution is to auto-generate an `id`
429
+ # as a deterministic hash of the compound key.
430
+ #
431
+ # For now, decision is to potentially re-insert "duplicate" documents, i.e. to interpret
432
+ # lack of `id` as lack of unique document identity for de-duplication.
558
433
  replace = False # wasting time trying to upsert by `id`.
559
434
  else:
560
435
  colls_not_id_indexed = [
561
- coll for coll in docs.keys() if not coll_has_id_index[coll]
436
+ coll for coll in docs.keys() if not coll_index_on_id_map[coll]
562
437
  ]
563
- colls_id_indexed = [coll for coll in docs.keys() if coll_has_id_index[coll]]
438
+ colls_id_indexed = [coll for coll in docs.keys() if coll_index_on_id_map[coll]]
564
439
  raise Failure(
565
440
  "Simultaneous addition of non-`id`ed collections and `id`-ed collections"
566
441
  " is not supported at this time."
567
442
  f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
568
443
  )
569
444
  op_result = mongo.add_docs(docs, validate=False, replace=replace)
570
- op_patch = UpdateOperationRequest(
571
- done=True,
572
- result=mongo_add_docs_result_as_dict(op_result),
573
- metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")},
574
- )
575
- op_doc = client.update_operation(op_id, op_patch).json()
576
- return ["/operations/" + op_doc["id"]]
445
+
446
+ # Translate the operation result into a dictionary in which each item's key is a collection name
447
+ # and each item's value is the corresponding bulk API result (excluding the "upserted" field).
448
+ return {
449
+ collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
450
+ for collection_name, bulk_write_result in op_result.items()
451
+ }
577
452
 
578
453
 
579
454
  @op(required_resource_keys={"mongo"})
@@ -589,9 +464,32 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
589
464
  context.log.info(f"No NMDC RunEvent doc for Dagster Run {context.run_id}")
590
465
 
591
466
 
592
- @op(config_schema={"study_id": str})
593
- def get_gold_study_pipeline_inputs(context: OpExecutionContext) -> str:
594
- return context.op_config["study_id"]
467
+ @op(
468
+ config_schema={
469
+ "study_id": str,
470
+ "study_type": str,
471
+ "gold_nmdc_instrument_mapping_file_url": str,
472
+ "include_field_site_info": bool,
473
+ "enable_biosample_filtering": bool,
474
+ },
475
+ out={
476
+ "study_id": Out(str),
477
+ "study_type": Out(str),
478
+ "gold_nmdc_instrument_mapping_file_url": Out(str),
479
+ "include_field_site_info": Out(bool),
480
+ "enable_biosample_filtering": Out(bool),
481
+ },
482
+ )
483
+ def get_gold_study_pipeline_inputs(
484
+ context: OpExecutionContext,
485
+ ) -> Tuple[str, str, str, bool, bool]:
486
+ return (
487
+ context.op_config["study_id"],
488
+ context.op_config["study_type"],
489
+ context.op_config["gold_nmdc_instrument_mapping_file_url"],
490
+ context.op_config["include_field_site_info"],
491
+ context.op_config["enable_biosample_filtering"],
492
+ )
595
493
 
596
494
 
597
495
  @op(required_resource_keys={"gold_api_client"})
@@ -628,9 +526,13 @@ def gold_study(context: OpExecutionContext, study_id: str) -> Dict[str, Any]:
628
526
  def nmdc_schema_database_from_gold_study(
629
527
  context: OpExecutionContext,
630
528
  study: Dict[str, Any],
529
+ study_type: str,
631
530
  projects: List[Dict[str, Any]],
632
531
  biosamples: List[Dict[str, Any]],
633
532
  analysis_projects: List[Dict[str, Any]],
533
+ gold_nmdc_instrument_map_df: pd.DataFrame,
534
+ include_field_site_info: bool,
535
+ enable_biosample_filtering: bool,
634
536
  ) -> nmdc.Database:
635
537
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
636
538
 
@@ -639,34 +541,54 @@ def nmdc_schema_database_from_gold_study(
639
541
  return response.json()
640
542
 
641
543
  translator = GoldStudyTranslator(
642
- study, biosamples, projects, analysis_projects, id_minter=id_minter
544
+ study,
545
+ study_type,
546
+ biosamples,
547
+ projects,
548
+ analysis_projects,
549
+ gold_nmdc_instrument_map_df,
550
+ include_field_site_info,
551
+ enable_biosample_filtering,
552
+ id_minter=id_minter,
643
553
  )
644
554
  database = translator.get_database()
645
555
  return database
646
556
 
647
557
 
648
558
  @op(
559
+ required_resource_keys={"mongo"},
649
560
  out={
650
561
  "submission_id": Out(),
651
- "omics_processing_mapping_file_url": Out(Optional[str]),
562
+ "nucleotide_sequencing_mapping_file_url": Out(Optional[str]),
652
563
  "data_object_mapping_file_url": Out(Optional[str]),
653
564
  "biosample_extras_file_url": Out(Optional[str]),
654
565
  "biosample_extras_slot_mapping_file_url": Out(Optional[str]),
566
+ "study_id": Out(Optional[str]),
655
567
  },
656
568
  )
657
569
  def get_submission_portal_pipeline_inputs(
570
+ context: OpExecutionContext,
658
571
  submission_id: str,
659
- omics_processing_mapping_file_url: Optional[str],
572
+ nucleotide_sequencing_mapping_file_url: Optional[str],
660
573
  data_object_mapping_file_url: Optional[str],
661
574
  biosample_extras_file_url: Optional[str],
662
575
  biosample_extras_slot_mapping_file_url: Optional[str],
663
- ) -> Tuple[str, str | None, str | None, str | None, str | None]:
576
+ study_id: Optional[str],
577
+ ) -> Tuple[str, str | None, str | None, str | None, str | None, str | None]:
578
+ # query for studies matching the ID to see if it eists
579
+ if study_id:
580
+ mdb = context.resources.mongo.db
581
+ result = mdb.study_set.find_one({"id": study_id})
582
+ if not result:
583
+ raise Exception(f"Study id: {study_id} does not exist in Mongo.")
584
+
664
585
  return (
665
586
  submission_id,
666
- omics_processing_mapping_file_url,
587
+ nucleotide_sequencing_mapping_file_url,
667
588
  data_object_mapping_file_url,
668
589
  biosample_extras_file_url,
669
590
  biosample_extras_slot_mapping_file_url,
591
+ study_id,
670
592
  )
671
593
 
672
594
 
@@ -684,15 +606,14 @@ def fetch_nmdc_portal_submission_by_id(
684
606
  def translate_portal_submission_to_nmdc_schema_database(
685
607
  context: OpExecutionContext,
686
608
  metadata_submission: Dict[str, Any],
687
- omics_processing_mapping: List,
609
+ nucleotide_sequencing_mapping: List,
688
610
  data_object_mapping: List,
611
+ instrument_mapping: Dict[str, str],
689
612
  study_category: Optional[str],
690
- study_doi_category: Optional[str],
691
- study_doi_provider: Optional[str],
692
- study_funding_sources: Optional[List[str]],
693
613
  study_pi_image_url: Optional[str],
694
614
  biosample_extras: Optional[list[dict]],
695
615
  biosample_extras_slot_mapping: Optional[list[dict]],
616
+ study_id: Optional[str],
696
617
  ) -> nmdc.Database:
697
618
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
698
619
 
@@ -702,21 +623,45 @@ def translate_portal_submission_to_nmdc_schema_database(
702
623
 
703
624
  translator = SubmissionPortalTranslator(
704
625
  metadata_submission,
705
- omics_processing_mapping,
706
- data_object_mapping,
626
+ nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
627
+ data_object_mapping=data_object_mapping,
707
628
  id_minter=id_minter,
708
629
  study_category=study_category,
709
- study_doi_category=study_doi_category,
710
- study_doi_provider=study_doi_provider,
711
- study_funding_sources=study_funding_sources,
712
630
  study_pi_image_url=study_pi_image_url,
713
631
  biosample_extras=biosample_extras,
714
632
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
633
+ illumina_instrument_mapping=instrument_mapping,
634
+ study_id=study_id,
715
635
  )
716
636
  database = translator.get_database()
717
637
  return database
718
638
 
719
639
 
640
+ @op(required_resource_keys={"nmdc_portal_api_client"})
641
+ def add_public_image_urls(
642
+ context: OpExecutionContext, database: nmdc.Database, submission_id: str
643
+ ) -> nmdc.Database:
644
+ client: NmdcPortalApiClient = context.resources.nmdc_portal_api_client
645
+
646
+ if len(database.study_set) != 1:
647
+ raise Failure(
648
+ description="Expected exactly one study in the database to add public image URLs."
649
+ )
650
+
651
+ study_id = database.study_set[0].id
652
+ public_images = client.make_submission_images_public(
653
+ submission_id, study_id=study_id
654
+ )
655
+ SubmissionPortalTranslator.set_study_images(
656
+ database.study_set[0],
657
+ public_images.get("pi_image_url"),
658
+ public_images.get("primary_study_image_url"),
659
+ public_images.get("study_image_urls"),
660
+ )
661
+
662
+ return database
663
+
664
+
720
665
  @op
721
666
  def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
722
667
  source_id = None
@@ -729,7 +674,7 @@ def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
729
674
 
730
675
  @op
731
676
  def nmdc_schema_object_to_dict(object: YAMLRoot) -> Dict[str, Any]:
732
- return json_dumper.to_dict(object)
677
+ return as_simple_dict(object)
733
678
 
734
679
 
735
680
  @op(required_resource_keys={"mongo"}, config_schema={"username": str})
@@ -765,6 +710,33 @@ def export_json_to_drs(
765
710
  return ["/objects/" + drs_object["id"]]
766
711
 
767
712
 
713
+ @op(
714
+ description="NCBI Submission XML file rendered in a Dagster Asset",
715
+ out=Out(description="XML content rendered through Dagit UI"),
716
+ )
717
+ def ncbi_submission_xml_asset(context: OpExecutionContext, data: str):
718
+ filename = "ncbi_submission.xml"
719
+ file_path = os.path.join(context.instance.storage_directory(), filename)
720
+
721
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
722
+
723
+ with open(file_path, "w") as f:
724
+ f.write(data)
725
+
726
+ context.log_event(
727
+ AssetMaterialization(
728
+ asset_key="ncbi_submission_xml",
729
+ description="NCBI Submission XML Data",
730
+ metadata={
731
+ "file_path": MetadataValue.path(file_path),
732
+ "xml": MetadataValue.text(data),
733
+ },
734
+ )
735
+ )
736
+
737
+ return Output(data)
738
+
739
+
768
740
  def unique_field_values(docs: List[Dict[str, Any]], field: str):
769
741
  return {doc[field] for doc in docs if field in doc}
770
742
 
@@ -784,6 +756,11 @@ def get_neon_pipeline_benthic_data_product(context: OpExecutionContext) -> dict:
784
756
  return context.op_config["benthic_data_product"]
785
757
 
786
758
 
759
+ @op(config_schema={"surface_water_data_product": dict})
760
+ def get_neon_pipeline_surface_water_data_product(context: OpExecutionContext) -> dict:
761
+ return context.op_config["surface_water_data_product"]
762
+
763
+
787
764
  @op(required_resource_keys={"neon_api_client"})
788
765
  def neon_data_by_product(
789
766
  context: OpExecutionContext, data_product: dict
@@ -817,6 +794,7 @@ def nmdc_schema_database_from_neon_soil_data(
817
794
  sls_data: Dict[str, pd.DataFrame],
818
795
  neon_envo_mappings_file: pd.DataFrame,
819
796
  neon_raw_data_file_mappings_file: pd.DataFrame,
797
+ neon_nmdc_instrument_mapping_file: pd.DataFrame,
820
798
  ) -> nmdc.Database:
821
799
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
822
800
 
@@ -829,6 +807,7 @@ def nmdc_schema_database_from_neon_soil_data(
829
807
  sls_data,
830
808
  neon_envo_mappings_file,
831
809
  neon_raw_data_file_mappings_file,
810
+ neon_nmdc_instrument_mapping_file,
832
811
  id_minter=id_minter,
833
812
  )
834
813
 
@@ -843,6 +822,7 @@ def nmdc_schema_database_from_neon_benthic_data(
843
822
  site_code_mapping: Dict[str, str],
844
823
  neon_envo_mappings_file: pd.DataFrame,
845
824
  neon_raw_data_file_mappings_file: pd.DataFrame,
825
+ neon_nmdc_instrument_mapping_file: pd.DataFrame,
846
826
  ) -> nmdc.Database:
847
827
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
848
828
 
@@ -855,6 +835,35 @@ def nmdc_schema_database_from_neon_benthic_data(
855
835
  site_code_mapping,
856
836
  neon_envo_mappings_file,
857
837
  neon_raw_data_file_mappings_file,
838
+ neon_nmdc_instrument_mapping_file,
839
+ id_minter=id_minter,
840
+ )
841
+
842
+ database = translator.get_database()
843
+ return database
844
+
845
+
846
+ @op(required_resource_keys={"runtime_api_site_client"})
847
+ def nmdc_schema_database_from_neon_surface_water_data(
848
+ context: OpExecutionContext,
849
+ surface_water_data: Dict[str, pd.DataFrame],
850
+ site_code_mapping: Dict[str, str],
851
+ neon_envo_mappings_file: pd.DataFrame,
852
+ neon_raw_data_file_mappings_file: pd.DataFrame,
853
+ neon_nmdc_instrument_mapping_file: pd.DataFrame,
854
+ ) -> nmdc.Database:
855
+ client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
856
+
857
+ def id_minter(*args, **kwargs):
858
+ response = client.mint_id(*args, **kwargs)
859
+ return response.json()
860
+
861
+ translator = NeonSurfaceWaterDataTranslator(
862
+ surface_water_data,
863
+ site_code_mapping,
864
+ neon_envo_mappings_file,
865
+ neon_raw_data_file_mappings_file,
866
+ neon_nmdc_instrument_mapping_file,
858
867
  id_minter=id_minter,
859
868
  )
860
869
 
@@ -866,15 +875,18 @@ def nmdc_schema_database_from_neon_benthic_data(
866
875
  out={
867
876
  "neon_envo_mappings_file_url": Out(),
868
877
  "neon_raw_data_file_mappings_file_url": Out(),
878
+ "neon_nmdc_instrument_mapping_file_url": Out(),
869
879
  }
870
880
  )
871
881
  def get_neon_pipeline_inputs(
872
882
  neon_envo_mappings_file_url: str,
873
883
  neon_raw_data_file_mappings_file_url: str,
874
- ) -> Tuple[str, str]:
884
+ neon_nmdc_instrument_mapping_file_url: str,
885
+ ) -> Tuple[str, str, str]:
875
886
  return (
876
887
  neon_envo_mappings_file_url,
877
888
  neon_raw_data_file_mappings_file_url,
889
+ neon_nmdc_instrument_mapping_file_url,
878
890
  )
879
891
 
880
892
 
@@ -943,3 +955,769 @@ def site_code_mapping() -> dict:
943
955
  raise Exception(
944
956
  f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}"
945
957
  )
958
+
959
+
960
+ @op(
961
+ required_resource_keys={"mongo"},
962
+ config_schema={
963
+ "source_ontology": str,
964
+ "output_directory": Field(Noneable(str), default_value=None, is_required=False),
965
+ "generate_reports": Field(bool, default_value=True, is_required=False),
966
+ },
967
+ )
968
+ def load_ontology(context: OpExecutionContext):
969
+ cfg = context.op_config
970
+ source_ontology = cfg["source_ontology"]
971
+ output_directory = cfg.get("output_directory")
972
+ generate_reports = cfg.get("generate_reports", True)
973
+
974
+ if output_directory is None:
975
+ output_directory = os.path.join(os.getcwd(), "ontology_reports")
976
+
977
+ # Redirect Python logging to Dagster context
978
+ handler = logging.Handler()
979
+ handler.emit = lambda record: context.log.info(record.getMessage())
980
+
981
+ # Get logger from ontology-loader package
982
+ controller_logger = logging.getLogger("ontology_loader.ontology_load_controller")
983
+ controller_logger.setLevel(logging.INFO)
984
+ controller_logger.addHandler(handler)
985
+
986
+ context.log.info(f"Running Ontology Loader for ontology: {source_ontology}")
987
+ loader = OntologyLoaderController(
988
+ source_ontology=source_ontology,
989
+ output_directory=output_directory,
990
+ generate_reports=generate_reports,
991
+ mongo_client=context.resources.mongo.client,
992
+ db_name=context.resources.mongo.db.name,
993
+ )
994
+
995
+ loader.run_ontology_loader()
996
+ context.log.info(f"Ontology load for {source_ontology} completed successfully!")
997
+
998
+
999
+ def _add_linked_instances_to_alldocs(
1000
+ temp_collection: MongoCollection,
1001
+ context: OpExecutionContext,
1002
+ document_reference_ranged_slots_by_type: dict,
1003
+ ) -> None:
1004
+ """
1005
+ Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
1006
+
1007
+ The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
1008
+ Each subdocument represents a link to another document that either links to or is linked from the document via
1009
+ document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
1010
+ document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
1011
+ considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
1012
+
1013
+ Args:
1014
+ temp_collection: The temporary MongoDB collection to process
1015
+ context: The Dagster execution context for logging
1016
+ document_reference_ranged_slots_by_type: Dictionary mapping document types to their reference-ranged slot names
1017
+
1018
+ Returns:
1019
+ None (modifies the documents in place)
1020
+ """
1021
+
1022
+ context.log.info(
1023
+ "Building relationships and adding `_upstream` and `_downstream` fields..."
1024
+ )
1025
+
1026
+ # document ID -> type (with "nmdc:" prefix preserved)
1027
+ id_to_type_map: Dict[str, str] = {}
1028
+
1029
+ # set of (<referencing document ID>, <slot>, <referenced document ID>) 3-tuples.
1030
+ relationship_triples: Set[Tuple[str, str, str]] = set()
1031
+
1032
+ # Collect relationship triples.
1033
+ for doc in temp_collection.find():
1034
+ doc_id = doc["id"]
1035
+ # Store the full type with prefix intact
1036
+ doc_type = doc["type"]
1037
+ # For looking up reference slots, we still need the type without prefix
1038
+ doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
1039
+
1040
+ # Record ID to type mapping - preserve the original type with prefix
1041
+ id_to_type_map[doc_id] = doc_type
1042
+
1043
+ # Find all document references from this document
1044
+ reference_slots = document_reference_ranged_slots_by_type.get(doc_type, [])
1045
+ for slot in reference_slots:
1046
+ if slot in doc:
1047
+ # Handle both single-value and array references
1048
+ refs = doc[slot] if isinstance(doc[slot], list) else [doc[slot]]
1049
+ for ref_doc in temp_collection.find(
1050
+ {"id": {"$in": refs}}, ["id", "type"]
1051
+ ):
1052
+ id_to_type_map[ref_doc["id"]] = ref_doc["type"]
1053
+ for ref_id in refs:
1054
+ relationship_triples.add((doc_id, slot, ref_id))
1055
+
1056
+ context.log.info(
1057
+ f"Found {len(id_to_type_map)} documents, with "
1058
+ f"{len({d for (d, _, _) in relationship_triples})} containing references"
1059
+ )
1060
+
1061
+ # The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
1062
+ # in order to perform graph traversal and collect all entities "related" to a given entity without
1063
+ # recursion "exploding".
1064
+ #
1065
+ # Note: We are hard-coding this "direction" information here in the Runtime
1066
+ # because the NMDC schema does not currently contain or expose it.
1067
+ #
1068
+ # An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
1069
+ upstream_document_reference_ranged_slots = [
1070
+ "associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
1071
+ "collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
1072
+ "has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1073
+ "has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
1074
+ "has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1075
+ "instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
1076
+ "part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
1077
+ "was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
1078
+ "was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
1079
+ ]
1080
+ # A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
1081
+ downstream_document_reference_ranged_slots = [
1082
+ "calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
1083
+ "generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
1084
+ "has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
1085
+ "in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
1086
+ "uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
1087
+ # Note: I don't think of superseding something as being either upstream or downstream of that thing;
1088
+ # but this function requires every document-reference-ranged slot to be accounted for in one
1089
+ # list or the other, and the superseding thing does arise _later_ than the thing it supersedes,
1090
+ # so I have opted to treat the superseding thing as being downstream.
1091
+ "superseded_by", # when a `nmdc:WorkflowExecution` or `nmdc:DataObject` is superseded by a `nmdc:WorkflowExecution`.
1092
+ ]
1093
+
1094
+ unique_document_reference_ranged_slot_names = set()
1095
+ for slot_names in document_reference_ranged_slots_by_type.values():
1096
+ for slot_name in slot_names:
1097
+ unique_document_reference_ranged_slot_names.add(slot_name)
1098
+ context.log.info(f"{unique_document_reference_ranged_slot_names=}")
1099
+ if len(upstream_document_reference_ranged_slots) + len(
1100
+ downstream_document_reference_ranged_slots
1101
+ ) != len(unique_document_reference_ranged_slot_names):
1102
+ raise Failure(
1103
+ "Number of detected unique document-reference-ranged slot names does not match "
1104
+ "sum of accounted-for upstream and downstream document-reference-ranged slot names."
1105
+ )
1106
+
1107
+ # Construct, and update documents with, `_upstream` and `_downstream` field values.
1108
+ #
1109
+ # manage batching of MongoDB `bulk_write` operations
1110
+ bulk_operations, update_count = [], 0
1111
+ for doc_id, slot, ref_id in relationship_triples:
1112
+
1113
+ # Determine in which respective fields to push this relationship
1114
+ # for the subject (doc) and object (ref) of this triple.
1115
+ if slot in upstream_document_reference_ranged_slots:
1116
+ field_for_doc, field_for_ref = "_upstream", "_downstream"
1117
+ elif slot in downstream_document_reference_ranged_slots:
1118
+ field_for_doc, field_for_ref = "_downstream", "_upstream"
1119
+ else:
1120
+ raise Failure(f"Unknown slot {slot} for document {doc_id}")
1121
+
1122
+ updates = [
1123
+ {
1124
+ "filter": {"id": doc_id},
1125
+ "update": {
1126
+ "$push": {
1127
+ field_for_doc: {
1128
+ "id": ref_id,
1129
+ # TODO existing tests are failing due to `KeyError`s for `id_to_type_map.get[ref_id]` here,
1130
+ # which acts as an implicit referential integrity checker (!). Using `.get` with
1131
+ # "nmdc:NamedThing" as default in order to (for now) allow such tests to continue to pass.
1132
+ "type": id_to_type_map.get(ref_id, "nmdc:NamedThing"),
1133
+ }
1134
+ }
1135
+ },
1136
+ },
1137
+ {
1138
+ "filter": {"id": ref_id},
1139
+ "update": {
1140
+ "$push": {
1141
+ field_for_ref: {"id": doc_id, "type": id_to_type_map[doc_id]}
1142
+ }
1143
+ },
1144
+ },
1145
+ ]
1146
+ for update in updates:
1147
+ bulk_operations.append(UpdateOne(**update))
1148
+
1149
+ # Execute in batches for efficiency
1150
+ if len(bulk_operations) >= BULK_WRITE_BATCH_SIZE:
1151
+ temp_collection.bulk_write(bulk_operations)
1152
+ update_count += len(bulk_operations)
1153
+ context.log.info(
1154
+ f"Pushed {update_count/(2*len(relationship_triples)):.1%} of updates so far..."
1155
+ )
1156
+ bulk_operations = []
1157
+
1158
+ # Execute any remaining operations
1159
+ if bulk_operations:
1160
+ temp_collection.bulk_write(bulk_operations)
1161
+ update_count += len(bulk_operations)
1162
+
1163
+ context.log.info(f"Pushed {update_count} updates in total")
1164
+
1165
+
1166
+ # Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
1167
+ # pass an argument to the op (in order to specify the order of the ops in the graph)
1168
+ # while also telling Dagster that this op doesn't need the _value_ of that argument.
1169
+ # This is the approach shown on: https://docs.dagster.io/api/dagster/types#dagster.Nothing
1170
+ # Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
1171
+ #
1172
+ @op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
1173
+ def materialize_alldocs(context: OpExecutionContext) -> int:
1174
+ """
1175
+ This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
1176
+
1177
+ 1. Getting all populated schema collection names with an `id` field.
1178
+ 2. Create a temporary collection to build the new alldocs collection.
1179
+ 3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
1180
+ 4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
1181
+ 5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
1182
+ 6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
1183
+ 7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
1184
+
1185
+ The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
1186
+ `nmdc_runtime.site.repository.ensure_alldocs_daily`. The collection is also updated as part of various workflows,
1187
+ such as when applying a changesheet or metadata updates (see `nmdc_runtime.site.graphs`).
1188
+
1189
+ The `alldocs` collection is used primarily by API endpoints like `/data_objects/study/{study_id}` and
1190
+ `/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
1191
+ related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
1192
+
1193
+ The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
1194
+ that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
1195
+ expansions.
1196
+ """
1197
+ mdb = context.resources.mongo.db
1198
+ schema_view = nmdc_schema_view()
1199
+
1200
+ # TODO include functional_annotation_agg for "real-time" ref integrity checking.
1201
+ # For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
1202
+ collection_names = populated_schema_collection_names_with_id_field(mdb)
1203
+ context.log.info(f"constructing `alldocs` collection using {collection_names=}")
1204
+
1205
+ document_class_names = set(
1206
+ chain.from_iterable(collection_name_to_class_names.values())
1207
+ )
1208
+
1209
+ cls_slot_map = {
1210
+ cls_name: {
1211
+ slot.name: slot for slot in schema_view.class_induced_slots(cls_name)
1212
+ }
1213
+ for cls_name in document_class_names
1214
+ }
1215
+
1216
+ # Any ancestor of a document class is a document-referencable range,
1217
+ # i.e., a valid range of a document-reference-ranged slot.
1218
+ document_referenceable_ranges = set(
1219
+ chain.from_iterable(
1220
+ schema_view.class_ancestors(cls_name) for cls_name in document_class_names
1221
+ )
1222
+ )
1223
+
1224
+ document_reference_ranged_slots_by_type = defaultdict(list)
1225
+ for cls_name, slot_map in cls_slot_map.items():
1226
+ for slot_name, slot in slot_map.items():
1227
+ if (
1228
+ set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
1229
+ & document_referenceable_ranges
1230
+ ):
1231
+ document_reference_ranged_slots_by_type[f"nmdc:{cls_name}"].append(
1232
+ slot_name
1233
+ )
1234
+
1235
+ # Build `alldocs` to a temporary collection for atomic replacement
1236
+ # https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
1237
+ temp_alldocs_collection_name = f"tmp.alldocs.{ObjectId()}"
1238
+ temp_alldocs_collection = mdb[temp_alldocs_collection_name]
1239
+ context.log.info(f"constructing `{temp_alldocs_collection.name}` collection")
1240
+
1241
+ for coll_name in collection_names:
1242
+ context.log.info(f"{coll_name=}")
1243
+ write_operations = []
1244
+ documents_processed_counter = 0
1245
+ for doc in mdb[coll_name].find():
1246
+ try:
1247
+ # Keep the full type with prefix for document
1248
+ doc_type_full = doc["type"]
1249
+ # Remove prefix for slot lookup and ancestor lookup
1250
+ doc_type = doc_type_full.removeprefix("nmdc:")
1251
+ except KeyError:
1252
+ raise Exception(
1253
+ f"doc {doc['id']} in collection {coll_name} has no 'type'!"
1254
+ )
1255
+ slots_to_include = ["id", "type"] + document_reference_ranged_slots_by_type[
1256
+ doc_type_full
1257
+ ]
1258
+ new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
1259
+
1260
+ # Get ancestors without the prefix, but add prefix to each one in the output
1261
+ new_doc["_type_and_ancestors"] = [
1262
+ f"nmdc:{a}" for a in schema_view.class_ancestors(doc_type)
1263
+ ]
1264
+ # InsertOne is a pymongo representation of a mongo command.
1265
+ write_operations.append(InsertOne(new_doc))
1266
+ if len(write_operations) == BULK_WRITE_BATCH_SIZE:
1267
+ _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1268
+ write_operations.clear()
1269
+ documents_processed_counter += BULK_WRITE_BATCH_SIZE
1270
+ if len(write_operations) > 0:
1271
+ # here bulk_write is a method on the pymongo db Collection class
1272
+ _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1273
+ documents_processed_counter += len(write_operations)
1274
+ context.log.info(
1275
+ f"Inserted {documents_processed_counter} documents from {coll_name=} "
1276
+ )
1277
+
1278
+ context.log.info(
1279
+ f"produced `{temp_alldocs_collection.name}` collection with"
1280
+ f" {temp_alldocs_collection.estimated_document_count()} docs."
1281
+ )
1282
+
1283
+ context.log.info(f"creating indexes on `{temp_alldocs_collection.name}` ...")
1284
+ # Ensure unique index on "id". Index creation here is blocking (i.e. background=False),
1285
+ # so that `temp_alldocs_collection` will be "good to go" on renaming.
1286
+ temp_alldocs_collection.create_index("id", unique=True)
1287
+ # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
1288
+ slots_to_index = {"_type_and_ancestors"} | {
1289
+ slot
1290
+ for slots in document_reference_ranged_slots_by_type.values()
1291
+ for slot in slots
1292
+ }
1293
+ [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
1294
+ context.log.info(f"created indexes on id and on each of {slots_to_index=}.")
1295
+
1296
+ # Add related-ids fields to enable efficient relationship traversal
1297
+ context.log.info("Adding fields for related ids to documents...")
1298
+ _add_linked_instances_to_alldocs(
1299
+ temp_alldocs_collection, context, document_reference_ranged_slots_by_type
1300
+ )
1301
+ context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
1302
+ temp_alldocs_collection.create_index("_upstream.id")
1303
+ temp_alldocs_collection.create_index("_downstream.id")
1304
+ # Create compound indexes to ensure index-covered queries
1305
+ temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
1306
+ temp_alldocs_collection.create_index(
1307
+ [("_downstream.type", 1), ("_downstream.id", 1)]
1308
+ )
1309
+ context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
1310
+
1311
+ context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
1312
+ temp_alldocs_collection.rename("alldocs", dropTarget=True)
1313
+ n_alldocs_documents = mdb.alldocs.estimated_document_count()
1314
+ context.log.info(
1315
+ f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
1316
+ )
1317
+ return n_alldocs_documents
1318
+
1319
+
1320
+ @op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
1321
+ def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any:
1322
+ nmdc_study = find_study_by_id(
1323
+ context.op_config["nmdc_study_id"], context.resources.mongo.db
1324
+ )
1325
+ return nmdc_study
1326
+
1327
+
1328
+ @op(
1329
+ config_schema={
1330
+ "nmdc_ncbi_attribute_mapping_file_url": str,
1331
+ "ncbi_submission_metadata": Field(
1332
+ Permissive(
1333
+ {
1334
+ "organization": String,
1335
+ }
1336
+ ),
1337
+ is_required=True,
1338
+ description="General metadata about the NCBI submission.",
1339
+ ),
1340
+ "ncbi_biosample_metadata": Field(
1341
+ Permissive(
1342
+ {
1343
+ "organism_name": String,
1344
+ }
1345
+ ),
1346
+ is_required=True,
1347
+ description="Metadata for one or many NCBI BioSample in the Submission.",
1348
+ ),
1349
+ },
1350
+ out=Out(Dict),
1351
+ )
1352
+ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
1353
+ nmdc_ncbi_attribute_mapping_file_url = context.op_config[
1354
+ "nmdc_ncbi_attribute_mapping_file_url"
1355
+ ]
1356
+ ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {})
1357
+ ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {})
1358
+
1359
+ return {
1360
+ "nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url,
1361
+ "ncbi_submission_metadata": ncbi_submission_metadata,
1362
+ "ncbi_biosample_metadata": ncbi_biosample_metadata,
1363
+ }
1364
+
1365
+
1366
+ @op(required_resource_keys={"mongo"})
1367
+ def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
1368
+ mdb = context.resources.mongo.db
1369
+ alldocs_collection = mdb["alldocs"]
1370
+ data_object_set = mdb["data_object_set"]
1371
+ biosample_data_objects = fetch_data_objects_from_biosamples(
1372
+ alldocs_collection, data_object_set, biosamples
1373
+ )
1374
+ return biosample_data_objects
1375
+
1376
+
1377
+ @op(required_resource_keys={"mongo"})
1378
+ def get_nucleotide_sequencing_from_biosamples(
1379
+ context: OpExecutionContext, biosamples: list
1380
+ ):
1381
+ mdb = context.resources.mongo.db
1382
+ alldocs_collection = mdb["alldocs"]
1383
+ data_generation_set = mdb["data_generation_set"]
1384
+ biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
1385
+ alldocs_collection, data_generation_set, biosamples
1386
+ )
1387
+ return biosample_omics_processing
1388
+
1389
+
1390
+ @op(required_resource_keys={"mongo"})
1391
+ def get_library_preparation_from_biosamples(
1392
+ context: OpExecutionContext, biosamples: list
1393
+ ):
1394
+ mdb = context.resources.mongo.db
1395
+ alldocs_collection = mdb["alldocs"]
1396
+ material_processing_set = mdb["material_processing_set"]
1397
+ biosample_lib_prep = fetch_library_preparation_from_biosamples(
1398
+ alldocs_collection, material_processing_set, biosamples
1399
+ )
1400
+ return biosample_lib_prep
1401
+
1402
+
1403
+ @op(required_resource_keys={"mongo"})
1404
+ def get_aggregated_pooled_biosamples(context: OpExecutionContext, biosamples: list):
1405
+ from nmdc_runtime.site.export.ncbi_xml_utils import check_pooling_for_biosamples
1406
+
1407
+ mdb = context.resources.mongo.db
1408
+ material_processing_set = mdb["material_processing_set"]
1409
+ pooled_biosamples_data = check_pooling_for_biosamples(
1410
+ material_processing_set, biosamples
1411
+ )
1412
+
1413
+ # Fetch ProcessedSample names from database
1414
+ processed_sample_ids = set()
1415
+ for biosample_id, pooling_info in pooled_biosamples_data.items():
1416
+ if pooling_info and pooling_info.get("processed_sample_id"):
1417
+ processed_sample_ids.add(pooling_info["processed_sample_id"])
1418
+
1419
+ # Query database for ProcessedSample names
1420
+ if processed_sample_ids:
1421
+ processed_sample_set = mdb["processed_sample_set"]
1422
+ cursor = processed_sample_set.find(
1423
+ {"id": {"$in": list(processed_sample_ids)}}, {"id": 1, "name": 1}
1424
+ )
1425
+ processed_samples = {doc["id"]: doc.get("name", "") for doc in cursor}
1426
+
1427
+ # Update pooled_biosamples_data with ProcessedSample names
1428
+ for biosample_id, pooling_info in pooled_biosamples_data.items():
1429
+ if pooling_info and pooling_info.get("processed_sample_id"):
1430
+ processed_sample_id = pooling_info["processed_sample_id"]
1431
+ if processed_sample_id in processed_samples:
1432
+ pooling_info["processed_sample_name"] = processed_samples[
1433
+ processed_sample_id
1434
+ ]
1435
+
1436
+ return pooled_biosamples_data
1437
+
1438
+
1439
+ @op(required_resource_keys={"mongo"})
1440
+ def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
1441
+ mdb = context.resources.mongo.db
1442
+ return get_instruments_by_id(mdb)
1443
+
1444
+
1445
+ @op(required_resource_keys={"mongo"})
1446
+ def get_instrument_ids_by_model(context: OpExecutionContext) -> dict[str, str]:
1447
+ mdb = context.resources.mongo.db
1448
+ instruments_by_id = get_instruments_by_id(mdb)
1449
+ instruments_by_model: dict[str, str] = {}
1450
+ for inst_id, instrument in instruments_by_id.items():
1451
+ model = instrument.get("model")
1452
+ if model is None:
1453
+ context.log.warning(f"Instrument {inst_id} has no model.")
1454
+ continue
1455
+ if model in instruments_by_model:
1456
+ context.log.warning(f"Instrument model {model} is not unique.")
1457
+ instruments_by_model[model] = inst_id
1458
+ context.log.info("Instrument models: %s", pformat(instruments_by_model))
1459
+ return instruments_by_model
1460
+
1461
+
1462
+ @op
1463
+ def ncbi_submission_xml_from_nmdc_study(
1464
+ context: OpExecutionContext,
1465
+ nmdc_study: Any,
1466
+ ncbi_exporter_metadata: dict,
1467
+ biosamples: list,
1468
+ omics_processing_records: list,
1469
+ data_object_records: list,
1470
+ library_preparation_records: list,
1471
+ all_instruments: dict,
1472
+ pooled_biosamples_data: dict,
1473
+ ) -> str:
1474
+ ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
1475
+ ncbi_xml = ncbi_exporter.get_submission_xml(
1476
+ biosamples,
1477
+ omics_processing_records,
1478
+ data_object_records,
1479
+ library_preparation_records,
1480
+ all_instruments,
1481
+ pooled_biosamples_data,
1482
+ )
1483
+ return ncbi_xml
1484
+
1485
+
1486
+ @op
1487
+ def post_submission_portal_biosample_ingest_record_stitching_filename(
1488
+ nmdc_study_id: str,
1489
+ ) -> str:
1490
+ filename = nmdc_study_id_to_filename(nmdc_study_id)
1491
+ return f"missing_database_records_for_{filename}.json"
1492
+
1493
+
1494
+ @op(
1495
+ config_schema={
1496
+ "nmdc_study_id": str,
1497
+ "gold_nmdc_instrument_mapping_file_url": str,
1498
+ "include_field_site_info": bool,
1499
+ "enable_biosample_filtering": bool,
1500
+ },
1501
+ out={
1502
+ "nmdc_study_id": Out(str),
1503
+ "gold_nmdc_instrument_mapping_file_url": Out(str),
1504
+ "include_field_site_info": Out(bool),
1505
+ "enable_biosample_filtering": Out(bool),
1506
+ },
1507
+ )
1508
+ def get_database_updater_inputs(
1509
+ context: OpExecutionContext,
1510
+ ) -> Tuple[str, str, bool, bool]:
1511
+ return (
1512
+ context.op_config["nmdc_study_id"],
1513
+ context.op_config["gold_nmdc_instrument_mapping_file_url"],
1514
+ context.op_config["include_field_site_info"],
1515
+ context.op_config["enable_biosample_filtering"],
1516
+ )
1517
+
1518
+
1519
+ @op(
1520
+ required_resource_keys={
1521
+ "runtime_api_user_client",
1522
+ "runtime_api_site_client",
1523
+ "gold_api_client",
1524
+ }
1525
+ )
1526
+ def generate_data_generation_set_post_biosample_ingest(
1527
+ context: OpExecutionContext,
1528
+ nmdc_study_id: str,
1529
+ gold_nmdc_instrument_map_df: pd.DataFrame,
1530
+ include_field_site_info: bool,
1531
+ enable_biosample_filtering: bool,
1532
+ ) -> nmdc.Database:
1533
+ runtime_api_user_client: RuntimeApiUserClient = (
1534
+ context.resources.runtime_api_user_client
1535
+ )
1536
+ runtime_api_site_client: RuntimeApiSiteClient = (
1537
+ context.resources.runtime_api_site_client
1538
+ )
1539
+ gold_api_client: GoldApiClient = context.resources.gold_api_client
1540
+
1541
+ database_updater = DatabaseUpdater(
1542
+ runtime_api_user_client,
1543
+ runtime_api_site_client,
1544
+ gold_api_client,
1545
+ nmdc_study_id,
1546
+ gold_nmdc_instrument_map_df,
1547
+ include_field_site_info,
1548
+ enable_biosample_filtering,
1549
+ )
1550
+ database = (
1551
+ database_updater.generate_data_generation_set_records_from_gold_api_for_study()
1552
+ )
1553
+
1554
+ return database
1555
+
1556
+
1557
+ @op(
1558
+ required_resource_keys={
1559
+ "runtime_api_user_client",
1560
+ "runtime_api_site_client",
1561
+ "gold_api_client",
1562
+ }
1563
+ )
1564
+ def generate_biosample_set_for_nmdc_study_from_gold(
1565
+ context: OpExecutionContext,
1566
+ nmdc_study_id: str,
1567
+ gold_nmdc_instrument_map_df: pd.DataFrame,
1568
+ include_field_site_info: bool = False,
1569
+ enable_biosample_filtering: bool = False,
1570
+ ) -> nmdc.Database:
1571
+ runtime_api_user_client: RuntimeApiUserClient = (
1572
+ context.resources.runtime_api_user_client
1573
+ )
1574
+ runtime_api_site_client: RuntimeApiSiteClient = (
1575
+ context.resources.runtime_api_site_client
1576
+ )
1577
+ gold_api_client: GoldApiClient = context.resources.gold_api_client
1578
+
1579
+ database_updater = DatabaseUpdater(
1580
+ runtime_api_user_client,
1581
+ runtime_api_site_client,
1582
+ gold_api_client,
1583
+ nmdc_study_id,
1584
+ gold_nmdc_instrument_map_df,
1585
+ include_field_site_info,
1586
+ enable_biosample_filtering,
1587
+ )
1588
+ database = database_updater.generate_biosample_set_from_gold_api_for_study()
1589
+
1590
+ return database
1591
+
1592
+
1593
+ @op(
1594
+ required_resource_keys={
1595
+ "runtime_api_user_client",
1596
+ "runtime_api_site_client",
1597
+ "gold_api_client",
1598
+ },
1599
+ out=Out(Any),
1600
+ )
1601
+ def run_script_to_update_insdc_biosample_identifiers(
1602
+ context: OpExecutionContext,
1603
+ nmdc_study_id: str,
1604
+ gold_nmdc_instrument_map_df: pd.DataFrame,
1605
+ include_field_site_info: bool,
1606
+ enable_biosample_filtering: bool,
1607
+ ):
1608
+ """Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
1609
+
1610
+ This op uses the DatabaseUpdater to generate a script that can be used to update biosample
1611
+ records with INSDC identifiers obtained from GOLD.
1612
+
1613
+ Args:
1614
+ context: The execution context
1615
+ nmdc_study_id: The NMDC study ID for which to generate the update script
1616
+ gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
1617
+
1618
+ Returns:
1619
+ A dictionary or list of dictionaries containing the MongoDB update script(s)
1620
+ """
1621
+ runtime_api_user_client: RuntimeApiUserClient = (
1622
+ context.resources.runtime_api_user_client
1623
+ )
1624
+ runtime_api_site_client: RuntimeApiSiteClient = (
1625
+ context.resources.runtime_api_site_client
1626
+ )
1627
+ gold_api_client: GoldApiClient = context.resources.gold_api_client
1628
+
1629
+ database_updater = DatabaseUpdater(
1630
+ runtime_api_user_client,
1631
+ runtime_api_site_client,
1632
+ gold_api_client,
1633
+ nmdc_study_id,
1634
+ gold_nmdc_instrument_map_df,
1635
+ include_field_site_info,
1636
+ enable_biosample_filtering,
1637
+ )
1638
+ update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
1639
+
1640
+ if isinstance(update_script, list):
1641
+ total_updates = sum(len(item.get("updates", [])) for item in update_script)
1642
+ else:
1643
+ total_updates = len(update_script.get("updates", []))
1644
+ context.log.info(
1645
+ f"Generated update script for study {nmdc_study_id} with {total_updates} updates"
1646
+ )
1647
+
1648
+ return update_script
1649
+
1650
+
1651
+ @op
1652
+ def log_database_ids(
1653
+ context: OpExecutionContext,
1654
+ database: nmdc.Database,
1655
+ ) -> None:
1656
+ """Log the IDs of the database."""
1657
+ database_dict = as_simple_dict(database)
1658
+ message = ""
1659
+ for collection_name, collection in database_dict.items():
1660
+ if not isinstance(collection, list):
1661
+ continue
1662
+ message += f"{collection_name} ({len(collection)}):\n"
1663
+ if len(collection) < 10:
1664
+ message += "\n".join(f" {doc['id']}" for doc in collection)
1665
+ else:
1666
+ message += "\n".join(f" {doc['id']}" for doc in collection[:4])
1667
+ message += f"\n ... {len(collection) - 8} more\n"
1668
+ message += "\n".join(f" {doc['id']}" for doc in collection[-4:])
1669
+ message += "\n"
1670
+ if message:
1671
+ context.log.info(message)
1672
+
1673
+
1674
+ @op(
1675
+ description="Render free text through the Dagit UI",
1676
+ out=Out(description="Text content rendered through Dagit UI"),
1677
+ )
1678
+ def render_text(context: OpExecutionContext, text: Any):
1679
+ """
1680
+ Renders content as a Dagster Asset in the Dagit UI.
1681
+
1682
+ This operation creates a Dagster Asset with the provided content, making it
1683
+ visible in the Dagit UI for easy viewing and sharing.
1684
+
1685
+ Args:
1686
+ context: The execution context
1687
+ text: The content to render (can be a string or a dictionary that will be converted to JSON)
1688
+
1689
+ Returns:
1690
+ The same content that was provided as input
1691
+ """
1692
+ # Convert dictionary to formatted JSON string if needed
1693
+ if isinstance(text, dict):
1694
+ import json
1695
+
1696
+ content = json.dumps(text, indent=2)
1697
+ file_extension = "json"
1698
+ hash_text = json.dumps(text, sort_keys=True)[:20] # For consistent hashing
1699
+ else:
1700
+ content = str(text) # Convert to string in case it's not already
1701
+ file_extension = "txt"
1702
+ hash_text = content[:20]
1703
+
1704
+ filename = f"rendered_text_{context.run_id}.{file_extension}"
1705
+ file_path = os.path.join(context.instance.storage_directory(), filename)
1706
+
1707
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
1708
+
1709
+ with open(file_path, "w") as f:
1710
+ f.write(content)
1711
+
1712
+ context.log_event(
1713
+ AssetMaterialization(
1714
+ asset_key=f"rendered_text_{hash_from_str(hash_text, 'md5')[:8]}",
1715
+ description="Rendered Content",
1716
+ metadata={
1717
+ "file_path": MetadataValue.path(file_path),
1718
+ "content": MetadataValue.text(content),
1719
+ },
1720
+ )
1721
+ )
1722
+
1723
+ return Output(text)