nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +208 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +788 -0
  10. nmdc_runtime/api/core/util.py +109 -0
  11. nmdc_runtime/api/db/mongo.py +435 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +143 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +270 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +796 -0
  31. nmdc_runtime/api/endpoints/workflows.py +353 -0
  32. nmdc_runtime/api/entrypoint.sh +7 -0
  33. nmdc_runtime/api/main.py +425 -0
  34. nmdc_runtime/api/middleware.py +43 -0
  35. nmdc_runtime/api/models/capability.py +14 -0
  36. nmdc_runtime/api/models/id.py +92 -0
  37. nmdc_runtime/api/models/job.py +37 -0
  38. nmdc_runtime/api/models/lib/helpers.py +78 -0
  39. nmdc_runtime/api/models/metadata.py +11 -0
  40. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  41. nmdc_runtime/api/models/object.py +180 -0
  42. nmdc_runtime/api/models/object_type.py +20 -0
  43. nmdc_runtime/api/models/operation.py +66 -0
  44. nmdc_runtime/api/models/query.py +246 -0
  45. nmdc_runtime/api/models/query_continuation.py +111 -0
  46. nmdc_runtime/api/models/run.py +161 -0
  47. nmdc_runtime/api/models/site.py +87 -0
  48. nmdc_runtime/api/models/trigger.py +13 -0
  49. nmdc_runtime/api/models/user.py +140 -0
  50. nmdc_runtime/api/models/util.py +260 -0
  51. nmdc_runtime/api/models/workflow.py +15 -0
  52. nmdc_runtime/api/openapi.py +178 -0
  53. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  54. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  55. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  56. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  57. nmdc_runtime/config.py +7 -8
  58. nmdc_runtime/minter/adapters/repository.py +22 -2
  59. nmdc_runtime/minter/config.py +2 -0
  60. nmdc_runtime/minter/domain/model.py +55 -1
  61. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  62. nmdc_runtime/mongo_util.py +1 -2
  63. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  64. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  65. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  66. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  67. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  68. nmdc_runtime/site/dagster.yaml +53 -0
  69. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  70. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  71. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  72. nmdc_runtime/site/export/ncbi_xml.py +633 -13
  73. nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
  74. nmdc_runtime/site/graphs.py +8 -22
  75. nmdc_runtime/site/ops.py +147 -181
  76. nmdc_runtime/site/repository.py +2 -112
  77. nmdc_runtime/site/resources.py +16 -3
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/site/workspace.yaml +13 -0
  87. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  88. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  89. nmdc_runtime/static/README.md +5 -0
  90. nmdc_runtime/static/favicon.ico +0 -0
  91. nmdc_runtime/util.py +90 -48
  92. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  93. nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
  94. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  95. nmdc_runtime/containers.py +0 -14
  96. nmdc_runtime/core/db/Database.py +0 -15
  97. nmdc_runtime/core/exceptions/__init__.py +0 -23
  98. nmdc_runtime/core/exceptions/base.py +0 -47
  99. nmdc_runtime/core/exceptions/token.py +0 -13
  100. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  101. nmdc_runtime/domain/users/userSchema.py +0 -37
  102. nmdc_runtime/domain/users/userService.py +0 -14
  103. nmdc_runtime/infrastructure/database/db.py +0 -3
  104. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  105. nmdc_runtime/lib/__init__.py +0 -1
  106. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  107. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  108. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  109. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  110. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  111. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  112. nmdc_runtime/site/drsobjects/registration.py +0 -131
  113. nmdc_runtime/site/translation/emsl.py +0 -43
  114. nmdc_runtime/site/translation/gold.py +0 -53
  115. nmdc_runtime/site/translation/jgi.py +0 -32
  116. nmdc_runtime/site/translation/util.py +0 -132
  117. nmdc_runtime/site/validation/jgi.py +0 -43
  118. nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
  119. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  120. nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
  121. /nmdc_runtime/{client → api}/__init__.py +0 -0
  122. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  123. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  124. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  125. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  126. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  127. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  128. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  129. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  130. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  131. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
nmdc_runtime/site/ops.py CHANGED
@@ -1,22 +1,21 @@
1
1
  import csv
2
2
  import json
3
3
  import logging
4
- import mimetypes
5
4
  import os
6
5
  import subprocess
7
- import tempfile
8
6
  from collections import defaultdict
9
7
  from datetime import datetime, timezone
10
- from io import BytesIO, StringIO
8
+ from io import BytesIO
11
9
  from pprint import pformat
12
10
  from toolz.dicttoolz import keyfilter
13
- from typing import Tuple, Set, Union
11
+ from typing import Tuple, Set
14
12
  from zipfile import ZipFile
15
13
  from itertools import chain
16
14
  from ontology_loader.ontology_load_controller import OntologyLoaderController
17
15
  import pandas as pd
18
16
  import requests
19
-
17
+ from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
18
+ from toolz import dissoc
20
19
 
21
20
  from bson import ObjectId, json_util
22
21
  from dagster import (
@@ -44,7 +43,7 @@ from dagster import (
44
43
  from gridfs import GridFS
45
44
  from linkml_runtime.utils.dictutils import as_simple_dict
46
45
  from linkml_runtime.utils.yamlutils import YAMLRoot
47
- from nmdc_runtime.api.db.mongo import get_mongo_db, validate_json
46
+ from nmdc_runtime.api.db.mongo import validate_json
48
47
  from nmdc_runtime.api.core.idgen import generate_one_id
49
48
  from nmdc_runtime.api.core.metadata import (
50
49
  _validate_changesheet,
@@ -74,7 +73,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
74
73
  fetch_nucleotide_sequencing_from_biosamples,
75
74
  fetch_library_preparation_from_biosamples,
76
75
  )
77
- from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
78
76
  from nmdc_runtime.site.resources import (
79
77
  NmdcPortalApiClient,
80
78
  GoldApiClient,
@@ -96,29 +94,23 @@ from nmdc_runtime.site.translation.submission_portal_translator import (
96
94
  )
97
95
  from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
98
96
  from nmdc_runtime.site.util import (
99
- run_and_log,
100
97
  schema_collection_has_index_on_id,
101
98
  nmdc_study_id_to_filename,
102
99
  get_instruments_by_id,
103
100
  )
104
101
  from nmdc_runtime.util import (
105
- drs_object_in_for,
106
- get_names_of_classes_in_effective_range_of_slot,
107
102
  pluralize,
108
- put_object,
109
103
  specialize_activity_set_docs,
110
104
  collection_name_to_class_names,
111
- class_hierarchy_as_list,
112
105
  nmdc_schema_view,
113
106
  populated_schema_collection_names_with_id_field,
114
107
  )
115
108
  from nmdc_schema import nmdc
116
- from nmdc_schema.nmdc import Database as NMDCDatabase
117
- from pydantic import BaseModel
118
109
  from pymongo import InsertOne, UpdateOne
119
110
  from pymongo.database import Database as MongoDatabase
120
- from starlette import status
121
- from toolz import assoc, dissoc, get_in, valfilter, identity
111
+ from pymongo.collection import Collection as MongoCollection
112
+ from toolz import get_in, valfilter, identity
113
+
122
114
 
123
115
  # batch size for writing documents to alldocs
124
116
  BULK_WRITE_BATCH_SIZE = 2000
@@ -153,99 +145,6 @@ def mongo_stats(context) -> List[str]:
153
145
  return collection_names
154
146
 
155
147
 
156
- @op(
157
- required_resource_keys={"mongo", "runtime_api_site_client"},
158
- retry_policy=RetryPolicy(max_retries=2),
159
- )
160
- def local_file_to_api_object(context, file_info):
161
- client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
162
- storage_path: str = file_info["storage_path"]
163
- mime_type = file_info.get("mime_type")
164
- if mime_type is None:
165
- mime_type = mimetypes.guess_type(storage_path)[0]
166
- rv = client.put_object_in_site(
167
- {"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
168
- )
169
- if not rv.status_code == status.HTTP_200_OK:
170
- raise Failure(description=f"put_object_in_site failed: {rv.content}")
171
- op = rv.json()
172
- context.log.info(f"put_object_in_site: {op}")
173
- rv = put_object(storage_path, op["metadata"]["url"])
174
- if not rv.status_code == status.HTTP_200_OK:
175
- raise Failure(description=f"put_object failed: {rv.content}")
176
- op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
177
- rv = client.update_operation(op["id"], op_patch)
178
- if not rv.status_code == status.HTTP_200_OK:
179
- raise Failure(description="update_operation failed")
180
- op = rv.json()
181
- context.log.info(f"update_operation: {op}")
182
- rv = client.create_object_from_op(op)
183
- if rv.status_code != status.HTTP_201_CREATED:
184
- raise Failure("create_object_from_op failed")
185
- obj = rv.json()
186
- context.log.info(f'Created /objects/{obj["id"]}')
187
- mdb = context.resources.mongo.db
188
- rv = mdb.operations.delete_one({"id": op["id"]})
189
- if rv.deleted_count != 1:
190
- context.log.error("deleting op failed")
191
- yield AssetMaterialization(
192
- asset_key=AssetKey(["object", obj["name"]]),
193
- description="output of metadata-translation run_etl",
194
- metadata={"object_id": MetadataValue.text(obj["id"])},
195
- )
196
- yield Output(obj)
197
-
198
-
199
- @op(
200
- out={
201
- "merged_data_path": Out(
202
- str,
203
- description="path to TSV merging of source metadata",
204
- )
205
- }
206
- )
207
- def build_merged_db(context) -> str:
208
- context.log.info("metadata-translation: running `make build-merged-db`")
209
- run_and_log(
210
- "cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
211
- )
212
- storage_path = (
213
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
214
- )
215
- yield AssetMaterialization(
216
- asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
217
- description="input to metadata-translation run_etl",
218
- metadata={"path": MetadataValue.path(storage_path)},
219
- )
220
- yield Output(storage_path, "merged_data_path")
221
-
222
-
223
- @op(
224
- required_resource_keys={"runtime_api_site_client"},
225
- )
226
- def run_etl(context, merged_data_path: str):
227
- context.log.info("metadata-translation: running `make run-etl`")
228
- if not os.path.exists(merged_data_path):
229
- raise Failure(description=f"merged_db not present at {merged_data_path}")
230
- run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
231
- storage_path = (
232
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
233
- )
234
- with ZipFile(storage_path) as zf:
235
- name = zf.namelist()[0]
236
- with zf.open(name) as f:
237
- rv = json.load(f)
238
- context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
239
- yield AssetMaterialization(
240
- asset_key=AssetKey(["gold_translation", "database.json.zip"]),
241
- description="output of metadata-translation run_etl",
242
- metadata={
243
- "path": MetadataValue.path(storage_path),
244
- },
245
- )
246
- yield Output({"storage_path": storage_path})
247
-
248
-
249
148
  @op(required_resource_keys={"mongo"})
250
149
  def get_operation(context):
251
150
  mdb = context.resources.mongo.db
@@ -470,6 +369,9 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn):
470
369
 
471
370
  @op(required_resource_keys={"runtime_api_site_client"})
472
371
  def get_json_in(context):
372
+ """
373
+ TODO: Document this function.
374
+ """
473
375
  object_id = context.op_config.get("object_id")
474
376
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
475
377
  rv = client.get_object_bytes(object_id)
@@ -482,6 +384,9 @@ def get_json_in(context):
482
384
 
483
385
  @op(required_resource_keys={"runtime_api_site_client", "mongo"})
484
386
  def perform_mongo_updates(context, json_in):
387
+ """
388
+ TODO: Document this function.
389
+ """
485
390
  mongo = context.resources.mongo
486
391
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
487
392
  op_id = context.op_config.get("operation_id")
@@ -511,6 +416,9 @@ def perform_mongo_updates(context, json_in):
511
416
  def _add_schema_docs_with_or_without_replacement(
512
417
  mongo: MongoDBResource, docs: Dict[str, list]
513
418
  ):
419
+ """
420
+ TODO: Document this function.
421
+ """
514
422
  coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
515
423
  if all(coll_index_on_id_map[coll] for coll in docs.keys()):
516
424
  replace = True
@@ -534,7 +442,13 @@ def _add_schema_docs_with_or_without_replacement(
534
442
  f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
535
443
  )
536
444
  op_result = mongo.add_docs(docs, validate=False, replace=replace)
537
- return mongo_add_docs_result_as_dict(op_result)
445
+
446
+ # Translate the operation result into a dictionary in which each item's key is a collection name
447
+ # and each item's value is the corresponding bulk API result (excluding the "upserted" field).
448
+ return {
449
+ collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
450
+ for collection_name, bulk_write_result in op_result.items()
451
+ }
538
452
 
539
453
 
540
454
  @op(required_resource_keys={"mongo"})
@@ -642,27 +556,39 @@ def nmdc_schema_database_from_gold_study(
642
556
 
643
557
 
644
558
  @op(
559
+ required_resource_keys={"mongo"},
645
560
  out={
646
561
  "submission_id": Out(),
647
562
  "nucleotide_sequencing_mapping_file_url": Out(Optional[str]),
648
563
  "data_object_mapping_file_url": Out(Optional[str]),
649
564
  "biosample_extras_file_url": Out(Optional[str]),
650
565
  "biosample_extras_slot_mapping_file_url": Out(Optional[str]),
566
+ "study_id": Out(Optional[str]),
651
567
  },
652
568
  )
653
569
  def get_submission_portal_pipeline_inputs(
570
+ context: OpExecutionContext,
654
571
  submission_id: str,
655
572
  nucleotide_sequencing_mapping_file_url: Optional[str],
656
573
  data_object_mapping_file_url: Optional[str],
657
574
  biosample_extras_file_url: Optional[str],
658
575
  biosample_extras_slot_mapping_file_url: Optional[str],
659
- ) -> Tuple[str, str | None, str | None, str | None, str | None]:
576
+ study_id: Optional[str],
577
+ ) -> Tuple[str, str | None, str | None, str | None, str | None, str | None]:
578
+ # query for studies matching the ID to see if it eists
579
+ if study_id:
580
+ mdb = context.resources.mongo.db
581
+ result = mdb.study_set.find_one({"id": study_id})
582
+ if not result:
583
+ raise Exception(f"Study id: {study_id} does not exist in Mongo.")
584
+
660
585
  return (
661
586
  submission_id,
662
587
  nucleotide_sequencing_mapping_file_url,
663
588
  data_object_mapping_file_url,
664
589
  biosample_extras_file_url,
665
590
  biosample_extras_slot_mapping_file_url,
591
+ study_id,
666
592
  )
667
593
 
668
594
 
@@ -687,6 +613,7 @@ def translate_portal_submission_to_nmdc_schema_database(
687
613
  study_pi_image_url: Optional[str],
688
614
  biosample_extras: Optional[list[dict]],
689
615
  biosample_extras_slot_mapping: Optional[list[dict]],
616
+ study_id: Optional[str],
690
617
  ) -> nmdc.Database:
691
618
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
692
619
 
@@ -704,6 +631,7 @@ def translate_portal_submission_to_nmdc_schema_database(
704
631
  biosample_extras=biosample_extras,
705
632
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
706
633
  illumina_instrument_mapping=instrument_mapping,
634
+ study_id=study_id,
707
635
  )
708
636
  database = translator.get_database()
709
637
  return database
@@ -1043,15 +971,19 @@ def load_ontology(context: OpExecutionContext):
1043
971
  context.log.info(f"Ontology load for {source_ontology} completed successfully!")
1044
972
 
1045
973
 
1046
- def _add_related_ids_to_alldocs(
1047
- temp_collection, context, document_reference_ranged_slots_by_type
974
+ def _add_linked_instances_to_alldocs(
975
+ temp_collection: MongoCollection,
976
+ context: OpExecutionContext,
977
+ document_reference_ranged_slots_by_type: dict,
1048
978
  ) -> None:
1049
979
  """
1050
- Adds {`_inbound`,`_outbound`} fields to each document in the temporary alldocs collection.
980
+ Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
1051
981
 
1052
- The {`_inbound`,`_outbound`} fields each contain an array of subdocuments, each with fields `id` and `type`.
1053
- Each subdocument represents a link to any other document that either links to or is linked from
1054
- the document via document-reference-ranged slots.
982
+ The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
983
+ Each subdocument represents a link to another document that either links to or is linked from the document via
984
+ document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
985
+ document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
986
+ considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
1055
987
 
1056
988
  Args:
1057
989
  temp_collection: The temporary MongoDB collection to process
@@ -1063,7 +995,7 @@ def _add_related_ids_to_alldocs(
1063
995
  """
1064
996
 
1065
997
  context.log.info(
1066
- "Building relationships and adding `_inbound` and `_outbound` fields..."
998
+ "Building relationships and adding `_upstream` and `_downstream` fields..."
1067
999
  )
1068
1000
 
1069
1001
  # document ID -> type (with "nmdc:" prefix preserved)
@@ -1084,9 +1016,7 @@ def _add_related_ids_to_alldocs(
1084
1016
  id_to_type_map[doc_id] = doc_type
1085
1017
 
1086
1018
  # Find all document references from this document
1087
- reference_slots = document_reference_ranged_slots_by_type.get(
1088
- doc_type_no_prefix, []
1089
- )
1019
+ reference_slots = document_reference_ranged_slots_by_type.get(doc_type, [])
1090
1020
  for slot in reference_slots:
1091
1021
  if slot in doc:
1092
1022
  # Handle both single-value and array references
@@ -1103,34 +1033,32 @@ def _add_related_ids_to_alldocs(
1103
1033
  f"{len({d for (d, _, _) in relationship_triples})} containing references"
1104
1034
  )
1105
1035
 
1106
- # The bifurcation of document-reference-ranged slots as "inbound" and "outbound" is essential
1036
+ # The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
1107
1037
  # in order to perform graph traversal and collect all entities "related" to a given entity without
1108
1038
  # recursion "exploding".
1109
1039
  #
1110
1040
  # Note: We are hard-coding this "direction" information here in the Runtime
1111
1041
  # because the NMDC schema does not currently contain or expose it.
1112
1042
  #
1113
- # An "inbound" slot is one for which an entity in the domain "was influenced by" (formally,
1114
- # <https://www.w3.org/ns/prov#wasInfluencedBy>, with typical CURIE prov:wasInfluencedBy) an entity in the range.
1115
- inbound_document_reference_ranged_slots = [
1116
- "collected_from", # a `nmdc:Biosample` was influenced by the `nmdc:Site` from which it was collected.
1117
- "has_chromatography_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
1118
- "has_input", # a `nmdc:PlannedProcess` was influenced by a `nmdc:NamedThing`.
1119
- "has_mass_spectrometry_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
1120
- "instrument_used", # a `nmdc:PlannedProcess` was influenced by a used `nmdc:Instrument`.
1121
- "uses_calibration", # a `nmdc:PlannedProcess` was influenced by `nmdc:CalibrationInformation`.
1122
- "was_generated_by", # prov:wasGeneratedBy rdfs:subPropertyOf prov:wasInfluencedBy.
1123
- "was_informed_by", # prov:wasInformedBy rdfs:subPropertyOf prov:wasInfluencedBy.
1043
+ # An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
1044
+ upstream_document_reference_ranged_slots = [
1045
+ "associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
1046
+ "collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
1047
+ "has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1048
+ "has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
1049
+ "has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1050
+ "instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
1051
+ "part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
1052
+ "was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
1053
+ "was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
1124
1054
  ]
1125
- # An "outbound" slot is one for which an entity in the domain "influences"
1126
- # (i.e., [owl:inverseOf prov:wasInfluencedBy]) an entity in the range.
1127
- outbound_document_reference_ranged_slots = [
1128
- "associated_studies", # a `nmdc:Biosample` influences a `nmdc:Study`.
1129
- "calibration_object", # `nmdc:CalibrationInformation` generates a `nmdc:DataObject`.
1130
- "generates_calibration", # a `nmdc:PlannedProcess` generates `nmdc:CalibrationInformation`.
1131
- "has_output", # a `nmdc:PlannedProcess` generates a `nmdc:NamedThing`.
1132
- "in_manifest", # a `nmdc:DataObject` becomes associated with `nmdc:Manifest`.
1133
- "part_of", # a "contained" `nmdc:NamedThing` influences its "container" `nmdc:NamedThing`,
1055
+ # A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
1056
+ downstream_document_reference_ranged_slots = [
1057
+ "calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
1058
+ "generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
1059
+ "has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
1060
+ "in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
1061
+ "uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
1134
1062
  ]
1135
1063
 
1136
1064
  unique_document_reference_ranged_slot_names = set()
@@ -1138,15 +1066,15 @@ def _add_related_ids_to_alldocs(
1138
1066
  for slot_name in slot_names:
1139
1067
  unique_document_reference_ranged_slot_names.add(slot_name)
1140
1068
  context.log.info(f"{unique_document_reference_ranged_slot_names=}")
1141
- if len(inbound_document_reference_ranged_slots) + len(
1142
- outbound_document_reference_ranged_slots
1069
+ if len(upstream_document_reference_ranged_slots) + len(
1070
+ downstream_document_reference_ranged_slots
1143
1071
  ) != len(unique_document_reference_ranged_slot_names):
1144
1072
  raise Failure(
1145
1073
  "Number of detected unique document-reference-ranged slot names does not match "
1146
- "sum of accounted-for inbound and outbound document-reference-ranged slot names."
1074
+ "sum of accounted-for upstream and downstream document-reference-ranged slot names."
1147
1075
  )
1148
1076
 
1149
- # Construct, and update documents with, `_incoming` and `_outgoing` field values.
1077
+ # Construct, and update documents with, `_upstream` and `_downstream` field values.
1150
1078
  #
1151
1079
  # manage batching of MongoDB `bulk_write` operations
1152
1080
  bulk_operations, update_count = [], 0
@@ -1154,10 +1082,10 @@ def _add_related_ids_to_alldocs(
1154
1082
 
1155
1083
  # Determine in which respective fields to push this relationship
1156
1084
  # for the subject (doc) and object (ref) of this triple.
1157
- if slot in inbound_document_reference_ranged_slots:
1158
- field_for_doc, field_for_ref = "_inbound", "_outbound"
1159
- elif slot in outbound_document_reference_ranged_slots:
1160
- field_for_doc, field_for_ref = "_outbound", "_inbound"
1085
+ if slot in upstream_document_reference_ranged_slots:
1086
+ field_for_doc, field_for_ref = "_upstream", "_downstream"
1087
+ elif slot in downstream_document_reference_ranged_slots:
1088
+ field_for_doc, field_for_ref = "_downstream", "_upstream"
1161
1089
  else:
1162
1090
  raise Failure(f"Unknown slot {slot} for document {doc_id}")
1163
1091
 
@@ -1204,14 +1132,6 @@ def _add_related_ids_to_alldocs(
1204
1132
 
1205
1133
  context.log.info(f"Pushed {update_count} updates in total")
1206
1134
 
1207
- context.log.info("Creating {`_inbound`,`_outbound`} indexes...")
1208
- temp_collection.create_index("_inbound.id")
1209
- temp_collection.create_index("_outbound.id")
1210
- # Create compound indexes to ensure index-covered queries
1211
- temp_collection.create_index([("_inbound.type", 1), ("_inbound.id", 1)])
1212
- temp_collection.create_index([("_outbound.type", 1), ("_outbound.id", 1)])
1213
- context.log.info("Successfully created {`_inbound`,`_outbound`} indexes")
1214
-
1215
1135
 
1216
1136
  # Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
1217
1137
  # pass an argument to the op (in order to specify the order of the ops in the graph)
@@ -1220,7 +1140,7 @@ def _add_related_ids_to_alldocs(
1220
1140
  # Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
1221
1141
  #
1222
1142
  @op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
1223
- def materialize_alldocs(context) -> int:
1143
+ def materialize_alldocs(context: OpExecutionContext) -> int:
1224
1144
  """
1225
1145
  This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
1226
1146
 
@@ -1228,8 +1148,8 @@ def materialize_alldocs(context) -> int:
1228
1148
  2. Create a temporary collection to build the new alldocs collection.
1229
1149
  3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
1230
1150
  4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
1231
- 5. Add special `_inbound` and `_outbound` fields with subdocuments containing ID and type of related entities.
1232
- 6. Add indexes for `id`, relationship fields, and `{_inbound,_outbound}.type`/`.id` compound indexes.
1151
+ 5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
1152
+ 6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
1233
1153
  7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
1234
1154
 
1235
1155
  The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
@@ -1240,7 +1160,7 @@ def materialize_alldocs(context) -> int:
1240
1160
  `/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
1241
1161
  related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
1242
1162
 
1243
- The {`_inbound`,`_outbound`} fields enable efficient index-covered queries to find all entities of specific types
1163
+ The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
1244
1164
  that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
1245
1165
  expansions.
1246
1166
  """
@@ -1271,14 +1191,16 @@ def materialize_alldocs(context) -> int:
1271
1191
  )
1272
1192
  )
1273
1193
 
1274
- document_reference_ranged_slots = defaultdict(list)
1194
+ document_reference_ranged_slots_by_type = defaultdict(list)
1275
1195
  for cls_name, slot_map in cls_slot_map.items():
1276
1196
  for slot_name, slot in slot_map.items():
1277
1197
  if (
1278
1198
  set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
1279
1199
  & document_referenceable_ranges
1280
1200
  ):
1281
- document_reference_ranged_slots[cls_name].append(slot_name)
1201
+ document_reference_ranged_slots_by_type[f"nmdc:{cls_name}"].append(
1202
+ slot_name
1203
+ )
1282
1204
 
1283
1205
  # Build `alldocs` to a temporary collection for atomic replacement
1284
1206
  # https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
@@ -1295,34 +1217,28 @@ def materialize_alldocs(context) -> int:
1295
1217
  # Keep the full type with prefix for document
1296
1218
  doc_type_full = doc["type"]
1297
1219
  # Remove prefix for slot lookup and ancestor lookup
1298
- doc_type = (
1299
- doc_type_full[5:]
1300
- if doc_type_full.startswith("nmdc:")
1301
- else doc_type_full
1302
- )
1220
+ doc_type = doc_type_full.removeprefix("nmdc:")
1303
1221
  except KeyError:
1304
1222
  raise Exception(
1305
1223
  f"doc {doc['id']} in collection {coll_name} has no 'type'!"
1306
1224
  )
1307
- slots_to_include = ["id", "type"] + document_reference_ranged_slots[
1308
- doc_type
1225
+ slots_to_include = ["id", "type"] + document_reference_ranged_slots_by_type[
1226
+ doc_type_full
1309
1227
  ]
1310
1228
  new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
1311
1229
 
1312
- new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
1313
- # InsertOne is a method on the py-mongo Client class.
1314
1230
  # Get ancestors without the prefix, but add prefix to each one in the output
1315
- ancestors = schema_view.class_ancestors(doc_type)
1316
1231
  new_doc["_type_and_ancestors"] = [
1317
- "nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
1232
+ f"nmdc:{a}" for a in schema_view.class_ancestors(doc_type)
1318
1233
  ]
1234
+ # InsertOne is a pymongo representation of a mongo command.
1319
1235
  write_operations.append(InsertOne(new_doc))
1320
1236
  if len(write_operations) == BULK_WRITE_BATCH_SIZE:
1321
1237
  _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1322
1238
  write_operations.clear()
1323
1239
  documents_processed_counter += BULK_WRITE_BATCH_SIZE
1324
1240
  if len(write_operations) > 0:
1325
- # here bulk_write is a method on the py-mongo db Client class
1241
+ # here bulk_write is a method on the pymongo db Collection class
1326
1242
  _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1327
1243
  documents_processed_counter += len(write_operations)
1328
1244
  context.log.info(
@@ -1339,19 +1255,31 @@ def materialize_alldocs(context) -> int:
1339
1255
  # so that `temp_alldocs_collection` will be "good to go" on renaming.
1340
1256
  temp_alldocs_collection.create_index("id", unique=True)
1341
1257
  # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
1342
- slots_to_index = ["has_input", "has_output", "was_informed_by"]
1258
+ slots_to_index = {"_type_and_ancestors"} | {
1259
+ slot
1260
+ for slots in document_reference_ranged_slots_by_type.values()
1261
+ for slot in slots
1262
+ }
1343
1263
  [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
1344
- context.log.info(f"created indexes on id, {slots_to_index}.")
1264
+ context.log.info(f"created indexes on id and on each of {slots_to_index=}.")
1345
1265
 
1346
1266
  # Add related-ids fields to enable efficient relationship traversal
1347
1267
  context.log.info("Adding fields for related ids to documents...")
1348
- _add_related_ids_to_alldocs(
1349
- temp_alldocs_collection, context, document_reference_ranged_slots
1268
+ _add_linked_instances_to_alldocs(
1269
+ temp_alldocs_collection, context, document_reference_ranged_slots_by_type
1270
+ )
1271
+ context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
1272
+ temp_alldocs_collection.create_index("_upstream.id")
1273
+ temp_alldocs_collection.create_index("_downstream.id")
1274
+ # Create compound indexes to ensure index-covered queries
1275
+ temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
1276
+ temp_alldocs_collection.create_index(
1277
+ [("_downstream.type", 1), ("_downstream.id", 1)]
1350
1278
  )
1279
+ context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
1351
1280
 
1352
1281
  context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
1353
1282
  temp_alldocs_collection.rename("alldocs", dropTarget=True)
1354
-
1355
1283
  n_alldocs_documents = mdb.alldocs.estimated_document_count()
1356
1284
  context.log.info(
1357
1285
  f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
@@ -1442,6 +1370,42 @@ def get_library_preparation_from_biosamples(
1442
1370
  return biosample_lib_prep
1443
1371
 
1444
1372
 
1373
+ @op(required_resource_keys={"mongo"})
1374
+ def get_aggregated_pooled_biosamples(context: OpExecutionContext, biosamples: list):
1375
+ from nmdc_runtime.site.export.ncbi_xml_utils import check_pooling_for_biosamples
1376
+
1377
+ mdb = context.resources.mongo.db
1378
+ material_processing_set = mdb["material_processing_set"]
1379
+ pooled_biosamples_data = check_pooling_for_biosamples(
1380
+ material_processing_set, biosamples
1381
+ )
1382
+
1383
+ # Fetch ProcessedSample names from database
1384
+ processed_sample_ids = set()
1385
+ for biosample_id, pooling_info in pooled_biosamples_data.items():
1386
+ if pooling_info and pooling_info.get("processed_sample_id"):
1387
+ processed_sample_ids.add(pooling_info["processed_sample_id"])
1388
+
1389
+ # Query database for ProcessedSample names
1390
+ if processed_sample_ids:
1391
+ processed_sample_set = mdb["processed_sample_set"]
1392
+ cursor = processed_sample_set.find(
1393
+ {"id": {"$in": list(processed_sample_ids)}}, {"id": 1, "name": 1}
1394
+ )
1395
+ processed_samples = {doc["id"]: doc.get("name", "") for doc in cursor}
1396
+
1397
+ # Update pooled_biosamples_data with ProcessedSample names
1398
+ for biosample_id, pooling_info in pooled_biosamples_data.items():
1399
+ if pooling_info and pooling_info.get("processed_sample_id"):
1400
+ processed_sample_id = pooling_info["processed_sample_id"]
1401
+ if processed_sample_id in processed_samples:
1402
+ pooling_info["processed_sample_name"] = processed_samples[
1403
+ processed_sample_id
1404
+ ]
1405
+
1406
+ return pooled_biosamples_data
1407
+
1408
+
1445
1409
  @op(required_resource_keys={"mongo"})
1446
1410
  def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
1447
1411
  mdb = context.resources.mongo.db
@@ -1475,6 +1439,7 @@ def ncbi_submission_xml_from_nmdc_study(
1475
1439
  data_object_records: list,
1476
1440
  library_preparation_records: list,
1477
1441
  all_instruments: dict,
1442
+ pooled_biosamples_data: dict,
1478
1443
  ) -> str:
1479
1444
  ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
1480
1445
  ncbi_xml = ncbi_exporter.get_submission_xml(
@@ -1483,6 +1448,7 @@ def ncbi_submission_xml_from_nmdc_study(
1483
1448
  data_object_records,
1484
1449
  library_preparation_records,
1485
1450
  all_instruments,
1451
+ pooled_biosamples_data,
1486
1452
  )
1487
1453
  return ncbi_xml
1488
1454