nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (100) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +55 -4
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +33 -28
  76. nmdc_runtime/site/ops.py +97 -237
  77. nmdc_runtime/site/repair/database_updater.py +8 -0
  78. nmdc_runtime/site/repository.py +7 -117
  79. nmdc_runtime/site/resources.py +4 -4
  80. nmdc_runtime/site/translation/gold_translator.py +22 -21
  81. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  82. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  83. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  84. nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
  85. nmdc_runtime/site/translation/translator.py +63 -1
  86. nmdc_runtime/site/util.py +8 -3
  87. nmdc_runtime/site/validation/util.py +10 -5
  88. nmdc_runtime/util.py +9 -321
  89. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  90. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  91. nmdc_runtime/site/translation/emsl.py +0 -43
  92. nmdc_runtime/site/translation/gold.py +0 -53
  93. nmdc_runtime/site/translation/jgi.py +0 -32
  94. nmdc_runtime/site/translation/util.py +0 -132
  95. nmdc_runtime/site/validation/jgi.py +0 -43
  96. nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
  97. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  98. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  99. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  100. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/site/ops.py CHANGED
@@ -4,10 +4,9 @@ import logging
4
4
  import mimetypes
5
5
  import os
6
6
  import subprocess
7
- import tempfile
8
7
  from collections import defaultdict
9
8
  from datetime import datetime, timezone
10
- from io import BytesIO, StringIO
9
+ from io import BytesIO
11
10
  from pprint import pformat
12
11
  from toolz.dicttoolz import keyfilter
13
12
  from typing import Tuple, Set
@@ -16,7 +15,7 @@ from itertools import chain
16
15
  from ontology_loader.ontology_load_controller import OntologyLoaderController
17
16
  import pandas as pd
18
17
  import requests
19
-
18
+ from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
20
19
 
21
20
  from bson import ObjectId, json_util
22
21
  from dagster import (
@@ -44,7 +43,7 @@ from dagster import (
44
43
  from gridfs import GridFS
45
44
  from linkml_runtime.utils.dictutils import as_simple_dict
46
45
  from linkml_runtime.utils.yamlutils import YAMLRoot
47
- from nmdc_runtime.api.db.mongo import get_mongo_db
46
+ from nmdc_runtime.api.db.mongo import validate_json
48
47
  from nmdc_runtime.api.core.idgen import generate_one_id
49
48
  from nmdc_runtime.api.core.metadata import (
50
49
  _validate_changesheet,
@@ -103,23 +102,19 @@ from nmdc_runtime.site.util import (
103
102
  )
104
103
  from nmdc_runtime.util import (
105
104
  drs_object_in_for,
106
- get_names_of_classes_in_effective_range_of_slot,
107
105
  pluralize,
108
106
  put_object,
109
- validate_json,
110
107
  specialize_activity_set_docs,
111
108
  collection_name_to_class_names,
112
- class_hierarchy_as_list,
113
109
  nmdc_schema_view,
114
110
  populated_schema_collection_names_with_id_field,
115
111
  )
116
112
  from nmdc_schema import nmdc
117
- from nmdc_schema.nmdc import Database as NMDCDatabase
118
- from pydantic import BaseModel
119
113
  from pymongo import InsertOne, UpdateOne
120
114
  from pymongo.database import Database as MongoDatabase
121
115
  from starlette import status
122
- from toolz import assoc, dissoc, get_in, valfilter, identity
116
+ from toolz import get_in, valfilter, identity
117
+
123
118
 
124
119
  # batch size for writing documents to alldocs
125
120
  BULK_WRITE_BATCH_SIZE = 2000
@@ -154,99 +149,6 @@ def mongo_stats(context) -> List[str]:
154
149
  return collection_names
155
150
 
156
151
 
157
- @op(
158
- required_resource_keys={"mongo", "runtime_api_site_client"},
159
- retry_policy=RetryPolicy(max_retries=2),
160
- )
161
- def local_file_to_api_object(context, file_info):
162
- client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
163
- storage_path: str = file_info["storage_path"]
164
- mime_type = file_info.get("mime_type")
165
- if mime_type is None:
166
- mime_type = mimetypes.guess_type(storage_path)[0]
167
- rv = client.put_object_in_site(
168
- {"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
169
- )
170
- if not rv.status_code == status.HTTP_200_OK:
171
- raise Failure(description=f"put_object_in_site failed: {rv.content}")
172
- op = rv.json()
173
- context.log.info(f"put_object_in_site: {op}")
174
- rv = put_object(storage_path, op["metadata"]["url"])
175
- if not rv.status_code == status.HTTP_200_OK:
176
- raise Failure(description=f"put_object failed: {rv.content}")
177
- op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
178
- rv = client.update_operation(op["id"], op_patch)
179
- if not rv.status_code == status.HTTP_200_OK:
180
- raise Failure(description="update_operation failed")
181
- op = rv.json()
182
- context.log.info(f"update_operation: {op}")
183
- rv = client.create_object_from_op(op)
184
- if rv.status_code != status.HTTP_201_CREATED:
185
- raise Failure("create_object_from_op failed")
186
- obj = rv.json()
187
- context.log.info(f'Created /objects/{obj["id"]}')
188
- mdb = context.resources.mongo.db
189
- rv = mdb.operations.delete_one({"id": op["id"]})
190
- if rv.deleted_count != 1:
191
- context.log.error("deleting op failed")
192
- yield AssetMaterialization(
193
- asset_key=AssetKey(["object", obj["name"]]),
194
- description="output of metadata-translation run_etl",
195
- metadata={"object_id": MetadataValue.text(obj["id"])},
196
- )
197
- yield Output(obj)
198
-
199
-
200
- @op(
201
- out={
202
- "merged_data_path": Out(
203
- str,
204
- description="path to TSV merging of source metadata",
205
- )
206
- }
207
- )
208
- def build_merged_db(context) -> str:
209
- context.log.info("metadata-translation: running `make build-merged-db`")
210
- run_and_log(
211
- "cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
212
- )
213
- storage_path = (
214
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
215
- )
216
- yield AssetMaterialization(
217
- asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
218
- description="input to metadata-translation run_etl",
219
- metadata={"path": MetadataValue.path(storage_path)},
220
- )
221
- yield Output(storage_path, "merged_data_path")
222
-
223
-
224
- @op(
225
- required_resource_keys={"runtime_api_site_client"},
226
- )
227
- def run_etl(context, merged_data_path: str):
228
- context.log.info("metadata-translation: running `make run-etl`")
229
- if not os.path.exists(merged_data_path):
230
- raise Failure(description=f"merged_db not present at {merged_data_path}")
231
- run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
232
- storage_path = (
233
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
234
- )
235
- with ZipFile(storage_path) as zf:
236
- name = zf.namelist()[0]
237
- with zf.open(name) as f:
238
- rv = json.load(f)
239
- context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
240
- yield AssetMaterialization(
241
- asset_key=AssetKey(["gold_translation", "database.json.zip"]),
242
- description="output of metadata-translation run_etl",
243
- metadata={
244
- "path": MetadataValue.path(storage_path),
245
- },
246
- )
247
- yield Output({"storage_path": storage_path})
248
-
249
-
250
152
  @op(required_resource_keys={"mongo"})
251
153
  def get_operation(context):
252
154
  mdb = context.resources.mongo.db
@@ -481,83 +383,6 @@ def get_json_in(context):
481
383
  return rv.json()
482
384
 
483
385
 
484
- def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
485
- """
486
- Does not ensure ordering of `docs`.
487
-
488
- TODO: Document this function. What _does_ it do (or what was it designed to do)?
489
- What, conceptually, did the author design it to receive (as `docs`); a dict
490
- having a `data_object_set` item whose value is a list of documents.
491
- What, conceptually, did the author design it to return?
492
- """
493
-
494
- if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
495
- return docs, 0
496
-
497
- do_docs = docs["data_object_set"]
498
-
499
- class FileTypeEnumBase(BaseModel):
500
- name: str
501
- description: str
502
- filter: str # JSON-encoded data_object_set mongo collection filter document
503
-
504
- class FileTypeEnum(FileTypeEnumBase):
505
- id: str
506
-
507
- # Make a temporary collection (which will be dropped below) and insert the
508
- # specified `data_object_set` documents into it.
509
- temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
510
- temp_collection = mdb[temp_collection_name]
511
- temp_collection.insert_many(do_docs)
512
- temp_collection.create_index("id")
513
-
514
- def fte_matches(fte_filter: str) -> List[dict]:
515
- r"""
516
- Returns a list of documents—without their `_id` field—that match the specified filter,
517
- which is encoded as a JSON string.
518
- """
519
- return [
520
- dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
521
- ]
522
-
523
- # Create a mapping from each document's `id` to the document, itself.
524
- do_docs_map = {d["id"]: d for d in do_docs}
525
-
526
- n_docs_with_types_added = 0
527
-
528
- # For each `file_type_enum` document in the database, find all the documents (among the
529
- # `data_object_set` documents provided by the caller) that match that `file_type_enum`
530
- # document's filter.
531
- #
532
- # If any of those documents lacks a `data_object_type` field, update the original
533
- # `data_object_set` document so that its `data_object_type` field is set to
534
- # the `file_type_enum` document's `id` (why not its `name`?).
535
- #
536
- # TODO: I don't know why this sets `data_object_type` to `file_type_enum.id`,
537
- # as opposed to `file_type_enum.name`.
538
- #
539
- for fte_doc in mdb.file_type_enum.find():
540
- fte = FileTypeEnum(**fte_doc)
541
- docs_matching = fte_matches(fte.filter)
542
- for doc in docs_matching:
543
- if "data_object_type" not in doc:
544
- do_docs_map[doc["id"]] = assoc(doc, "data_object_type", fte.id)
545
- n_docs_with_types_added += 1
546
-
547
- mdb.drop_collection(temp_collection_name)
548
-
549
- # Returns a tuple. The first item is the original `docs` dictionary, but with the
550
- # `data_object_set` list replaced by the list of the documents that are in the
551
- # `do_docs_map` dictionary (with their `_id` fields omitted). The second item is
552
- # the number of documents to which this function added a `data_object_type` field.
553
- return (
554
- assoc(
555
- docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
556
- ),
557
- n_docs_with_types_added,
558
- )
559
-
560
-
561
386
  @op(required_resource_keys={"runtime_api_site_client", "mongo"})
562
387
  def perform_mongo_updates(context, json_in):
563
388
  mongo = context.resources.mongo
@@ -566,8 +391,6 @@ def perform_mongo_updates(context, json_in):
566
391
 
567
392
  docs = json_in
568
393
  docs, _ = specialize_activity_set_docs(docs)
569
- docs, n_docs_with_types_added = ensure_data_object_type(docs, mongo.db)
570
- context.log.info(f"added `data_object_type` to {n_docs_with_types_added} docs")
571
394
  context.log.debug(f"{docs}")
572
395
 
573
396
  rv = validate_json(
@@ -636,22 +459,25 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
636
459
  "study_type": str,
637
460
  "gold_nmdc_instrument_mapping_file_url": str,
638
461
  "include_field_site_info": bool,
462
+ "enable_biosample_filtering": bool,
639
463
  },
640
464
  out={
641
465
  "study_id": Out(str),
642
466
  "study_type": Out(str),
643
467
  "gold_nmdc_instrument_mapping_file_url": Out(str),
644
468
  "include_field_site_info": Out(bool),
469
+ "enable_biosample_filtering": Out(bool),
645
470
  },
646
471
  )
647
472
  def get_gold_study_pipeline_inputs(
648
473
  context: OpExecutionContext,
649
- ) -> Tuple[str, str, str, bool]:
474
+ ) -> Tuple[str, str, str, bool, bool]:
650
475
  return (
651
476
  context.op_config["study_id"],
652
477
  context.op_config["study_type"],
653
478
  context.op_config["gold_nmdc_instrument_mapping_file_url"],
654
479
  context.op_config["include_field_site_info"],
480
+ context.op_config["enable_biosample_filtering"],
655
481
  )
656
482
 
657
483
 
@@ -695,6 +521,7 @@ def nmdc_schema_database_from_gold_study(
695
521
  analysis_projects: List[Dict[str, Any]],
696
522
  gold_nmdc_instrument_map_df: pd.DataFrame,
697
523
  include_field_site_info: bool,
524
+ enable_biosample_filtering: bool,
698
525
  ) -> nmdc.Database:
699
526
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
700
527
 
@@ -710,6 +537,7 @@ def nmdc_schema_database_from_gold_study(
710
537
  analysis_projects,
711
538
  gold_nmdc_instrument_map_df,
712
539
  include_field_site_info,
540
+ enable_biosample_filtering,
713
541
  id_minter=id_minter,
714
542
  )
715
543
  database = translator.get_database()
@@ -1110,21 +938,25 @@ def load_ontology(context: OpExecutionContext):
1110
938
  source_ontology=source_ontology,
1111
939
  output_directory=output_directory,
1112
940
  generate_reports=generate_reports,
941
+ mongo_client=context.resources.mongo.client,
942
+ db_name=context.resources.mongo.db.name,
1113
943
  )
1114
944
 
1115
945
  loader.run_ontology_loader()
1116
946
  context.log.info(f"Ontology load for {source_ontology} completed successfully!")
1117
947
 
1118
948
 
1119
- def _add_related_ids_to_alldocs(
949
+ def _add_linked_instances_to_alldocs(
1120
950
  temp_collection, context, document_reference_ranged_slots_by_type
1121
951
  ) -> None:
1122
952
  """
1123
- Adds {`_inbound`,`_outbound`} fields to each document in the temporary alldocs collection.
953
+ Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
1124
954
 
1125
- The {`_inbound`,`_outbound`} fields each contain an array of subdocuments, each with fields `id` and `type`.
1126
- Each subdocument represents a link to any other document that either links to or is linked from
1127
- the document via document-reference-ranged slots.
955
+ The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
956
+ Each subdocument represents a link to another document that either links to or is linked from the document via
957
+ document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
958
+ document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
959
+ considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
1128
960
 
1129
961
  Args:
1130
962
  temp_collection: The temporary MongoDB collection to process
@@ -1136,7 +968,7 @@ def _add_related_ids_to_alldocs(
1136
968
  """
1137
969
 
1138
970
  context.log.info(
1139
- "Building relationships and adding `_inbound` and `_outbound` fields..."
971
+ "Building relationships and adding `_upstream` and `_downstream` fields..."
1140
972
  )
1141
973
 
1142
974
  # document ID -> type (with "nmdc:" prefix preserved)
@@ -1151,6 +983,7 @@ def _add_related_ids_to_alldocs(
1151
983
  # Store the full type with prefix intact
1152
984
  doc_type = doc["type"]
1153
985
  # For looking up reference slots, we still need the type without prefix
986
+ # FIXME `document_reference_ranged_slots_by_type` should key on `doc_type`
1154
987
  doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
1155
988
 
1156
989
  # Record ID to type mapping - preserve the original type with prefix
@@ -1176,34 +1009,32 @@ def _add_related_ids_to_alldocs(
1176
1009
  f"{len({d for (d, _, _) in relationship_triples})} containing references"
1177
1010
  )
1178
1011
 
1179
- # The bifurcation of document-reference-ranged slots as "inbound" and "outbound" is essential
1012
+ # The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
1180
1013
  # in order to perform graph traversal and collect all entities "related" to a given entity without
1181
1014
  # recursion "exploding".
1182
1015
  #
1183
1016
  # Note: We are hard-coding this "direction" information here in the Runtime
1184
1017
  # because the NMDC schema does not currently contain or expose it.
1185
1018
  #
1186
- # An "inbound" slot is one for which an entity in the domain "was influenced by" (formally,
1187
- # <https://www.w3.org/ns/prov#wasInfluencedBy>, with typical CURIE prov:wasInfluencedBy) an entity in the range.
1188
- inbound_document_reference_ranged_slots = [
1189
- "collected_from", # a `nmdc:Biosample` was influenced by the `nmdc:Site` from which it was collected.
1190
- "has_chromatography_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
1191
- "has_input", # a `nmdc:PlannedProcess` was influenced by a `nmdc:NamedThing`.
1192
- "has_mass_spectrometry_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
1193
- "instrument_used", # a `nmdc:PlannedProcess` was influenced by a used `nmdc:Instrument`.
1194
- "uses_calibration", # a `nmdc:PlannedProcess` was influenced by `nmdc:CalibrationInformation`.
1195
- "was_generated_by", # prov:wasGeneratedBy rdfs:subPropertyOf prov:wasInfluencedBy .
1196
- "was_informed_by", # prov:wasInformedBy rdfs:subPropertyOf prov:wasInfluencedBy .
1019
+ # An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
1020
+ upstream_document_reference_ranged_slots = [
1021
+ "associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
1022
+ "collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
1023
+ "has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1024
+ "has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
1025
+ "has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1026
+ "instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
1027
+ "part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
1028
+ "was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
1029
+ "was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
1197
1030
  ]
1198
- # An "outbound" slot is one for which an entity in the domain "influences"
1199
- # (i.e., [owl:inverseOf prov:wasInfluencedBy]) an entity in the range.
1200
- outbound_document_reference_ranged_slots = [
1201
- "associated_studies", # a `nmdc:Biosample` influences a `nmdc:Study`.
1202
- "calibration_object", # `nmdc:CalibrationInformation` generates a `nmdc:DataObject`.
1203
- "generates_calibration", # a `nmdc:PlannedProcess` generates `nmdc:CalibrationInformation`.
1204
- "has_output", # a `nmdc:PlannedProcess` generates a `nmdc:NamedThing`.
1205
- "in_manifest", # a `nmdc:DataObject` becomes associated with `nmdc:Manifest`.
1206
- "part_of", # a "contained" `nmdc:NamedThing` influences its "container" `nmdc:NamedThing`,
1031
+ # A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
1032
+ downstream_document_reference_ranged_slots = [
1033
+ "calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
1034
+ "generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
1035
+ "has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
1036
+ "in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
1037
+ "uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
1207
1038
  ]
1208
1039
 
1209
1040
  unique_document_reference_ranged_slot_names = set()
@@ -1211,15 +1042,15 @@ def _add_related_ids_to_alldocs(
1211
1042
  for slot_name in slot_names:
1212
1043
  unique_document_reference_ranged_slot_names.add(slot_name)
1213
1044
  context.log.info(f"{unique_document_reference_ranged_slot_names=}")
1214
- if len(inbound_document_reference_ranged_slots) + len(
1215
- outbound_document_reference_ranged_slots
1045
+ if len(upstream_document_reference_ranged_slots) + len(
1046
+ downstream_document_reference_ranged_slots
1216
1047
  ) != len(unique_document_reference_ranged_slot_names):
1217
1048
  raise Failure(
1218
1049
  "Number of detected unique document-reference-ranged slot names does not match "
1219
- "sum of accounted-for inbound and outbound document-reference-ranged slot names."
1050
+ "sum of accounted-for upstream and downstream document-reference-ranged slot names."
1220
1051
  )
1221
1052
 
1222
- # Construct, and update documents with, `_incoming` and `_outgoing` field values.
1053
+ # Construct, and update documents with, `_upstream` and `_downstream` field values.
1223
1054
  #
1224
1055
  # manage batching of MongoDB `bulk_write` operations
1225
1056
  bulk_operations, update_count = [], 0
@@ -1227,10 +1058,10 @@ def _add_related_ids_to_alldocs(
1227
1058
 
1228
1059
  # Determine in which respective fields to push this relationship
1229
1060
  # for the subject (doc) and object (ref) of this triple.
1230
- if slot in inbound_document_reference_ranged_slots:
1231
- field_for_doc, field_for_ref = "_inbound", "_outbound"
1232
- elif slot in outbound_document_reference_ranged_slots:
1233
- field_for_doc, field_for_ref = "_outbound", "_inbound"
1061
+ if slot in upstream_document_reference_ranged_slots:
1062
+ field_for_doc, field_for_ref = "_upstream", "_downstream"
1063
+ elif slot in downstream_document_reference_ranged_slots:
1064
+ field_for_doc, field_for_ref = "_downstream", "_upstream"
1234
1065
  else:
1235
1066
  raise Failure(f"Unknown slot {slot} for document {doc_id}")
1236
1067
 
@@ -1277,14 +1108,6 @@ def _add_related_ids_to_alldocs(
1277
1108
 
1278
1109
  context.log.info(f"Pushed {update_count} updates in total")
1279
1110
 
1280
- context.log.info("Creating {`_inbound`,`_outbound`} indexes...")
1281
- temp_collection.create_index("_inbound.id")
1282
- temp_collection.create_index("_outbound.id")
1283
- # Create compound indexes to ensure index-covered queries
1284
- temp_collection.create_index([("_inbound.type", 1), ("_inbound.id", 1)])
1285
- temp_collection.create_index([("_outbound.type", 1), ("_outbound.id", 1)])
1286
- context.log.info("Successfully created {`_inbound`,`_outbound`} indexes")
1287
-
1288
1111
 
1289
1112
  # Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
1290
1113
  # pass an argument to the op (in order to specify the order of the ops in the graph)
@@ -1301,8 +1124,8 @@ def materialize_alldocs(context) -> int:
1301
1124
  2. Create a temporary collection to build the new alldocs collection.
1302
1125
  3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
1303
1126
  4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
1304
- 5. Add special `_inbound` and `_outbound` fields with subdocuments containing ID and type of related entities.
1305
- 6. Add indexes for `id`, relationship fields, and `{_inbound,_outbound}.type`/`.id` compound indexes.
1127
+ 5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
1128
+ 6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
1306
1129
  7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
1307
1130
 
1308
1131
  The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
@@ -1313,7 +1136,7 @@ def materialize_alldocs(context) -> int:
1313
1136
  `/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
1314
1137
  related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
1315
1138
 
1316
- The {`_inbound`,`_outbound`} fields enable efficient index-covered queries to find all entities of specific types
1139
+ The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
1317
1140
  that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
1318
1141
  expansions.
1319
1142
  """
@@ -1344,6 +1167,9 @@ def materialize_alldocs(context) -> int:
1344
1167
  )
1345
1168
  )
1346
1169
 
1170
+ # FIXME rename to `document_reference_ranged_slots_by_type`
1171
+ # FIXME key on CURIE, e.g. `nmdc:Study`
1172
+ # (here, not upstream in `cls_slot_map`/`document_referenceable_ranges`, b/c `schema_view` used directly in those)
1347
1173
  document_reference_ranged_slots = defaultdict(list)
1348
1174
  for cls_name, slot_map in cls_slot_map.items():
1349
1175
  for slot_name, slot in slot_map.items():
@@ -1383,12 +1209,12 @@ def materialize_alldocs(context) -> int:
1383
1209
  new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
1384
1210
 
1385
1211
  new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
1386
- # InsertOne is a method on the py-mongo Client class.
1387
1212
  # Get ancestors without the prefix, but add prefix to each one in the output
1388
1213
  ancestors = schema_view.class_ancestors(doc_type)
1389
1214
  new_doc["_type_and_ancestors"] = [
1390
1215
  "nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
1391
1216
  ]
1217
+ # InsertOne is a pymongo representation of a mongo command.
1392
1218
  write_operations.append(InsertOne(new_doc))
1393
1219
  if len(write_operations) == BULK_WRITE_BATCH_SIZE:
1394
1220
  _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
@@ -1412,19 +1238,28 @@ def materialize_alldocs(context) -> int:
1412
1238
  # so that `temp_alldocs_collection` will be "good to go" on renaming.
1413
1239
  temp_alldocs_collection.create_index("id", unique=True)
1414
1240
  # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
1241
+ # TODO add indexes on each of `set(document_reference_ranged_slots.values())`.
1415
1242
  slots_to_index = ["has_input", "has_output", "was_informed_by"]
1416
1243
  [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
1417
1244
  context.log.info(f"created indexes on id, {slots_to_index}.")
1418
1245
 
1419
1246
  # Add related-ids fields to enable efficient relationship traversal
1420
1247
  context.log.info("Adding fields for related ids to documents...")
1421
- _add_related_ids_to_alldocs(
1248
+ _add_linked_instances_to_alldocs(
1422
1249
  temp_alldocs_collection, context, document_reference_ranged_slots
1423
1250
  )
1251
+ context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
1252
+ temp_alldocs_collection.create_index("_upstream.id")
1253
+ temp_alldocs_collection.create_index("_downstream.id")
1254
+ # Create compound indexes to ensure index-covered queries
1255
+ temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
1256
+ temp_alldocs_collection.create_index(
1257
+ [("_downstream.type", 1), ("_downstream.id", 1)]
1258
+ )
1259
+ context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
1424
1260
 
1425
1261
  context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
1426
1262
  temp_alldocs_collection.rename("alldocs", dropTarget=True)
1427
-
1428
1263
  n_alldocs_documents = mdb.alldocs.estimated_document_count()
1429
1264
  context.log.info(
1430
1265
  f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
@@ -1572,16 +1407,24 @@ def post_submission_portal_biosample_ingest_record_stitching_filename(
1572
1407
  config_schema={
1573
1408
  "nmdc_study_id": str,
1574
1409
  "gold_nmdc_instrument_mapping_file_url": str,
1410
+ "include_field_site_info": bool,
1411
+ "enable_biosample_filtering": bool,
1575
1412
  },
1576
1413
  out={
1577
1414
  "nmdc_study_id": Out(str),
1578
1415
  "gold_nmdc_instrument_mapping_file_url": Out(str),
1416
+ "include_field_site_info": Out(bool),
1417
+ "enable_biosample_filtering": Out(bool),
1579
1418
  },
1580
1419
  )
1581
- def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]:
1420
+ def get_database_updater_inputs(
1421
+ context: OpExecutionContext,
1422
+ ) -> Tuple[str, str, bool, bool]:
1582
1423
  return (
1583
1424
  context.op_config["nmdc_study_id"],
1584
1425
  context.op_config["gold_nmdc_instrument_mapping_file_url"],
1426
+ context.op_config["include_field_site_info"],
1427
+ context.op_config["enable_biosample_filtering"],
1585
1428
  )
1586
1429
 
1587
1430
 
@@ -1596,6 +1439,8 @@ def generate_data_generation_set_post_biosample_ingest(
1596
1439
  context: OpExecutionContext,
1597
1440
  nmdc_study_id: str,
1598
1441
  gold_nmdc_instrument_map_df: pd.DataFrame,
1442
+ include_field_site_info: bool,
1443
+ enable_biosample_filtering: bool,
1599
1444
  ) -> nmdc.Database:
1600
1445
  runtime_api_user_client: RuntimeApiUserClient = (
1601
1446
  context.resources.runtime_api_user_client
@@ -1611,6 +1456,8 @@ def generate_data_generation_set_post_biosample_ingest(
1611
1456
  gold_api_client,
1612
1457
  nmdc_study_id,
1613
1458
  gold_nmdc_instrument_map_df,
1459
+ include_field_site_info,
1460
+ enable_biosample_filtering,
1614
1461
  )
1615
1462
  database = (
1616
1463
  database_updater.generate_data_generation_set_records_from_gold_api_for_study()
@@ -1630,6 +1477,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
1630
1477
  context: OpExecutionContext,
1631
1478
  nmdc_study_id: str,
1632
1479
  gold_nmdc_instrument_map_df: pd.DataFrame,
1480
+ include_field_site_info: bool = False,
1481
+ enable_biosample_filtering: bool = False,
1633
1482
  ) -> nmdc.Database:
1634
1483
  runtime_api_user_client: RuntimeApiUserClient = (
1635
1484
  context.resources.runtime_api_user_client
@@ -1645,6 +1494,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
1645
1494
  gold_api_client,
1646
1495
  nmdc_study_id,
1647
1496
  gold_nmdc_instrument_map_df,
1497
+ include_field_site_info,
1498
+ enable_biosample_filtering,
1648
1499
  )
1649
1500
  database = database_updater.generate_biosample_set_from_gold_api_for_study()
1650
1501
 
@@ -1656,13 +1507,16 @@ def generate_biosample_set_for_nmdc_study_from_gold(
1656
1507
  "runtime_api_user_client",
1657
1508
  "runtime_api_site_client",
1658
1509
  "gold_api_client",
1659
- }
1510
+ },
1511
+ out=Out(Any),
1660
1512
  )
1661
1513
  def run_script_to_update_insdc_biosample_identifiers(
1662
1514
  context: OpExecutionContext,
1663
1515
  nmdc_study_id: str,
1664
1516
  gold_nmdc_instrument_map_df: pd.DataFrame,
1665
- ) -> Dict[str, Any]:
1517
+ include_field_site_info: bool,
1518
+ enable_biosample_filtering: bool,
1519
+ ):
1666
1520
  """Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
1667
1521
 
1668
1522
  This op uses the DatabaseUpdater to generate a script that can be used to update biosample
@@ -1674,7 +1528,7 @@ def run_script_to_update_insdc_biosample_identifiers(
1674
1528
  gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
1675
1529
 
1676
1530
  Returns:
1677
- A dictionary containing the MongoDB update script
1531
+ A dictionary or list of dictionaries containing the MongoDB update script(s)
1678
1532
  """
1679
1533
  runtime_api_user_client: RuntimeApiUserClient = (
1680
1534
  context.resources.runtime_api_user_client
@@ -1690,11 +1544,17 @@ def run_script_to_update_insdc_biosample_identifiers(
1690
1544
  gold_api_client,
1691
1545
  nmdc_study_id,
1692
1546
  gold_nmdc_instrument_map_df,
1547
+ include_field_site_info,
1548
+ enable_biosample_filtering,
1693
1549
  )
1694
1550
  update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
1695
1551
 
1552
+ if isinstance(update_script, list):
1553
+ total_updates = sum(len(item.get("updates", [])) for item in update_script)
1554
+ else:
1555
+ total_updates = len(update_script.get("updates", []))
1696
1556
  context.log.info(
1697
- f"Generated update script for study {nmdc_study_id} with {len(update_script.get('updates', []))} updates"
1557
+ f"Generated update script for study {nmdc_study_id} with {total_updates} updates"
1698
1558
  )
1699
1559
 
1700
1560
  return update_script
@@ -18,6 +18,8 @@ class DatabaseUpdater:
18
18
  gold_api_client: GoldApiClient,
19
19
  study_id: str,
20
20
  gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
21
+ include_field_site_info: bool = False,
22
+ enable_biosample_filtering: bool = True,
21
23
  ):
22
24
  """This class serves as an API for repairing connections in the database by
23
25
  adding records that are essentially missing "links"/"connections". As we identify
@@ -39,6 +41,8 @@ class DatabaseUpdater:
39
41
  self.gold_api_client = gold_api_client
40
42
  self.study_id = study_id
41
43
  self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
44
+ self.include_field_site_info = include_field_site_info
45
+ self.enable_biosample_filtering = enable_biosample_filtering
42
46
 
43
47
  @lru_cache
44
48
  def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
@@ -95,6 +99,8 @@ class DatabaseUpdater:
95
99
  biosamples=all_gold_biosamples,
96
100
  projects=all_gold_projects,
97
101
  gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
102
+ include_field_site_info=self.include_field_site_info,
103
+ enable_biosample_filtering=self.enable_biosample_filtering,
98
104
  )
99
105
 
100
106
  # The GoldStudyTranslator class has some pre-processing logic which filters out
@@ -214,6 +220,8 @@ class DatabaseUpdater:
214
220
  projects=gold_sequencing_projects_for_study,
215
221
  analysis_projects=gold_analysis_projects_for_study,
216
222
  gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
223
+ include_field_site_info=self.include_field_site_info,
224
+ enable_biosample_filtering=self.enable_biosample_filtering,
217
225
  )
218
226
 
219
227
  translated_biosamples = gold_study_translator.biosamples