nmdc-runtime 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (98) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +7 -8
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +1 -22
  76. nmdc_runtime/site/ops.py +60 -152
  77. nmdc_runtime/site/repository.py +0 -112
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +2 -54
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/util.py +3 -47
  87. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  88. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  89. nmdc_runtime/site/translation/emsl.py +0 -43
  90. nmdc_runtime/site/translation/gold.py +0 -53
  91. nmdc_runtime/site/translation/jgi.py +0 -32
  92. nmdc_runtime/site/translation/util.py +0 -132
  93. nmdc_runtime/site/validation/jgi.py +0 -43
  94. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  95. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  96. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  97. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  98. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,11 @@
1
1
  from enum import Enum
2
+ import re
2
3
  from typing import Optional
3
4
 
5
+ from base32_lib import base32
4
6
  from pydantic import BaseModel, PositiveInt
5
7
 
6
- from nmdc_runtime.minter.config import schema_classes
8
+ from nmdc_runtime.minter.config import schema_classes, typecodes
7
9
 
8
10
 
9
11
  class Entity(BaseModel):
@@ -71,3 +73,35 @@ class Identifier(Entity):
71
73
  class Typecode(Entity):
72
74
  schema_class: str
73
75
  name: str
76
+
77
+
78
+ id_prefix_pattern = rf"(?P<prefix>nmdc)"
79
+ id_typecode_pattern = rf"(?P<typecode>[a-z]{{1,6}})"
80
+ id_shoulder_pattern = rf"(?P<shoulder>[0-9][a-z]{{0,6}}[0-9])"
81
+ id_blade_pattern = rf"(?P<blade>[A-Za-z0-9]+)"
82
+ id_version_pattern = rf"(?P<version>(\.[A-Za-z0-9]+)*)"
83
+ id_locus_pattern = rf"(?P<locus>_[A-Za-z0-9_\.-]+)?"
84
+ id_pattern = (
85
+ rf"^{id_prefix_pattern}:{id_typecode_pattern}-{id_shoulder_pattern}-"
86
+ rf"{id_blade_pattern}{id_version_pattern}{id_locus_pattern}$"
87
+ )
88
+ ID_TYPECODE_VALUES = [t["name"] for t in typecodes()]
89
+ id_typecode_pattern_strict = rf"(?P<typecode_strict>({'|'.join(ID_TYPECODE_VALUES)}))"
90
+ id_blade_pattern_strict = rf"(?P<blade_strict>[{base32.ENCODING_CHARS}]+)"
91
+ id_pattern_strict = (
92
+ rf"^{id_prefix_pattern}:{id_typecode_pattern_strict}-{id_shoulder_pattern}-"
93
+ rf"{id_blade_pattern_strict}{id_version_pattern}{id_locus_pattern}$"
94
+ )
95
+ id_pattern_strict_compiled = re.compile(id_pattern_strict)
96
+
97
+
98
+ def check_valid_ids(ids: list[str]):
99
+ for id_ in ids:
100
+ if not re.match(id_pattern, id_):
101
+ raise ValueError(
102
+ (
103
+ f"Invalid ID format for given ID: '{id_}'.\n\nAn ID must match the pattern: '{id_pattern}'.\n\n"
104
+ "See: <https://microbiomedata.github.io/nmdc-schema/identifiers/#ids-minted-for-use-within-nmdc>"
105
+ )
106
+ )
107
+ return ids
@@ -8,7 +8,7 @@ from nmdc_runtime.api.core.util import raise404_if_none
8
8
  from nmdc_runtime.api.db.mongo import get_mongo_db
9
9
  from nmdc_runtime.api.models.site import get_current_client_site, Site
10
10
  from nmdc_runtime.minter.adapters.repository import MongoIDStore, MinterError
11
- from nmdc_runtime.minter.config import minting_service_id, schema_classes
11
+ from nmdc_runtime.minter.config import minting_service_id
12
12
  from nmdc_runtime.minter.domain.model import (
13
13
  Identifier,
14
14
  AuthenticatedMintingRequest,
@@ -1,7 +1,6 @@
1
- from pymongo import MongoClient
2
1
  from pymongo.database import Database
3
2
  from pymongo.collection import Collection
4
- from typing import Any, Mapping, Optional, Type, Callable
3
+ from typing import Any, Optional
5
4
  from pymongo.client_session import ClientSession
6
5
  import inspect
7
6
 
@@ -6,7 +6,7 @@ $ nmdcdb-mongodump
6
6
 
7
7
  import os
8
8
  import subprocess
9
- from datetime import datetime, timezone
9
+ from datetime import datetime
10
10
  from pathlib import Path
11
11
  from zoneinfo import ZoneInfo
12
12
 
@@ -16,9 +16,7 @@ from toolz import assoc
16
16
 
17
17
  from nmdc_runtime.api.core.util import pick
18
18
  from nmdc_runtime.api.db.mongo import get_mongo_db
19
- from nmdc_runtime.site.repository import run_config_frozen__normal_env
20
- from nmdc_runtime.site.resources import get_mongo
21
- from nmdc_runtime.util import nmdc_jsonschema, schema_collection_names_with_id_field
19
+ from nmdc_runtime.util import schema_collection_names_with_id_field
22
20
 
23
21
 
24
22
  def collection_stats(mdb: MongoDatabase):
@@ -4,7 +4,7 @@ import datetime
4
4
  import xml.etree.ElementTree as ET
5
5
  import xml.dom.minidom
6
6
 
7
- from typing import Any, List, Union
7
+ from typing import Any, List
8
8
  from urllib.parse import urlparse
9
9
  from nmdc_runtime.site.export.ncbi_xml_utils import (
10
10
  handle_controlled_identified_term_value,
@@ -16,7 +16,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
16
16
  handle_float_value,
17
17
  handle_string_value,
18
18
  load_mappings,
19
- validate_xml,
20
19
  )
21
20
 
22
21
 
@@ -1,5 +1,5 @@
1
1
  from io import BytesIO, StringIO
2
- from typing import Any, Dict, List, Union
2
+ from typing import Any, Dict, List
3
3
 
4
4
  from nmdc_runtime.api.endpoints.util import strip_oid
5
5
  from nmdc_runtime.minter.config import typecodes
@@ -1,7 +1,6 @@
1
- from dagster import graph, GraphIn
1
+ from dagster import graph
2
2
 
3
3
  from nmdc_runtime.site.ops import (
4
- build_merged_db,
5
4
  generate_biosample_set_for_nmdc_study_from_gold,
6
5
  nmdc_schema_database_export_filename,
7
6
  nmdc_schema_database_from_gold_study,
@@ -12,8 +11,6 @@ from nmdc_runtime.site.ops import (
12
11
  gold_projects_by_study,
13
12
  gold_study,
14
13
  poll_for_run_completion,
15
- run_etl,
16
- local_file_to_api_object,
17
14
  get_operation,
18
15
  produce_curated_db,
19
16
  delete_operations,
@@ -70,24 +67,6 @@ from nmdc_runtime.site.ops import (
70
67
  from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
71
68
 
72
69
 
73
- @graph
74
- def gold_translation():
75
- """
76
- Translating an export of the JGI GOLD [1] SQL database to the NMDC database JSON schema.
77
-
78
- [1] Genomes OnLine Database (GOLD) <https://gold.jgi.doe.gov/>.
79
- """
80
- local_file_to_api_object(run_etl(build_merged_db()))
81
-
82
-
83
- @graph()
84
- def gold_translation_curation():
85
- # TODO
86
- # - have produce_curated_db do actual curation (see notebook), persisting to db.
87
- # - more steps in pipeline? Or handoff via run_status_sensor on DagsterRunStatus.SUCCESS.
88
- produce_curated_db(get_operation())
89
-
90
-
91
70
  @graph()
92
71
  def create_objects_from_site_object_puts():
93
72
  delete_operations(
nmdc_runtime/site/ops.py CHANGED
@@ -4,19 +4,18 @@ import logging
4
4
  import mimetypes
5
5
  import os
6
6
  import subprocess
7
- import tempfile
8
7
  from collections import defaultdict
9
8
  from datetime import datetime, timezone
10
- from io import BytesIO, StringIO
9
+ from io import BytesIO
11
10
  from pprint import pformat
12
11
  from toolz.dicttoolz import keyfilter
13
- from typing import Tuple, Set, Union
12
+ from typing import Tuple, Set
14
13
  from zipfile import ZipFile
15
14
  from itertools import chain
16
15
  from ontology_loader.ontology_load_controller import OntologyLoaderController
17
16
  import pandas as pd
18
17
  import requests
19
-
18
+ from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
20
19
 
21
20
  from bson import ObjectId, json_util
22
21
  from dagster import (
@@ -44,7 +43,7 @@ from dagster import (
44
43
  from gridfs import GridFS
45
44
  from linkml_runtime.utils.dictutils import as_simple_dict
46
45
  from linkml_runtime.utils.yamlutils import YAMLRoot
47
- from nmdc_runtime.api.db.mongo import get_mongo_db, validate_json
46
+ from nmdc_runtime.api.db.mongo import validate_json
48
47
  from nmdc_runtime.api.core.idgen import generate_one_id
49
48
  from nmdc_runtime.api.core.metadata import (
50
49
  _validate_changesheet,
@@ -103,22 +102,19 @@ from nmdc_runtime.site.util import (
103
102
  )
104
103
  from nmdc_runtime.util import (
105
104
  drs_object_in_for,
106
- get_names_of_classes_in_effective_range_of_slot,
107
105
  pluralize,
108
106
  put_object,
109
107
  specialize_activity_set_docs,
110
108
  collection_name_to_class_names,
111
- class_hierarchy_as_list,
112
109
  nmdc_schema_view,
113
110
  populated_schema_collection_names_with_id_field,
114
111
  )
115
112
  from nmdc_schema import nmdc
116
- from nmdc_schema.nmdc import Database as NMDCDatabase
117
- from pydantic import BaseModel
118
113
  from pymongo import InsertOne, UpdateOne
119
114
  from pymongo.database import Database as MongoDatabase
120
115
  from starlette import status
121
- from toolz import assoc, dissoc, get_in, valfilter, identity
116
+ from toolz import get_in, valfilter, identity
117
+
122
118
 
123
119
  # batch size for writing documents to alldocs
124
120
  BULK_WRITE_BATCH_SIZE = 2000
@@ -153,99 +149,6 @@ def mongo_stats(context) -> List[str]:
153
149
  return collection_names
154
150
 
155
151
 
156
- @op(
157
- required_resource_keys={"mongo", "runtime_api_site_client"},
158
- retry_policy=RetryPolicy(max_retries=2),
159
- )
160
- def local_file_to_api_object(context, file_info):
161
- client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
162
- storage_path: str = file_info["storage_path"]
163
- mime_type = file_info.get("mime_type")
164
- if mime_type is None:
165
- mime_type = mimetypes.guess_type(storage_path)[0]
166
- rv = client.put_object_in_site(
167
- {"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
168
- )
169
- if not rv.status_code == status.HTTP_200_OK:
170
- raise Failure(description=f"put_object_in_site failed: {rv.content}")
171
- op = rv.json()
172
- context.log.info(f"put_object_in_site: {op}")
173
- rv = put_object(storage_path, op["metadata"]["url"])
174
- if not rv.status_code == status.HTTP_200_OK:
175
- raise Failure(description=f"put_object failed: {rv.content}")
176
- op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
177
- rv = client.update_operation(op["id"], op_patch)
178
- if not rv.status_code == status.HTTP_200_OK:
179
- raise Failure(description="update_operation failed")
180
- op = rv.json()
181
- context.log.info(f"update_operation: {op}")
182
- rv = client.create_object_from_op(op)
183
- if rv.status_code != status.HTTP_201_CREATED:
184
- raise Failure("create_object_from_op failed")
185
- obj = rv.json()
186
- context.log.info(f'Created /objects/{obj["id"]}')
187
- mdb = context.resources.mongo.db
188
- rv = mdb.operations.delete_one({"id": op["id"]})
189
- if rv.deleted_count != 1:
190
- context.log.error("deleting op failed")
191
- yield AssetMaterialization(
192
- asset_key=AssetKey(["object", obj["name"]]),
193
- description="output of metadata-translation run_etl",
194
- metadata={"object_id": MetadataValue.text(obj["id"])},
195
- )
196
- yield Output(obj)
197
-
198
-
199
- @op(
200
- out={
201
- "merged_data_path": Out(
202
- str,
203
- description="path to TSV merging of source metadata",
204
- )
205
- }
206
- )
207
- def build_merged_db(context) -> str:
208
- context.log.info("metadata-translation: running `make build-merged-db`")
209
- run_and_log(
210
- "cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
211
- )
212
- storage_path = (
213
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
214
- )
215
- yield AssetMaterialization(
216
- asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
217
- description="input to metadata-translation run_etl",
218
- metadata={"path": MetadataValue.path(storage_path)},
219
- )
220
- yield Output(storage_path, "merged_data_path")
221
-
222
-
223
- @op(
224
- required_resource_keys={"runtime_api_site_client"},
225
- )
226
- def run_etl(context, merged_data_path: str):
227
- context.log.info("metadata-translation: running `make run-etl`")
228
- if not os.path.exists(merged_data_path):
229
- raise Failure(description=f"merged_db not present at {merged_data_path}")
230
- run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
231
- storage_path = (
232
- "/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
233
- )
234
- with ZipFile(storage_path) as zf:
235
- name = zf.namelist()[0]
236
- with zf.open(name) as f:
237
- rv = json.load(f)
238
- context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
239
- yield AssetMaterialization(
240
- asset_key=AssetKey(["gold_translation", "database.json.zip"]),
241
- description="output of metadata-translation run_etl",
242
- metadata={
243
- "path": MetadataValue.path(storage_path),
244
- },
245
- )
246
- yield Output({"storage_path": storage_path})
247
-
248
-
249
152
  @op(required_resource_keys={"mongo"})
250
153
  def get_operation(context):
251
154
  mdb = context.resources.mongo.db
@@ -1043,15 +946,17 @@ def load_ontology(context: OpExecutionContext):
1043
946
  context.log.info(f"Ontology load for {source_ontology} completed successfully!")
1044
947
 
1045
948
 
1046
- def _add_related_ids_to_alldocs(
949
+ def _add_linked_instances_to_alldocs(
1047
950
  temp_collection, context, document_reference_ranged_slots_by_type
1048
951
  ) -> None:
1049
952
  """
1050
- Adds {`_inbound`,`_outbound`} fields to each document in the temporary alldocs collection.
953
+ Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
1051
954
 
1052
- The {`_inbound`,`_outbound`} fields each contain an array of subdocuments, each with fields `id` and `type`.
1053
- Each subdocument represents a link to any other document that either links to or is linked from
1054
- the document via document-reference-ranged slots.
955
+ The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
956
+ Each subdocument represents a link to another document that either links to or is linked from the document via
957
+ document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
958
+ document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
959
+ considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
1055
960
 
1056
961
  Args:
1057
962
  temp_collection: The temporary MongoDB collection to process
@@ -1063,7 +968,7 @@ def _add_related_ids_to_alldocs(
1063
968
  """
1064
969
 
1065
970
  context.log.info(
1066
- "Building relationships and adding `_inbound` and `_outbound` fields..."
971
+ "Building relationships and adding `_upstream` and `_downstream` fields..."
1067
972
  )
1068
973
 
1069
974
  # document ID -> type (with "nmdc:" prefix preserved)
@@ -1078,6 +983,7 @@ def _add_related_ids_to_alldocs(
1078
983
  # Store the full type with prefix intact
1079
984
  doc_type = doc["type"]
1080
985
  # For looking up reference slots, we still need the type without prefix
986
+ # FIXME `document_reference_ranged_slots_by_type` should key on `doc_type`
1081
987
  doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
1082
988
 
1083
989
  # Record ID to type mapping - preserve the original type with prefix
@@ -1103,34 +1009,32 @@ def _add_related_ids_to_alldocs(
1103
1009
  f"{len({d for (d, _, _) in relationship_triples})} containing references"
1104
1010
  )
1105
1011
 
1106
- # The bifurcation of document-reference-ranged slots as "inbound" and "outbound" is essential
1012
+ # The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
1107
1013
  # in order to perform graph traversal and collect all entities "related" to a given entity without
1108
1014
  # recursion "exploding".
1109
1015
  #
1110
1016
  # Note: We are hard-coding this "direction" information here in the Runtime
1111
1017
  # because the NMDC schema does not currently contain or expose it.
1112
1018
  #
1113
- # An "inbound" slot is one for which an entity in the domain "was influenced by" (formally,
1114
- # <https://www.w3.org/ns/prov#wasInfluencedBy>, with typical CURIE prov:wasInfluencedBy) an entity in the range.
1115
- inbound_document_reference_ranged_slots = [
1116
- "collected_from", # a `nmdc:Biosample` was influenced by the `nmdc:Site` from which it was collected.
1117
- "has_chromatography_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
1118
- "has_input", # a `nmdc:PlannedProcess` was influenced by a `nmdc:NamedThing`.
1119
- "has_mass_spectrometry_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
1120
- "instrument_used", # a `nmdc:PlannedProcess` was influenced by a used `nmdc:Instrument`.
1121
- "uses_calibration", # a `nmdc:PlannedProcess` was influenced by `nmdc:CalibrationInformation`.
1122
- "was_generated_by", # prov:wasGeneratedBy rdfs:subPropertyOf prov:wasInfluencedBy.
1123
- "was_informed_by", # prov:wasInformedBy rdfs:subPropertyOf prov:wasInfluencedBy.
1019
+ # An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
1020
+ upstream_document_reference_ranged_slots = [
1021
+ "associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
1022
+ "collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
1023
+ "has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1024
+ "has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
1025
+ "has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
1026
+ "instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
1027
+ "part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
1028
+ "was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
1029
+ "was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
1124
1030
  ]
1125
- # An "outbound" slot is one for which an entity in the domain "influences"
1126
- # (i.e., [owl:inverseOf prov:wasInfluencedBy]) an entity in the range.
1127
- outbound_document_reference_ranged_slots = [
1128
- "associated_studies", # a `nmdc:Biosample` influences a `nmdc:Study`.
1129
- "calibration_object", # `nmdc:CalibrationInformation` generates a `nmdc:DataObject`.
1130
- "generates_calibration", # a `nmdc:PlannedProcess` generates `nmdc:CalibrationInformation`.
1131
- "has_output", # a `nmdc:PlannedProcess` generates a `nmdc:NamedThing`.
1132
- "in_manifest", # a `nmdc:DataObject` becomes associated with `nmdc:Manifest`.
1133
- "part_of", # a "contained" `nmdc:NamedThing` influences its "container" `nmdc:NamedThing`,
1031
+ # A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
1032
+ downstream_document_reference_ranged_slots = [
1033
+ "calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
1034
+ "generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
1035
+ "has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
1036
+ "in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
1037
+ "uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
1134
1038
  ]
1135
1039
 
1136
1040
  unique_document_reference_ranged_slot_names = set()
@@ -1138,15 +1042,15 @@ def _add_related_ids_to_alldocs(
1138
1042
  for slot_name in slot_names:
1139
1043
  unique_document_reference_ranged_slot_names.add(slot_name)
1140
1044
  context.log.info(f"{unique_document_reference_ranged_slot_names=}")
1141
- if len(inbound_document_reference_ranged_slots) + len(
1142
- outbound_document_reference_ranged_slots
1045
+ if len(upstream_document_reference_ranged_slots) + len(
1046
+ downstream_document_reference_ranged_slots
1143
1047
  ) != len(unique_document_reference_ranged_slot_names):
1144
1048
  raise Failure(
1145
1049
  "Number of detected unique document-reference-ranged slot names does not match "
1146
- "sum of accounted-for inbound and outbound document-reference-ranged slot names."
1050
+ "sum of accounted-for upstream and downstream document-reference-ranged slot names."
1147
1051
  )
1148
1052
 
1149
- # Construct, and update documents with, `_incoming` and `_outgoing` field values.
1053
+ # Construct, and update documents with, `_upstream` and `_downstream` field values.
1150
1054
  #
1151
1055
  # manage batching of MongoDB `bulk_write` operations
1152
1056
  bulk_operations, update_count = [], 0
@@ -1154,10 +1058,10 @@ def _add_related_ids_to_alldocs(
1154
1058
 
1155
1059
  # Determine in which respective fields to push this relationship
1156
1060
  # for the subject (doc) and object (ref) of this triple.
1157
- if slot in inbound_document_reference_ranged_slots:
1158
- field_for_doc, field_for_ref = "_inbound", "_outbound"
1159
- elif slot in outbound_document_reference_ranged_slots:
1160
- field_for_doc, field_for_ref = "_outbound", "_inbound"
1061
+ if slot in upstream_document_reference_ranged_slots:
1062
+ field_for_doc, field_for_ref = "_upstream", "_downstream"
1063
+ elif slot in downstream_document_reference_ranged_slots:
1064
+ field_for_doc, field_for_ref = "_downstream", "_upstream"
1161
1065
  else:
1162
1066
  raise Failure(f"Unknown slot {slot} for document {doc_id}")
1163
1067
 
@@ -1204,14 +1108,6 @@ def _add_related_ids_to_alldocs(
1204
1108
 
1205
1109
  context.log.info(f"Pushed {update_count} updates in total")
1206
1110
 
1207
- context.log.info("Creating {`_inbound`,`_outbound`} indexes...")
1208
- temp_collection.create_index("_inbound.id")
1209
- temp_collection.create_index("_outbound.id")
1210
- # Create compound indexes to ensure index-covered queries
1211
- temp_collection.create_index([("_inbound.type", 1), ("_inbound.id", 1)])
1212
- temp_collection.create_index([("_outbound.type", 1), ("_outbound.id", 1)])
1213
- context.log.info("Successfully created {`_inbound`,`_outbound`} indexes")
1214
-
1215
1111
 
1216
1112
  # Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
1217
1113
  # pass an argument to the op (in order to specify the order of the ops in the graph)
@@ -1228,8 +1124,8 @@ def materialize_alldocs(context) -> int:
1228
1124
  2. Create a temporary collection to build the new alldocs collection.
1229
1125
  3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
1230
1126
  4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
1231
- 5. Add special `_inbound` and `_outbound` fields with subdocuments containing ID and type of related entities.
1232
- 6. Add indexes for `id`, relationship fields, and `{_inbound,_outbound}.type`/`.id` compound indexes.
1127
+ 5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
1128
+ 6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
1233
1129
  7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
1234
1130
 
1235
1131
  The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
@@ -1240,7 +1136,7 @@ def materialize_alldocs(context) -> int:
1240
1136
  `/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
1241
1137
  related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
1242
1138
 
1243
- The {`_inbound`,`_outbound`} fields enable efficient index-covered queries to find all entities of specific types
1139
+ The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
1244
1140
  that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
1245
1141
  expansions.
1246
1142
  """
@@ -1271,6 +1167,9 @@ def materialize_alldocs(context) -> int:
1271
1167
  )
1272
1168
  )
1273
1169
 
1170
+ # FIXME rename to `document_reference_ranged_slots_by_type`
1171
+ # FIXME key on CURIE, e.g. `nmdc:Study`
1172
+ # (here, not upstream in `cls_slot_map`/`document_referenceable_ranges`, b/c `schema_view` used directly in those)
1274
1173
  document_reference_ranged_slots = defaultdict(list)
1275
1174
  for cls_name, slot_map in cls_slot_map.items():
1276
1175
  for slot_name, slot in slot_map.items():
@@ -1310,12 +1209,12 @@ def materialize_alldocs(context) -> int:
1310
1209
  new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
1311
1210
 
1312
1211
  new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
1313
- # InsertOne is a method on the py-mongo Client class.
1314
1212
  # Get ancestors without the prefix, but add prefix to each one in the output
1315
1213
  ancestors = schema_view.class_ancestors(doc_type)
1316
1214
  new_doc["_type_and_ancestors"] = [
1317
1215
  "nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
1318
1216
  ]
1217
+ # InsertOne is a pymongo representation of a mongo command.
1319
1218
  write_operations.append(InsertOne(new_doc))
1320
1219
  if len(write_operations) == BULK_WRITE_BATCH_SIZE:
1321
1220
  _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
@@ -1339,19 +1238,28 @@ def materialize_alldocs(context) -> int:
1339
1238
  # so that `temp_alldocs_collection` will be "good to go" on renaming.
1340
1239
  temp_alldocs_collection.create_index("id", unique=True)
1341
1240
  # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
1241
+ # TODO add indexes on each of `set(document_reference_ranged_slots.values())`.
1342
1242
  slots_to_index = ["has_input", "has_output", "was_informed_by"]
1343
1243
  [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
1344
1244
  context.log.info(f"created indexes on id, {slots_to_index}.")
1345
1245
 
1346
1246
  # Add related-ids fields to enable efficient relationship traversal
1347
1247
  context.log.info("Adding fields for related ids to documents...")
1348
- _add_related_ids_to_alldocs(
1248
+ _add_linked_instances_to_alldocs(
1349
1249
  temp_alldocs_collection, context, document_reference_ranged_slots
1350
1250
  )
1251
+ context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
1252
+ temp_alldocs_collection.create_index("_upstream.id")
1253
+ temp_alldocs_collection.create_index("_downstream.id")
1254
+ # Create compound indexes to ensure index-covered queries
1255
+ temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
1256
+ temp_alldocs_collection.create_index(
1257
+ [("_downstream.type", 1), ("_downstream.id", 1)]
1258
+ )
1259
+ context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
1351
1260
 
1352
1261
  context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
1353
1262
  temp_alldocs_collection.rename("alldocs", dropTarget=True)
1354
-
1355
1263
  n_alldocs_documents = mdb.alldocs.estimated_document_count()
1356
1264
  context.log.info(
1357
1265
  f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."