nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +208 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +788 -0
  10. nmdc_runtime/api/core/util.py +109 -0
  11. nmdc_runtime/api/db/mongo.py +435 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +143 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +270 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +796 -0
  31. nmdc_runtime/api/endpoints/workflows.py +353 -0
  32. nmdc_runtime/api/entrypoint.sh +7 -0
  33. nmdc_runtime/api/main.py +425 -0
  34. nmdc_runtime/api/middleware.py +43 -0
  35. nmdc_runtime/api/models/capability.py +14 -0
  36. nmdc_runtime/api/models/id.py +92 -0
  37. nmdc_runtime/api/models/job.py +37 -0
  38. nmdc_runtime/api/models/lib/helpers.py +78 -0
  39. nmdc_runtime/api/models/metadata.py +11 -0
  40. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  41. nmdc_runtime/api/models/object.py +180 -0
  42. nmdc_runtime/api/models/object_type.py +20 -0
  43. nmdc_runtime/api/models/operation.py +66 -0
  44. nmdc_runtime/api/models/query.py +246 -0
  45. nmdc_runtime/api/models/query_continuation.py +111 -0
  46. nmdc_runtime/api/models/run.py +161 -0
  47. nmdc_runtime/api/models/site.py +87 -0
  48. nmdc_runtime/api/models/trigger.py +13 -0
  49. nmdc_runtime/api/models/user.py +140 -0
  50. nmdc_runtime/api/models/util.py +260 -0
  51. nmdc_runtime/api/models/workflow.py +15 -0
  52. nmdc_runtime/api/openapi.py +178 -0
  53. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  54. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  55. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  56. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  57. nmdc_runtime/config.py +7 -8
  58. nmdc_runtime/minter/adapters/repository.py +22 -2
  59. nmdc_runtime/minter/config.py +2 -0
  60. nmdc_runtime/minter/domain/model.py +55 -1
  61. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  62. nmdc_runtime/mongo_util.py +1 -2
  63. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  64. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  65. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  66. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  67. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  68. nmdc_runtime/site/dagster.yaml +53 -0
  69. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  70. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  71. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  72. nmdc_runtime/site/export/ncbi_xml.py +633 -13
  73. nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
  74. nmdc_runtime/site/graphs.py +8 -22
  75. nmdc_runtime/site/ops.py +147 -181
  76. nmdc_runtime/site/repository.py +2 -112
  77. nmdc_runtime/site/resources.py +16 -3
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/site/workspace.yaml +13 -0
  87. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  88. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  89. nmdc_runtime/static/README.md +5 -0
  90. nmdc_runtime/static/favicon.ico +0 -0
  91. nmdc_runtime/util.py +90 -48
  92. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  93. nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
  94. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  95. nmdc_runtime/containers.py +0 -14
  96. nmdc_runtime/core/db/Database.py +0 -15
  97. nmdc_runtime/core/exceptions/__init__.py +0 -23
  98. nmdc_runtime/core/exceptions/base.py +0 -47
  99. nmdc_runtime/core/exceptions/token.py +0 -13
  100. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  101. nmdc_runtime/domain/users/userSchema.py +0 -37
  102. nmdc_runtime/domain/users/userService.py +0 -14
  103. nmdc_runtime/infrastructure/database/db.py +0 -3
  104. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  105. nmdc_runtime/lib/__init__.py +0 -1
  106. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  107. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  108. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  109. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  110. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  111. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  112. nmdc_runtime/site/drsobjects/registration.py +0 -131
  113. nmdc_runtime/site/translation/emsl.py +0 -43
  114. nmdc_runtime/site/translation/gold.py +0 -53
  115. nmdc_runtime/site/translation/jgi.py +0 -32
  116. nmdc_runtime/site/translation/util.py +0 -132
  117. nmdc_runtime/site/validation/jgi.py +0 -43
  118. nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
  119. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  120. nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
  121. /nmdc_runtime/{client → api}/__init__.py +0 -0
  122. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  123. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  124. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  125. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  126. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  127. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  128. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  129. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  130. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  131. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,143 @@
1
+ import json
2
+ from typing import Optional, Annotated
3
+
4
+ from pymongo.database import Database
5
+ from fastapi import APIRouter, Depends, Query, HTTPException, Path
6
+ from pymongo.errors import ConnectionFailure, OperationFailure
7
+ from starlette import status
8
+
9
+ from nmdc_runtime.api.core.util import (
10
+ raise404_if_none,
11
+ )
12
+ from nmdc_runtime.api.db.mongo import get_mongo_db
13
+ from nmdc_runtime.api.endpoints.util import list_resources, _claim_job
14
+ from nmdc_runtime.api.models.job import Job, JobClaim
15
+ from nmdc_runtime.api.models.operation import Operation, MetadataT
16
+ from nmdc_runtime.api.models.site import (
17
+ Site,
18
+ maybe_get_current_client_site,
19
+ get_current_client_site,
20
+ )
21
+ from nmdc_runtime.api.models.util import ListRequest, ListResponse, ResultT
22
+
23
+ router = APIRouter()
24
+
25
+
26
+ @router.get(
27
+ "/jobs", response_model=ListResponse[Job], response_model_exclude_unset=True
28
+ )
29
+ def list_jobs(
30
+ req: Annotated[ListRequest, Query()],
31
+ mdb: Database = Depends(get_mongo_db),
32
+ maybe_site: Optional[Site] = Depends(maybe_get_current_client_site),
33
+ ):
34
+ """List pre-configured workflow jobs.
35
+
36
+ If authenticated as a site client, `req.filter` defaults to fetch unclaimed jobs
37
+ that are claimable by the site client. This default can be overridden to view all jobs
38
+ by explicitly passing a `req.filter` of `{}`.
39
+ """
40
+ if isinstance(maybe_site, Site) and req.filter is None:
41
+ req.filter = json.dumps({"claims.site_id": {"$ne": maybe_site.id}})
42
+ return list_resources(req, mdb, "jobs")
43
+
44
+
45
+ @router.get("/jobs/{job_id}", response_model=Job, response_model_exclude_unset=True)
46
+ def get_job_info(
47
+ job_id: str,
48
+ mdb: Database = Depends(get_mongo_db),
49
+ ):
50
+ return raise404_if_none(mdb.jobs.find_one({"id": job_id}))
51
+
52
+
53
+ @router.post("/jobs/{job_id}:claim", response_model=Operation[ResultT, MetadataT])
54
+ def claim_job(
55
+ job_id: str,
56
+ mdb: Database = Depends(get_mongo_db),
57
+ site: Site = Depends(get_current_client_site),
58
+ ):
59
+ return _claim_job(job_id, mdb, site)
60
+
61
+
62
+ @router.post("/jobs/{job_id}:release")
63
+ def release_job(
64
+ job_id: Annotated[
65
+ str,
66
+ Path(
67
+ title="Job ID",
68
+ description="The `id` of the job.\n\n_Example_: `nmdc:f81d4fae-7dec-11d0-a765-00a0c91e6bf6`",
69
+ examples=["nmdc:f81d4fae-7dec-11d0-a765-00a0c91e6bf6"],
70
+ ),
71
+ ],
72
+ mdb: Database = Depends(get_mongo_db),
73
+ site: Site = Depends(get_current_client_site),
74
+ ) -> Optional[Job]:
75
+ r"""
76
+ Release the specified job.
77
+
78
+ Releasing a job cancels all the unfinished operations (of that job)
79
+ claimed by the `site` associated with the logged-in site client.
80
+
81
+ Return the updated job, reflecting that the aforementioned operations have been cancelled.
82
+ """
83
+ job = Job(**raise404_if_none(mdb.jobs.find_one({"id": job_id})))
84
+ active_job_claims_by_this_site = list(
85
+ mdb.operations.find(
86
+ {
87
+ "metadata.job.id": job_id,
88
+ "metadata.site_id": site.id,
89
+ "done": False,
90
+ },
91
+ ["id"],
92
+ )
93
+ )
94
+ job_claims_by_this_site_post_release = [
95
+ JobClaim(op_id=claim["id"], site_id=site.id, done=True, cancelled=True)
96
+ for claim in active_job_claims_by_this_site
97
+ ]
98
+ job_claims_not_by_this_site = [
99
+ claim for claim in job.claims if (claim.site_id != site.id)
100
+ ]
101
+
102
+ # Execute MongoDB transaction to ensure atomic change of job document plus relevant set of operations documents.
103
+ def transactional_update(session):
104
+ mdb.operations.update_many(
105
+ {"id": {"$in": [claim["id"] for claim in active_job_claims_by_this_site]}},
106
+ {"$set": {"metadata.cancelled": True, "metadata.done": True}},
107
+ session=session,
108
+ )
109
+ job_claim_subdocuments_post_release = [
110
+ claim.model_dump(exclude_unset=True)
111
+ for claim in (
112
+ job_claims_not_by_this_site + job_claims_by_this_site_post_release
113
+ )
114
+ ]
115
+ mdb.jobs.update_one(
116
+ {"id": job_id},
117
+ {"$set": {"claims": job_claim_subdocuments_post_release}},
118
+ session=session,
119
+ )
120
+
121
+ try:
122
+ with mdb.client.start_session() as session:
123
+ with session.start_transaction():
124
+ transactional_update(session)
125
+ except (ConnectionFailure, OperationFailure) as e:
126
+ raise HTTPException(
127
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
128
+ detail=f"Transaction failed: {e}",
129
+ )
130
+
131
+ # Return the updated `jobs` document.
132
+ #
133
+ # TODO: Consider retrieving the document within the transaction
134
+ # to ensure it still exists.
135
+ #
136
+ updated_job = mdb.jobs.find_one({"id": job_id})
137
+ if updated_job is None:
138
+ # Note: We return `None` in this case because that's what the
139
+ # endpoint originally did in this case, and we don't want
140
+ # to introduce a breaking change as part of this refactor.
141
+ return None
142
+ else:
143
+ return Job(**updated_job)
@@ -0,0 +1,274 @@
1
+ import json
2
+ import bson.json_util
3
+ from typing import List
4
+
5
+ from pymongo.database import Database
6
+ from refscan.lib.Finder import Finder
7
+ from refscan.lib.helpers import derive_schema_class_name_from_document
8
+ from refscan.scanner import identify_referring_documents, scan_outgoing_references
9
+
10
+ from nmdc_runtime.api.models.lib.helpers import derive_update_specs
11
+ from nmdc_runtime.api.models.query import UpdateCommand, UpdateSpecs
12
+ from nmdc_runtime.util import get_allowed_references, nmdc_schema_view
13
+
14
+
15
+ def make_violation_message(
16
+ collection_name: str,
17
+ source_document_id: str,
18
+ source_field_name: str,
19
+ target_document_id: str,
20
+ ) -> str:
21
+ r"""
22
+ Constructs a violation message that indicates that a document would contain a broken reference.
23
+
24
+ :param collection_name: The name of the collection containing the document containing the broken reference
25
+ :param source_document_id: The `id` of the document containing the broken reference
26
+ :param source_field_name: The name of the field containing the broken reference
27
+ :param target_document_id: The `id` of the document that is being referenced
28
+
29
+ :return: A formatted string describing the violation
30
+ """
31
+ return (
32
+ f"The document having id='{source_document_id}' in "
33
+ f"the collection '{collection_name}' contains a "
34
+ f"reference (in its '{source_field_name}' field, "
35
+ f"referring to the document having id='{target_document_id}') "
36
+ f"which would be broken."
37
+ )
38
+
39
+
40
+ def simulate_updates_and_check_references(
41
+ db: Database, update_cmd: UpdateCommand
42
+ ) -> List[str]:
43
+ r"""
44
+ Checks whether, if the specified updates were performed on the specified database,
45
+ both of the following things would be true afterward:
46
+ 1. (Regarding outgoing references): The updated documents do not contain any
47
+ broken references.
48
+ 2. (Regarding incoming references): The documents that originally _referenced_
49
+ any of the updated documents do not contain any broken references.
50
+ This check is necessary because update operations can currently change `id`
51
+ and `type` values, which can affect what can legally reference those documents.
52
+
53
+ This function checks those things by performing the updates within a MongoDB
54
+ transaction, leaving the transaction in the _pending_ (i.e. not committed) state,
55
+ and then performing various checks on the database in that _pending_ state.
56
+
57
+ :param db: The database on which to simulate performing the updates
58
+ :param update_cmd: The command that specifies the updates
59
+
60
+ :return: List of violation messages. If the list is empty, it means that—if
61
+ the updates had been performed (instead of only simulated) here—they
62
+ would not have left behind any broken references.
63
+ """
64
+
65
+ # Initialize the list of violation messages that we will return.
66
+ violation_messages: List[str] = []
67
+
68
+ # Instantiate a `Finder` bound to the Mongo database. This will be
69
+ # used later, to identify and check inter-document references.
70
+ finder = Finder(database=db)
71
+
72
+ # Extract the collection name from the command.
73
+ collection_name = update_cmd.update
74
+
75
+ # Derive the update specifications from the command.
76
+ update_specs: UpdateSpecs = derive_update_specs(update_cmd)
77
+
78
+ # Get a reference to a `SchemaView` bound to the NMDC schema, so we can
79
+ # use it to, for example, map `type` field values to schema class names.
80
+ schema_view = nmdc_schema_view()
81
+
82
+ # Get some data structures that indicate which fields of which documents
83
+ # can legally contain references, according to the NMDC schema.
84
+ legal_references = get_allowed_references()
85
+ reference_field_names_by_source_class_name = (
86
+ legal_references.get_reference_field_names_by_source_class_name()
87
+ )
88
+
89
+ # Start a "throwaway" MongoDB transaction so we can simulate the updates.
90
+ with db.client.start_session() as session:
91
+ with session.start_transaction():
92
+
93
+ # Make a list of the `_id`, `id`, and `type` values of the documents that
94
+ # the user wants to update.
95
+ projection = {"_id": 1, "id": 1, "type": 1}
96
+ subject_document_summaries_pre_update = list(
97
+ db[collection_name].find(
98
+ filter={"$or": [spec["filter"] for spec in update_specs]},
99
+ projection=projection,
100
+ session=session,
101
+ )
102
+ )
103
+
104
+ # Make a set of the `_id` values of the subject documents so that (later) we can
105
+ # check whether a given _referring_ document is also one of the _subject_
106
+ # documents (i.e. is among the documents the user wants to update).
107
+ subject_document_object_ids = set(
108
+ tdd["_id"] for tdd in subject_document_summaries_pre_update
109
+ )
110
+
111
+ # Identify _all_ documents that reference any of the subject documents.
112
+ all_referring_document_descriptors_pre_update = []
113
+ for subject_document_summary in subject_document_summaries_pre_update:
114
+ # If the document summary lacks the "id" field, we already know that no
115
+ # documents reference it (since they would have to _use_ that "id" value to
116
+ # do so); so, we abort this iteration and move on to the next subject document.
117
+ if "id" not in subject_document_summary:
118
+ continue
119
+
120
+ referring_document_descriptors = identify_referring_documents(
121
+ document=subject_document_summary, # expects at least "id" and "type"
122
+ schema_view=schema_view,
123
+ references=legal_references,
124
+ finder=finder,
125
+ client_session=session,
126
+ )
127
+ all_referring_document_descriptors_pre_update.extend(
128
+ referring_document_descriptors
129
+ )
130
+
131
+ # Simulate the updates (i.e. apply them within the context of the transaction).
132
+ db.command(
133
+ # Note: This expression was copied from the `_run_mdb_cmd` function in `queries.py`.
134
+ # TODO: Document this expression (i.e. the Pydantic->JSON->BSON chain).
135
+ bson.json_util.loads(
136
+ json.dumps(update_cmd.model_dump(exclude_unset=True))
137
+ ),
138
+ session=session,
139
+ )
140
+ # For each referring document, check whether any of its outgoing references
141
+ # is broken (in the context of the transaction).
142
+ for descriptor in all_referring_document_descriptors_pre_update:
143
+ referring_document_oid = descriptor["source_document_object_id"]
144
+ referring_document_id = descriptor["source_document_id"]
145
+ referring_collection_name = descriptor["source_collection_name"]
146
+ # If the referring document is among the documents that the user wanted to
147
+ # update, we skip it for now. We will check its outgoing references later
148
+ # (i.e. when we check the outgoing references of _all_ updated documents).
149
+ if referring_document_oid in subject_document_object_ids:
150
+ continue
151
+ # Get the referring document, so we can check its outgoing references.
152
+ # Note: We project only the fields that can legally contain references,
153
+ # plus other fields involved in referential integrity checking.
154
+ referring_document_reference_field_names = (
155
+ reference_field_names_by_source_class_name[
156
+ descriptor["source_class_name"]
157
+ ]
158
+ )
159
+ projection = {
160
+ field_name: 1
161
+ for field_name in referring_document_reference_field_names
162
+ } | {
163
+ "_id": 1,
164
+ "id": 1,
165
+ "type": 1,
166
+ } # note: `|` unions the dicts
167
+ referring_document = db[referring_collection_name].find_one(
168
+ {"_id": referring_document_oid},
169
+ projection=projection,
170
+ session=session,
171
+ )
172
+ # Note: We assert that the referring document exists (to satisfy the type checker).
173
+ assert (
174
+ referring_document is not None
175
+ ), "A referring document has vanished."
176
+ violations = scan_outgoing_references(
177
+ document=referring_document,
178
+ source_collection_name=referring_collection_name,
179
+ schema_view=schema_view,
180
+ references=legal_references,
181
+ finder=finder,
182
+ client_session=session, # so it uses the pending transaction's session
183
+ )
184
+ # For each violation (i.e. broken reference) that exists, add a violation message
185
+ # to the list of violation messages.
186
+ #
187
+ # TODO: The violation might not involve a reference to one of the
188
+ # subject documents. The `scan_outgoing_references` function
189
+ # scans _all_ references emanating from the document.
190
+ #
191
+ for violation in violations:
192
+ source_field_name = violation.source_field_name
193
+ target_id = violation.target_id
194
+ violation_messages.append(
195
+ make_violation_message(
196
+ collection_name=referring_collection_name,
197
+ source_document_id=referring_document_id,
198
+ source_field_name=source_field_name,
199
+ target_document_id=target_id,
200
+ )
201
+ )
202
+
203
+ # For each updated document, check whether any of its outgoing references
204
+ # is broken (in the context of the transaction).
205
+ for subject_document_summary in subject_document_summaries_pre_update:
206
+ subject_document_oid = subject_document_summary["_id"]
207
+ subject_document_id = subject_document_summary["id"]
208
+ subject_document_class_name = derive_schema_class_name_from_document(
209
+ document=subject_document_summary,
210
+ schema_view=schema_view,
211
+ )
212
+ assert (
213
+ subject_document_class_name is not None
214
+ ), "The updated document does not represent a valid schema class instance."
215
+ subject_collection_name = (
216
+ collection_name # makes a disambiguating alias
217
+ )
218
+ # Get the updated document, so we can check its outgoing references.
219
+ # Note: We project only the fields that can legally contain references,
220
+ # plus other fields involved in referential integrity checking.
221
+ updated_document_reference_field_names = (
222
+ reference_field_names_by_source_class_name[
223
+ subject_document_class_name
224
+ ]
225
+ )
226
+ projection = {
227
+ field_name: 1
228
+ for field_name in updated_document_reference_field_names
229
+ } | {
230
+ "_id": 1,
231
+ "id": 1,
232
+ "type": 1,
233
+ } # note: `|` unions the dicts
234
+ updated_document = db[subject_collection_name].find_one(
235
+ {"_id": subject_document_oid},
236
+ projection=projection,
237
+ session=session,
238
+ )
239
+ # Note: We assert that the updated document exists (to satisfy the type checker).
240
+ assert updated_document is not None, "An updated document has vanished."
241
+ violations = scan_outgoing_references(
242
+ document=updated_document,
243
+ source_collection_name=subject_collection_name,
244
+ schema_view=schema_view,
245
+ references=legal_references,
246
+ finder=finder,
247
+ client_session=session, # so it uses the pending transaction's session
248
+ )
249
+ # For each violation (i.e. broken reference) that exists, add a violation message
250
+ # to the list of violation messages.
251
+ for violation in violations:
252
+ source_field_name = violation.source_field_name
253
+ target_id = violation.target_id
254
+ violation_messages.append(
255
+ make_violation_message(
256
+ collection_name=subject_collection_name,
257
+ source_document_id=subject_document_id,
258
+ source_field_name=source_field_name,
259
+ target_document_id=target_id,
260
+ )
261
+ )
262
+
263
+ # Whatever happens (i.e. whether there are violations or not), abort the transaction.
264
+ #
265
+ # Note: If an exception was raised within this `with` block, the transaction
266
+ # will already have been aborted automatically (and execution will not
267
+ # have reached this statement). On the other hand, if no exception
268
+ # was raised, we explicitly abort the transaction so that the updates
269
+ # that we "simulated" in this block do not get applied to the real database.
270
+ # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/client_session.html
271
+ #
272
+ session.abort_transaction()
273
+
274
+ return violation_messages
@@ -0,0 +1,180 @@
1
+ """
2
+
3
+ This module houses logic for the `GET /nmdcschema/linked_instances` endpoint, defined as
4
+ `nmdc_runtime.api.endpoints.nmdcschema.linked_instances`, to avoid (further) bloating the
5
+ `nmdc_runtime.api.endpoints.nmdcschema` module.
6
+
7
+ """
8
+
9
+ from typing import Literal, Any
10
+
11
+ from bson import ObjectId
12
+ from pymongo.collection import Collection as MongoCollection
13
+ from pymongo.database import Database as MongoDatabase
14
+ from toolz import merge
15
+
16
+ from nmdc_runtime.api.core.util import hash_from_str
17
+ from nmdc_runtime.util import get_class_name_to_collection_names_map, nmdc_schema_view
18
+
19
+
20
+ def hash_from_ids_and_types(ids: list[str], types: list[str]) -> str:
21
+ """A quick hash as a function of `ids` and `types`.
22
+
23
+ This will serve as part of a temporary mongo collection name.
24
+ Because it will only be "part of" the name, avoiding hash collisions isn't a priority.
25
+
26
+ Returns a hex digest truncated to 8 characters, so 16**8 ≈ 4M possible values.
27
+ """
28
+ return hash_from_str(
29
+ ",".join(sorted(ids)) + "." + ",".join(sorted(types)), algo="md5"
30
+ )[:8]
31
+
32
+
33
+ def temp_linked_instances_collection_name(ids: list[str], types: list[str]) -> str:
34
+ """A name for a temporary mongo collection to store linked instances in service of an API request."""
35
+ return f"_runtime.tmp.linked_instances.{hash_from_ids_and_types(ids=ids,types=types)}.{ObjectId()}"
36
+
37
+
38
+ def gather_linked_instances(
39
+ alldocs_collection: MongoCollection,
40
+ ids: list[str],
41
+ types: list[str],
42
+ ) -> str:
43
+ """Collect linked instances and stores them in a new temporary collection.
44
+
45
+ Run an aggregation pipeline over `alldocs_collection` that collects ∈`types` instances linked to `ids`.
46
+ The pipeline is run twice, once for each of {"downstream", "upstream"} directions.
47
+ """
48
+ merge_into_collection_name = temp_linked_instances_collection_name(
49
+ ids=ids, types=types
50
+ )
51
+ for direction in ["downstream", "upstream"]:
52
+ _ = list(
53
+ alldocs_collection.aggregate(
54
+ pipeline_for_direction(
55
+ ids=ids,
56
+ types=types,
57
+ direction=direction,
58
+ merge_into_collection_name=merge_into_collection_name,
59
+ ),
60
+ allowDiskUse=True,
61
+ )
62
+ )
63
+ return merge_into_collection_name
64
+
65
+
66
+ def pipeline_for_direction(
67
+ ids: list[str],
68
+ types: list[str],
69
+ direction: Literal["downstream", "upstream"],
70
+ merge_into_collection_name: str,
71
+ alldocs_collection_name: str = "alldocs",
72
+ ) -> list:
73
+ """A pure function that returns the aggregation pipeline for `direction`.
74
+
75
+ The pipeline
76
+ - collects ∈`types` instances linked to `ids` along `direction`,
77
+ - retains only those document fields essential to the caller, and
78
+ - ensures the collected instances are present, and properly updated if applicable, in a merge-target collection.
79
+ """
80
+ return pipeline_for_instances_linked_to_ids_by_direction(
81
+ ids=ids,
82
+ types=types,
83
+ direction=direction,
84
+ alldocs_collection_name=alldocs_collection_name,
85
+ ) + [
86
+ {"$project": {"id": 1, "type": 1, f"_{direction}_of": 1}},
87
+ pipeline_stage_for_merging_instances_and_grouping_link_provenance_by_direction(
88
+ merge_into_collection_name=merge_into_collection_name, direction=direction
89
+ ),
90
+ ]
91
+
92
+
93
+ def pipeline_for_instances_linked_to_ids_by_direction(
94
+ ids: list[str],
95
+ types: list[str],
96
+ direction: Literal["downstream", "upstream"],
97
+ alldocs_collection_name: str = "alldocs",
98
+ slim: bool = True,
99
+ ) -> list[dict[str, Any]]:
100
+ """
101
+ Returns an aggregation pipeline that:
102
+ - traverses the graph of documents in the alldocs collection, following `direction`-specific relationships
103
+ to discover documents linked to the documents given by `ids`.
104
+ - `$unwind`s the collected (via `$graphLookup`) docs,
105
+ - filters them by given `types` of interest,
106
+ - adds bookkeeping information about `direction`ality, and
107
+ - (optionally) projects only essential fields to reduce response latency and size.
108
+ """
109
+ return [
110
+ {"$match": {"id": {"$in": ids}}},
111
+ {
112
+ "$graphLookup": {
113
+ "from": alldocs_collection_name,
114
+ "startWith": f"$_{direction}.id",
115
+ "connectFromField": f"_{direction}.id",
116
+ "connectToField": "id",
117
+ "as": f"{direction}_docs",
118
+ }
119
+ },
120
+ {"$unwind": {"path": f"${direction}_docs"}},
121
+ {"$match": {f"{direction}_docs._type_and_ancestors": {"$in": types}}},
122
+ {"$addFields": {f"{direction}_docs._{direction}_of": ["$id"]}},
123
+ {"$replaceRoot": {"newRoot": f"${direction}_docs"}},
124
+ ] + ([{"$project": {"id": 1, "type": 1, f"_{direction}_of": 1}}] if slim else [])
125
+
126
+
127
+ def pipeline_stage_for_merging_instances_and_grouping_link_provenance_by_direction(
128
+ merge_into_collection_name: str,
129
+ direction: Literal["downstream", "upstream"],
130
+ ) -> dict[str, Any]:
131
+ """
132
+ Returns an aggregation-pipeline step that merges its input document stream to a collection dedicated to serving
133
+ the caller in a manner amenable to pagination across multiple HTTP requests.
134
+ """
135
+ return {
136
+ "$merge": {
137
+ "into": merge_into_collection_name,
138
+ "on": "_id",
139
+ "whenMatched": [
140
+ {
141
+ "$set": {
142
+ f"_{direction}_of": {
143
+ "$setUnion": [
144
+ f"$_{direction}_of",
145
+ f"$$new._{direction}_of",
146
+ ]
147
+ }
148
+ }
149
+ }
150
+ ],
151
+ "whenNotMatched": "insert",
152
+ }
153
+ }
154
+
155
+
156
+ def hydrated(resources: list[dict], mdb: MongoDatabase) -> list[dict]:
157
+ """Replace each `dict` in `resources` with a hydrated version.
158
+
159
+ Instead of returning the retrieved "full" documents as is, we merge each one with (a copy of) the corresponding
160
+ original document in *resources*, which includes additional fields, e.g. `_upstream_of` and `_downstream_of`.
161
+ """
162
+ class_name_to_collection_names_map = get_class_name_to_collection_names_map(
163
+ nmdc_schema_view()
164
+ )
165
+ types_of_resources = {r["type"] for r in resources}
166
+ full_docs_by_id = {}
167
+
168
+ for type in types_of_resources:
169
+ resource_ids_of_type = [d["id"] for d in resources if d["type"] == type]
170
+ schema_collection = mdb.get_collection(
171
+ # Note: We are assuming that documents of a given type are only allowed (by the schema) to reside in one
172
+ # collection. Based on that assumption, we will query only the _first_ collection whose name we get from
173
+ # the map. This assumption is continuously verified prior to code deployment via
174
+ # `test_get_class_name_to_collection_names_map_has_one_and_only_one_collection_name_per_class_name`.
175
+ class_name_to_collection_names_map[type.removeprefix("nmdc:")][0]
176
+ )
177
+ for doc in schema_collection.find({"id": {"$in": resource_ids_of_type}}):
178
+ full_docs_by_id[doc["id"]] = doc
179
+
180
+ return [merge(r, full_docs_by_id[r["id"]]) for r in resources]