nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,206 @@
1
+ from datetime import datetime, timezone
2
+ import json
3
+ import logging
4
+ from typing import Optional, Annotated
5
+
6
+ from pymongo.database import Database
7
+ from fastapi import APIRouter, Depends, Query, HTTPException, Path
8
+ from pymongo.errors import ConnectionFailure, OperationFailure
9
+ from starlette import status
10
+ from nmdc_runtime.api.core.util import (
11
+ raise404_if_none,
12
+ )
13
+ from nmdc_runtime.api.db.mongo import get_mongo_db
14
+ from nmdc_runtime.api.core.idgen import generate_one_id
15
+ from nmdc_runtime.api.endpoints.util import list_resources, _claim_job, strip_oid
16
+ from nmdc_runtime.api.models.job import Job, JobClaim, JobIn
17
+ from nmdc_runtime.api.models.metadata import Doc
18
+ from nmdc_runtime.api.models.operation import Operation, MetadataT
19
+ from nmdc_runtime.api.models.site import (
20
+ Site,
21
+ maybe_get_current_client_site,
22
+ get_current_client_site,
23
+ )
24
+ from nmdc_runtime.api.models.util import ListRequest, ListResponse, ResultT
25
+
26
+ router = APIRouter()
27
+
28
+
29
+ # Note: We use the generic `Doc` class—instead of the `Job` class—to describe the response
30
+ # because this endpoint (via `ListRequest`) supports projection, which can be used to omit
31
+ # fields from the response, even fields the `Job` class says are required.
32
+ @router.get(
33
+ "/jobs", response_model=ListResponse[Doc], response_model_exclude_unset=True
34
+ )
35
+ def list_jobs(
36
+ req: Annotated[ListRequest, Query()],
37
+ mdb: Database = Depends(get_mongo_db),
38
+ maybe_site: Optional[Site] = Depends(maybe_get_current_client_site),
39
+ ):
40
+ """List pre-configured workflow jobs.
41
+
42
+ If authenticated as a site client, `req.filter` defaults to fetch unclaimed jobs
43
+ that are claimable by the site client. This default can be overridden to view all jobs
44
+ by explicitly passing a `req.filter` of `{}`.
45
+ """
46
+ if isinstance(maybe_site, Site) and req.filter is None:
47
+ req.filter = json.dumps({"claims.site_id": {"$ne": maybe_site.id}})
48
+ rv = list_resources(req, mdb, "jobs")
49
+ rv["resources"] = [strip_oid(d) for d in rv["resources"]]
50
+ return rv
51
+
52
+
53
+ @router.post(
54
+ "/jobs",
55
+ status_code=status.HTTP_201_CREATED,
56
+ response_model_exclude_unset=True,
57
+ )
58
+ def create_job(
59
+ job_in: JobIn,
60
+ mdb: Database = Depends(get_mongo_db),
61
+ site: Site = Depends(get_current_client_site),
62
+ ) -> Job:
63
+ """
64
+ Create a workflow job.
65
+
66
+ A workflow job is a resource that decouples the configuration of a workflow from the execution of that workflow.
67
+
68
+ **Permissions:** This endpoint is only accessible to site clients.
69
+ """
70
+
71
+ _ = site # must be authenticated
72
+
73
+ # Generate a unique ID for the job.
74
+ job_id = generate_one_id(mdb, "jobs")
75
+
76
+ # Generate a timestamp for the job's `created_at` field.
77
+ created_at = datetime.now(timezone.utc)
78
+
79
+ # Validate the request payload, combined with the generated ID and timestamp.
80
+ job_in_dict: dict = job_in.model_dump(exclude_unset=True)
81
+ try:
82
+ validated_job = Job(**job_in_dict, id=job_id, created_at=created_at)
83
+ except Exception as e:
84
+ error_message = f"Invalid job. Details: {str(e)}"
85
+ logging.warning(error_message)
86
+ raise HTTPException(
87
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
88
+ detail=error_message,
89
+ )
90
+
91
+ # Insert the validated job into the database.
92
+ validated_job_dict: dict = validated_job.model_dump(exclude_unset=True)
93
+ try:
94
+ result = mdb.jobs.insert_one(validated_job_dict)
95
+ if not result.inserted_id:
96
+ raise Exception("Failed to insert job into database.")
97
+ except Exception as e:
98
+ logging.exception(e)
99
+ raise HTTPException(
100
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
101
+ detail="Failed to create job.",
102
+ )
103
+
104
+ # Return the job that was created (i.e. inserted into the database).
105
+ return validated_job
106
+
107
+
108
+ @router.get("/jobs/{job_id}", response_model=Job, response_model_exclude_unset=True)
109
+ def get_job_info(
110
+ job_id: str,
111
+ mdb: Database = Depends(get_mongo_db),
112
+ ):
113
+ return raise404_if_none(mdb.jobs.find_one({"id": job_id}))
114
+
115
+
116
+ @router.post("/jobs/{job_id}:claim", response_model=Operation[ResultT, MetadataT])
117
+ def claim_job(
118
+ job_id: str,
119
+ mdb: Database = Depends(get_mongo_db),
120
+ site: Site = Depends(get_current_client_site),
121
+ ):
122
+ return _claim_job(job_id, mdb, site)
123
+
124
+
125
+ @router.post("/jobs/{job_id}:release")
126
+ def release_job(
127
+ job_id: Annotated[
128
+ str,
129
+ Path(
130
+ title="Job ID",
131
+ description="The `id` of the job.\n\n_Example_: `nmdc:f81d4fae-7dec-11d0-a765-00a0c91e6bf6`",
132
+ examples=["nmdc:f81d4fae-7dec-11d0-a765-00a0c91e6bf6"],
133
+ ),
134
+ ],
135
+ mdb: Database = Depends(get_mongo_db),
136
+ site: Site = Depends(get_current_client_site),
137
+ ) -> Optional[Job]:
138
+ r"""
139
+ Release the specified job.
140
+
141
+ Releasing a job cancels all the unfinished operations (of that job)
142
+ claimed by the `site` associated with the logged-in site client.
143
+
144
+ Return the updated job, reflecting that the aforementioned operations have been cancelled.
145
+ """
146
+ job = Job(**raise404_if_none(mdb.jobs.find_one({"id": job_id})))
147
+ active_job_claims_by_this_site = list(
148
+ mdb.operations.find(
149
+ {
150
+ "metadata.job.id": job_id,
151
+ "metadata.site_id": site.id,
152
+ "done": False,
153
+ },
154
+ ["id"],
155
+ )
156
+ )
157
+ job_claims_by_this_site_post_release = [
158
+ JobClaim(op_id=claim["id"], site_id=site.id, done=True, cancelled=True)
159
+ for claim in active_job_claims_by_this_site
160
+ ]
161
+ job_claims_not_by_this_site = [
162
+ claim for claim in job.claims if (claim.site_id != site.id)
163
+ ]
164
+
165
+ # Execute MongoDB transaction to ensure atomic change of job document plus relevant set of operations documents.
166
+ def transactional_update(session):
167
+ mdb.operations.update_many(
168
+ {"id": {"$in": [claim["id"] for claim in active_job_claims_by_this_site]}},
169
+ {"$set": {"metadata.cancelled": True, "metadata.done": True}},
170
+ session=session,
171
+ )
172
+ job_claim_subdocuments_post_release = [
173
+ claim.model_dump(exclude_unset=True)
174
+ for claim in (
175
+ job_claims_not_by_this_site + job_claims_by_this_site_post_release
176
+ )
177
+ ]
178
+ mdb.jobs.update_one(
179
+ {"id": job_id},
180
+ {"$set": {"claims": job_claim_subdocuments_post_release}},
181
+ session=session,
182
+ )
183
+
184
+ try:
185
+ with mdb.client.start_session() as session:
186
+ with session.start_transaction():
187
+ transactional_update(session)
188
+ except (ConnectionFailure, OperationFailure) as e:
189
+ raise HTTPException(
190
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
191
+ detail=f"Transaction failed: {e}",
192
+ )
193
+
194
+ # Return the updated `jobs` document.
195
+ #
196
+ # TODO: Consider retrieving the document within the transaction
197
+ # to ensure it still exists.
198
+ #
199
+ updated_job = mdb.jobs.find_one({"id": job_id})
200
+ if updated_job is None:
201
+ # Note: We return `None` in this case because that's what the
202
+ # endpoint originally did in this case, and we don't want
203
+ # to introduce a breaking change as part of this refactor.
204
+ return None
205
+ else:
206
+ return Job(**updated_job)
@@ -0,0 +1,274 @@
1
+ import json
2
+ import bson.json_util
3
+ from typing import List
4
+
5
+ from pymongo.database import Database
6
+ from refscan.lib.Finder import Finder
7
+ from refscan.lib.helpers import derive_schema_class_name_from_document
8
+ from refscan.scanner import identify_referring_documents, scan_outgoing_references
9
+
10
+ from nmdc_runtime.api.models.lib.helpers import derive_update_specs
11
+ from nmdc_runtime.api.models.query import UpdateCommand, UpdateSpecs
12
+ from nmdc_runtime.util import get_allowed_references, nmdc_schema_view
13
+
14
+
15
+ def make_violation_message(
16
+ collection_name: str,
17
+ source_document_id: str,
18
+ source_field_name: str,
19
+ target_document_id: str,
20
+ ) -> str:
21
+ r"""
22
+ Constructs a violation message that indicates that a document would contain a broken reference.
23
+
24
+ :param collection_name: The name of the collection containing the document containing the broken reference
25
+ :param source_document_id: The `id` of the document containing the broken reference
26
+ :param source_field_name: The name of the field containing the broken reference
27
+ :param target_document_id: The `id` of the document that is being referenced
28
+
29
+ :return: A formatted string describing the violation
30
+ """
31
+ return (
32
+ f"The document having id='{source_document_id}' in "
33
+ f"the collection '{collection_name}' contains a "
34
+ f"reference (in its '{source_field_name}' field, "
35
+ f"referring to the document having id='{target_document_id}') "
36
+ f"which would be broken."
37
+ )
38
+
39
+
40
+ def simulate_updates_and_check_references(
41
+ db: Database, update_cmd: UpdateCommand
42
+ ) -> List[str]:
43
+ r"""
44
+ Checks whether, if the specified updates were performed on the specified database,
45
+ both of the following things would be true afterward:
46
+ 1. (Regarding outgoing references): The updated documents do not contain any
47
+ broken references.
48
+ 2. (Regarding incoming references): The documents that originally _referenced_
49
+ any of the updated documents do not contain any broken references.
50
+ This check is necessary because update operations can currently change `id`
51
+ and `type` values, which can affect what can legally reference those documents.
52
+
53
+ This function checks those things by performing the updates within a MongoDB
54
+ transaction, leaving the transaction in the _pending_ (i.e. not committed) state,
55
+ and then performing various checks on the database in that _pending_ state.
56
+
57
+ :param db: The database on which to simulate performing the updates
58
+ :param update_cmd: The command that specifies the updates
59
+
60
+ :return: List of violation messages. If the list is empty, it means that—if
61
+ the updates had been performed (instead of only simulated) here—they
62
+ would not have left behind any broken references.
63
+ """
64
+
65
+ # Initialize the list of violation messages that we will return.
66
+ violation_messages: List[str] = []
67
+
68
+ # Instantiate a `Finder` bound to the Mongo database. This will be
69
+ # used later, to identify and check inter-document references.
70
+ finder = Finder(database=db)
71
+
72
+ # Extract the collection name from the command.
73
+ collection_name = update_cmd.update
74
+
75
+ # Derive the update specifications from the command.
76
+ update_specs: UpdateSpecs = derive_update_specs(update_cmd)
77
+
78
+ # Get a reference to a `SchemaView` bound to the NMDC schema, so we can
79
+ # use it to, for example, map `type` field values to schema class names.
80
+ schema_view = nmdc_schema_view()
81
+
82
+ # Get some data structures that indicate which fields of which documents
83
+ # can legally contain references, according to the NMDC schema.
84
+ legal_references = get_allowed_references()
85
+ reference_field_names_by_source_class_name = (
86
+ legal_references.get_reference_field_names_by_source_class_name()
87
+ )
88
+
89
+ # Start a "throwaway" MongoDB transaction so we can simulate the updates.
90
+ with db.client.start_session() as session:
91
+ with session.start_transaction():
92
+
93
+ # Make a list of the `_id`, `id`, and `type` values of the documents that
94
+ # the user wants to update.
95
+ projection = {"_id": 1, "id": 1, "type": 1}
96
+ subject_document_summaries_pre_update = list(
97
+ db[collection_name].find(
98
+ filter={"$or": [spec["filter"] for spec in update_specs]},
99
+ projection=projection,
100
+ session=session,
101
+ )
102
+ )
103
+
104
+ # Make a set of the `_id` values of the subject documents so that (later) we can
105
+ # check whether a given _referring_ document is also one of the _subject_
106
+ # documents (i.e. is among the documents the user wants to update).
107
+ subject_document_object_ids = set(
108
+ tdd["_id"] for tdd in subject_document_summaries_pre_update
109
+ )
110
+
111
+ # Identify _all_ documents that reference any of the subject documents.
112
+ all_referring_document_descriptors_pre_update = []
113
+ for subject_document_summary in subject_document_summaries_pre_update:
114
+ # If the document summary lacks the "id" field, we already know that no
115
+ # documents reference it (since they would have to _use_ that "id" value to
116
+ # do so); so, we abort this iteration and move on to the next subject document.
117
+ if "id" not in subject_document_summary:
118
+ continue
119
+
120
+ referring_document_descriptors = identify_referring_documents(
121
+ document=subject_document_summary, # expects at least "id" and "type"
122
+ schema_view=schema_view,
123
+ references=legal_references,
124
+ finder=finder,
125
+ client_session=session,
126
+ )
127
+ all_referring_document_descriptors_pre_update.extend(
128
+ referring_document_descriptors
129
+ )
130
+
131
+ # Simulate the updates (i.e. apply them within the context of the transaction).
132
+ db.command(
133
+ # Note: This expression was copied from the `_run_mdb_cmd` function in `queries.py`.
134
+ # TODO: Document this expression (i.e. the Pydantic->JSON->BSON chain).
135
+ bson.json_util.loads(
136
+ json.dumps(update_cmd.model_dump(exclude_unset=True))
137
+ ),
138
+ session=session,
139
+ )
140
+ # For each referring document, check whether any of its outgoing references
141
+ # is broken (in the context of the transaction).
142
+ for descriptor in all_referring_document_descriptors_pre_update:
143
+ referring_document_oid = descriptor["source_document_object_id"]
144
+ referring_document_id = descriptor["source_document_id"]
145
+ referring_collection_name = descriptor["source_collection_name"]
146
+ # If the referring document is among the documents that the user wanted to
147
+ # update, we skip it for now. We will check its outgoing references later
148
+ # (i.e. when we check the outgoing references of _all_ updated documents).
149
+ if referring_document_oid in subject_document_object_ids:
150
+ continue
151
+ # Get the referring document, so we can check its outgoing references.
152
+ # Note: We project only the fields that can legally contain references,
153
+ # plus other fields involved in referential integrity checking.
154
+ referring_document_reference_field_names = (
155
+ reference_field_names_by_source_class_name[
156
+ descriptor["source_class_name"]
157
+ ]
158
+ )
159
+ projection = {
160
+ field_name: 1
161
+ for field_name in referring_document_reference_field_names
162
+ } | {
163
+ "_id": 1,
164
+ "id": 1,
165
+ "type": 1,
166
+ } # note: `|` unions the dicts
167
+ referring_document = db[referring_collection_name].find_one(
168
+ {"_id": referring_document_oid},
169
+ projection=projection,
170
+ session=session,
171
+ )
172
+ # Note: We assert that the referring document exists (to satisfy the type checker).
173
+ assert (
174
+ referring_document is not None
175
+ ), "A referring document has vanished."
176
+ violations = scan_outgoing_references(
177
+ document=referring_document,
178
+ source_collection_name=referring_collection_name,
179
+ schema_view=schema_view,
180
+ references=legal_references,
181
+ finder=finder,
182
+ client_session=session, # so it uses the pending transaction's session
183
+ )
184
+ # For each violation (i.e. broken reference) that exists, add a violation message
185
+ # to the list of violation messages.
186
+ #
187
+ # TODO: The violation might not involve a reference to one of the
188
+ # subject documents. The `scan_outgoing_references` function
189
+ # scans _all_ references emanating from the document.
190
+ #
191
+ for violation in violations:
192
+ source_field_name = violation.source_field_name
193
+ target_id = violation.target_id
194
+ violation_messages.append(
195
+ make_violation_message(
196
+ collection_name=referring_collection_name,
197
+ source_document_id=referring_document_id,
198
+ source_field_name=source_field_name,
199
+ target_document_id=target_id,
200
+ )
201
+ )
202
+
203
+ # For each updated document, check whether any of its outgoing references
204
+ # is broken (in the context of the transaction).
205
+ for subject_document_summary in subject_document_summaries_pre_update:
206
+ subject_document_oid = subject_document_summary["_id"]
207
+ subject_document_id = subject_document_summary["id"]
208
+ subject_document_class_name = derive_schema_class_name_from_document(
209
+ document=subject_document_summary,
210
+ schema_view=schema_view,
211
+ )
212
+ assert (
213
+ subject_document_class_name is not None
214
+ ), "The updated document does not represent a valid schema class instance."
215
+ subject_collection_name = (
216
+ collection_name # makes a disambiguating alias
217
+ )
218
+ # Get the updated document, so we can check its outgoing references.
219
+ # Note: We project only the fields that can legally contain references,
220
+ # plus other fields involved in referential integrity checking.
221
+ updated_document_reference_field_names = (
222
+ reference_field_names_by_source_class_name[
223
+ subject_document_class_name
224
+ ]
225
+ )
226
+ projection = {
227
+ field_name: 1
228
+ for field_name in updated_document_reference_field_names
229
+ } | {
230
+ "_id": 1,
231
+ "id": 1,
232
+ "type": 1,
233
+ } # note: `|` unions the dicts
234
+ updated_document = db[subject_collection_name].find_one(
235
+ {"_id": subject_document_oid},
236
+ projection=projection,
237
+ session=session,
238
+ )
239
+ # Note: We assert that the updated document exists (to satisfy the type checker).
240
+ assert updated_document is not None, "An updated document has vanished."
241
+ violations = scan_outgoing_references(
242
+ document=updated_document,
243
+ source_collection_name=subject_collection_name,
244
+ schema_view=schema_view,
245
+ references=legal_references,
246
+ finder=finder,
247
+ client_session=session, # so it uses the pending transaction's session
248
+ )
249
+ # For each violation (i.e. broken reference) that exists, add a violation message
250
+ # to the list of violation messages.
251
+ for violation in violations:
252
+ source_field_name = violation.source_field_name
253
+ target_id = violation.target_id
254
+ violation_messages.append(
255
+ make_violation_message(
256
+ collection_name=subject_collection_name,
257
+ source_document_id=subject_document_id,
258
+ source_field_name=source_field_name,
259
+ target_document_id=target_id,
260
+ )
261
+ )
262
+
263
+ # Whatever happens (i.e. whether there are violations or not), abort the transaction.
264
+ #
265
+ # Note: If an exception was raised within this `with` block, the transaction
266
+ # will already have been aborted automatically (and execution will not
267
+ # have reached this statement). On the other hand, if no exception
268
+ # was raised, we explicitly abort the transaction so that the updates
269
+ # that we "simulated" in this block do not get applied to the real database.
270
+ # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/client_session.html
271
+ #
272
+ session.abort_transaction()
273
+
274
+ return violation_messages
@@ -0,0 +1,193 @@
1
+ """
2
+
3
+ This module houses logic for the `GET /nmdcschema/linked_instances` endpoint, defined as
4
+ `nmdc_runtime.api.endpoints.nmdcschema.linked_instances`, to avoid (further) bloating the
5
+ `nmdc_runtime.api.endpoints.nmdcschema` module.
6
+
7
+ """
8
+
9
+ from datetime import timedelta
10
+ from typing import Literal, Any
11
+
12
+ from bson import ObjectId
13
+ from pymongo.collection import Collection as MongoCollection
14
+ from pymongo.database import Database as MongoDatabase
15
+ from toolz import merge
16
+
17
+ from nmdc_runtime.api.core.util import hash_from_str, now
18
+ from nmdc_runtime.api.db.mongo import get_mongo_db
19
+ from nmdc_runtime.util import get_class_name_to_collection_names_map, nmdc_schema_view
20
+
21
+
22
+ def hash_from_ids_and_types(ids: list[str], types: list[str]) -> str:
23
+ """A quick hash as a function of `ids` and `types`.
24
+
25
+ This will serve as part of a temporary mongo collection name.
26
+ Because it will only be "part of" the name, avoiding hash collisions isn't a priority.
27
+
28
+ Returns a hex digest truncated to 8 characters, so 16**8 ≈ 4M possible values.
29
+ """
30
+ return hash_from_str(
31
+ ",".join(sorted(ids)) + "." + ",".join(sorted(types)), algo="md5"
32
+ )[:8]
33
+
34
+
35
+ def temp_linked_instances_collection_name(ids: list[str], types: list[str]) -> str:
36
+ """A name for a temporary mongo collection to store linked instances in service of an API request."""
37
+ return f"_runtime.tmp.linked_instances.{hash_from_ids_and_types(ids=ids,types=types)}.{ObjectId()}"
38
+
39
+
40
+ def drop_stale_temp_linked_instances_collections() -> None:
41
+ """Drop any temporary linked-instances collections that were generated earlier than one day ago."""
42
+ mdb = get_mongo_db()
43
+ one_day_ago = now() - timedelta(days=1)
44
+ for collection_name in mdb.list_collection_names(
45
+ filter={"name": {"$regex": r"^_runtime.tmp.linked_instances\..*"}}
46
+ ):
47
+ if ObjectId(collection_name.split(".")[-1]).generation_time < one_day_ago:
48
+ mdb.drop_collection(collection_name)
49
+
50
+
51
+ def gather_linked_instances(
52
+ alldocs_collection: MongoCollection,
53
+ ids: list[str],
54
+ types: list[str],
55
+ ) -> str:
56
+ """Collect linked instances and stores them in a new temporary collection.
57
+
58
+ Run an aggregation pipeline over `alldocs_collection` that collects ∈`types` instances linked to `ids`.
59
+ The pipeline is run twice, once for each of {"downstream", "upstream"} directions.
60
+ """
61
+ merge_into_collection_name = temp_linked_instances_collection_name(
62
+ ids=ids, types=types
63
+ )
64
+ for direction in ["downstream", "upstream"]:
65
+ _ = list(
66
+ alldocs_collection.aggregate(
67
+ pipeline_for_direction(
68
+ ids=ids,
69
+ types=types,
70
+ direction=direction,
71
+ merge_into_collection_name=merge_into_collection_name,
72
+ ),
73
+ allowDiskUse=True,
74
+ )
75
+ )
76
+ return merge_into_collection_name
77
+
78
+
79
+ def pipeline_for_direction(
80
+ ids: list[str],
81
+ types: list[str],
82
+ direction: Literal["downstream", "upstream"],
83
+ merge_into_collection_name: str,
84
+ alldocs_collection_name: str = "alldocs",
85
+ ) -> list:
86
+ """A pure function that returns the aggregation pipeline for `direction`.
87
+
88
+ The pipeline
89
+ - collects ∈`types` instances linked to `ids` along `direction`,
90
+ - retains only those document fields essential to the caller, and
91
+ - ensures the collected instances are present, and properly updated if applicable, in a merge-target collection.
92
+ """
93
+ return pipeline_for_instances_linked_to_ids_by_direction(
94
+ ids=ids,
95
+ types=types,
96
+ direction=direction,
97
+ alldocs_collection_name=alldocs_collection_name,
98
+ ) + [
99
+ {"$project": {"id": 1, "type": 1, f"_{direction}_of": 1}},
100
+ pipeline_stage_for_merging_instances_and_grouping_link_provenance_by_direction(
101
+ merge_into_collection_name=merge_into_collection_name, direction=direction
102
+ ),
103
+ ]
104
+
105
+
106
+ def pipeline_for_instances_linked_to_ids_by_direction(
107
+ ids: list[str],
108
+ types: list[str],
109
+ direction: Literal["downstream", "upstream"],
110
+ alldocs_collection_name: str = "alldocs",
111
+ slim: bool = True,
112
+ ) -> list[dict[str, Any]]:
113
+ """
114
+ Returns an aggregation pipeline that:
115
+ - traverses the graph of documents in the alldocs collection, following `direction`-specific relationships
116
+ to discover documents linked to the documents given by `ids`.
117
+ - `$unwind`s the collected (via `$graphLookup`) docs,
118
+ - filters them by given `types` of interest,
119
+ - adds bookkeeping information about `direction`ality, and
120
+ - (optionally) projects only essential fields to reduce response latency and size.
121
+ """
122
+ return [
123
+ {"$match": {"id": {"$in": ids}}},
124
+ {
125
+ "$graphLookup": {
126
+ "from": alldocs_collection_name,
127
+ "startWith": f"$_{direction}.id",
128
+ "connectFromField": f"_{direction}.id",
129
+ "connectToField": "id",
130
+ "as": f"{direction}_docs",
131
+ }
132
+ },
133
+ {"$unwind": {"path": f"${direction}_docs"}},
134
+ {"$match": {f"{direction}_docs._type_and_ancestors": {"$in": types}}},
135
+ {"$addFields": {f"{direction}_docs._{direction}_of": ["$id"]}},
136
+ {"$replaceRoot": {"newRoot": f"${direction}_docs"}},
137
+ ] + ([{"$project": {"id": 1, "type": 1, f"_{direction}_of": 1}}] if slim else [])
138
+
139
+
140
+ def pipeline_stage_for_merging_instances_and_grouping_link_provenance_by_direction(
141
+ merge_into_collection_name: str,
142
+ direction: Literal["downstream", "upstream"],
143
+ ) -> dict[str, Any]:
144
+ """
145
+ Returns an aggregation-pipeline step that merges its input document stream to a collection dedicated to serving
146
+ the caller in a manner amenable to pagination across multiple HTTP requests.
147
+ """
148
+ return {
149
+ "$merge": {
150
+ "into": merge_into_collection_name,
151
+ "on": "_id",
152
+ "whenMatched": [
153
+ {
154
+ "$set": {
155
+ f"_{direction}_of": {
156
+ "$setUnion": [
157
+ f"$_{direction}_of",
158
+ f"$$new._{direction}_of",
159
+ ]
160
+ }
161
+ }
162
+ }
163
+ ],
164
+ "whenNotMatched": "insert",
165
+ }
166
+ }
167
+
168
+
169
+ def hydrated(resources: list[dict], mdb: MongoDatabase) -> list[dict]:
170
+ """Replace each `dict` in `resources` with a hydrated version.
171
+
172
+ Instead of returning the retrieved "full" documents as is, we merge each one with (a copy of) the corresponding
173
+ original document in *resources*, which includes additional fields, e.g. `_upstream_of` and `_downstream_of`.
174
+ """
175
+ class_name_to_collection_names_map = get_class_name_to_collection_names_map(
176
+ nmdc_schema_view()
177
+ )
178
+ types_of_resources = {r["type"] for r in resources}
179
+ full_docs_by_id = {}
180
+
181
+ for type in types_of_resources:
182
+ resource_ids_of_type = [d["id"] for d in resources if d["type"] == type]
183
+ schema_collection = mdb.get_collection(
184
+ # Note: We are assuming that documents of a given type are only allowed (by the schema) to reside in one
185
+ # collection. Based on that assumption, we will query only the _first_ collection whose name we get from
186
+ # the map. This assumption is continuously verified prior to code deployment via
187
+ # `test_get_class_name_to_collection_names_map_has_one_and_only_one_collection_name_per_class_name`.
188
+ class_name_to_collection_names_map[type.removeprefix("nmdc:")][0]
189
+ )
190
+ for doc in schema_collection.find({"id": {"$in": resource_ids_of_type}}):
191
+ full_docs_by_id[doc["id"]] = doc
192
+
193
+ return [merge(r, full_docs_by_id[r["id"]]) for r in resources]