nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (100) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +55 -4
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +33 -28
  76. nmdc_runtime/site/ops.py +97 -237
  77. nmdc_runtime/site/repair/database_updater.py +8 -0
  78. nmdc_runtime/site/repository.py +7 -117
  79. nmdc_runtime/site/resources.py +4 -4
  80. nmdc_runtime/site/translation/gold_translator.py +22 -21
  81. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  82. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  83. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  84. nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
  85. nmdc_runtime/site/translation/translator.py +63 -1
  86. nmdc_runtime/site/util.py +8 -3
  87. nmdc_runtime/site/validation/util.py +10 -5
  88. nmdc_runtime/util.py +9 -321
  89. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  90. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  91. nmdc_runtime/site/translation/emsl.py +0 -43
  92. nmdc_runtime/site/translation/gold.py +0 -53
  93. nmdc_runtime/site/translation/jgi.py +0 -32
  94. nmdc_runtime/site/translation/util.py +0 -132
  95. nmdc_runtime/site/validation/jgi.py +0 -43
  96. nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
  97. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  98. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  99. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  100. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,192 @@
1
+ """
2
+ `router` here is deprecated in favor of `nmdc_runtime.minter.entrypoints.fastapi_app.router`
3
+ """
4
+
5
+ import re
6
+ from typing import List, Dict, Any
7
+
8
+ from fastapi import APIRouter, Depends, HTTPException
9
+ from pydantic import ValidationError
10
+ from pymongo.database import Database as MongoDatabase
11
+ from starlette import status
12
+ from toolz import dissoc
13
+
14
+ from nmdc_runtime.api.core.idgen import (
15
+ generate_ids,
16
+ decode_id,
17
+ collection_name,
18
+ )
19
+ from nmdc_runtime.api.core.util import raise404_if_none, pick
20
+ from nmdc_runtime.api.db.mongo import get_mongo_db
21
+ from nmdc_runtime.api.models.id import (
22
+ MintRequest,
23
+ pattern_shoulder,
24
+ AssignedBaseName,
25
+ pattern_assigned_base_name,
26
+ IdBindingRequest,
27
+ pattern_base_object_name,
28
+ IdThreeParts,
29
+ pattern_naa,
30
+ )
31
+ from nmdc_runtime.api.models.site import get_current_client_site, Site
32
+
33
+ router = APIRouter()
34
+
35
+
36
+ @router.post("/ids/mint", response_model=List[str])
37
+ def mint_ids(
38
+ mint_req: MintRequest,
39
+ mdb: MongoDatabase = Depends(get_mongo_db),
40
+ site: Site = Depends(get_current_client_site),
41
+ ):
42
+ """Generate one or more identifiers.
43
+
44
+ Leaving `populator` blank will set it to the site ID of the request client.
45
+ """
46
+ ids = generate_ids(
47
+ mdb,
48
+ owner=site.id,
49
+ populator=(mint_req.populator or site.id),
50
+ number=mint_req.number,
51
+ naa=mint_req.naa,
52
+ shoulder=mint_req.shoulder,
53
+ )
54
+ return ids
55
+
56
+
57
+ @router.post("/ids/bindings", response_model=List[Dict[str, Any]])
58
+ def set_id_bindings(
59
+ binding_requests: List[IdBindingRequest],
60
+ mdb: MongoDatabase = Depends(get_mongo_db),
61
+ site: Site = Depends(get_current_client_site),
62
+ ):
63
+ bons = [r.i for r in binding_requests]
64
+ ids: List[IdThreeParts] = []
65
+ for bon in bons:
66
+ m = re.match(pattern_base_object_name, bon)
67
+ ids.append(
68
+ IdThreeParts(
69
+ naa=m.group("naa"),
70
+ shoulder=m.group("shoulder"),
71
+ blade=m.group("blade"),
72
+ )
73
+ )
74
+ # Ensure that user owns all supplied identifiers.
75
+ for id_, r in zip(ids, binding_requests):
76
+ collection = mdb.get_collection(collection_name(id_.naa, id_.shoulder))
77
+ doc = collection.find_one({"_id": decode_id(str(id_.blade))}, ["__ao"])
78
+ if doc is None:
79
+ raise HTTPException(
80
+ status_code=status.HTTP_404_NOT_FOUND,
81
+ detail=f"id {r.i} not found",
82
+ )
83
+ elif doc.get("__ao") != site.id:
84
+ raise HTTPException(
85
+ status_code=status.HTTP_403_FORBIDDEN,
86
+ detail=(
87
+ f"authenticated site client does not manage {r.i} "
88
+ f"(client represents site {site.id}).",
89
+ ),
90
+ )
91
+ # Ensure no attempts to set reserved attributes.
92
+ if any(r.a.startswith("__a") for r in binding_requests):
93
+ raise HTTPException(
94
+ status_code=status.HTTP_403_FORBIDDEN,
95
+ detail="Cannot set attribute names beginning with '__a'.",
96
+ )
97
+ # Process binding requests
98
+ docs = []
99
+ for id_, r in zip(ids, binding_requests):
100
+ collection = mdb.get_collection(collection_name(id_.naa, id_.shoulder))
101
+
102
+ filter_ = {"_id": decode_id(id_.blade)}
103
+ if r.o == "purge":
104
+ docs.append(collection.find_one_and_delete(filter_))
105
+ elif r.o == "rm":
106
+ docs.append(collection.find_one_and_update(filter_, {"$unset": {r.a: ""}}))
107
+ elif r.o == "set":
108
+ docs.append(collection.find_one_and_update(filter_, {"$set": {r.a: r.v}}))
109
+ elif r.o == "addToSet":
110
+ docs.append(
111
+ collection.find_one_and_update(filter_, {"$addToSet": {r.a: r.v}})
112
+ )
113
+ else:
114
+ # Note: IdBindingRequest root_validator methods should preclude this.
115
+ raise HTTPException(
116
+ status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid operation 'o'."
117
+ )
118
+
119
+ return [dissoc(d, "_id") for d in docs]
120
+
121
+
122
+ @router.get("/ids/bindings/{rest:path}", response_model=Dict[str, Any])
123
+ def get_id_bindings(
124
+ rest: str,
125
+ mdb: MongoDatabase = Depends(get_mongo_db),
126
+ ):
127
+ cleaned = rest.replace("-", "")
128
+ parts = cleaned.split(":")
129
+ if len(parts) != 2:
130
+ raise HTTPException(
131
+ status_code=status.HTTP_400_BAD_REQUEST,
132
+ detail=(
133
+ "Invalid ID - needs both name assigning authority (NAA) part"
134
+ "(e.g. 'nmdc') and name part (e.g. 'fk4ra92'), separated by a colon (':')."
135
+ ),
136
+ )
137
+ naa = parts[0]
138
+ suffix_parts = parts[1].split("/")
139
+ if len(suffix_parts) == 2 and suffix_parts[-1] != "": # one '/', or ends with '/'
140
+ assigned_base_name, attribute = suffix_parts
141
+ else:
142
+ assigned_base_name = suffix_parts[0]
143
+ attribute = None
144
+
145
+ if re.match(pattern_naa, naa) is None:
146
+ raise HTTPException(
147
+ status_code=status.HTTP_400_BAD_REQUEST,
148
+ detail=f"Invalid ID - invalid name assigning authority (NAA) '{naa}'.",
149
+ )
150
+ print(assigned_base_name)
151
+ if re.match(pattern_shoulder, assigned_base_name) is None:
152
+ raise HTTPException(
153
+ status_code=status.HTTP_400_BAD_REQUEST,
154
+ detail=(
155
+ "Invalid ID - invalid shoulder. "
156
+ "Every name part begins with a 'shoulder', a "
157
+ "sequence of letters followed by a number, "
158
+ "for example 'fk4'. "
159
+ "Did you forget to include the shoulder?",
160
+ ),
161
+ )
162
+ try:
163
+ m = re.match(pattern_assigned_base_name, AssignedBaseName(assigned_base_name))
164
+ shoulder, blade = m.group("shoulder"), m.group("blade")
165
+ id_decoded = decode_id(blade)
166
+ except (AttributeError, ValidationError):
167
+ raise HTTPException(
168
+ status_code=status.HTTP_400_BAD_REQUEST,
169
+ detail="Invalid ID - characters used outside of base32.",
170
+ )
171
+ except ValueError:
172
+ raise HTTPException(
173
+ status_code=status.HTTP_400_BAD_REQUEST,
174
+ detail="Invalid ID - failed checksum. Did you copy it incorrectly?",
175
+ )
176
+
177
+ collection = mdb.get_collection(collection_name(naa, shoulder))
178
+ d = raise404_if_none(collection.find_one({"_id": id_decoded}))
179
+ d = dissoc(d, "_id")
180
+ if attribute is not None:
181
+ if attribute not in d:
182
+ raise HTTPException(
183
+ status_code=status.HTTP_404_NOT_FOUND,
184
+ detail=(
185
+ f"attribute '{attribute}' not found in "
186
+ f"{naa}:{assigned_base_name}."
187
+ ),
188
+ )
189
+ rv = pick(["where", attribute], d)
190
+ else:
191
+ rv = d
192
+ return rv
@@ -0,0 +1,143 @@
1
+ import json
2
+ from typing import Optional, Annotated
3
+
4
+ from pymongo.database import Database
5
+ from fastapi import APIRouter, Depends, Query, HTTPException, Path
6
+ from pymongo.errors import ConnectionFailure, OperationFailure
7
+ from starlette import status
8
+
9
+ from nmdc_runtime.api.core.util import (
10
+ raise404_if_none,
11
+ )
12
+ from nmdc_runtime.api.db.mongo import get_mongo_db
13
+ from nmdc_runtime.api.endpoints.util import list_resources, _claim_job
14
+ from nmdc_runtime.api.models.job import Job, JobClaim
15
+ from nmdc_runtime.api.models.operation import Operation, MetadataT
16
+ from nmdc_runtime.api.models.site import (
17
+ Site,
18
+ maybe_get_current_client_site,
19
+ get_current_client_site,
20
+ )
21
+ from nmdc_runtime.api.models.util import ListRequest, ListResponse, ResultT
22
+
23
+ router = APIRouter()
24
+
25
+
26
+ @router.get(
27
+ "/jobs", response_model=ListResponse[Job], response_model_exclude_unset=True
28
+ )
29
+ def list_jobs(
30
+ req: Annotated[ListRequest, Query()],
31
+ mdb: Database = Depends(get_mongo_db),
32
+ maybe_site: Optional[Site] = Depends(maybe_get_current_client_site),
33
+ ):
34
+ """List pre-configured workflow jobs.
35
+
36
+ If authenticated as a site client, `req.filter` defaults to fetch unclaimed jobs
37
+ that are claimable by the site client. This default can be overridden to view all jobs
38
+ by explicitly passing a `req.filter` of `{}`.
39
+ """
40
+ if isinstance(maybe_site, Site) and req.filter is None:
41
+ req.filter = json.dumps({"claims.site_id": {"$ne": maybe_site.id}})
42
+ return list_resources(req, mdb, "jobs")
43
+
44
+
45
+ @router.get("/jobs/{job_id}", response_model=Job, response_model_exclude_unset=True)
46
+ def get_job_info(
47
+ job_id: str,
48
+ mdb: Database = Depends(get_mongo_db),
49
+ ):
50
+ return raise404_if_none(mdb.jobs.find_one({"id": job_id}))
51
+
52
+
53
+ @router.post("/jobs/{job_id}:claim", response_model=Operation[ResultT, MetadataT])
54
+ def claim_job(
55
+ job_id: str,
56
+ mdb: Database = Depends(get_mongo_db),
57
+ site: Site = Depends(get_current_client_site),
58
+ ):
59
+ return _claim_job(job_id, mdb, site)
60
+
61
+
62
+ @router.post("/jobs/{job_id}:release")
63
+ def release_job(
64
+ job_id: Annotated[
65
+ str,
66
+ Path(
67
+ title="Job ID",
68
+ description="The `id` of the job.\n\n_Example_: `nmdc:f81d4fae-7dec-11d0-a765-00a0c91e6bf6`",
69
+ examples=["nmdc:f81d4fae-7dec-11d0-a765-00a0c91e6bf6"],
70
+ ),
71
+ ],
72
+ mdb: Database = Depends(get_mongo_db),
73
+ site: Site = Depends(get_current_client_site),
74
+ ) -> Optional[Job]:
75
+ r"""
76
+ Release the specified job.
77
+
78
+ Releasing a job cancels all the unfinished operations (of that job)
79
+ claimed by the `site` associated with the logged-in site client.
80
+
81
+ Return the updated job, reflecting that the aforementioned operations have been cancelled.
82
+ """
83
+ job = Job(**raise404_if_none(mdb.jobs.find_one({"id": job_id})))
84
+ active_job_claims_by_this_site = list(
85
+ mdb.operations.find(
86
+ {
87
+ "metadata.job.id": job_id,
88
+ "metadata.site_id": site.id,
89
+ "done": False,
90
+ },
91
+ ["id"],
92
+ )
93
+ )
94
+ job_claims_by_this_site_post_release = [
95
+ JobClaim(op_id=claim["id"], site_id=site.id, done=True, cancelled=True)
96
+ for claim in active_job_claims_by_this_site
97
+ ]
98
+ job_claims_not_by_this_site = [
99
+ claim for claim in job.claims if (claim.site_id != site.id)
100
+ ]
101
+
102
+ # Execute MongoDB transaction to ensure atomic change of job document plus relevant set of operations documents.
103
+ def transactional_update(session):
104
+ mdb.operations.update_many(
105
+ {"id": {"$in": [claim["id"] for claim in active_job_claims_by_this_site]}},
106
+ {"$set": {"metadata.cancelled": True, "metadata.done": True}},
107
+ session=session,
108
+ )
109
+ job_claim_subdocuments_post_release = [
110
+ claim.model_dump(exclude_unset=True)
111
+ for claim in (
112
+ job_claims_not_by_this_site + job_claims_by_this_site_post_release
113
+ )
114
+ ]
115
+ mdb.jobs.update_one(
116
+ {"id": job_id},
117
+ {"$set": {"claims": job_claim_subdocuments_post_release}},
118
+ session=session,
119
+ )
120
+
121
+ try:
122
+ with mdb.client.start_session() as session:
123
+ with session.start_transaction():
124
+ transactional_update(session)
125
+ except (ConnectionFailure, OperationFailure) as e:
126
+ raise HTTPException(
127
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
128
+ detail=f"Transaction failed: {e}",
129
+ )
130
+
131
+ # Return the updated `jobs` document.
132
+ #
133
+ # TODO: Consider retrieving the document within the transaction
134
+ # to ensure it still exists.
135
+ #
136
+ updated_job = mdb.jobs.find_one({"id": job_id})
137
+ if updated_job is None:
138
+ # Note: We return `None` in this case because that's what the
139
+ # endpoint originally did in this case, and we don't want
140
+ # to introduce a breaking change as part of this refactor.
141
+ return None
142
+ else:
143
+ return Job(**updated_job)
File without changes
@@ -0,0 +1,274 @@
1
+ import json
2
+ import bson.json_util
3
+ from typing import List
4
+
5
+ from pymongo.database import Database
6
+ from refscan.lib.Finder import Finder
7
+ from refscan.lib.helpers import derive_schema_class_name_from_document
8
+ from refscan.scanner import identify_referring_documents, scan_outgoing_references
9
+
10
+ from nmdc_runtime.api.models.lib.helpers import derive_update_specs
11
+ from nmdc_runtime.api.models.query import UpdateCommand, UpdateSpecs
12
+ from nmdc_runtime.util import get_allowed_references, nmdc_schema_view
13
+
14
+
15
+ def make_violation_message(
16
+ collection_name: str,
17
+ source_document_id: str,
18
+ source_field_name: str,
19
+ target_document_id: str,
20
+ ) -> str:
21
+ r"""
22
+ Constructs a violation message that indicates that a document would contain a broken reference.
23
+
24
+ :param collection_name: The name of the collection containing the document containing the broken reference
25
+ :param source_document_id: The `id` of the document containing the broken reference
26
+ :param source_field_name: The name of the field containing the broken reference
27
+ :param target_document_id: The `id` of the document that is being referenced
28
+
29
+ :return: A formatted string describing the violation
30
+ """
31
+ return (
32
+ f"The document having id='{source_document_id}' in "
33
+ f"the collection '{collection_name}' contains a "
34
+ f"reference (in its '{source_field_name}' field, "
35
+ f"referring to the document having id='{target_document_id}') "
36
+ f"which would be broken."
37
+ )
38
+
39
+
40
+ def simulate_updates_and_check_references(
41
+ db: Database, update_cmd: UpdateCommand
42
+ ) -> List[str]:
43
+ r"""
44
+ Checks whether, if the specified updates were performed on the specified database,
45
+ both of the following things would be true afterward:
46
+ 1. (Regarding outgoing references): The updated documents do not contain any
47
+ broken references.
48
+ 2. (Regarding incoming references): The documents that originally _referenced_
49
+ any of the updated documents do not contain any broken references.
50
+ This check is necessary because update operations can currently change `id`
51
+ and `type` values, which can affect what can legally reference those documents.
52
+
53
+ This function checks those things by performing the updates within a MongoDB
54
+ transaction, leaving the transaction in the _pending_ (i.e. not committed) state,
55
+ and then performing various checks on the database in that _pending_ state.
56
+
57
+ :param db: The database on which to simulate performing the updates
58
+ :param update_cmd: The command that specifies the updates
59
+
60
+ :return: List of violation messages. If the list is empty, it means that—if
61
+ the updates had been performed (instead of only simulated) here—they
62
+ would not have left behind any broken references.
63
+ """
64
+
65
+ # Initialize the list of violation messages that we will return.
66
+ violation_messages: List[str] = []
67
+
68
+ # Instantiate a `Finder` bound to the Mongo database. This will be
69
+ # used later, to identify and check inter-document references.
70
+ finder = Finder(database=db)
71
+
72
+ # Extract the collection name from the command.
73
+ collection_name = update_cmd.update
74
+
75
+ # Derive the update specifications from the command.
76
+ update_specs: UpdateSpecs = derive_update_specs(update_cmd)
77
+
78
+ # Get a reference to a `SchemaView` bound to the NMDC schema, so we can
79
+ # use it to, for example, map `type` field values to schema class names.
80
+ schema_view = nmdc_schema_view()
81
+
82
+ # Get some data structures that indicate which fields of which documents
83
+ # can legally contain references, according to the NMDC schema.
84
+ legal_references = get_allowed_references()
85
+ reference_field_names_by_source_class_name = (
86
+ legal_references.get_reference_field_names_by_source_class_name()
87
+ )
88
+
89
+ # Start a "throwaway" MongoDB transaction so we can simulate the updates.
90
+ with db.client.start_session() as session:
91
+ with session.start_transaction():
92
+
93
+ # Make a list of the `_id`, `id`, and `type` values of the documents that
94
+ # the user wants to update.
95
+ projection = {"_id": 1, "id": 1, "type": 1}
96
+ subject_document_summaries_pre_update = list(
97
+ db[collection_name].find(
98
+ filter={"$or": [spec["filter"] for spec in update_specs]},
99
+ projection=projection,
100
+ session=session,
101
+ )
102
+ )
103
+
104
+ # Make a set of the `_id` values of the subject documents so that (later) we can
105
+ # check whether a given _referring_ document is also one of the _subject_
106
+ # documents (i.e. is among the documents the user wants to update).
107
+ subject_document_object_ids = set(
108
+ tdd["_id"] for tdd in subject_document_summaries_pre_update
109
+ )
110
+
111
+ # Identify _all_ documents that reference any of the subject documents.
112
+ all_referring_document_descriptors_pre_update = []
113
+ for subject_document_summary in subject_document_summaries_pre_update:
114
+ # If the document summary lacks the "id" field, we already know that no
115
+ # documents reference it (since they would have to _use_ that "id" value to
116
+ # do so); so, we abort this iteration and move on to the next subject document.
117
+ if "id" not in subject_document_summary:
118
+ continue
119
+
120
+ referring_document_descriptors = identify_referring_documents(
121
+ document=subject_document_summary, # expects at least "id" and "type"
122
+ schema_view=schema_view,
123
+ references=legal_references,
124
+ finder=finder,
125
+ client_session=session,
126
+ )
127
+ all_referring_document_descriptors_pre_update.extend(
128
+ referring_document_descriptors
129
+ )
130
+
131
+ # Simulate the updates (i.e. apply them within the context of the transaction).
132
+ db.command(
133
+ # Note: This expression was copied from the `_run_mdb_cmd` function in `queries.py`.
134
+ # TODO: Document this expression (i.e. the Pydantic->JSON->BSON chain).
135
+ bson.json_util.loads(
136
+ json.dumps(update_cmd.model_dump(exclude_unset=True))
137
+ ),
138
+ session=session,
139
+ )
140
+ # For each referring document, check whether any of its outgoing references
141
+ # is broken (in the context of the transaction).
142
+ for descriptor in all_referring_document_descriptors_pre_update:
143
+ referring_document_oid = descriptor["source_document_object_id"]
144
+ referring_document_id = descriptor["source_document_id"]
145
+ referring_collection_name = descriptor["source_collection_name"]
146
+ # If the referring document is among the documents that the user wanted to
147
+ # update, we skip it for now. We will check its outgoing references later
148
+ # (i.e. when we check the outgoing references of _all_ updated documents).
149
+ if referring_document_oid in subject_document_object_ids:
150
+ continue
151
+ # Get the referring document, so we can check its outgoing references.
152
+ # Note: We project only the fields that can legally contain references,
153
+ # plus other fields involved in referential integrity checking.
154
+ referring_document_reference_field_names = (
155
+ reference_field_names_by_source_class_name[
156
+ descriptor["source_class_name"]
157
+ ]
158
+ )
159
+ projection = {
160
+ field_name: 1
161
+ for field_name in referring_document_reference_field_names
162
+ } | {
163
+ "_id": 1,
164
+ "id": 1,
165
+ "type": 1,
166
+ } # note: `|` unions the dicts
167
+ referring_document = db[referring_collection_name].find_one(
168
+ {"_id": referring_document_oid},
169
+ projection=projection,
170
+ session=session,
171
+ )
172
+ # Note: We assert that the referring document exists (to satisfy the type checker).
173
+ assert (
174
+ referring_document is not None
175
+ ), "A referring document has vanished."
176
+ violations = scan_outgoing_references(
177
+ document=referring_document,
178
+ source_collection_name=referring_collection_name,
179
+ schema_view=schema_view,
180
+ references=legal_references,
181
+ finder=finder,
182
+ client_session=session, # so it uses the pending transaction's session
183
+ )
184
+ # For each violation (i.e. broken reference) that exists, add a violation message
185
+ # to the list of violation messages.
186
+ #
187
+ # TODO: The violation might not involve a reference to one of the
188
+ # subject documents. The `scan_outgoing_references` function
189
+ # scans _all_ references emanating from the document.
190
+ #
191
+ for violation in violations:
192
+ source_field_name = violation.source_field_name
193
+ target_id = violation.target_id
194
+ violation_messages.append(
195
+ make_violation_message(
196
+ collection_name=referring_collection_name,
197
+ source_document_id=referring_document_id,
198
+ source_field_name=source_field_name,
199
+ target_document_id=target_id,
200
+ )
201
+ )
202
+
203
+ # For each updated document, check whether any of its outgoing references
204
+ # is broken (in the context of the transaction).
205
+ for subject_document_summary in subject_document_summaries_pre_update:
206
+ subject_document_oid = subject_document_summary["_id"]
207
+ subject_document_id = subject_document_summary["id"]
208
+ subject_document_class_name = derive_schema_class_name_from_document(
209
+ document=subject_document_summary,
210
+ schema_view=schema_view,
211
+ )
212
+ assert (
213
+ subject_document_class_name is not None
214
+ ), "The updated document does not represent a valid schema class instance."
215
+ subject_collection_name = (
216
+ collection_name # makes a disambiguating alias
217
+ )
218
+ # Get the updated document, so we can check its outgoing references.
219
+ # Note: We project only the fields that can legally contain references,
220
+ # plus other fields involved in referential integrity checking.
221
+ updated_document_reference_field_names = (
222
+ reference_field_names_by_source_class_name[
223
+ subject_document_class_name
224
+ ]
225
+ )
226
+ projection = {
227
+ field_name: 1
228
+ for field_name in updated_document_reference_field_names
229
+ } | {
230
+ "_id": 1,
231
+ "id": 1,
232
+ "type": 1,
233
+ } # note: `|` unions the dicts
234
+ updated_document = db[subject_collection_name].find_one(
235
+ {"_id": subject_document_oid},
236
+ projection=projection,
237
+ session=session,
238
+ )
239
+ # Note: We assert that the updated document exists (to satisfy the type checker).
240
+ assert updated_document is not None, "An updated document has vanished."
241
+ violations = scan_outgoing_references(
242
+ document=updated_document,
243
+ source_collection_name=subject_collection_name,
244
+ schema_view=schema_view,
245
+ references=legal_references,
246
+ finder=finder,
247
+ client_session=session, # so it uses the pending transaction's session
248
+ )
249
+ # For each violation (i.e. broken reference) that exists, add a violation message
250
+ # to the list of violation messages.
251
+ for violation in violations:
252
+ source_field_name = violation.source_field_name
253
+ target_id = violation.target_id
254
+ violation_messages.append(
255
+ make_violation_message(
256
+ collection_name=subject_collection_name,
257
+ source_document_id=subject_document_id,
258
+ source_field_name=source_field_name,
259
+ target_document_id=target_id,
260
+ )
261
+ )
262
+
263
+ # Whatever happens (i.e. whether there are violations or not), abort the transaction.
264
+ #
265
+ # Note: If an exception was raised within this `with` block, the transaction
266
+ # will already have been aborted automatically (and execution will not
267
+ # have reached this statement). On the other hand, if no exception
268
+ # was raised, we explicitly abort the transaction so that the updates
269
+ # that we "simulated" in this block do not get applied to the real database.
270
+ # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/client_session.html
271
+ #
272
+ session.abort_transaction()
273
+
274
+ return violation_messages