nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,4 @@
1
+ <svg width="32" height="32" fill="none" xmlns="http://www.w3.org/2000/svg">
2
+ <path fill-rule="evenodd" clip-rule="evenodd" d="M32 16c0 8.837-7.163 16-16 16-8.838 0-16-7.163-16-16C0 7.162 7.162 0 16 0c8.837 0 16 7.162 16 16Z" fill="#A6CE39"/>
3
+ <path fill-rule="evenodd" clip-rule="evenodd" d="M18.813 9.637h-5.45v13.9h5.474c4.555 0 7.35-3.378 7.35-6.95 0-1.635-.562-3.372-1.77-4.704-1.215-1.336-3.065-2.246-5.605-2.246ZM18.6 21.3h-2.813v-9.425H18.5c1.823 0 3.12.552 3.96 1.4.842.849 1.252 2.021 1.252 3.312 0 .784-.239 1.967-.993 2.948-.745.969-2.01 1.765-4.119 1.765Zm5.311-4.026c-.251 1.74-1.494 4.276-5.311 4.276h-3.063H18.6c3.817 0 5.06-2.536 5.311-4.276Zm1.812-2.405c-.657-2.601-2.85-4.982-6.91-4.982h-5.2 5.2c4.06 0 6.253 2.38 6.91 4.982Zm.215 1.718ZM8.363 9.675v13.887h2.425V9.675H8.363Zm2.175 13.637H8.612h1.925ZM9.575 8.65c.84 0 1.513-.689 1.513-1.513 0-.823-.673-1.512-1.513-1.512-.838 0-1.512.674-1.512 1.513 0 .823.672 1.512 1.512 1.512Z" fill="#fff"/>
4
+ </svg>
@@ -0,0 +1,5 @@
1
+ # Static
2
+
3
+ This document contains information about the origins of the other files in this directory.
4
+
5
+ - `ORCID-iD_icon_vector.svg`: On September 27, 2025, we downloaded this SVG file from ORCID's [Brand Library](https://orcid.filecamp.com/s/o/3CCuLloCl73Knntn/VU19wHSMUnX9TD4R), which we found a link to on the [Brand Guidelines](https://info.orcid.org/brand-guidelines/) page of ORCID's website.
Binary file
nmdc_runtime/util.py CHANGED
@@ -1,80 +1,34 @@
1
+ import importlib.resources
1
2
  import json
2
3
  import mimetypes
3
4
  import os
4
- import pkgutil
5
+ from collections import defaultdict
5
6
  from collections.abc import Iterable
6
- from contextlib import AbstractContextManager
7
- from copy import deepcopy
8
7
  from datetime import datetime, timezone
9
8
  from functools import lru_cache
10
- from io import BytesIO
11
9
  from itertools import chain
12
10
  from pathlib import Path
13
- from uuid import uuid4
14
- from typing import List, Optional, Set, Dict
11
+ from typing import Callable, List, Optional, Set, Dict
15
12
 
16
- import fastjsonschema
17
13
  import requests
14
+ from bson.son import SON
18
15
  from frozendict import frozendict
19
- from jsonschema.validators import Draft7Validator
20
- from linkml_runtime import linkml_model
21
- from linkml_runtime.utils.schemaview import SchemaView
22
- from nmdc_schema.nmdc import Database as NMDCDatabase
16
+ from linkml.validator import Validator
17
+ from linkml.validator.plugins import JsonschemaValidationPlugin
18
+ from linkml_runtime import SchemaView
19
+ from nmdc_schema import NmdcSchemaValidationPlugin
23
20
  from nmdc_schema.get_nmdc_view import ViewGetter
24
- from pydantic import Field, BaseModel
25
21
  from pymongo.database import Database as MongoDatabase
26
22
  from pymongo.errors import OperationFailure
27
- from refscan.lib.helpers import identify_references
28
- from refscan.lib.Finder import Finder
23
+ from refscan.lib.helpers import (
24
+ identify_references,
25
+ get_collection_name_to_class_names_map,
26
+ )
29
27
  from refscan.lib.ReferenceList import ReferenceList
30
- from refscan.scanner import scan_outgoing_references
31
- from toolz import merge, unique
28
+ from toolz import merge
32
29
 
33
30
  from nmdc_runtime.api.core.util import sha256hash_from_file
34
31
  from nmdc_runtime.api.models.object import DrsObjectIn
35
- from typing_extensions import Annotated
36
-
37
-
38
- def get_names_of_classes_in_effective_range_of_slot(
39
- schema_view: SchemaView, slot_definition: linkml_model.SlotDefinition
40
- ) -> List[str]:
41
- r"""
42
- Determine the slot's "effective" range, by taking into account its `any_of` constraints (if defined).
43
-
44
- Note: The `any_of` constraints constrain the slot's "effective" range beyond that described by the
45
- induced slot definition's `range` attribute. `SchemaView` does not seem to provide the result
46
- of applying those additional constraints, so we do it manually here (if any are defined).
47
- Reference: https://github.com/orgs/linkml/discussions/2101#discussion-6625646
48
-
49
- Reference: https://linkml.io/linkml-model/latest/docs/any_of/
50
- """
51
-
52
- # Initialize the list to be empty.
53
- names_of_eligible_target_classes = []
54
-
55
- # If the `any_of` constraint is defined on this slot, use that instead of the `range`.
56
- if "any_of" in slot_definition and len(slot_definition.any_of) > 0:
57
- for slot_expression in slot_definition.any_of:
58
- # Use the slot expression's `range` to get the specified eligible class name
59
- # and the names of all classes that inherit from that eligible class.
60
- if slot_expression.range in schema_view.all_classes():
61
- own_and_descendant_class_names = schema_view.class_descendants(
62
- slot_expression.range
63
- )
64
- names_of_eligible_target_classes.extend(own_and_descendant_class_names)
65
- else:
66
- # Use the slot's `range` to get the specified eligible class name
67
- # and the names of all classes that inherit from that eligible class.
68
- if slot_definition.range in schema_view.all_classes():
69
- own_and_descendant_class_names = schema_view.class_descendants(
70
- slot_definition.range
71
- )
72
- names_of_eligible_target_classes.extend(own_and_descendant_class_names)
73
-
74
- # Remove duplicate class names.
75
- names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))
76
-
77
- return names_of_eligible_target_classes
78
32
 
79
33
 
80
34
  def get_class_names_from_collection_spec(
@@ -157,41 +111,23 @@ def get_type_collections() -> dict:
157
111
  return mappings
158
112
 
159
113
 
160
- def without_id_patterns(nmdc_jsonschema):
161
- rv = deepcopy(nmdc_jsonschema)
162
- for cls_, spec in rv["$defs"].items():
163
- if "properties" in spec:
164
- if "id" in spec["properties"]:
165
- spec["properties"]["id"].pop("pattern", None)
166
- return rv
167
-
168
-
169
114
  @lru_cache
170
- def get_nmdc_jsonschema_dict(enforce_id_patterns=True):
171
- """Get NMDC JSON Schema with materialized patterns (for identifier regexes)."""
172
- d = json.loads(
173
- BytesIO(
174
- pkgutil.get_data("nmdc_schema", "nmdc_materialized_patterns.schema.json")
175
- )
176
- .getvalue()
177
- .decode("utf-8")
178
- )
179
- return d if enforce_id_patterns else without_id_patterns(d)
115
+ def get_nmdc_jsonschema_path() -> Path:
116
+ """Get path to NMDC JSON Schema file."""
117
+ with importlib.resources.path(
118
+ "nmdc_schema", "nmdc_materialized_patterns.schema.json"
119
+ ) as p:
120
+ return p
180
121
 
181
122
 
182
- @lru_cache
183
- def get_nmdc_jsonschema_validator(enforce_id_patterns=True):
184
- return fastjsonschema.compile(
185
- get_nmdc_jsonschema_dict(enforce_id_patterns=enforce_id_patterns)
186
- )
123
+ @lru_cache()
124
+ def get_nmdc_jsonschema_dict() -> dict:
125
+ """Get NMDC JSON Schema with materialized patterns (for identifier regexes)."""
126
+ with open(get_nmdc_jsonschema_path(), "r") as f:
127
+ return json.load(f)
187
128
 
188
129
 
189
130
  nmdc_jsonschema = get_nmdc_jsonschema_dict()
190
- nmdc_jsonschema_validator = get_nmdc_jsonschema_validator()
191
- nmdc_jsonschema_noidpatterns = get_nmdc_jsonschema_dict(enforce_id_patterns=False)
192
- nmdc_jsonschema_validator_noidpatterns = get_nmdc_jsonschema_validator(
193
- enforce_id_patterns=False
194
- )
195
131
 
196
132
  REPO_ROOT_DIR = Path(__file__).parent.parent
197
133
 
@@ -332,9 +268,9 @@ def find_one(k_v: dict, entities: Iterable[dict]):
332
268
  """Find the first entity with key-value pair k_v, if any?
333
269
 
334
270
  >>> find_one({"id": "foo"}, [{"id": "foo"}])
271
+ {'id': 'foo'}
272
+ >>> find_one({"id": "foo"}, [{"id": "bar"}]) is None
335
273
  True
336
- >>> find_one({"id": "foo"}, [{"id": "bar"}])
337
- False
338
274
  """
339
275
  if len(k_v) > 1:
340
276
  raise Exception("Supports only one key-value pair")
@@ -360,6 +296,49 @@ def nmdc_schema_view():
360
296
  return ViewGetter().get_view()
361
297
 
362
298
 
299
+ @lru_cache()
300
+ def get_nmdc_schema_validator() -> Validator:
301
+ schema_view = nmdc_schema_view()
302
+ return Validator(
303
+ schema_view.schema,
304
+ validation_plugins=[
305
+ JsonschemaValidationPlugin(
306
+ closed=True,
307
+ # Since the `nmdc-schema` package exports a pre-built JSON Schema file, use that
308
+ # instead of relying on the plugin to generate one on the fly.
309
+ json_schema_path=get_nmdc_jsonschema_path(),
310
+ ),
311
+ NmdcSchemaValidationPlugin(),
312
+ ],
313
+ )
314
+
315
+
316
+ @lru_cache
317
+ def get_class_name_to_collection_names_map(
318
+ schema_view: SchemaView,
319
+ ) -> Dict[str, List[str]]:
320
+ """
321
+ Returns a mapping of class names to the names of the collections that can store instances of those classes/types,
322
+ according to the specified `SchemaView`.
323
+
324
+ Example output:
325
+ ```
326
+ {
327
+ "Study": ["study_set"],
328
+ "Biosample": ["biosample_set"],
329
+ ...
330
+ }
331
+ ```
332
+ """
333
+ class_name_to_collection_names = defaultdict(list)
334
+ for collection_name, class_names in get_collection_name_to_class_names_map(
335
+ schema_view
336
+ ).items():
337
+ for class_name in class_names:
338
+ class_name_to_collection_names[class_name].append(collection_name)
339
+ return class_name_to_collection_names
340
+
341
+
363
342
  @lru_cache
364
343
  def nmdc_database_collection_instance_class_names():
365
344
  names = []
@@ -378,7 +357,7 @@ def nmdc_database_collection_names():
378
357
  TODO: Document this function.
379
358
 
380
359
  TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
381
- collections, use the function named `get_collection_names_from_schema` in `nmdc_runtime/api/db/mongo.py`
360
+ collections, import/use the function named `get_collection_names_from_schema` from `refscan.lib.helpers`
382
361
  instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
383
362
  maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
384
363
  """
@@ -414,6 +393,12 @@ def all_docs_have_unique_id(coll) -> bool:
414
393
 
415
394
 
416
395
  def specialize_activity_set_docs(docs):
396
+ """
397
+ TODO: Document this function.
398
+
399
+ TODO: Check whether this function is still necessary, given that the `Database` class
400
+ in `nmdc-schema` does not have a slot named `activity_set`.
401
+ """
417
402
  validation_errors = {}
418
403
  type_collections = get_type_collections()
419
404
  if "activity_set" in docs:
@@ -497,8 +482,56 @@ def populated_schema_collection_names_with_id_field(mdb: MongoDatabase) -> List[
497
482
  return [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]
498
483
 
499
484
 
485
+ def does_collection_have_unique_index_on_id_field(
486
+ collection_name: str, db: MongoDatabase
487
+ ) -> bool:
488
+ """Check whether the specified MongoDB collection has a unique index on its `id` field (not `_id`).
489
+
490
+ Note: If the specified MongoDB collection either does not exist or is a _view_ instead of a collection,
491
+ this function will return `False`.
492
+
493
+ References:
494
+ - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes
495
+ - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.index_information
496
+ """
497
+ # Check whether the specified collection actually exists in the database; and, if it does,
498
+ # whether it is really a _collection_ (as opposed to being a _view_). If it doesn't exist,
499
+ # or it is a view, return `False` right away.
500
+ collection_infos_cursor = db.list_collections(filter={"name": collection_name})
501
+ collection_infos = list(collection_infos_cursor)
502
+ if len(collection_infos) == 0:
503
+ return False
504
+ collection_info = collection_infos[0]
505
+ if collection_info["type"] != "collection":
506
+ return False
507
+
508
+ # Now that we know we're dealing with a collection, get information about each of its indexes.
509
+ collection = db.get_collection(collection_name)
510
+ for index_information in collection.list_indexes():
511
+ # Get the "field_name-direction" pairs that make up this index.
512
+ field_name_and_direction_pairs: SON = index_information["key"]
513
+
514
+ # If this index involves a number of fields other than one, skip it.
515
+ # We're only interested in indexes that involve the `id` field by itself.
516
+ if len(field_name_and_direction_pairs.keys()) != 1:
517
+ continue
518
+
519
+ # Check whether the field this index involves is the `id` field,
520
+ # and whether this index is `unique`.
521
+ field_name = list(field_name_and_direction_pairs.keys())[0]
522
+ if field_name == "id" and index_information.get("unique", False):
523
+ return True
524
+
525
+ return False
526
+
527
+
500
528
  def ensure_unique_id_indexes(mdb: MongoDatabase):
501
529
  """Ensure that any collections with an "id" field have an index on "id"."""
530
+
531
+ # Note: The pipe (i.e. `|`) operator performs a union of the two sets. In this case,
532
+ # it creates a set (i.e. `candidate_names`) consisting of the names of both
533
+ # (a) all collections in the real database, and (b) all collections that
534
+ # the NMDC schema says can contain instances of classes that have an "id" slot.
502
535
  candidate_names = (
503
536
  set(mdb.list_collection_names()) | schema_collection_names_with_id_field()
504
537
  )
@@ -506,273 +539,67 @@ def ensure_unique_id_indexes(mdb: MongoDatabase):
506
539
  if collection_name.startswith("system."): # reserved by mongodb
507
540
  continue
508
541
 
542
+ # If the collection already has a unique index on `id`, there's no need
543
+ # to check anything else about the collection.
544
+ if does_collection_have_unique_index_on_id_field(collection_name, mdb):
545
+ continue
546
+
509
547
  if (
510
548
  collection_name in schema_collection_names_with_id_field()
511
549
  or all_docs_have_unique_id(mdb[collection_name])
512
550
  ):
513
- mdb[collection_name].create_index("id", unique=True)
514
-
515
-
516
- class UpdateStatement(BaseModel):
517
- q: dict
518
- u: dict
519
- upsert: bool = False
520
- multi: bool = False
521
-
522
-
523
- class DeleteStatement(BaseModel):
524
- q: dict
525
- limit: Annotated[int, Field(ge=0, le=1)] = 1
526
-
527
-
528
- class OverlayDBError(Exception):
529
- pass
530
-
531
-
532
- class OverlayDB(AbstractContextManager):
533
- """Provides a context whereby a base Database is overlaid with a temporary one.
534
-
535
- If you need to run basic simulations of updates to a base database,
536
- you don't want to actually commit transactions to the base database.
537
-
538
- For example, to insert or replace (matching on "id") many documents into a collection in order
539
- to then validate the resulting total set of collection documents, an OverlayDB writes to
540
- an overlay collection that "shadows" the base collection during a "find" query
541
- (the "merge_find" method of an OverlayDB object): if a document with `id0` is found in the
542
- overlay collection, that id is marked as "seen" and will not also be returned when
543
- subsequently scanning the (unmodified) base-database collection.
544
-
545
- Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
546
- database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
547
- `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
548
- the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
549
- "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
550
- of the `merge_find` method, which internally accesses both the real database and the overlaying database.
551
-
552
- Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
553
- documents from a base collection to the overlay, and then applying the updates to the overlay,
554
- so that again, base collections are unmodified, and a "merge_find" call will produce a result
555
- *as if* the base collection(s) were modified.
551
+ # Check if index already exists, and if so, drop it if not unique
552
+ try:
553
+ existing_indexes = list(mdb[collection_name].list_indexes())
554
+ id_index = next(
555
+ (idx for idx in existing_indexes if idx["name"] == "id_1"), None
556
+ )
556
557
 
557
- Mongo deletions (as the "delete" method) also copy affected documents from the base collection
558
- to the overlay collection, and flag them using the "_deleted" field. In this way, a `merge_find`
559
- call will match a relevant document given a suitable filter, and will mark the document's id
560
- as "seen" *without* returning the document. Thus, the result is as if the document were deleted.
558
+ if id_index:
559
+ # If index exists but isn't unique, drop it so we can recreate
560
+ if not id_index.get("unique", False):
561
+ mdb[collection_name].drop_index("id_1")
562
+
563
+ # Create index with unique constraint
564
+ mdb[collection_name].create_index("id", unique=True)
565
+ except OperationFailure as e:
566
+ # If error is about index with same name, just continue
567
+ if "An existing index has the same name" in str(e):
568
+ continue
569
+ else:
570
+ # Re-raise other errors
571
+ raise
561
572
 
562
- Usage:
563
- ````
564
- with OverlayDB(mdb) as odb:
565
- # do stuff, e.g. `odb.replace_or_insert_many(...)`
566
- ```
567
- """
568
573
 
569
- def __init__(self, mdb: MongoDatabase):
570
- self._bottom_db = mdb
571
- self._top_db = self._bottom_db.client.get_database(f"overlay-{uuid4()}")
572
- ensure_unique_id_indexes(self._top_db)
573
-
574
- def __enter__(self):
575
- return self
576
-
577
- def __exit__(self, exc_type, exc_value, traceback):
578
- self._bottom_db.client.drop_database(self._top_db.name)
579
-
580
- def replace_or_insert_many(self, coll_name, documents: list):
581
- try:
582
- self._top_db[coll_name].insert_many(documents)
583
- except OperationFailure as e:
584
- raise OverlayDBError(str(e.details))
585
-
586
- def apply_updates(self, coll_name, updates: list):
587
- """prepare overlay db and apply updates to it."""
588
- assert all(UpdateStatement(**us) for us in updates)
589
- for update_spec in updates:
590
- for bottom_doc in self._bottom_db[coll_name].find(update_spec["q"]):
591
- self._top_db[coll_name].insert_one(bottom_doc)
592
- try:
593
- self._top_db.command({"update": coll_name, "updates": updates})
594
- except OperationFailure as e:
595
- raise OverlayDBError(str(e.details))
596
-
597
- def delete(self, coll_name, deletes: list):
598
- """ "apply" delete command by flagging docs in overlay database"""
599
- assert all(DeleteStatement(**us) for us in deletes)
600
- for delete_spec in deletes:
601
- for bottom_doc in self._bottom_db[coll_name].find(
602
- delete_spec["q"], limit=delete_spec.get("limit", 1)
603
- ):
604
- bottom_doc["_deleted"] = True
605
- self._top_db[coll_name].insert_one(bottom_doc)
606
-
607
- def merge_find(self, coll_name, find_spec: dict):
608
- """Yield docs first from overlay and then from base db, minding deletion flags."""
609
- # ensure projection of "id" and "_deleted"
610
- if "projection" in find_spec:
611
- proj = find_spec["projection"]
612
- if isinstance(proj, dict):
613
- proj = merge(proj, {"id": 1, "_deleted": 1})
614
- elif isinstance(proj, list):
615
- proj = list(unique(proj + ["id", "_deleted"]))
616
-
617
- top_docs = self._top_db[coll_name].find(**find_spec)
618
- bottom_docs = self._bottom_db[coll_name].find(**find_spec)
619
- top_seen_ids = set()
620
- for doc in top_docs:
621
- if not doc.get("_deleted"):
622
- yield doc
623
- top_seen_ids.add(doc["id"])
624
-
625
- for doc in bottom_docs:
626
- if doc["id"] not in top_seen_ids:
627
- yield doc
628
-
629
-
630
- def validate_json(
631
- in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
632
- ):
574
+ def decorate_if(condition: bool = False) -> Callable:
633
575
  r"""
634
- Checks whether the specified dictionary represents a valid instance of the `Database` class
635
- defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
636
-
637
- Example dictionary:
638
- {
639
- "biosample_set": [
640
- {"id": "nmdc:bsm-00-000001", ...},
641
- {"id": "nmdc:bsm-00-000002", ...}
642
- ],
643
- "study_set": [
644
- {"id": "nmdc:sty-00-000001", ...},
645
- {"id": "nmdc:sty-00-000002", ...}
646
- ]
647
- }
648
-
649
- :param in_docs: The dictionary you want to validate
650
- :param mdb: A reference to a MongoDB database
651
- :param check_inter_document_references: Whether you want this function to check whether every document that
652
- is referenced by any of the documents passed in would, indeed, exist
653
- in the database, if the documents passed in were to be inserted into
654
- the database. In other words, set this to `True` if you want this
655
- function to perform referential integrity checks.
576
+ Decorator that applies another decorator only when `condition` is `True`.
577
+
578
+ Note: We implemented this so we could conditionally register
579
+ endpoints with FastAPI's `@router`.
580
+
581
+ Example usages:
582
+ A. Apply the `@router.get` decorator:
583
+ ```python
584
+ @decorate_if(True)(router.get("/me"))
585
+ def get_me(...):
586
+ ...
587
+ ```
588
+ B. Bypass the `@router.get` decorator:
589
+ ```python
590
+ @decorate_if(False)(router.get("/me"))
591
+ def get_me(...):
592
+ ...
593
+ ```
656
594
  """
657
- validator = Draft7Validator(get_nmdc_jsonschema_dict())
658
- docs = deepcopy(in_docs)
659
- validation_errors = {}
660
595
 
661
- known_coll_names = set(nmdc_database_collection_names())
662
- for coll_name, coll_docs in docs.items():
663
- if coll_name not in known_coll_names:
664
- # FIXME: Document what `@type` is (conceptually; e.g., why this function accepts it as a collection name).
665
- # See: https://github.com/microbiomedata/nmdc-runtime/discussions/858
666
- if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
667
- continue
596
+ def apply_original_decorator(original_decorator: Callable) -> Callable:
597
+ def check_condition(original_function: Callable) -> Callable:
598
+ if condition:
599
+ return original_decorator(original_function)
668
600
  else:
669
- validation_errors[coll_name] = [
670
- f"'{coll_name}' is not a known schema collection name"
671
- ]
672
- continue
601
+ return original_function
673
602
 
674
- errors = list(validator.iter_errors({coll_name: coll_docs}))
675
- validation_errors[coll_name] = [e.message for e in errors]
676
- if coll_docs:
677
- if not isinstance(coll_docs, list):
678
- validation_errors[coll_name].append("value must be a list")
679
- elif not all(isinstance(d, dict) for d in coll_docs):
680
- validation_errors[coll_name].append(
681
- "all elements of list must be dicts"
682
- )
683
- if not validation_errors[coll_name]:
684
- try:
685
- with OverlayDB(mdb) as odb:
686
- odb.replace_or_insert_many(coll_name, coll_docs)
687
- except OverlayDBError as e:
688
- validation_errors[coll_name].append(str(e))
689
-
690
- if all(len(v) == 0 for v in validation_errors.values()):
691
- # Second pass. Try instantiating linkml-sourced dataclass
692
- in_docs.pop("@type", None)
693
- try:
694
- NMDCDatabase(**in_docs)
695
- except Exception as e:
696
- return {"result": "errors", "detail": str(e)}
697
-
698
- # Third pass (if enabled): Check inter-document references.
699
- if check_inter_document_references is True:
700
- # Prepare to use `refscan`.
701
- #
702
- # Note: We check the inter-document references in two stages, which are:
703
- # 1. For each document in the JSON payload, check whether each document it references already exists
704
- # (in the collections the schema says it can exist in) in the database. We use the
705
- # `refscan` package to do this, which returns violation details we'll use in the second stage.
706
- # 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
707
- # check whether that document exists (in the collections the schema says it can exist in) in the
708
- # JSON payload. If it does, then we "waive" (i.e. discard) that violation.
709
- # The violations that remain after those two stages are the ones we return to the caller.
710
- #
711
- # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
712
- # does not provide a means to perform arbitrary queries against its virtual "merged" database. It
713
- # is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
714
- # `refscan`'s `Finder` class accepts.
715
- #
716
- finder = Finder(database=mdb)
717
- references = get_allowed_references()
718
- reference_field_names_by_source_class_name = (
719
- references.get_reference_field_names_by_source_class_name()
720
- )
603
+ return check_condition
721
604
 
722
- # Iterate over the collections in the JSON payload.
723
- for source_collection_name, documents in in_docs.items():
724
- for document in documents:
725
- # Add an `_id` field to the document, since `refscan` requires the document to have one.
726
- source_document = dict(document, _id=None)
727
- violations = scan_outgoing_references(
728
- document=source_document,
729
- schema_view=nmdc_schema_view(),
730
- reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
731
- references=references,
732
- finder=finder,
733
- collection_names=nmdc_database_collection_names(),
734
- source_collection_name=source_collection_name,
735
- user_wants_to_locate_misplaced_documents=False,
736
- )
737
-
738
- # For each violation, check whether the misplaced document is in the JSON payload, itself.
739
- for violation in violations:
740
- can_waive_violation = False
741
- # Determine which collections can contain the referenced document, based upon
742
- # the schema class of which this source document is an instance.
743
- target_collection_names = (
744
- references.get_target_collection_names(
745
- source_class_name=violation.source_class_name,
746
- source_field_name=violation.source_field_name,
747
- )
748
- )
749
- # Check whether the referenced document exists in any of those collections in the JSON payload.
750
- for json_coll_name, json_coll_docs in in_docs.items():
751
- if json_coll_name in target_collection_names:
752
- for json_coll_doc in json_coll_docs:
753
- if json_coll_doc["id"] == violation.target_id:
754
- can_waive_violation = True
755
- break # stop checking
756
- if can_waive_violation:
757
- break # stop checking
758
- if not can_waive_violation:
759
- violation_as_str = (
760
- f"Document '{violation.source_document_id}' "
761
- f"in collection '{violation.source_collection_name}' "
762
- f"has a field '{violation.source_field_name}' that "
763
- f"references a document having id "
764
- f"'{violation.target_id}', but the latter document "
765
- f"does not exist in any of the collections the "
766
- f"NMDC Schema says it can exist in."
767
- )
768
- validation_errors[source_collection_name].append(
769
- violation_as_str
770
- )
771
-
772
- # If any collection's error list is not empty, return an error response.
773
- if any(len(v) > 0 for v in validation_errors.values()):
774
- return {"result": "errors", "detail": validation_errors}
775
-
776
- return {"result": "All Okay!"}
777
- else:
778
- return {"result": "errors", "detail": validation_errors}
605
+ return apply_original_decorator
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.4
2
+ Name: nmdc-runtime
3
+ Version: 2.12.0
4
+ Summary: A runtime system for NMDC data management and orchestration
5
+ Project-URL: Changelog, https://github.com/microbiomedata/nmdc-runtime/releases
6
+ Project-URL: Documentation, https://docs.microbiomedata.org/runtime
7
+ Project-URL: Issues, https://github.com/microbiomedata/nmdc-runtime/issues
8
+ Project-URL: Repository, https://github.com/microbiomedata/nmdc-runtime
9
+ License-File: LICENSE
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: base32-lib
12
+ Requires-Dist: boto3
13
+ Requires-Dist: click
14
+ Requires-Dist: dagit
15
+ Requires-Dist: dagster
16
+ Requires-Dist: dagster-graphql
17
+ Requires-Dist: dagster-postgres
18
+ Requires-Dist: fastapi>=0.115.0
19
+ Requires-Dist: frozendict
20
+ Requires-Dist: git-root
21
+ Requires-Dist: jq
22
+ Requires-Dist: jsonasobj2
23
+ Requires-Dist: linkml
24
+ Requires-Dist: linkml-runtime
25
+ Requires-Dist: lxml
26
+ Requires-Dist: nmdc-schema==11.13.0
27
+ Requires-Dist: ontology-loader==0.2.2
28
+ Requires-Dist: pandas
29
+ Requires-Dist: passlib[bcrypt]
30
+ Requires-Dist: pydantic[email]>=1.10.0
31
+ Requires-Dist: pyinstrument
32
+ Requires-Dist: pymongo
33
+ Requires-Dist: python-dotenv
34
+ Requires-Dist: python-jose[cryptography]
35
+ Requires-Dist: python-multipart>=0.0.18
36
+ Requires-Dist: pyyaml
37
+ Requires-Dist: refscan==0.3.2
38
+ Requires-Dist: requests
39
+ Requires-Dist: requests-cache
40
+ Requires-Dist: scalar-fastapi<2.0.0,>=1.4.1
41
+ Requires-Dist: tenacity
42
+ Requires-Dist: toolz
43
+ Requires-Dist: tqdm
44
+ Requires-Dist: unidecode
45
+ Requires-Dist: uvicorn[standard]