nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,4 @@
1
+ <svg width="32" height="32" fill="none" xmlns="http://www.w3.org/2000/svg">
2
+ <path fill-rule="evenodd" clip-rule="evenodd" d="M32 16c0 8.837-7.163 16-16 16-8.838 0-16-7.163-16-16C0 7.162 7.162 0 16 0c8.837 0 16 7.162 16 16Z" fill="#A6CE39"/>
3
+ <path fill-rule="evenodd" clip-rule="evenodd" d="M18.813 9.637h-5.45v13.9h5.474c4.555 0 7.35-3.378 7.35-6.95 0-1.635-.562-3.372-1.77-4.704-1.215-1.336-3.065-2.246-5.605-2.246ZM18.6 21.3h-2.813v-9.425H18.5c1.823 0 3.12.552 3.96 1.4.842.849 1.252 2.021 1.252 3.312 0 .784-.239 1.967-.993 2.948-.745.969-2.01 1.765-4.119 1.765Zm5.311-4.026c-.251 1.74-1.494 4.276-5.311 4.276h-3.063H18.6c3.817 0 5.06-2.536 5.311-4.276Zm1.812-2.405c-.657-2.601-2.85-4.982-6.91-4.982h-5.2 5.2c4.06 0 6.253 2.38 6.91 4.982Zm.215 1.718ZM8.363 9.675v13.887h2.425V9.675H8.363Zm2.175 13.637H8.612h1.925ZM9.575 8.65c.84 0 1.513-.689 1.513-1.513 0-.823-.673-1.512-1.513-1.512-.838 0-1.512.674-1.512 1.513 0 .823.672 1.512 1.512 1.512Z" fill="#fff"/>
4
+ </svg>
@@ -0,0 +1,5 @@
1
+ # Static
2
+
3
+ This document contains information about the origins of the other files in this directory.
4
+
5
+ - `ORCID-iD_icon_vector.svg`: On September 27, 2025, we downloaded this SVG file from ORCID's [Brand Library](https://orcid.filecamp.com/s/o/3CCuLloCl73Knntn/VU19wHSMUnX9TD4R), which we found a link to on the [Brand Guidelines](https://info.orcid.org/brand-guidelines/) page of ORCID's website.
Binary file
nmdc_runtime/util.py CHANGED
@@ -1,31 +1,34 @@
1
+ import importlib.resources
1
2
  import json
2
3
  import mimetypes
3
4
  import os
4
- import pkgutil
5
+ from collections import defaultdict
5
6
  from collections.abc import Iterable
6
- from contextlib import AbstractContextManager
7
- from copy import deepcopy
8
7
  from datetime import datetime, timezone
9
8
  from functools import lru_cache
10
- from io import BytesIO
9
+ from itertools import chain
11
10
  from pathlib import Path
12
- from uuid import uuid4
13
- from typing import List, Optional, Set, Dict
11
+ from typing import Callable, List, Optional, Set, Dict
14
12
 
15
- import fastjsonschema
16
13
  import requests
14
+ from bson.son import SON
17
15
  from frozendict import frozendict
18
- from jsonschema.validators import Draft7Validator
19
- from nmdc_schema.nmdc_schema_accepting_legacy_ids import Database as NMDCDatabase
16
+ from linkml.validator import Validator
17
+ from linkml.validator.plugins import JsonschemaValidationPlugin
18
+ from linkml_runtime import SchemaView
19
+ from nmdc_schema import NmdcSchemaValidationPlugin
20
20
  from nmdc_schema.get_nmdc_view import ViewGetter
21
- from pydantic import Field, BaseModel
22
21
  from pymongo.database import Database as MongoDatabase
23
22
  from pymongo.errors import OperationFailure
24
- from toolz import merge, unique
23
+ from refscan.lib.helpers import (
24
+ identify_references,
25
+ get_collection_name_to_class_names_map,
26
+ )
27
+ from refscan.lib.ReferenceList import ReferenceList
28
+ from toolz import merge
25
29
 
26
30
  from nmdc_runtime.api.core.util import sha256hash_from_file
27
31
  from nmdc_runtime.api.models.object import DrsObjectIn
28
- from typing_extensions import Annotated
29
32
 
30
33
 
31
34
  def get_class_names_from_collection_spec(
@@ -75,6 +78,23 @@ def get_class_names_from_collection_spec(
75
78
  return class_names
76
79
 
77
80
 
81
+ @lru_cache
82
+ def get_allowed_references() -> ReferenceList:
83
+ r"""
84
+ Returns a `ReferenceList` of all the inter-document references that
85
+ the NMDC Schema allows a schema-compliant MongoDB database to contain.
86
+ """
87
+
88
+ # Identify the inter-document references that the schema allows a database to contain.
89
+ print("Identifying schema-allowed references.")
90
+ references = identify_references(
91
+ schema_view=nmdc_schema_view(),
92
+ collection_name_to_class_names=collection_name_to_class_names,
93
+ )
94
+
95
+ return references
96
+
97
+
78
98
  @lru_cache
79
99
  def get_type_collections() -> dict:
80
100
  """Returns a dictionary mapping class names to Mongo collection names."""
@@ -91,41 +111,23 @@ def get_type_collections() -> dict:
91
111
  return mappings
92
112
 
93
113
 
94
- def without_id_patterns(nmdc_jsonschema):
95
- rv = deepcopy(nmdc_jsonschema)
96
- for cls_, spec in rv["$defs"].items():
97
- if "properties" in spec:
98
- if "id" in spec["properties"]:
99
- spec["properties"]["id"].pop("pattern", None)
100
- return rv
101
-
102
-
103
114
  @lru_cache
104
- def get_nmdc_jsonschema_dict(enforce_id_patterns=True):
105
- """Get NMDC JSON Schema with materialized patterns (for identifier regexes)."""
106
- d = json.loads(
107
- BytesIO(
108
- pkgutil.get_data("nmdc_schema", "nmdc_materialized_patterns.schema.json")
109
- )
110
- .getvalue()
111
- .decode("utf-8")
112
- )
113
- return d if enforce_id_patterns else without_id_patterns(d)
115
+ def get_nmdc_jsonschema_path() -> Path:
116
+ """Get path to NMDC JSON Schema file."""
117
+ with importlib.resources.path(
118
+ "nmdc_schema", "nmdc_materialized_patterns.schema.json"
119
+ ) as p:
120
+ return p
114
121
 
115
122
 
116
- @lru_cache
117
- def get_nmdc_jsonschema_validator(enforce_id_patterns=True):
118
- return fastjsonschema.compile(
119
- get_nmdc_jsonschema_dict(enforce_id_patterns=enforce_id_patterns)
120
- )
123
+ @lru_cache()
124
+ def get_nmdc_jsonschema_dict() -> dict:
125
+ """Get NMDC JSON Schema with materialized patterns (for identifier regexes)."""
126
+ with open(get_nmdc_jsonschema_path(), "r") as f:
127
+ return json.load(f)
121
128
 
122
129
 
123
130
  nmdc_jsonschema = get_nmdc_jsonschema_dict()
124
- nmdc_jsonschema_validator = get_nmdc_jsonschema_validator()
125
- nmdc_jsonschema_noidpatterns = get_nmdc_jsonschema_dict(enforce_id_patterns=False)
126
- nmdc_jsonschema_validator_noidpatterns = get_nmdc_jsonschema_validator(
127
- enforce_id_patterns=False
128
- )
129
131
 
130
132
  REPO_ROOT_DIR = Path(__file__).parent.parent
131
133
 
@@ -266,9 +268,9 @@ def find_one(k_v: dict, entities: Iterable[dict]):
266
268
  """Find the first entity with key-value pair k_v, if any?
267
269
 
268
270
  >>> find_one({"id": "foo"}, [{"id": "foo"}])
271
+ {'id': 'foo'}
272
+ >>> find_one({"id": "foo"}, [{"id": "bar"}]) is None
269
273
  True
270
- >>> find_one({"id": "foo"}, [{"id": "bar"}])
271
- False
272
274
  """
273
275
  if len(k_v) > 1:
274
276
  raise Exception("Supports only one key-value pair")
@@ -294,6 +296,49 @@ def nmdc_schema_view():
294
296
  return ViewGetter().get_view()
295
297
 
296
298
 
299
+ @lru_cache()
300
+ def get_nmdc_schema_validator() -> Validator:
301
+ schema_view = nmdc_schema_view()
302
+ return Validator(
303
+ schema_view.schema,
304
+ validation_plugins=[
305
+ JsonschemaValidationPlugin(
306
+ closed=True,
307
+ # Since the `nmdc-schema` package exports a pre-built JSON Schema file, use that
308
+ # instead of relying on the plugin to generate one on the fly.
309
+ json_schema_path=get_nmdc_jsonschema_path(),
310
+ ),
311
+ NmdcSchemaValidationPlugin(),
312
+ ],
313
+ )
314
+
315
+
316
+ @lru_cache
317
+ def get_class_name_to_collection_names_map(
318
+ schema_view: SchemaView,
319
+ ) -> Dict[str, List[str]]:
320
+ """
321
+ Returns a mapping of class names to the names of the collections that can store instances of those classes/types,
322
+ according to the specified `SchemaView`.
323
+
324
+ Example output:
325
+ ```
326
+ {
327
+ "Study": ["study_set"],
328
+ "Biosample": ["biosample_set"],
329
+ ...
330
+ }
331
+ ```
332
+ """
333
+ class_name_to_collection_names = defaultdict(list)
334
+ for collection_name, class_names in get_collection_name_to_class_names_map(
335
+ schema_view
336
+ ).items():
337
+ for class_name in class_names:
338
+ class_name_to_collection_names[class_name].append(collection_name)
339
+ return class_name_to_collection_names
340
+
341
+
297
342
  @lru_cache
298
343
  def nmdc_database_collection_instance_class_names():
299
344
  names = []
@@ -308,6 +353,14 @@ def nmdc_database_collection_instance_class_names():
308
353
 
309
354
  @lru_cache
310
355
  def nmdc_database_collection_names():
356
+ r"""
357
+ TODO: Document this function.
358
+
359
+ TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
360
+ collections, import/use the function named `get_collection_names_from_schema` from `refscan.lib.helpers`
361
+ instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
362
+ maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
363
+ """
311
364
  names = []
312
365
  view = nmdc_schema_view()
313
366
  all_classes = set(view.all_classes())
@@ -340,6 +393,12 @@ def all_docs_have_unique_id(coll) -> bool:
340
393
 
341
394
 
342
395
  def specialize_activity_set_docs(docs):
396
+ """
397
+ TODO: Document this function.
398
+
399
+ TODO: Check whether this function is still necessary, given that the `Database` class
400
+ in `nmdc-schema` does not have a slot named `activity_set`.
401
+ """
343
402
  validation_errors = {}
344
403
  type_collections = get_type_collections()
345
404
  if "activity_set" in docs:
@@ -369,13 +428,38 @@ def specialize_activity_set_docs(docs):
369
428
 
370
429
  # Define a mapping from collection name to a list of class names allowable for that collection's documents.
371
430
  collection_name_to_class_names: Dict[str, List[str]] = {
372
- collection_name: get_class_names_from_collection_spec(spec)
431
+ collection_name: list(
432
+ set(
433
+ chain.from_iterable(
434
+ nmdc_schema_view().class_descendants(cls_name)
435
+ for cls_name in get_class_names_from_collection_spec(spec)
436
+ )
437
+ )
438
+ )
373
439
  for collection_name, spec in nmdc_jsonschema["$defs"]["Database"][
374
440
  "properties"
375
441
  ].items()
376
442
  }
377
443
 
378
444
 
445
+ def class_hierarchy_as_list(obj) -> list[str]:
446
+ """
447
+ get list of inherited classes for each concrete class
448
+ """
449
+ rv = []
450
+ current_class = obj.__class__
451
+
452
+ def recurse_through_bases(cls):
453
+ if cls.__name__ == "YAMLRoot":
454
+ return rv
455
+ rv.append(cls.__name__)
456
+ for base in cls.__bases__:
457
+ recurse_through_bases(base)
458
+ return rv
459
+
460
+ return recurse_through_bases(current_class)
461
+
462
+
379
463
  @lru_cache
380
464
  def schema_collection_names_with_id_field() -> Set[str]:
381
465
  """
@@ -393,169 +477,129 @@ def schema_collection_names_with_id_field() -> Set[str]:
393
477
  return target_collection_names
394
478
 
395
479
 
396
- def ensure_unique_id_indexes(mdb: MongoDatabase):
397
- """Ensure that any collections with an "id" field have an index on "id"."""
398
- candidate_names = (
399
- set(mdb.list_collection_names()) | schema_collection_names_with_id_field()
400
- )
401
- for collection_name in candidate_names:
402
- if collection_name.startswith("system."): # reserved by mongodb
403
- continue
480
+ def populated_schema_collection_names_with_id_field(mdb: MongoDatabase) -> List[str]:
481
+ collection_names = sorted(schema_collection_names_with_id_field())
482
+ return [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]
404
483
 
405
- if (
406
- collection_name in schema_collection_names_with_id_field()
407
- or all_docs_have_unique_id(mdb[collection_name])
408
- ):
409
- mdb[collection_name].create_index("id", unique=True)
410
484
 
485
+ def does_collection_have_unique_index_on_id_field(
486
+ collection_name: str, db: MongoDatabase
487
+ ) -> bool:
488
+ """Check whether the specified MongoDB collection has a unique index on its `id` field (not `_id`).
411
489
 
412
- class UpdateStatement(BaseModel):
413
- q: dict
414
- u: dict
415
- upsert: bool = False
416
- multi: bool = False
490
+ Note: If the specified MongoDB collection either does not exist or is a _view_ instead of a collection,
491
+ this function will return `False`.
417
492
 
493
+ References:
494
+ - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes
495
+ - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.index_information
496
+ """
497
+ # Check whether the specified collection actually exists in the database; and, if it does,
498
+ # whether it is really a _collection_ (as opposed to being a _view_). If it doesn't exist,
499
+ # or it is a view, return `False` right away.
500
+ collection_infos_cursor = db.list_collections(filter={"name": collection_name})
501
+ collection_infos = list(collection_infos_cursor)
502
+ if len(collection_infos) == 0:
503
+ return False
504
+ collection_info = collection_infos[0]
505
+ if collection_info["type"] != "collection":
506
+ return False
418
507
 
419
- class DeleteStatement(BaseModel):
420
- q: dict
421
- limit: Annotated[int, Field(ge=0, le=1)] = 1
508
+ # Now that we know we're dealing with a collection, get information about each of its indexes.
509
+ collection = db.get_collection(collection_name)
510
+ for index_information in collection.list_indexes():
511
+ # Get the "field_name-direction" pairs that make up this index.
512
+ field_name_and_direction_pairs: SON = index_information["key"]
422
513
 
514
+ # If this index involves a number of fields other than one, skip it.
515
+ # We're only interested in indexes that involve the `id` field by itself.
516
+ if len(field_name_and_direction_pairs.keys()) != 1:
517
+ continue
423
518
 
424
- class OverlayDBError(Exception):
425
- pass
519
+ # Check whether the field this index involves is the `id` field,
520
+ # and whether this index is `unique`.
521
+ field_name = list(field_name_and_direction_pairs.keys())[0]
522
+ if field_name == "id" and index_information.get("unique", False):
523
+ return True
426
524
 
525
+ return False
427
526
 
428
- class OverlayDB(AbstractContextManager):
429
- """Provides a context whereby a base Database is overlaid with a temporary one.
430
527
 
431
- If you need to run basic simulations of updates to a base database,
432
- you don't want to actually commit transactions to the base database.
528
+ def ensure_unique_id_indexes(mdb: MongoDatabase):
529
+ """Ensure that any collections with an "id" field have an index on "id"."""
433
530
 
434
- For example, to insert or replace (matching on "id") many documents into a collection in order
435
- to then validate the resulting total set of collection documents, an OverlayDB writes to
436
- an overlay collection that "shadows" the base collection during a "find" query
437
- (the "merge_find" method of an OverlayDB object): if a document with `id0` is found in the
438
- overlay collection, that id is marked as "seen" and will not also be returned when
439
- subsequently scanning the (unmodified) base-database collection.
531
+ # Note: The pipe (i.e. `|`) operator performs a union of the two sets. In this case,
532
+ # it creates a set (i.e. `candidate_names`) consisting of the names of both
533
+ # (a) all collections in the real database, and (b) all collections that
534
+ # the NMDC schema says can contain instances of classes that have an "id" slot.
535
+ candidate_names = (
536
+ set(mdb.list_collection_names()) | schema_collection_names_with_id_field()
537
+ )
538
+ for collection_name in candidate_names:
539
+ if collection_name.startswith("system."): # reserved by mongodb
540
+ continue
440
541
 
441
- Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
442
- documents from a base collection to the overlay, and then applying the updates to the overlay,
443
- so that again, base collections are unmodified, and a "merge_find" call will produce a result
444
- *as if* the base collection(s) were modified.
542
+ # If the collection already has a unique index on `id`, there's no need
543
+ # to check anything else about the collection.
544
+ if does_collection_have_unique_index_on_id_field(collection_name, mdb):
545
+ continue
445
546
 
446
- Mongo deletions (as the "delete" method) also copy affected documents from the base collection
447
- to the overlay collection, and flag them using the "_deleted" field. In this way, a `merge_find`
448
- call will match a relevant document given a suitable filter, and will mark the document's id
449
- as "seen" *without* returning the document. Thus, the result is as if the document were deleted.
547
+ if (
548
+ collection_name in schema_collection_names_with_id_field()
549
+ or all_docs_have_unique_id(mdb[collection_name])
550
+ ):
551
+ # Check if index already exists, and if so, drop it if not unique
552
+ try:
553
+ existing_indexes = list(mdb[collection_name].list_indexes())
554
+ id_index = next(
555
+ (idx for idx in existing_indexes if idx["name"] == "id_1"), None
556
+ )
450
557
 
451
- Usage:
452
- ````
453
- with OverlayDB(mdb) as odb:
454
- # do stuff, e.g. `odb.replace_or_insert_many(...)`
455
- ```
558
+ if id_index:
559
+ # If index exists but isn't unique, drop it so we can recreate
560
+ if not id_index.get("unique", False):
561
+ mdb[collection_name].drop_index("id_1")
562
+
563
+ # Create index with unique constraint
564
+ mdb[collection_name].create_index("id", unique=True)
565
+ except OperationFailure as e:
566
+ # If error is about index with same name, just continue
567
+ if "An existing index has the same name" in str(e):
568
+ continue
569
+ else:
570
+ # Re-raise other errors
571
+ raise
572
+
573
+
574
+ def decorate_if(condition: bool = False) -> Callable:
575
+ r"""
576
+ Decorator that applies another decorator only when `condition` is `True`.
577
+
578
+ Note: We implemented this so we could conditionally register
579
+ endpoints with FastAPI's `@router`.
580
+
581
+ Example usages:
582
+ A. Apply the `@router.get` decorator:
583
+ ```python
584
+ @decorate_if(True)(router.get("/me"))
585
+ def get_me(...):
586
+ ...
587
+ ```
588
+ B. Bypass the `@router.get` decorator:
589
+ ```python
590
+ @decorate_if(False)(router.get("/me"))
591
+ def get_me(...):
592
+ ...
593
+ ```
456
594
  """
457
595
 
458
- def __init__(self, mdb: MongoDatabase):
459
- self._bottom_db = mdb
460
- self._top_db = self._bottom_db.client.get_database(f"overlay-{uuid4()}")
461
- ensure_unique_id_indexes(self._top_db)
462
-
463
- def __enter__(self):
464
- return self
465
-
466
- def __exit__(self, exc_type, exc_value, traceback):
467
- self._bottom_db.client.drop_database(self._top_db.name)
468
-
469
- def replace_or_insert_many(self, coll_name, documents: list):
470
- try:
471
- self._top_db[coll_name].insert_many(documents)
472
- except OperationFailure as e:
473
- raise OverlayDBError(str(e.details))
474
-
475
- def apply_updates(self, coll_name, updates: list):
476
- """prepare overlay db and apply updates to it."""
477
- assert all(UpdateStatement(**us) for us in updates)
478
- for update_spec in updates:
479
- for bottom_doc in self._bottom_db[coll_name].find(update_spec["q"]):
480
- self._top_db[coll_name].insert_one(bottom_doc)
481
- try:
482
- self._top_db.command({"update": coll_name, "updates": updates})
483
- except OperationFailure as e:
484
- raise OverlayDBError(str(e.details))
485
-
486
- def delete(self, coll_name, deletes: list):
487
- """ "apply" delete command by flagging docs in overlay database"""
488
- assert all(DeleteStatement(**us) for us in deletes)
489
- for delete_spec in deletes:
490
- for bottom_doc in self._bottom_db[coll_name].find(
491
- delete_spec["q"], limit=delete_spec.get("limit", 1)
492
- ):
493
- bottom_doc["_deleted"] = True
494
- self._top_db[coll_name].insert_one(bottom_doc)
495
-
496
- def merge_find(self, coll_name, find_spec: dict):
497
- """Yield docs first from overlay and then from base db, minding deletion flags."""
498
- # ensure projection of "id" and "_deleted"
499
- if "projection" in find_spec:
500
- proj = find_spec["projection"]
501
- if isinstance(proj, dict):
502
- proj = merge(proj, {"id": 1, "_deleted": 1})
503
- elif isinstance(proj, list):
504
- proj = list(unique(proj + ["id", "_deleted"]))
505
-
506
- top_docs = self._top_db[coll_name].find(**find_spec)
507
- bottom_docs = self._bottom_db[coll_name].find(**find_spec)
508
- top_seen_ids = set()
509
- for doc in top_docs:
510
- if not doc.get("_deleted"):
511
- yield doc
512
- top_seen_ids.add(doc["id"])
513
-
514
- for doc in bottom_docs:
515
- if doc["id"] not in top_seen_ids:
516
- yield doc
517
-
518
-
519
- def validate_json(in_docs: dict, mdb: MongoDatabase):
520
- validator = Draft7Validator(get_nmdc_jsonschema_dict())
521
- docs = deepcopy(in_docs)
522
- validation_errors = {}
523
-
524
- known_coll_names = set(nmdc_database_collection_names())
525
- for coll_name, coll_docs in docs.items():
526
- if coll_name not in known_coll_names:
527
- if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
528
- continue
596
+ def apply_original_decorator(original_decorator: Callable) -> Callable:
597
+ def check_condition(original_function: Callable) -> Callable:
598
+ if condition:
599
+ return original_decorator(original_function)
529
600
  else:
530
- validation_errors[coll_name] = [
531
- f"'{coll_name}' is not a known schema collection name"
532
- ]
533
- continue
601
+ return original_function
534
602
 
535
- errors = list(validator.iter_errors({coll_name: coll_docs}))
536
- validation_errors[coll_name] = [e.message for e in errors]
537
- if coll_docs:
538
- if not isinstance(coll_docs, list):
539
- validation_errors[coll_name].append("value must be a list")
540
- elif not all(isinstance(d, dict) for d in coll_docs):
541
- validation_errors[coll_name].append(
542
- "all elements of list must be dicts"
543
- )
544
- if not validation_errors[coll_name]:
545
- try:
546
- with OverlayDB(mdb) as odb:
547
- odb.replace_or_insert_many(coll_name, coll_docs)
548
- except OverlayDBError as e:
549
- validation_errors[coll_name].append(str(e))
550
-
551
- if all(len(v) == 0 for v in validation_errors.values()):
552
- # Second pass. Try instantiating linkml-sourced dataclass
553
- in_docs.pop("@type", None)
554
- try:
555
- NMDCDatabase(**in_docs)
556
- except Exception as e:
557
- return {"result": "errors", "detail": str(e)}
558
-
559
- return {"result": "All Okay!"}
560
- else:
561
- return {"result": "errors", "detail": validation_errors}
603
+ return check_condition
604
+
605
+ return apply_original_decorator
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.4
2
+ Name: nmdc-runtime
3
+ Version: 2.12.0
4
+ Summary: A runtime system for NMDC data management and orchestration
5
+ Project-URL: Changelog, https://github.com/microbiomedata/nmdc-runtime/releases
6
+ Project-URL: Documentation, https://docs.microbiomedata.org/runtime
7
+ Project-URL: Issues, https://github.com/microbiomedata/nmdc-runtime/issues
8
+ Project-URL: Repository, https://github.com/microbiomedata/nmdc-runtime
9
+ License-File: LICENSE
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: base32-lib
12
+ Requires-Dist: boto3
13
+ Requires-Dist: click
14
+ Requires-Dist: dagit
15
+ Requires-Dist: dagster
16
+ Requires-Dist: dagster-graphql
17
+ Requires-Dist: dagster-postgres
18
+ Requires-Dist: fastapi>=0.115.0
19
+ Requires-Dist: frozendict
20
+ Requires-Dist: git-root
21
+ Requires-Dist: jq
22
+ Requires-Dist: jsonasobj2
23
+ Requires-Dist: linkml
24
+ Requires-Dist: linkml-runtime
25
+ Requires-Dist: lxml
26
+ Requires-Dist: nmdc-schema==11.13.0
27
+ Requires-Dist: ontology-loader==0.2.2
28
+ Requires-Dist: pandas
29
+ Requires-Dist: passlib[bcrypt]
30
+ Requires-Dist: pydantic[email]>=1.10.0
31
+ Requires-Dist: pyinstrument
32
+ Requires-Dist: pymongo
33
+ Requires-Dist: python-dotenv
34
+ Requires-Dist: python-jose[cryptography]
35
+ Requires-Dist: python-multipart>=0.0.18
36
+ Requires-Dist: pyyaml
37
+ Requires-Dist: refscan==0.3.2
38
+ Requires-Dist: requests
39
+ Requires-Dist: requests-cache
40
+ Requires-Dist: scalar-fastapi<2.0.0,>=1.4.1
41
+ Requires-Dist: tenacity
42
+ Requires-Dist: toolz
43
+ Requires-Dist: tqdm
44
+ Requires-Dist: unidecode
45
+ Requires-Dist: uvicorn[standard]