nmdc-runtime 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (98) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +7 -8
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +1 -22
  76. nmdc_runtime/site/ops.py +60 -152
  77. nmdc_runtime/site/repository.py +0 -112
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +2 -54
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/util.py +3 -47
  87. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  88. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  89. nmdc_runtime/site/translation/emsl.py +0 -43
  90. nmdc_runtime/site/translation/gold.py +0 -53
  91. nmdc_runtime/site/translation/jgi.py +0 -32
  92. nmdc_runtime/site/translation/util.py +0 -132
  93. nmdc_runtime/site/validation/jgi.py +0 -43
  94. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  95. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  96. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  97. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  98. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,581 @@
1
+ from importlib.metadata import version
2
+ import re
3
+ from typing import List, Dict, Annotated
4
+
5
+ import pymongo
6
+ from fastapi import APIRouter, Depends, HTTPException, Path, Query
7
+ from pydantic import AfterValidator
8
+ from refscan.lib.helpers import (
9
+ get_collection_names_from_schema,
10
+ get_names_of_classes_eligible_for_collection,
11
+ )
12
+
13
+ from nmdc_runtime.config import IS_LINKED_INSTANCES_ENDPOINT_ENABLED
14
+ from nmdc_runtime.minter.config import typecodes
15
+ from nmdc_runtime.minter.domain.model import check_valid_ids
16
+ from nmdc_runtime.util import (
17
+ decorate_if,
18
+ nmdc_database_collection_names,
19
+ nmdc_schema_view,
20
+ )
21
+ from pymongo.database import Database as MongoDatabase
22
+ from starlette import status
23
+
24
+ from nmdc_runtime.api.core.metadata import map_id_to_collection, get_collection_for_id
25
+ from nmdc_runtime.api.core.util import raise404_if_none
26
+ from nmdc_runtime.api.db.mongo import (
27
+ get_mongo_db,
28
+ )
29
+ from nmdc_runtime.api.endpoints.util import (
30
+ list_resources,
31
+ strip_oid,
32
+ comma_separated_values,
33
+ )
34
+ from nmdc_runtime.api.models.metadata import Doc
35
+ from nmdc_runtime.api.models.util import ListRequest, ListResponse
36
+
37
+ router = APIRouter()
38
+
39
+
40
+ def ensure_collection_name_is_known_to_schema(collection_name: str):
41
+ r"""
42
+ Raises an exception if the specified string is _not_ the name of a collection described by the NMDC Schema.
43
+ """
44
+ schema_view = nmdc_schema_view()
45
+ names = get_collection_names_from_schema(schema_view)
46
+ if collection_name not in names:
47
+ raise HTTPException(
48
+ status_code=status.HTTP_400_BAD_REQUEST,
49
+ detail=f"Collection name must be one of {sorted(names)}",
50
+ )
51
+
52
+
53
+ @router.get("/nmdcschema/version")
54
+ def get_nmdc_schema_version():
55
+ r"""
56
+ Returns a string indicating which version of the [NMDC Schema](https://microbiomedata.github.io/nmdc-schema/)
57
+ the Runtime is using.
58
+
59
+ **Note:** The same information—and more—is also available via the `/version` endpoint.
60
+ """
61
+ return version("nmdc_schema")
62
+
63
+
64
+ @router.get("/nmdcschema/typecodes")
65
+ def get_nmdc_schema_typecodes() -> List[Dict[str, str]]:
66
+ r"""
67
+ Returns a list of objects, each of which indicates (a) a schema class, and (b) the typecode
68
+ that the minter would use when generating a new ID for an instance of that schema class.
69
+
70
+ Each object has three properties:
71
+ - `id`: a string that consists of "nmdc:" + the class name + "_typecode"
72
+ - `schema_class`: a string that consists of "nmdc:" + the class name
73
+ - `name`: the typecode the minter would use when minting an ID for an instance of that class
74
+ """
75
+ return typecodes()
76
+
77
+
78
+ @router.get("/nmdcschema/collection_stats")
79
+ def get_nmdc_database_collection_stats(
80
+ mdb: MongoDatabase = Depends(get_mongo_db),
81
+ ):
82
+ """
83
+ To get the NMDC Database MongoDB collection statistics, like the total count of records in a collection or the size
84
+ of the collection, try executing the GET /nmdcschema/collection_stats endpoint
85
+
86
+ Field reference: <https://www.mongodb.com/docs/manual/reference/command/collStats/#std-label-collStats-output>.
87
+ """
88
+ # Take set intersection of
89
+ # (1) all collections defined by the NMDC schema, and
90
+ # (2) all runtime collections
91
+ # Thus, only retrieve collections from the schema that are present (i.e. having actual documents) in the runtime.
92
+ present_collection_names = set(nmdc_database_collection_names()) & set(
93
+ mdb.list_collection_names()
94
+ )
95
+ stats = []
96
+ for n in present_collection_names:
97
+ for doc in mdb[n].aggregate(
98
+ [
99
+ {"$collStats": {"storageStats": {}}},
100
+ {
101
+ "$project": {
102
+ "ns": 1,
103
+ "storageStats.size": 1,
104
+ "storageStats.count": 1,
105
+ "storageStats.avgObjSize": 1,
106
+ "storageStats.storageSize": 1,
107
+ "storageStats.totalIndexSize": 1,
108
+ "storageStats.totalSize": 1,
109
+ "storageStats.scaleFactor": 1,
110
+ }
111
+ },
112
+ ]
113
+ ):
114
+ stats.append(doc)
115
+ return stats
116
+
117
+
118
+ @decorate_if(condition=IS_LINKED_INSTANCES_ENDPOINT_ENABLED)(
119
+ router.get(
120
+ "/nmdcschema/linked_instances",
121
+ response_model=ListResponse,
122
+ response_model_exclude_unset=True,
123
+ )
124
+ )
125
+ def get_linked_instances(
126
+ ids: Annotated[
127
+ list[str],
128
+ Query(
129
+ title="Instance (aka Document) IDs",
130
+ description=(
131
+ "The `ids` you want to serve as the nexus for graph traversal to collect linked instances."
132
+ "\n\n_Example_: [`nmdc:dobj-11-nf3t6f36`]"
133
+ ),
134
+ examples=["nmdc:dobj-11-nf3t6f36"],
135
+ ),
136
+ AfterValidator(check_valid_ids),
137
+ ],
138
+ types: Annotated[
139
+ list[str] | None,
140
+ Query(
141
+ title="Instance (aka Document) types",
142
+ description=(
143
+ "The `types` of instances you want to return. Can be abstract types such as `nmdc:InformationObject` "
144
+ "or instantiated ones such as `nmdc:DataObject`. Defaults to [`nmdc:NamedThing`]."
145
+ "\n\n_Example_: [`nmdc:PlannedProcess`]"
146
+ ),
147
+ examples=["nmdc:bsm-11-abc123"],
148
+ ),
149
+ ] = None,
150
+ mdb: MongoDatabase = Depends(get_mongo_db),
151
+ ):
152
+ """
153
+ Retrieves database instances that are both (a) linked to any of `ids`, and (b) of a type in `types`.
154
+
155
+ An [instance](https://linkml.io/linkml-model/latest/docs/specification/02instances/) is an object conforming to
156
+ a class definition ([linkml:ClassDefinition](https://w3id.org/linkml/ClassDefinition))
157
+ in our database ([nmdc:Database](https://w3id.org/nmdc/Database)).
158
+ While a [nmdc:Database](https://w3id.org/nmdc/Database) is organized into collections,
159
+ every item in every database collection -- that is, every instance -- knows its `type`, so we can
160
+ (and here do)<sup>&dagger;</sup>
161
+ return a simple list of instances
162
+ ([a LinkML CollectionInstance](https://linkml.io/linkml-model/latest/docs/specification/02instances/#collections)),
163
+ which a client may use to construct a corresponding [nmdc:Database](https://w3id.org/nmdc/Database).
164
+
165
+ From the nexus instance IDs given in `ids`, both "upstream" and "downstream" links are followed (transitively) in
166
+ order to collect the set of all instances linked to these `ids`.
167
+
168
+ * A link "upstream" is represented by a slot ([linkml:SlotDefinition](https://w3id.org/linkml/SlotDefinition))
169
+ for which the
170
+ range ([linkml:range](https://w3id.org/linkml/range)) instance has originated, or helped produce,
171
+ the domain ([linkml:domain](https://w3id.org/linkml/domain)) instance.
172
+ For example, we consider [nmdc:associated_studies](https://w3id.org/nmdc/associated_studies) to be
173
+ an "upstream" slot because we consider a [nmdc:Study](https://w3id.org/nmdc/Study) (the slot's range)
174
+ to be upstream of a [nmdc:Biosample](https://w3id.org/nmdc/Biosample) (the slot's domain).
175
+
176
+ * A link "downstream" is represented by a slot for which the
177
+ range instance has originated from, or was in part produced by, the domain instance.
178
+ For example, [nmdc:has_output](https://w3id.org/nmdc/has_output) is
179
+ a "downstream" slot because its [nmdc:NamedThing](https://w3id.org/nmdc/NamedThing) range
180
+ is downstream of its [nmdc:PlannedProcess](https://w3id.org/nmdc/PlannedProcess) domain.
181
+
182
+ Acceptable values for `types` are not limited only to the ones embedded in concrete instances, e.g.
183
+ the `schema_class` field values returned by the [`GET /nmdcschema/typecodes`](/nmdcschema/typecodes) API endpoint.
184
+ Rather, any subclass (of any depth) of [nmdc:NamedThing](https://w3id.org/nmdc/NamedThing) --
185
+ [nmdc:DataEmitterProcess](https://w3id.org/nmdc/DataEmitterProcess),
186
+ [nmdc:InformationObject](https://w3id.org/nmdc/InformationObject),
187
+ [nmdc:Sample](https://w3id.org/nmdc/Sample), etc. -- may be given.
188
+ If no value for `types` is given, then all [nmdc:NamedThing](https://w3id.org/nmdc/NamedThing)s are returned.
189
+
190
+ <sup>&dagger;</sup>: actually, we do not (yet).
191
+ For now (see [microbiomedata/nmdc-runtime#1118](https://github.com/microbiomedata/nmdc-runtime/issues/1118)),
192
+ we return a short list of "fat" documents, each of which represents one of the `ids` and presents
193
+ representations of that id's downstream and upstream instances (currently just each instance's `id` and `type`)
194
+ as separate subdocument array fields.
195
+ """
196
+ # TODO move logic from endpoint to unit-testable handler
197
+ # TODO ListResponse[SimplifiedNMDCDatabase]
198
+ # TODO ensure pagination for responses
199
+ ids_found = [d["id"] for d in mdb.alldocs.find({"id": {"$in": ids}}, {"id": 1})]
200
+ ids_not_found = list(set(ids) - set(ids_found))
201
+ if ids_not_found:
202
+ raise HTTPException(
203
+ status_code=status.HTTP_404_NOT_FOUND,
204
+ detail=f"Some IDs not found: {ids_not_found}.",
205
+ )
206
+ types = types or ["nmdc:NamedThing"]
207
+ types_possible = set([f"nmdc:{name}" for name in nmdc_schema_view().all_classes()])
208
+ types_not_found = list(set(types) - types_possible)
209
+ if types_not_found:
210
+ raise HTTPException(
211
+ status_code=status.HTTP_404_NOT_FOUND,
212
+ detail=(
213
+ f"Some types not found: {types_not_found}. "
214
+ "You may need to prefix with `nmdc:`. "
215
+ "If you don't supply any types, the set {'nmdc:NamedThing'} will be used. "
216
+ f"Types possible: {types_possible}"
217
+ ),
218
+ )
219
+
220
+ # This aggregation pipeline traverses the graph of documents in the alldocs collection, following upstream
221
+ # relationships (_upstream.id) to discover upstream documents for entities that originated, or helped produce,
222
+ # the entities with documents identified by `ids`. It unwinds the collected (via `$graphLookup`) upstream docs,
223
+ # filters them by given `types` of interest, projects only essential fields to reduce response latency and size,
224
+ # and groups them by each of the given `ids`, i.e. re-winding the `$unwind`-ed upstream docs into an array for each
225
+ # given ID.
226
+ upstream_docs = list(
227
+ mdb.alldocs.aggregate(
228
+ [
229
+ {"$match": {"id": {"$in": ids}}},
230
+ {
231
+ "$graphLookup": {
232
+ "from": "alldocs",
233
+ "startWith": "$_upstream.id",
234
+ "connectFromField": "_upstream.id",
235
+ "connectToField": "id",
236
+ "as": "upstream_docs",
237
+ }
238
+ },
239
+ {"$unwind": {"path": "$upstream_docs"}},
240
+ {"$match": {"upstream_docs._type_and_ancestors": {"$in": types}}},
241
+ {"$project": {"id": 1, "upstream_docs": "$upstream_docs"}},
242
+ {
243
+ "$group": {
244
+ "_id": "$id",
245
+ "upstream_docs": {
246
+ "$addToSet": {
247
+ "id": "$upstream_docs.id",
248
+ "type": "$upstream_docs.type",
249
+ }
250
+ },
251
+ }
252
+ },
253
+ {
254
+ "$lookup": {
255
+ "from": "alldocs",
256
+ "localField": "_id",
257
+ "foreignField": "id",
258
+ "as": "selves",
259
+ }
260
+ },
261
+ {
262
+ "$project": {
263
+ "_id": 0,
264
+ "id": "$_id",
265
+ "upstream_docs": 1,
266
+ "type": {"$arrayElemAt": ["$selves.type", 0]},
267
+ }
268
+ },
269
+ ],
270
+ allowDiskUse=True,
271
+ )
272
+ )
273
+
274
+ # This aggregation pipeline traverses the graph of documents in the alldocs collection, following downstream
275
+ # relationships (_downstream.id) to discover downstream documents for entities that originated from,
276
+ # or are considered part of, the entities with documents identified by `ids`. It unwinds the collected (via
277
+ # `$graphLookup`) downstream docs, filters them by given `types` of interest, projects only essential fields to
278
+ # reduce response latency and size, and groups them by each of the given `ids`, i.e. re-winding the `$unwind`-ed
279
+ # downstream docs into an array for each given ID.
280
+ downstream_docs = list(
281
+ mdb.alldocs.aggregate(
282
+ [
283
+ {"$match": {"id": {"$in": ids}}},
284
+ {
285
+ "$graphLookup": {
286
+ "from": "alldocs",
287
+ "startWith": "$_downstream.id",
288
+ "connectFromField": "_downstream.id",
289
+ "connectToField": "id",
290
+ "as": "downstream_docs",
291
+ }
292
+ },
293
+ {"$unwind": {"path": "$downstream_docs"}},
294
+ {"$match": {"downstream_docs._type_and_ancestors": {"$in": types}}},
295
+ {
296
+ "$group": {
297
+ "_id": "$id",
298
+ "downstream_docs": {
299
+ "$addToSet": {
300
+ "id": "$downstream_docs.id",
301
+ "type": "$downstream_docs.type",
302
+ }
303
+ },
304
+ }
305
+ },
306
+ {
307
+ "$lookup": {
308
+ "from": "alldocs",
309
+ "localField": "_id",
310
+ "foreignField": "id",
311
+ "as": "selves",
312
+ }
313
+ },
314
+ {
315
+ "$project": {
316
+ "_id": 0,
317
+ "id": "$_id",
318
+ "downstream_docs": 1,
319
+ "type": {"$arrayElemAt": ["$selves.type", 0]},
320
+ }
321
+ },
322
+ ],
323
+ allowDiskUse=True,
324
+ )
325
+ )
326
+
327
+ relations_by_id = {
328
+ id_: {
329
+ "id": id_,
330
+ "upstream_docs": [],
331
+ "downstream_docs": [],
332
+ }
333
+ for id_ in ids
334
+ }
335
+
336
+ # For each subject document that was upstream of or downstream of any documents, create a dictionary
337
+ # containing that subject document's `id`, its `type`, and the list of `id`s of the
338
+ # documents that it for upstream or or downstream of.
339
+ for d in upstream_docs + downstream_docs:
340
+ relations_by_id[d["id"]]["type"] = d["type"]
341
+ relations_by_id[d["id"]]["upstream_docs"] += d.get("upstream_docs", [])
342
+ relations_by_id[d["id"]]["downstream_docs"] += d.get("downstream_docs", [])
343
+
344
+ return {"resources": list(relations_by_id.values())}
345
+
346
+
347
+ @router.get(
348
+ "/nmdcschema/ids/{doc_id}",
349
+ response_model=Doc,
350
+ response_model_exclude_unset=True,
351
+ )
352
+ def get_by_id(
353
+ doc_id: Annotated[
354
+ str,
355
+ Path(
356
+ title="Document ID",
357
+ description="The `id` of the document you want to retrieve.\n\n_Example_: `nmdc:bsm-11-abc123`",
358
+ examples=["nmdc:bsm-11-abc123"],
359
+ ),
360
+ ],
361
+ mdb: MongoDatabase = Depends(get_mongo_db),
362
+ ):
363
+ r"""
364
+ Retrieves the document having the specified `id`, regardless of which schema-described collection it resides in.
365
+ """
366
+ id_dict = map_id_to_collection(mdb)
367
+ collection_name = get_collection_for_id(doc_id, id_dict)
368
+ return strip_oid(
369
+ raise404_if_none(
370
+ collection_name and (mdb[collection_name].find_one({"id": doc_id}))
371
+ )
372
+ )
373
+
374
+
375
+ @router.get("/nmdcschema/ids/{doc_id}/collection-name")
376
+ def get_collection_name_by_doc_id(
377
+ doc_id: Annotated[
378
+ str,
379
+ Path(
380
+ title="Document ID",
381
+ description="The `id` of the document.\n\n_Example_: `nmdc:bsm-11-abc123`",
382
+ examples=["nmdc:bsm-11-abc123"],
383
+ ),
384
+ ],
385
+ mdb: MongoDatabase = Depends(get_mongo_db),
386
+ ):
387
+ r"""
388
+ Returns the name of the collection, if any, containing the document having the specified `id`.
389
+
390
+ This endpoint uses the NMDC Schema to determine the schema class of which an instance could have
391
+ the specified value as its `id`; and then uses the NMDC Schema to determine the names of the
392
+ `Database` slots (i.e. Mongo collection names) that could contain instances of that schema class.
393
+
394
+ This endpoint then searches those Mongo collections for a document having that `id`.
395
+ If it finds one, it responds with the name of the collection containing the document.
396
+ If it does not find one, it response with an `HTTP 404 Not Found` response.
397
+ """
398
+ # Note: The `nmdc_runtime.api.core.metadata.map_id_to_collection` function is
399
+ # not used here because that function (a) only processes collections whose
400
+ # names end with `_set` and (b) only works for `id` values that are in
401
+ # use in the database (as opposed to hypothetical `id` values).
402
+
403
+ # Extract the typecode portion, if any, of the specified `id`.
404
+ #
405
+ # Examples:
406
+ # - "nmdc:foo-123-456" → "foo"
407
+ # - "foo:nmdc-123-456" → `None`
408
+ #
409
+ pattern = re.compile(r"^nmdc:(\w+)?-")
410
+ match = pattern.search(doc_id)
411
+ typecode_portion = match.group(1) if match else None
412
+
413
+ if typecode_portion is None:
414
+ raise HTTPException(
415
+ status_code=status.HTTP_404_NOT_FOUND,
416
+ detail=f"The typecode portion of the specified `id` is invalid.",
417
+ )
418
+
419
+ # Determine the schema class, if any, of which the specified `id` could belong to an instance.
420
+ schema_class_name = None
421
+ for typecode in typecodes():
422
+ if typecode_portion == typecode["name"]:
423
+ schema_class_name_prefixed = typecode["schema_class"]
424
+ schema_class_name = schema_class_name_prefixed.replace("nmdc:", "", 1)
425
+ break
426
+
427
+ if schema_class_name is None:
428
+ raise HTTPException(
429
+ status_code=status.HTTP_404_NOT_FOUND,
430
+ detail=f"The specified `id` is not compatible with any schema classes.",
431
+ )
432
+
433
+ # Determine the Mongo collection(s) in which instances of that schema class can reside.
434
+ schema_view = nmdc_schema_view()
435
+ collection_names = []
436
+ for collection_name in get_collection_names_from_schema(schema_view=schema_view):
437
+ if schema_class_name in get_names_of_classes_eligible_for_collection(
438
+ schema_view=schema_view, collection_name=collection_name
439
+ ):
440
+ collection_names.append(collection_name)
441
+
442
+ if len(collection_names) == 0:
443
+ raise HTTPException(
444
+ status_code=status.HTTP_404_NOT_FOUND,
445
+ detail=f"The specified `id` is not compatible with any database collections.",
446
+ )
447
+
448
+ # Use the Mongo database to determine which of those collections a document having that `id` actually
449
+ # resides in, if any. If multiple collections contain such a document, report only the first one.
450
+ containing_collection_name = None
451
+ for collection_name in collection_names:
452
+ collection = mdb.get_collection(name=collection_name)
453
+ if collection.count_documents(dict(id=doc_id), limit=1) > 0:
454
+ containing_collection_name = collection_name
455
+ break
456
+
457
+ if containing_collection_name is None:
458
+ raise HTTPException(
459
+ status_code=status.HTTP_404_NOT_FOUND,
460
+ detail=f"The specified `id` does not belong to any documents.",
461
+ )
462
+
463
+ return {
464
+ "id": doc_id,
465
+ "collection_name": containing_collection_name,
466
+ }
467
+
468
+
469
+ @router.get(
470
+ "/nmdcschema/collection_names",
471
+ response_model=List[str],
472
+ status_code=status.HTTP_200_OK,
473
+ )
474
+ def get_collection_names():
475
+ """
476
+ Return all valid NMDC Schema collection names, i.e. the names of the slots of [the nmdc:Database class](
477
+ https://w3id.org/nmdc/Database/) that describe database collections.
478
+ """
479
+ schema_view = nmdc_schema_view()
480
+ return sorted(get_collection_names_from_schema(schema_view))
481
+
482
+
483
+ @router.get(
484
+ "/nmdcschema/{collection_name}",
485
+ response_model=ListResponse[Doc],
486
+ response_model_exclude_unset=True,
487
+ )
488
+ def list_from_collection(
489
+ collection_name: Annotated[
490
+ str,
491
+ Path(
492
+ title="Collection name",
493
+ description="The name of the collection.\n\n_Example_: `biosample_set`",
494
+ examples=["biosample_set"],
495
+ ),
496
+ ],
497
+ req: Annotated[ListRequest, Query()],
498
+ mdb: MongoDatabase = Depends(get_mongo_db),
499
+ ):
500
+ r"""
501
+ Retrieves resources that match the specified filter criteria and reside in the specified collection.
502
+
503
+ Searches the specified collection for documents matching the specified `filter` criteria.
504
+ If the `projection` parameter is used, each document in the response will only include
505
+ the fields specified by that parameter (plus the `id` field).
506
+
507
+ Use the [`GET /nmdcschema/collection_names`](/nmdcschema/collection_names) API endpoint to return all valid
508
+ collection names, i.e. the names of the slots of [the nmdc:Database class](https://w3id.org/nmdc/Database/) that
509
+ describe database collections.
510
+
511
+ Note: If the specified maximum page size is a number greater than zero, and _more than that number of resources_
512
+ in the collection match the filter criteria, this endpoint will paginate the resources. Pagination can take
513
+ a long time—especially for collections that contain a lot of documents (e.g. millions).
514
+
515
+ **Tips:**
516
+ 1. When the filter includes a regex and you're using that regex to match the beginning of a string, try to ensure
517
+ the regex is a [prefix expression](https://www.mongodb.com/docs/manual/reference/operator/query/regex/#index-use),
518
+ That will allow MongoDB to optimize the way it uses the regex, making this API endpoint respond faster.
519
+ """
520
+
521
+ # raise HTTP_400_BAD_REQUEST on invalid collection_name
522
+ ensure_collection_name_is_known_to_schema(collection_name)
523
+
524
+ rv = list_resources(req, mdb, collection_name)
525
+ rv["resources"] = [strip_oid(d) for d in rv["resources"]]
526
+ return rv
527
+
528
+
529
+ @router.get(
530
+ "/nmdcschema/{collection_name}/{doc_id}",
531
+ response_model=Doc,
532
+ response_model_exclude_unset=True,
533
+ )
534
+ def get_from_collection_by_id(
535
+ collection_name: Annotated[
536
+ str,
537
+ Path(
538
+ title="Collection name",
539
+ description="The name of the collection.\n\n_Example_: `biosample_set`",
540
+ examples=["biosample_set"],
541
+ ),
542
+ ],
543
+ doc_id: Annotated[
544
+ str,
545
+ Path(
546
+ title="Document ID",
547
+ description="The `id` of the document you want to retrieve.\n\n_Example_: `nmdc:bsm-11-abc123`",
548
+ examples=["nmdc:bsm-11-abc123"],
549
+ ),
550
+ ],
551
+ projection: Annotated[
552
+ str | None,
553
+ Query(
554
+ title="Projection",
555
+ description="""Comma-delimited list of the names of the fields you want the document in the response to
556
+ include.\n\n_Example_: `id,name,ecosystem_type`""",
557
+ examples=[
558
+ "id,name,ecosystem_type",
559
+ ],
560
+ ),
561
+ ] = None,
562
+ mdb: MongoDatabase = Depends(get_mongo_db),
563
+ ):
564
+ r"""
565
+ Retrieves the document having the specified `id`, from the specified collection; optionally, including only the
566
+ fields specified via the `projection` parameter.
567
+ """
568
+ # raise HTTP_400_BAD_REQUEST on invalid collection_name
569
+ ensure_collection_name_is_known_to_schema(collection_name)
570
+
571
+ projection = comma_separated_values(projection) if projection else None
572
+ try:
573
+ return strip_oid(
574
+ raise404_if_none(
575
+ mdb[collection_name].find_one({"id": doc_id}, projection=projection)
576
+ )
577
+ )
578
+ except pymongo.errors.OperationFailure as e:
579
+ raise HTTPException(
580
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
581
+ )
@@ -0,0 +1,38 @@
1
+ from typing import List
2
+
3
+ import pymongo
4
+ from fastapi import APIRouter, Depends
5
+
6
+ from nmdc_runtime.api.core.util import raise404_if_none
7
+ from nmdc_runtime.api.db.mongo import get_mongo_db
8
+ from nmdc_runtime.api.models.object_type import ObjectType
9
+ from nmdc_runtime.api.models.workflow import Workflow
10
+
11
+ router = APIRouter()
12
+
13
+
14
+ @router.get("/object_types", response_model=List[ObjectType])
15
+ def list_object_types(
16
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
17
+ ):
18
+ return list(mdb.object_types.find())
19
+
20
+
21
+ @router.get("/object_types/{object_type_id}", response_model=ObjectType)
22
+ def get_object_type(
23
+ object_type_id: str,
24
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
25
+ ):
26
+ return raise404_if_none(mdb.object_types.find_one({"id": object_type_id}))
27
+
28
+
29
+ @router.get("/object_types/{object_type_id}/workflows", response_model=List[Workflow])
30
+ def list_object_type_workflows(
31
+ object_type_id: str,
32
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
33
+ ):
34
+ workflow_ids = [
35
+ doc["workflow_id"]
36
+ for doc in mdb.triggers.find({"object_type_id": object_type_id})
37
+ ]
38
+ return list(mdb.workflows.find({"id": {"$in": workflow_ids}}))