nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,515 @@
1
+ from importlib.metadata import version
2
+ import re
3
+ from typing import List, Dict, Annotated
4
+
5
+ import pymongo
6
+ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Path, Query
7
+ from pydantic import AfterValidator
8
+ from refscan.lib.helpers import (
9
+ get_collection_names_from_schema,
10
+ get_names_of_classes_eligible_for_collection,
11
+ )
12
+
13
+ from nmdc_runtime.api.endpoints.lib.linked_instances import (
14
+ gather_linked_instances,
15
+ hydrated,
16
+ drop_stale_temp_linked_instances_collections,
17
+ )
18
+ from nmdc_runtime.config import IS_LINKED_INSTANCES_ENDPOINT_ENABLED
19
+ from nmdc_runtime.minter.config import typecodes
20
+ from nmdc_runtime.minter.domain.model import check_valid_ids
21
+ from nmdc_runtime.util import (
22
+ decorate_if,
23
+ nmdc_database_collection_names,
24
+ nmdc_schema_view,
25
+ )
26
+ from pymongo.database import Database as MongoDatabase
27
+ from starlette import status
28
+
29
+ from nmdc_runtime.api.core.metadata import map_id_to_collection, get_collection_for_id
30
+ from nmdc_runtime.api.core.util import raise404_if_none
31
+ from nmdc_runtime.api.db.mongo import (
32
+ get_mongo_db,
33
+ )
34
+ from nmdc_runtime.api.endpoints.util import (
35
+ list_resources,
36
+ strip_oid,
37
+ comma_separated_values,
38
+ )
39
+ from nmdc_runtime.api.models.metadata import Doc
40
+ from nmdc_runtime.api.models.util import ListRequest, ListResponse
41
+
42
+ router = APIRouter()
43
+
44
+
45
+ def ensure_collection_name_is_known_to_schema(collection_name: str):
46
+ r"""
47
+ Raises an exception if the specified string is _not_ the name of a collection described by the NMDC Schema.
48
+ """
49
+ schema_view = nmdc_schema_view()
50
+ names = get_collection_names_from_schema(schema_view)
51
+ if collection_name not in names:
52
+ raise HTTPException(
53
+ status_code=status.HTTP_400_BAD_REQUEST,
54
+ detail=f"Collection name must be one of {sorted(names)}",
55
+ )
56
+
57
+
58
+ @router.get("/nmdcschema/version")
59
+ def get_nmdc_schema_version():
60
+ r"""
61
+ Returns a string indicating which version of the [NMDC Schema](https://microbiomedata.github.io/nmdc-schema/)
62
+ the Runtime is using.
63
+
64
+ **Note:** The same information—and more—is also available via the `/version` endpoint.
65
+ """
66
+ return version("nmdc_schema")
67
+
68
+
69
+ @router.get("/nmdcschema/typecodes")
70
+ def get_nmdc_schema_typecodes() -> List[Dict[str, str]]:
71
+ r"""
72
+ Returns a list of objects, each of which indicates (a) a schema class, and (b) the typecode
73
+ that the minter would use when generating a new ID for an instance of that schema class.
74
+
75
+ Each object has three properties:
76
+ - `id`: a string that consists of "nmdc:" + the class name + "_typecode"
77
+ - `schema_class`: a string that consists of "nmdc:" + the class name
78
+ - `name`: the typecode the minter would use when minting an ID for an instance of that class
79
+ """
80
+ return typecodes()
81
+
82
+
83
+ @router.get("/nmdcschema/collection_stats")
84
+ def get_nmdc_database_collection_stats(
85
+ mdb: MongoDatabase = Depends(get_mongo_db),
86
+ ):
87
+ """
88
+ To get the NMDC Database MongoDB collection statistics, like the total count of records in a collection or the size
89
+ of the collection, try executing the GET /nmdcschema/collection_stats endpoint
90
+
91
+ Field reference: <https://www.mongodb.com/docs/manual/reference/command/collStats/#std-label-collStats-output>.
92
+ """
93
+ # Take set intersection of
94
+ # (1) all collections defined by the NMDC schema, and
95
+ # (2) all runtime collections
96
+ # Thus, only retrieve collections from the schema that are present (i.e. having actual documents) in the runtime.
97
+ present_collection_names = set(nmdc_database_collection_names()) & set(
98
+ mdb.list_collection_names()
99
+ )
100
+ stats = []
101
+ for n in present_collection_names:
102
+ for doc in mdb[n].aggregate(
103
+ [
104
+ {"$collStats": {"storageStats": {}}},
105
+ {
106
+ "$project": {
107
+ "ns": 1,
108
+ "storageStats.size": 1,
109
+ "storageStats.count": 1,
110
+ "storageStats.avgObjSize": 1,
111
+ "storageStats.storageSize": 1,
112
+ "storageStats.totalIndexSize": 1,
113
+ "storageStats.totalSize": 1,
114
+ "storageStats.scaleFactor": 1,
115
+ }
116
+ },
117
+ ]
118
+ ):
119
+ stats.append(doc)
120
+ return stats
121
+
122
+
123
+ @decorate_if(condition=IS_LINKED_INSTANCES_ENDPOINT_ENABLED)(
124
+ router.get(
125
+ "/nmdcschema/linked_instances",
126
+ responses={
127
+ status.HTTP_200_OK: {
128
+ "model": ListResponse[Doc],
129
+ }
130
+ },
131
+ response_model_exclude_unset=True,
132
+ )
133
+ )
134
+ def get_linked_instances(
135
+ ids: Annotated[
136
+ list[str],
137
+ Query(
138
+ title="Instance (aka Document) IDs",
139
+ description=(
140
+ "The `ids` you want to serve as the nexus for graph traversal to collect linked instances."
141
+ "\n\n_Example_: [`nmdc:dobj-11-nf3t6f36`]"
142
+ ),
143
+ examples=["nmdc:dobj-11-nf3t6f36"],
144
+ ),
145
+ AfterValidator(check_valid_ids),
146
+ ],
147
+ types: Annotated[
148
+ list[str] | None,
149
+ Query(
150
+ title="Instance (aka Document) types",
151
+ description=(
152
+ "The `types` of instances you want to return. Can be abstract types such as `nmdc:InformationObject` "
153
+ "or instantiated ones such as `nmdc:DataObject`. Defaults to [`nmdc:NamedThing`]."
154
+ "\n\n_Example_: [`nmdc:PlannedProcess`]"
155
+ ),
156
+ examples=["nmdc:bsm-11-abc123"],
157
+ ),
158
+ ] = None,
159
+ hydrate: Annotated[
160
+ bool,
161
+ Query(
162
+ title="Hydrate",
163
+ description="Whether to include full documents in the response. The default is to include slim documents.",
164
+ ),
165
+ ] = False,
166
+ page_token: Annotated[
167
+ str | None,
168
+ Query(
169
+ title="Next page token",
170
+ description="""A bookmark you can use to fetch the _next_ page of resources. You can get this from the
171
+ `next_page_token` field in a previous response from this endpoint.\n\n_Example_:
172
+ `nmdc:sys0zr0fbt71`""",
173
+ examples=[
174
+ "nmdc:sys0zr0fbt71",
175
+ ],
176
+ ),
177
+ ] = None,
178
+ max_page_size: Annotated[
179
+ int,
180
+ Query(
181
+ title="Resources per page",
182
+ description="How many resources you want _each page_ to contain, formatted as a positive integer.",
183
+ examples=[20],
184
+ ),
185
+ ] = 20,
186
+ mdb: MongoDatabase = Depends(get_mongo_db),
187
+ # FastAPI will inject this `background_tasks` argument, to which we can add background tasks
188
+ # for FastAPI to run after it returns the HTTP response.
189
+ # References:
190
+ # - https://fastapi.tiangolo.com/tutorial/background-tasks/ (RE: `BackgroundTasks`)
191
+ # - https://stackoverflow.com/a/68807219 (RE: how to specify it after some optional parameters)
192
+ background_tasks: BackgroundTasks = BackgroundTasks(),
193
+ ):
194
+ """
195
+ Retrieves database instances that are both (a) linked to any of `ids`, and (b) of a type in `types`.
196
+
197
+ An [instance](https://linkml.io/linkml-model/latest/docs/specification/02instances/) is an object conforming to a
198
+ class definition ([linkml:ClassDefinition](https://w3id.org/linkml/ClassDefinition)) in our database ([
199
+ nmdc:Database](https://w3id.org/nmdc/Database)). While a [nmdc:Database](https://w3id.org/nmdc/Database) is
200
+ organized into collections, every item in every database collection -- that is, every instance -- knows its
201
+ `type`, so we can (and here do) return a simple list of instances ([a LinkML CollectionInstance](
202
+ https://linkml.io/linkml-model/latest/docs/specification/02instances/#collections)). If hydrate is `False` (the
203
+ default), then the returned list contains "slim" documents that include only the `id` and `type` of each
204
+ instance. If hydrate is `True`, then the returned list contains "full" (aka <a
205
+ href="https://en.wikipedia.org/wiki/Hydration_(web_development)">"hydrated"</a>) documents of each instance,
206
+ suitable e.g. for a client to subsequently use to construct a corresponding
207
+ [nmdc:Database](https://w3id.org/nmdc/Database) instance with schema-compliant documents.
208
+ Both "slim" and "full" documents include (optional) `_upstream_of` and `_downstream_of` fields,
209
+ to indicate the returned document's relationship to `ids`.
210
+
211
+ From the nexus instance IDs given in `ids`, both "upstream" and "downstream" links are followed (transitively)
212
+ to collect the set of all instances linked to these `ids`.
213
+
214
+ * A link "upstream" is represented by a slot ([linkml:SlotDefinition](https://w3id.org/linkml/SlotDefinition))
215
+ for which the
216
+ range ([linkml:range](https://w3id.org/linkml/range)) instance has originated, or helped produce,
217
+ the domain ([linkml:domain](https://w3id.org/linkml/domain)) instance.
218
+ For example, we consider [nmdc:associated_studies](https://w3id.org/nmdc/associated_studies) to be
219
+ an "upstream" slot because we consider a [nmdc:Study](https://w3id.org/nmdc/Study) (the slot's range)
220
+ to be upstream of a [nmdc:Biosample](https://w3id.org/nmdc/Biosample) (the slot's domain).
221
+
222
+ * A link "downstream" is represented by a slot for which the
223
+ range instance has originated from, or was in part produced by, the domain instance.
224
+ For example, [nmdc:has_output](https://w3id.org/nmdc/has_output) is
225
+ a "downstream" slot because its [nmdc:NamedThing](https://w3id.org/nmdc/NamedThing) range
226
+ is downstream of its [nmdc:PlannedProcess](https://w3id.org/nmdc/PlannedProcess) domain.
227
+
228
+ Acceptable values for `types` are not limited only to the ones embedded in concrete instances, e.g.
229
+ the `schema_class` field values returned by the [`GET /nmdcschema/typecodes`](/nmdcschema/typecodes) API endpoint.
230
+ Rather, any subclass (of any depth) of [nmdc:NamedThing](https://w3id.org/nmdc/NamedThing) --
231
+ [nmdc:DataEmitterProcess](https://w3id.org/nmdc/DataEmitterProcess),
232
+ [nmdc:InformationObject](https://w3id.org/nmdc/InformationObject),
233
+ [nmdc:Sample](https://w3id.org/nmdc/Sample), etc. -- may be given.
234
+ If no value for `types` is given, then all [nmdc:NamedThing](https://w3id.org/nmdc/NamedThing)s are returned.
235
+ """
236
+ background_tasks.add_task(drop_stale_temp_linked_instances_collections)
237
+ if page_token is not None:
238
+ rv = list_resources(
239
+ req=ListRequest(page_token=page_token, max_page_size=max_page_size), mdb=mdb
240
+ )
241
+ rv["resources"] = hydrated(rv["resources"], mdb) if hydrate else rv["resources"]
242
+ rv["resources"] = [strip_oid(d) for d in rv["resources"]]
243
+ return rv
244
+
245
+ ids_found = [d["id"] for d in mdb.alldocs.find({"id": {"$in": ids}}, {"id": 1})]
246
+ ids_not_found = list(set(ids) - set(ids_found))
247
+ if ids_not_found:
248
+ raise HTTPException(
249
+ status_code=status.HTTP_404_NOT_FOUND,
250
+ detail=f"Some IDs not found: {ids_not_found}.",
251
+ )
252
+
253
+ types = types or ["nmdc:NamedThing"]
254
+ types_possible = set([f"nmdc:{name}" for name in nmdc_schema_view().all_classes()])
255
+ types_not_found = list(set(types) - types_possible)
256
+ if types_not_found:
257
+ raise HTTPException(
258
+ status_code=status.HTTP_404_NOT_FOUND,
259
+ detail=(
260
+ f"Some types not found: {types_not_found}. "
261
+ "You may need to prefix with `nmdc:`. "
262
+ "If you don't supply any types, the set {'nmdc:NamedThing'} will be used. "
263
+ f"Types possible: {types_possible}"
264
+ ),
265
+ )
266
+
267
+ merge_into_collection_name = gather_linked_instances(
268
+ alldocs_collection=mdb.alldocs, ids=ids, types=types
269
+ )
270
+
271
+ rv = list_resources(
272
+ ListRequest(page_token=page_token, max_page_size=max_page_size),
273
+ mdb,
274
+ merge_into_collection_name,
275
+ )
276
+ rv["resources"] = hydrated(rv["resources"], mdb) if hydrate else rv["resources"]
277
+ rv["resources"] = [strip_oid(d) for d in rv["resources"]]
278
+ return rv
279
+
280
+
281
+ @router.get(
282
+ "/nmdcschema/ids/{doc_id}",
283
+ response_model=Doc,
284
+ response_model_exclude_unset=True,
285
+ )
286
+ def get_by_id(
287
+ doc_id: Annotated[
288
+ str,
289
+ Path(
290
+ title="Document ID",
291
+ description="The `id` of the document you want to retrieve.\n\n_Example_: `nmdc:bsm-11-abc123`",
292
+ examples=["nmdc:bsm-11-abc123"],
293
+ ),
294
+ ],
295
+ mdb: MongoDatabase = Depends(get_mongo_db),
296
+ ):
297
+ r"""
298
+ Retrieves the document having the specified `id`, regardless of which schema-described collection it resides in.
299
+ """
300
+ id_dict = map_id_to_collection(mdb)
301
+ collection_name = get_collection_for_id(doc_id, id_dict)
302
+ return strip_oid(
303
+ raise404_if_none(
304
+ collection_name and (mdb[collection_name].find_one({"id": doc_id}))
305
+ )
306
+ )
307
+
308
+
309
+ @router.get("/nmdcschema/ids/{doc_id}/collection-name")
310
+ def get_collection_name_by_doc_id(
311
+ doc_id: Annotated[
312
+ str,
313
+ Path(
314
+ title="Document ID",
315
+ description="The `id` of the document.\n\n_Example_: `nmdc:bsm-11-abc123`",
316
+ examples=["nmdc:bsm-11-abc123"],
317
+ ),
318
+ ],
319
+ mdb: MongoDatabase = Depends(get_mongo_db),
320
+ ):
321
+ r"""
322
+ Returns the name of the collection, if any, containing the document having the specified `id`.
323
+
324
+ This endpoint uses the NMDC Schema to determine the schema class of which an instance could have
325
+ the specified value as its `id`; and then uses the NMDC Schema to determine the names of the
326
+ `Database` slots (i.e. Mongo collection names) that could contain instances of that schema class.
327
+
328
+ This endpoint then searches those Mongo collections for a document having that `id`.
329
+ If it finds one, it responds with the name of the collection containing the document.
330
+ If it does not find one, it response with an `HTTP 404 Not Found` response.
331
+ """
332
+ # Note: The `nmdc_runtime.api.core.metadata.map_id_to_collection` function is
333
+ # not used here because that function (a) only processes collections whose
334
+ # names end with `_set` and (b) only works for `id` values that are in
335
+ # use in the database (as opposed to hypothetical `id` values).
336
+
337
+ # Extract the typecode portion, if any, of the specified `id`.
338
+ #
339
+ # Examples:
340
+ # - "nmdc:foo-123-456" → "foo"
341
+ # - "foo:nmdc-123-456" → `None`
342
+ #
343
+ pattern = re.compile(r"^nmdc:(\w+)?-")
344
+ match = pattern.search(doc_id)
345
+ typecode_portion = match.group(1) if match else None
346
+
347
+ if typecode_portion is None:
348
+ raise HTTPException(
349
+ status_code=status.HTTP_404_NOT_FOUND,
350
+ detail=f"The typecode portion of the specified `id` is invalid.",
351
+ )
352
+
353
+ # Determine the schema class, if any, of which the specified `id` could belong to an instance.
354
+ schema_class_name = None
355
+ for typecode in typecodes():
356
+ if typecode_portion == typecode["name"]:
357
+ schema_class_name_prefixed = typecode["schema_class"]
358
+ schema_class_name = schema_class_name_prefixed.replace("nmdc:", "", 1)
359
+ break
360
+
361
+ if schema_class_name is None:
362
+ raise HTTPException(
363
+ status_code=status.HTTP_404_NOT_FOUND,
364
+ detail=f"The specified `id` is not compatible with any schema classes.",
365
+ )
366
+
367
+ # Determine the Mongo collection(s) in which instances of that schema class can reside.
368
+ schema_view = nmdc_schema_view()
369
+ collection_names = []
370
+ for collection_name in get_collection_names_from_schema(schema_view=schema_view):
371
+ if schema_class_name in get_names_of_classes_eligible_for_collection(
372
+ schema_view=schema_view, collection_name=collection_name
373
+ ):
374
+ collection_names.append(collection_name)
375
+
376
+ if len(collection_names) == 0:
377
+ raise HTTPException(
378
+ status_code=status.HTTP_404_NOT_FOUND,
379
+ detail=f"The specified `id` is not compatible with any database collections.",
380
+ )
381
+
382
+ # Use the Mongo database to determine which of those collections a document having that `id` actually
383
+ # resides in, if any. If multiple collections contain such a document, report only the first one.
384
+ containing_collection_name = None
385
+ for collection_name in collection_names:
386
+ collection = mdb.get_collection(name=collection_name)
387
+ if collection.count_documents(dict(id=doc_id), limit=1) > 0:
388
+ containing_collection_name = collection_name
389
+ break
390
+
391
+ if containing_collection_name is None:
392
+ raise HTTPException(
393
+ status_code=status.HTTP_404_NOT_FOUND,
394
+ detail=f"The specified `id` does not belong to any documents.",
395
+ )
396
+
397
+ return {
398
+ "id": doc_id,
399
+ "collection_name": containing_collection_name,
400
+ }
401
+
402
+
403
+ @router.get(
404
+ "/nmdcschema/collection_names",
405
+ response_model=List[str],
406
+ status_code=status.HTTP_200_OK,
407
+ )
408
+ def get_collection_names():
409
+ """
410
+ Return all valid NMDC Schema collection names, i.e. the names of the slots of [the nmdc:Database class](
411
+ https://w3id.org/nmdc/Database/) that describe database collections.
412
+ """
413
+ schema_view = nmdc_schema_view()
414
+ return sorted(get_collection_names_from_schema(schema_view))
415
+
416
+
417
+ @router.get(
418
+ "/nmdcschema/{collection_name}",
419
+ response_model=ListResponse[Doc],
420
+ response_model_exclude_unset=True,
421
+ )
422
+ def list_from_collection(
423
+ collection_name: Annotated[
424
+ str,
425
+ Path(
426
+ title="Collection name",
427
+ description="The name of the collection.\n\n_Example_: `biosample_set`",
428
+ examples=["biosample_set"],
429
+ ),
430
+ ],
431
+ req: Annotated[ListRequest, Query()],
432
+ mdb: MongoDatabase = Depends(get_mongo_db),
433
+ ):
434
+ r"""
435
+ Retrieves resources that match the specified filter criteria and reside in the specified collection.
436
+
437
+ Searches the specified collection for documents matching the specified `filter` criteria.
438
+ If the `projection` parameter is used, each document in the response will only include
439
+ the fields specified by that parameter (plus the `id` field).
440
+
441
+ Use the [`GET /nmdcschema/collection_names`](/nmdcschema/collection_names) API endpoint to return all valid
442
+ collection names, i.e. the names of the slots of [the nmdc:Database class](https://w3id.org/nmdc/Database/) that
443
+ describe database collections.
444
+
445
+ Note: If the specified maximum page size is a number greater than zero, and _more than that number of resources_
446
+ in the collection match the filter criteria, this endpoint will paginate the resources. Pagination can take
447
+ a long time—especially for collections that contain a lot of documents (e.g. millions).
448
+
449
+ **Tips:**
450
+ 1. When the filter includes a regex and you're using that regex to match the beginning of a string, try to ensure
451
+ the regex is a [prefix expression](https://www.mongodb.com/docs/manual/reference/operator/query/regex/#index-use),
452
+ That will allow MongoDB to optimize the way it uses the regex, making this API endpoint respond faster.
453
+ """
454
+
455
+ # raise HTTP_400_BAD_REQUEST on invalid collection_name
456
+ ensure_collection_name_is_known_to_schema(collection_name)
457
+
458
+ rv = list_resources(req, mdb, collection_name)
459
+ rv["resources"] = [strip_oid(d) for d in rv["resources"]]
460
+ return rv
461
+
462
+
463
+ @router.get(
464
+ "/nmdcschema/{collection_name}/{doc_id}",
465
+ response_model=Doc,
466
+ response_model_exclude_unset=True,
467
+ )
468
+ def get_from_collection_by_id(
469
+ collection_name: Annotated[
470
+ str,
471
+ Path(
472
+ title="Collection name",
473
+ description="The name of the collection.\n\n_Example_: `biosample_set`",
474
+ examples=["biosample_set"],
475
+ ),
476
+ ],
477
+ doc_id: Annotated[
478
+ str,
479
+ Path(
480
+ title="Document ID",
481
+ description="The `id` of the document you want to retrieve.\n\n_Example_: `nmdc:bsm-11-abc123`",
482
+ examples=["nmdc:bsm-11-abc123"],
483
+ ),
484
+ ],
485
+ projection: Annotated[
486
+ str | None,
487
+ Query(
488
+ title="Projection",
489
+ description="""Comma-delimited list of the names of the fields you want the document in the response to
490
+ include.\n\n_Example_: `id,name,ecosystem_type`""",
491
+ examples=[
492
+ "id,name,ecosystem_type",
493
+ ],
494
+ ),
495
+ ] = None,
496
+ mdb: MongoDatabase = Depends(get_mongo_db),
497
+ ):
498
+ r"""
499
+ Retrieves the document having the specified `id`, from the specified collection; optionally, including only the
500
+ fields specified via the `projection` parameter.
501
+ """
502
+ # raise HTTP_400_BAD_REQUEST on invalid collection_name
503
+ ensure_collection_name_is_known_to_schema(collection_name)
504
+
505
+ projection = comma_separated_values(projection) if projection else None
506
+ try:
507
+ return strip_oid(
508
+ raise404_if_none(
509
+ mdb[collection_name].find_one({"id": doc_id}, projection=projection)
510
+ )
511
+ )
512
+ except pymongo.errors.OperationFailure as e:
513
+ raise HTTPException(
514
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
515
+ )
@@ -0,0 +1,38 @@
1
+ from typing import List
2
+
3
+ import pymongo
4
+ from fastapi import APIRouter, Depends
5
+
6
+ from nmdc_runtime.api.core.util import raise404_if_none
7
+ from nmdc_runtime.api.db.mongo import get_mongo_db
8
+ from nmdc_runtime.api.models.object_type import ObjectType
9
+ from nmdc_runtime.api.models.workflow import Workflow
10
+
11
+ router = APIRouter()
12
+
13
+
14
+ @router.get("/object_types", response_model=List[ObjectType])
15
+ def list_object_types(
16
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
17
+ ):
18
+ return list(mdb.object_types.find())
19
+
20
+
21
+ @router.get("/object_types/{object_type_id}", response_model=ObjectType)
22
+ def get_object_type(
23
+ object_type_id: str,
24
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
25
+ ):
26
+ return raise404_if_none(mdb.object_types.find_one({"id": object_type_id}))
27
+
28
+
29
+ @router.get("/object_types/{object_type_id}/workflows", response_model=List[Workflow])
30
+ def list_object_type_workflows(
31
+ object_type_id: str,
32
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
33
+ ):
34
+ workflow_ids = [
35
+ doc["workflow_id"]
36
+ for doc in mdb.triggers.find({"object_type_id": object_type_id})
37
+ ]
38
+ return list(mdb.workflows.find({"id": {"$in": workflow_ids}}))