nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/api/__init__.py +0 -0
- nmdc_runtime/api/analytics.py +70 -0
- nmdc_runtime/api/boot/__init__.py +0 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/__init__.py +0 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +170 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/__init__.py +0 -0
- nmdc_runtime/api/db/mongo.py +447 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/__init__.py +0 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +794 -0
- nmdc_runtime/api/endpoints/ids.py +192 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +105 -0
- nmdc_runtime/api/endpoints/queries.py +679 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +229 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +774 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/main.py +401 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/__init__.py +0 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/__init__.py +0 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/minter.py +0 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +253 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +242 -0
- nmdc_runtime/config.py +55 -4
- nmdc_runtime/core/db/Database.py +1 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -9
- nmdc_runtime/lib/extract_nmdc_data.py +0 -8
- nmdc_runtime/lib/nmdc_dataframes.py +3 -7
- nmdc_runtime/lib/nmdc_etl_class.py +1 -7
- nmdc_runtime/minter/adapters/repository.py +1 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +35 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/export/ncbi_xml.py +1 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
- nmdc_runtime/site/graphs.py +33 -28
- nmdc_runtime/site/ops.py +97 -237
- nmdc_runtime/site/repair/database_updater.py +8 -0
- nmdc_runtime/site/repository.py +7 -117
- nmdc_runtime/site/resources.py +4 -4
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/util.py +9 -321
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
- nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
from typing import TypeVar, List, Optional, Generic, Annotated
|
|
2
|
+
|
|
3
|
+
from pydantic import model_validator, Field, BaseModel
|
|
4
|
+
|
|
5
|
+
ResultT = TypeVar("ResultT")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ListResponse(BaseModel, Generic[ResultT]):
|
|
9
|
+
resources: List[ResultT]
|
|
10
|
+
next_page_token: Optional[str] = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ListRequest(BaseModel):
|
|
14
|
+
r"""
|
|
15
|
+
An encapsulation of a set of parameters accepted by API endpoints related to listing things.
|
|
16
|
+
|
|
17
|
+
Note: This class was documented after the `FindRequest` class was documented. You can refer to the documentation of
|
|
18
|
+
the latter class for additional context about the usage of Pydantic's `Field` constructor in this class.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
filter: Optional[str] = Field(
|
|
22
|
+
default=None,
|
|
23
|
+
title="Filter",
|
|
24
|
+
description="""The criteria by which you want to filter the resources, in the same format as the [`query`
|
|
25
|
+
parameter](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
|
|
26
|
+
of MongoDB's `db.collection.find()` method.\n\n_Example:_
|
|
27
|
+
`{"lat_lon.latitude": {"$gt": 45.0}, "ecosystem_category": "Plants"}`""",
|
|
28
|
+
examples=[
|
|
29
|
+
r'{"ecosystem_type": "Freshwater"}',
|
|
30
|
+
r'{"lat_lon.latitude": {"$gt": 45.0}, "ecosystem_category": "Plants"}',
|
|
31
|
+
],
|
|
32
|
+
)
|
|
33
|
+
# TODO: Document why the optional type here is `int` as opposed to `PerPageRange` (`FindRequest` uses the latter).
|
|
34
|
+
max_page_size: Optional[int] = Field(
|
|
35
|
+
default=20,
|
|
36
|
+
title="Resources per page",
|
|
37
|
+
description="How many resources you want _each page_ to contain, formatted as a positive integer.",
|
|
38
|
+
examples=[20],
|
|
39
|
+
)
|
|
40
|
+
page_token: Optional[str] = Field(
|
|
41
|
+
default=None,
|
|
42
|
+
title="Next page token",
|
|
43
|
+
description="""A bookmark you can use to fetch the _next_ page of resources. You can get this from the
|
|
44
|
+
`next_page_token` field in a previous response from this endpoint.\n\n_Example_:
|
|
45
|
+
`nmdc:sys0zr0fbt71`""",
|
|
46
|
+
examples=[
|
|
47
|
+
"nmdc:sys0zr0fbt71",
|
|
48
|
+
],
|
|
49
|
+
)
|
|
50
|
+
# TODO: Document the endpoint's behavior when a projection includes a _nested_ field identifier (i.e. `foo.bar`),
|
|
51
|
+
# and ensure the endpoint doesn't break when the projection includes field descriptors that contain commas.
|
|
52
|
+
projection: Optional[str] = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
title="Projection",
|
|
55
|
+
description="""Comma-delimited list of the names of the fields you want the resources in the response to
|
|
56
|
+
include. Note: In addition to those fields, the response will also include the `id`
|
|
57
|
+
field.\n\n_Example_: `name, ecosystem_type`""",
|
|
58
|
+
examples=[
|
|
59
|
+
"name, ecosystem_type",
|
|
60
|
+
],
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
PerPageRange = Annotated[int, Field(gt=0, le=2_000)]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class FindRequest(BaseModel):
|
|
68
|
+
r"""
|
|
69
|
+
An encapsulation of a set of parameters accepted by API endpoints related to finding things.
|
|
70
|
+
|
|
71
|
+
Notes:
|
|
72
|
+
- The "Query Parameter Models" section of the FastAPI docs says that this way of encapsulating
|
|
73
|
+
a set of query parameter definitions in a Pydantic model — so that Swagger UI displays a given
|
|
74
|
+
parameter's _description_ — was introduced in FastAPI 0.115.0.
|
|
75
|
+
Reference: https://fastapi.tiangolo.com/tutorial/query-param-models/
|
|
76
|
+
- While Swagger UI does show the parameter's _description_, specifically, it does not currently show the
|
|
77
|
+
parameter's _title_ or example value(s). The approach shown in the "Classes as Dependencies" section
|
|
78
|
+
of the FastAPI docs (i.e. https://fastapi.tiangolo.com/tutorial/dependencies/classes-as-dependencies/)
|
|
79
|
+
does result in Swagger UI showing those additional things, but the approach involves not inheriting
|
|
80
|
+
from Pydantic's `BaseModel` class and involves defining an `__init__` method for the class. That is
|
|
81
|
+
further than I want to take these classes from their existing selves at this point. To compensate
|
|
82
|
+
for that, I have included examples _within_ some of the descriptions.
|
|
83
|
+
Reference: https://github.com/fastapi/fastapi/issues/318#issuecomment-507043221
|
|
84
|
+
- The "Fields" section of the Pydantic docs says:
|
|
85
|
+
> "The `Field` function is used to customize and add metadata to fields of models."
|
|
86
|
+
References: https://docs.pydantic.dev/latest/concepts/fields/
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
filter: Optional[str] = Field(
|
|
90
|
+
default=None,
|
|
91
|
+
title="Filter",
|
|
92
|
+
description="""The criteria by which you want to filter the resources, formatted as a comma-separated list of
|
|
93
|
+
`attribute:value` pairs. The `value` can include a comparison operator (e.g. `>=`). If the attribute
|
|
94
|
+
is of type _string_ and you append `.search` to its name, the server will perform a full-text
|
|
95
|
+
search.\n\n_Example:_ `ecosystem_category:Plants, lat_lon.latitude:>35.0`""",
|
|
96
|
+
examples=[
|
|
97
|
+
"ecosystem_category:Plants",
|
|
98
|
+
"ecosystem_category:Plants, lat_lon.latitude:>35.0",
|
|
99
|
+
],
|
|
100
|
+
)
|
|
101
|
+
search: Optional[str] = Field(
|
|
102
|
+
default=None,
|
|
103
|
+
title="Search",
|
|
104
|
+
description="N/A _(not implemented yet)_",
|
|
105
|
+
)
|
|
106
|
+
sort: Optional[str] = Field(
|
|
107
|
+
default=None,
|
|
108
|
+
title="Sort",
|
|
109
|
+
description="""How you want the resources to be ordered in the response, formatted as a comma-separated list of
|
|
110
|
+
`attribute:value` pairs. Each `attribute` is the name of a field you want the resources to be
|
|
111
|
+
ordered by, and each `value` is the direction you want the values in that field to be ordered
|
|
112
|
+
(i.e. `asc` or no value for _ascending_ order, and `desc` for _descending_ order).\n\n_Example:_
|
|
113
|
+
`depth.has_numeric_value:desc, ecosystem_type`""",
|
|
114
|
+
examples=[
|
|
115
|
+
"depth.has_numeric_value:desc",
|
|
116
|
+
"depth.has_numeric_value:desc, ecosystem_type",
|
|
117
|
+
],
|
|
118
|
+
)
|
|
119
|
+
page: Optional[int] = Field(
|
|
120
|
+
default=None,
|
|
121
|
+
title="Page number",
|
|
122
|
+
description="""_Which page_ of resources you want to retrieve, when using page number-based pagination.
|
|
123
|
+
This is the page number formatted as an integer ≥ 1.""",
|
|
124
|
+
examples=[1],
|
|
125
|
+
)
|
|
126
|
+
per_page: Optional[PerPageRange] = Field(
|
|
127
|
+
default=25,
|
|
128
|
+
title="Resources per page",
|
|
129
|
+
description="How many resources you want _each page_ to contain, formatted as a positive integer ≤ 2000.",
|
|
130
|
+
examples=[25],
|
|
131
|
+
)
|
|
132
|
+
cursor: Optional[str] = Field(
|
|
133
|
+
default=None,
|
|
134
|
+
title="Cursor",
|
|
135
|
+
description="""A bookmark you can use to fetch the _next_ page of resources, when using cursor-based pagination.
|
|
136
|
+
To use cursor-based pagination, set the `cursor` parameter to `*`. The response's `meta` object will
|
|
137
|
+
include a `next_cursor` field, whose value can be used as the `cursor` parameter in a subsequent
|
|
138
|
+
request.\n\n_Example_: `nmdc:sys0zr0fbt71`""",
|
|
139
|
+
examples=[
|
|
140
|
+
"*",
|
|
141
|
+
"nmdc:sys0zr0fbt71",
|
|
142
|
+
],
|
|
143
|
+
)
|
|
144
|
+
group_by: Optional[str] = Field(
|
|
145
|
+
default=None,
|
|
146
|
+
title="Group by",
|
|
147
|
+
description="N/A _(not implemented yet)_",
|
|
148
|
+
)
|
|
149
|
+
fields: Optional[str] = Field(
|
|
150
|
+
default=None,
|
|
151
|
+
title="Fields",
|
|
152
|
+
description="""The fields you want the resources to include in the response, formatted as a comma-separated list
|
|
153
|
+
of field names. This can be used to reduce the size and complexity of the response.\n\n_Example:_
|
|
154
|
+
`name, ess_dive_datasets`""",
|
|
155
|
+
examples=[
|
|
156
|
+
"name",
|
|
157
|
+
"name, ess_dive_datasets",
|
|
158
|
+
],
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Reference: https://docs.pydantic.dev/latest/concepts/validators/#model-validators
|
|
162
|
+
@model_validator(mode="before")
|
|
163
|
+
def set_page_if_cursor_unset(cls, values):
|
|
164
|
+
page, cursor = values.get("page"), values.get("cursor")
|
|
165
|
+
if page is not None and cursor is not None:
|
|
166
|
+
raise ValueError("cannot use cursor- and page-based pagination together")
|
|
167
|
+
if page is None and cursor is None:
|
|
168
|
+
values["page"] = 1
|
|
169
|
+
return values
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class FindResponse(BaseModel):
|
|
173
|
+
meta: dict
|
|
174
|
+
results: List[dict]
|
|
175
|
+
group_by: List[dict]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class DeleteResponse(BaseModel):
|
|
179
|
+
r"""
|
|
180
|
+
Response model for "delete" operations. It summarizes the result of the
|
|
181
|
+
operation and it lists identifiers of the documents that were deleted.
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
message: str = Field(
|
|
185
|
+
description="Success message describing the deletion operation"
|
|
186
|
+
)
|
|
187
|
+
deleted_workflow_execution_ids: List[str] = Field(
|
|
188
|
+
# Note: `default_factory=list` sets this to an empty list by default.
|
|
189
|
+
default_factory=list,
|
|
190
|
+
description="The `id`s of the `WorkflowExecution`s that were deleted",
|
|
191
|
+
)
|
|
192
|
+
deleted_data_object_ids: List[str] = Field(
|
|
193
|
+
default_factory=list,
|
|
194
|
+
description="The `id`s of the `DataObject`s that were deleted",
|
|
195
|
+
)
|
|
196
|
+
deleted_functional_annotation_agg_oids: List[str] = Field(
|
|
197
|
+
default_factory=list,
|
|
198
|
+
description="The internal MongoDB `ObjectId`s of the `FunctionalAnnotationAggMember`s that were deleted",
|
|
199
|
+
)
|
|
200
|
+
deleted_job_ids: List[str] = Field(
|
|
201
|
+
default_factory=list,
|
|
202
|
+
description="The `id`s of the `jobs` documents that were deleted",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# Note: For MongoDB, a single collection can have no more than 64 indexes
|
|
207
|
+
# Note: Each collection has a unique index set on "id" elsewhere.
|
|
208
|
+
entity_attributes_to_index = {
|
|
209
|
+
"biosample_set": {
|
|
210
|
+
"alternative_identifiers",
|
|
211
|
+
"env_broad_scale.has_raw_value",
|
|
212
|
+
"env_local_scale.has_raw_value",
|
|
213
|
+
"env_medium.has_raw_value",
|
|
214
|
+
"collection_date.has_raw_value",
|
|
215
|
+
"ecosystem",
|
|
216
|
+
"ecosystem_category",
|
|
217
|
+
"ecosystem_type",
|
|
218
|
+
"ecosystem_subtype",
|
|
219
|
+
"specific_ecosystem",
|
|
220
|
+
# Note: if `lat_lon` was GeoJSON, i.e. {type,coordinates}, MongoDB has a "2dsphere" index
|
|
221
|
+
"lat_lon.latitude",
|
|
222
|
+
"lat_lon.longitude",
|
|
223
|
+
},
|
|
224
|
+
"study_set": {
|
|
225
|
+
"has_credit_associations.applied_roles",
|
|
226
|
+
"has_credit_associations.applies_to_person.name",
|
|
227
|
+
"has_credit_associations.applies_to_person.orcid",
|
|
228
|
+
},
|
|
229
|
+
"data_object_set": {
|
|
230
|
+
"data_object_type",
|
|
231
|
+
"file_size_bytes",
|
|
232
|
+
"md5_checksum",
|
|
233
|
+
"url",
|
|
234
|
+
},
|
|
235
|
+
# TODO: Refrain from ensuring indexes exist in the `omics_processing_set` collection,
|
|
236
|
+
# since that collection was deleted as part of the "Berkeley schema" refactor.
|
|
237
|
+
# Reference: https://microbiomedata.github.io/nmdc-schema/v10-vs-v11-retrospective/#slots-removed-from-database
|
|
238
|
+
"omics_processing_set": {
|
|
239
|
+
"has_input",
|
|
240
|
+
"has_output",
|
|
241
|
+
"instrument_name",
|
|
242
|
+
"alternative_identifiers",
|
|
243
|
+
},
|
|
244
|
+
"functional_annotation_agg": {"was_generated_by"},
|
|
245
|
+
"workflow_execution_set": {
|
|
246
|
+
"has_input",
|
|
247
|
+
"has_output",
|
|
248
|
+
},
|
|
249
|
+
# Note: The `jobs` collection is not described by the NMDC schema.
|
|
250
|
+
"jobs": {
|
|
251
|
+
"config.activity_id",
|
|
252
|
+
},
|
|
253
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Optional, List
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class WorkflowBase(BaseModel):
|
|
8
|
+
name: Optional[str] = None
|
|
9
|
+
description: Optional[str] = None
|
|
10
|
+
capability_ids: Optional[List[str]] = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Workflow(WorkflowBase):
|
|
14
|
+
id: str
|
|
15
|
+
created_at: Optional[datetime.datetime] = None
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
r"""
|
|
2
|
+
This module contains the definitions of constants and functions related to
|
|
3
|
+
generating the API's OpenAPI schema (a.k.a. Swagger schema).
|
|
4
|
+
|
|
5
|
+
References:
|
|
6
|
+
- FastAPI Documentation: https://fastapi.tiangolo.com/tutorial/metadata/
|
|
7
|
+
|
|
8
|
+
Notes:
|
|
9
|
+
- The tag descriptions in this file were cut/pasted from `nmdc_runtime/api/main.py`.
|
|
10
|
+
Now that they are in a separate module, we will be able to edit them more easily.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from html import escape
|
|
14
|
+
from typing import List, Dict
|
|
15
|
+
|
|
16
|
+
# Mapping from tag names to their (Markdown-formatted) descriptions.
|
|
17
|
+
tag_descriptions: Dict[str, str] = {}
|
|
18
|
+
|
|
19
|
+
tag_descriptions[
|
|
20
|
+
"sites"
|
|
21
|
+
] = r"""
|
|
22
|
+
A site corresponds to a physical place that may participate in job execution.
|
|
23
|
+
|
|
24
|
+
A site may register data objects and capabilities with NMDC. It may claim jobs to execute, and it may
|
|
25
|
+
update job operations with execution info.
|
|
26
|
+
|
|
27
|
+
A site must be able to service requests for any data objects it has registered.
|
|
28
|
+
|
|
29
|
+
A site may expose a "put object" custom method for authorized users. This method facilitates an
|
|
30
|
+
operation to upload an object to the site and have the site register that object with the runtime
|
|
31
|
+
system.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
tag_descriptions[
|
|
35
|
+
"workflows"
|
|
36
|
+
] = r"""
|
|
37
|
+
A workflow is a template for creating jobs.
|
|
38
|
+
|
|
39
|
+
Workflow jobs are typically created by the system via trigger associations between
|
|
40
|
+
workflows and object types. A workflow may also require certain capabilities of sites
|
|
41
|
+
in order for those sites to claim workflow jobs.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
tag_descriptions[
|
|
45
|
+
"users"
|
|
46
|
+
] = r"""
|
|
47
|
+
Endpoints for user identification.
|
|
48
|
+
|
|
49
|
+
Currently, accounts for use with the Runtime API are created manually by system administrators.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
tag_descriptions[
|
|
53
|
+
"capabilities"
|
|
54
|
+
] = r"""
|
|
55
|
+
A workflow may require an executing site to have particular capabilities.
|
|
56
|
+
|
|
57
|
+
These capabilities go beyond the simple ability to access the data object resources registered with
|
|
58
|
+
the runtime system. Sites register their capabilities, and sites are only able to claim workflow
|
|
59
|
+
jobs if they are known to have the capabilities required by the workflow.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
tag_descriptions[
|
|
63
|
+
"object types"
|
|
64
|
+
] = r"""
|
|
65
|
+
An object type is an object annotation that is useful for triggering workflows.
|
|
66
|
+
|
|
67
|
+
A data object may be annotated with one or more types, which in turn can be associated with
|
|
68
|
+
workflows through trigger resources.
|
|
69
|
+
|
|
70
|
+
The data-object type system may be used to trigger workflow jobs on a subset of data objects when a
|
|
71
|
+
new version of a workflow is deployed. This could be done by minting a special object type for the
|
|
72
|
+
occasion, annotating the subset of data objects with that type, and registering the association of
|
|
73
|
+
object type to workflow via a trigger resource.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
tag_descriptions[
|
|
77
|
+
"triggers"
|
|
78
|
+
] = r"""
|
|
79
|
+
A trigger is an association between a workflow and a data object type.
|
|
80
|
+
|
|
81
|
+
When a data object is annotated with a type, perhaps shortly after object registration, the NMDC
|
|
82
|
+
Runtime will check, via trigger associations, for potential new jobs to create for any workflows.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
tag_descriptions[
|
|
86
|
+
"jobs"
|
|
87
|
+
] = r"""
|
|
88
|
+
A job is a resource that isolates workflow configuration from execution.
|
|
89
|
+
|
|
90
|
+
Rather than directly creating a workflow operation by supplying a workflow ID along with
|
|
91
|
+
configuration, NMDC creates a job that pairs a workflow with configuration. Then, a site can claim a
|
|
92
|
+
job ID, allowing the site to execute the intended workflow without additional configuration.
|
|
93
|
+
|
|
94
|
+
A job can have multiple executions, and a workflow's executions are precisely the executions of all
|
|
95
|
+
jobs created for that workflow.
|
|
96
|
+
|
|
97
|
+
A site that already has a compatible job execution result can preempt the unnecessary creation of a
|
|
98
|
+
job by pre-claiming it. This will return like a claim, and now the site can register known data
|
|
99
|
+
object inputs for the job without the risk of the runtime system creating a claimable job of the
|
|
100
|
+
pre-claimed type.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
tag_descriptions[
|
|
104
|
+
"objects"
|
|
105
|
+
] = r"""
|
|
106
|
+
A [Data Repository Service (DRS)
|
|
107
|
+
object](https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.1.0/docs/#_drs_datatypes)
|
|
108
|
+
represents content necessary for a workflow job to execute, and/or output from a job execution.
|
|
109
|
+
|
|
110
|
+
An object may be a *blob*, analogous to a file, or a *bundle*, analogous to a folder. Sites register
|
|
111
|
+
objects, and sites must ensure that these objects are accessible to the NMDC data broker.
|
|
112
|
+
|
|
113
|
+
An object may be associated with one or more object types, useful for triggering workflows.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
tag_descriptions[
|
|
117
|
+
"operations"
|
|
118
|
+
] = r"""
|
|
119
|
+
An operation is a resource for tracking the execution of a job.
|
|
120
|
+
|
|
121
|
+
When a job is claimed by a site for execution, an operation resource is created.
|
|
122
|
+
|
|
123
|
+
An operation is akin to a "promise" or "future" in that it should eventually resolve to either a
|
|
124
|
+
successful result, i.e. an execution resource, or to an error.
|
|
125
|
+
|
|
126
|
+
An operation is parameterized to return a result type, and a metadata type for storing progress
|
|
127
|
+
information, that are both particular to the job type.
|
|
128
|
+
|
|
129
|
+
Operations may be paused, resumed, and/or cancelled.
|
|
130
|
+
|
|
131
|
+
Operations may expire, i.e. not be stored indefinitely. In this case, it is recommended that
|
|
132
|
+
execution resources have longer lifetimes / not expire, so that information about successful results
|
|
133
|
+
of operations are available.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
tag_descriptions[
|
|
137
|
+
"queries"
|
|
138
|
+
] = r"""
|
|
139
|
+
A query is an operation (find, update, etc.) against the metadata store.
|
|
140
|
+
|
|
141
|
+
Metadata -- for studies, biosamples, omics processing, etc. -- is used by sites to execute jobs,
|
|
142
|
+
as the parameterization of job executions may depend not only on the content of data objects, but
|
|
143
|
+
also on objects' associated metadata.
|
|
144
|
+
|
|
145
|
+
Also, the function of many workflows is to extract or produce new metadata. Such metadata products
|
|
146
|
+
should be registered as data objects, and they may also be supplied by sites to the runtime system
|
|
147
|
+
as an update query (if the latter is not done, the runtime system will sense the new metadata and
|
|
148
|
+
issue an update query).
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
tag_descriptions[
|
|
152
|
+
"metadata"
|
|
153
|
+
] = r"""
|
|
154
|
+
The [metadata endpoints](https://api.microbiomedata.org/docs#/metadata) can be used to get and filter
|
|
155
|
+
metadata from collection set types (including
|
|
156
|
+
[studies](https://w3id.org/nmdc/Study/),
|
|
157
|
+
[biosamples](https://w3id.org/nmdc/Biosample/),
|
|
158
|
+
[planned processes](https://w3id.org/nmdc/PlannedProcess/), and
|
|
159
|
+
[data objects](https://w3id.org/nmdc/DataObject/)
|
|
160
|
+
as discussed in the __find__ section).
|
|
161
|
+
<br/>
|
|
162
|
+
|
|
163
|
+
The __metadata__ endpoints allow users to retrieve metadata from the data portal using the various
|
|
164
|
+
GET endpoints that are slightly different than the __find__ endpoints, but some can be used similarly.
|
|
165
|
+
As with the __find__ endpoints, parameters for the __metadata__ endpoints that do not have a
|
|
166
|
+
red ___* required___ next to them are optional. <br/>
|
|
167
|
+
|
|
168
|
+
Unlike the compact syntax used in the __find__ endpoints, the syntax for the filter parameter of
|
|
169
|
+
the metadata endpoints
|
|
170
|
+
uses [MongoDB-like language querying](https://www.mongodb.com/docs/manual/tutorial/query-documents/).
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
tag_descriptions[
|
|
174
|
+
"find"
|
|
175
|
+
] = r"""
|
|
176
|
+
The [find endpoints](https://api.microbiomedata.org/docs#/find) are provided with NMDC metadata entities
|
|
177
|
+
already specified - where metadata about [studies](https://w3id.org/nmdc/Study),
|
|
178
|
+
[biosamples](https://w3id.org/nmdc/Biosample), [data objects](https://w3id.org/nmdc/DataObject/),
|
|
179
|
+
and [planned processes](https://w3id.org/nmdc/PlannedProcess/) can be retrieved using GET requests.
|
|
180
|
+
<br/>
|
|
181
|
+
|
|
182
|
+
Each endpoint is unique and requires the applicable attribute names to be known in order to structure a query
|
|
183
|
+
in a meaningful way. Parameters that do not have a red ___* required___ label next to them are optional.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
tag_descriptions[
|
|
187
|
+
"runs"
|
|
188
|
+
] = r"""
|
|
189
|
+
**WORK IN PROGRESS**
|
|
190
|
+
|
|
191
|
+
Run simple jobs.
|
|
192
|
+
|
|
193
|
+
For off-site job runs, keep the Runtime appraised of run events.
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
# Remove leading and trailing whitespace from each description.
|
|
197
|
+
for name, description in tag_descriptions.items():
|
|
198
|
+
tag_descriptions[name] = description.strip()
|
|
199
|
+
|
|
200
|
+
ordered_tag_descriptors: List[Dict[str, str]] = [
|
|
201
|
+
{"name": "sites", "description": tag_descriptions["sites"]},
|
|
202
|
+
{"name": "users", "description": tag_descriptions["users"]},
|
|
203
|
+
{"name": "workflows", "description": tag_descriptions["workflows"]},
|
|
204
|
+
{"name": "capabilities", "description": tag_descriptions["capabilities"]},
|
|
205
|
+
{"name": "object types", "description": tag_descriptions["object types"]},
|
|
206
|
+
{"name": "triggers", "description": tag_descriptions["triggers"]},
|
|
207
|
+
{"name": "jobs", "description": tag_descriptions["jobs"]},
|
|
208
|
+
{"name": "objects", "description": tag_descriptions["objects"]},
|
|
209
|
+
{"name": "operations", "description": tag_descriptions["operations"]},
|
|
210
|
+
{"name": "queries", "description": tag_descriptions["queries"]},
|
|
211
|
+
{"name": "metadata", "description": tag_descriptions["metadata"]},
|
|
212
|
+
{"name": "find", "description": tag_descriptions["find"]},
|
|
213
|
+
{"name": "runs", "description": tag_descriptions["runs"]},
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def make_api_description(schema_version: str, orcid_login_url: str) -> str:
|
|
218
|
+
r"""
|
|
219
|
+
Returns an API description into which the specified schema version and
|
|
220
|
+
ORCID login URL have been incorporated.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
schema_version (str): The version of `nmdc-schema` the Runtime is using.
|
|
224
|
+
orcid_login_url (str): The URL at which a user could login via ORCID.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
str: The Markdown-formatted API description.
|
|
228
|
+
"""
|
|
229
|
+
result = f"""
|
|
230
|
+
The NMDC Runtime API, via on-demand functions and via schedule-based and sensor-based automation,
|
|
231
|
+
supports validation and submission of metadata, as well as orchestration of workflow executions.
|
|
232
|
+
|
|
233
|
+
[NMDC Schema](https://microbiomedata.github.io/nmdc-schema/) version: `{schema_version}`
|
|
234
|
+
|
|
235
|
+
[Documentation](https://docs.microbiomedata.org/runtime/)
|
|
236
|
+
|
|
237
|
+
<img src="/static/ORCIDiD_icon128x128.png" height="18" width="18"/>
|
|
238
|
+
<a href="{escape(orcid_login_url)}" title="Login with ORCID">
|
|
239
|
+
Login with ORCID
|
|
240
|
+
</a>
|
|
241
|
+
""".strip()
|
|
242
|
+
return result
|
nmdc_runtime/config.py
CHANGED
|
@@ -1,5 +1,56 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
This module acts as a unified interface between the codebase and the environment.
|
|
3
|
+
We will eventually move all of the Runtime's environment variables reads into this
|
|
4
|
+
module, instead of leaving them sprinkled throughout the codebase.
|
|
2
5
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
+
TODO: Move all environment variable reads into this module and update references accordingly.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Set
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def is_env_var_true(name: str, default: str = "false") -> bool:
|
|
14
|
+
r"""
|
|
15
|
+
Checks whether the value of the specified environment variable
|
|
16
|
+
meets our criteria for true-ness.
|
|
17
|
+
|
|
18
|
+
Reference: https://docs.python.org/3/library/os.html#os.environ
|
|
19
|
+
|
|
20
|
+
Run doctests via: $ python -m doctest nmdc_runtime/config.py
|
|
21
|
+
|
|
22
|
+
>>> import os
|
|
23
|
+
>>> name = "EXAMPLE_ENV_VAR"
|
|
24
|
+
>>> os.unsetenv(name) # Undefined
|
|
25
|
+
>>> is_env_var_true(name)
|
|
26
|
+
False
|
|
27
|
+
>>> is_env_var_true(name, "true") # Undefined, overridden default
|
|
28
|
+
True
|
|
29
|
+
>>> os.environ[name] = "false" # Defined as false
|
|
30
|
+
>>> is_env_var_true(name)
|
|
31
|
+
False
|
|
32
|
+
>>> os.environ[name] = "true" # Defined as true
|
|
33
|
+
>>> is_env_var_true(name)
|
|
34
|
+
True
|
|
35
|
+
>>> os.environ[name] = "TRUE" # Case-insensitive
|
|
36
|
+
>>> is_env_var_true(name)
|
|
37
|
+
True
|
|
38
|
+
>>> os.environ[name] = "potato" # Non-boolean string
|
|
39
|
+
>>> is_env_var_true(name)
|
|
40
|
+
False
|
|
41
|
+
"""
|
|
42
|
+
lowercase_true_strings: Set[str] = {"true"}
|
|
43
|
+
return os.environ.get(name, default).lower() in lowercase_true_strings
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Feature flag to enable/disable the `/nmdcschema/linked_instances` endpoint and the tests that target it.
|
|
47
|
+
IS_LINKED_INSTANCES_ENDPOINT_ENABLED: bool = is_env_var_true(
|
|
48
|
+
"IS_LINKED_INSTANCES_ENDPOINT_ENABLED", default="true"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Feature flag that can be used to enable/disable the `/scalar` endpoint.
|
|
52
|
+
IS_SCALAR_ENABLED: bool = is_env_var_true("IS_SCALAR_ENABLED", default="true")
|
|
53
|
+
|
|
54
|
+
# Feature flag that can be used to enable/disable performance profiling,
|
|
55
|
+
# which can be activated via the `?profile=true` URL query parameter.
|
|
56
|
+
IS_PROFILING_ENABLED: bool = is_env_var_true("IS_PROFILING_ENABLED", default="false")
|
nmdc_runtime/core/db/Database.py
CHANGED
|
@@ -3,14 +3,6 @@
|
|
|
3
3
|
|
|
4
4
|
## system level modules
|
|
5
5
|
import pandas as pds
|
|
6
|
-
import jq
|
|
7
|
-
import jsonasobj
|
|
8
|
-
import json
|
|
9
|
-
import zipfile
|
|
10
|
-
import yaml
|
|
11
|
-
from yaml import CLoader as Loader, CDumper as Dumper
|
|
12
|
-
from dotted_dict import DottedDict
|
|
13
|
-
from collections import namedtuple
|
|
14
6
|
|
|
15
7
|
|
|
16
8
|
def extract_table(merged_df, table_name):
|
|
@@ -3,13 +3,9 @@
|
|
|
3
3
|
|
|
4
4
|
## system level modules
|
|
5
5
|
import pandas as pds
|
|
6
|
-
import jsonasobj
|
|
7
|
-
import json
|
|
8
6
|
import zipfile
|
|
9
7
|
import yaml
|
|
10
|
-
from
|
|
11
|
-
from yaml import CLoader as Loader, CDumper as Dumper
|
|
12
|
-
from dotted_dict import DottedDict
|
|
8
|
+
from yaml import CLoader as Loader
|
|
13
9
|
from collections import namedtuple
|
|
14
10
|
|
|
15
11
|
|
|
@@ -309,10 +305,10 @@ def make_study_dataframe(study_table, contact_table, proposals_table, result_col
|
|
|
309
305
|
|
|
310
306
|
## make sure the contact ids are strings with the ".0" removed from the end (i.e., the strings aren't floats)
|
|
311
307
|
study_table["contact_id"] = (
|
|
312
|
-
study_table["contact_id"].astype(str).replace("\.0", "", regex=True)
|
|
308
|
+
study_table["contact_id"].astype(str).replace(r"\.0", "", regex=True)
|
|
313
309
|
)
|
|
314
310
|
contact_table_splice["contact_id"] = (
|
|
315
|
-
contact_table_splice["contact_id"].astype(str).replace("\.0", "", regex=True)
|
|
311
|
+
contact_table_splice["contact_id"].astype(str).replace(r"\.0", "", regex=True)
|
|
316
312
|
)
|
|
317
313
|
# print(study_table[['contact_id', 'principal_investigator_name']].head())
|
|
318
314
|
|
|
@@ -8,14 +8,8 @@ import nmdc_runtime.lib.load_nmdc_data as lx
|
|
|
8
8
|
import nmdc_runtime.lib.nmdc_dataframes as nmdc_dfs
|
|
9
9
|
from nmdc_schema import nmdc
|
|
10
10
|
import pandas as pds
|
|
11
|
-
import jq
|
|
12
|
-
import jsonasobj
|
|
13
|
-
import json
|
|
14
|
-
import zipfile
|
|
15
11
|
import yaml
|
|
16
|
-
from yaml import CLoader as Loader
|
|
17
|
-
from dotted_dict import DottedDict
|
|
18
|
-
from collections import namedtuple
|
|
12
|
+
from yaml import CLoader as Loader
|
|
19
13
|
|
|
20
14
|
|
|
21
15
|
class NMDC_ETL:
|
nmdc_runtime/minter/config.py
CHANGED
|
@@ -23,9 +23,11 @@ def typecodes() -> List[dict]:
|
|
|
23
23
|
that class _today_; regardless of what it may have used in the past.
|
|
24
24
|
|
|
25
25
|
>>> typecode_descriptors = typecodes()
|
|
26
|
+
|
|
26
27
|
# Test #1: We get the typecode we expect, for a class whose pattern contains only one typecode.
|
|
27
28
|
>>> any((td["name"] == "sty" and td["schema_class"] == "nmdc:Study") for td in typecode_descriptors)
|
|
28
29
|
True
|
|
30
|
+
|
|
29
31
|
# Tests #2 and #3: We get only the typecode we expect, for a class whose pattern contains multiple typecodes.
|
|
30
32
|
>>> any((td["name"] == "dgms" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
|
|
31
33
|
True
|