nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -1
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +731 -40
- nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
- nmdc_runtime/site/graphs.py +80 -29
- nmdc_runtime/site/ops.py +522 -183
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +108 -117
- nmdc_runtime/site/resources.py +72 -36
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
- nmdc_runtime/site/translation/translator.py +64 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +175 -348
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
- nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
- nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Optional, Any, Dict, List, Union, TypedDict
|
|
4
|
+
|
|
5
|
+
import bson
|
|
6
|
+
import bson.json_util
|
|
7
|
+
from pydantic import (
|
|
8
|
+
model_validator,
|
|
9
|
+
Field,
|
|
10
|
+
BaseModel,
|
|
11
|
+
PositiveInt,
|
|
12
|
+
NonNegativeInt,
|
|
13
|
+
field_validator,
|
|
14
|
+
WrapSerializer,
|
|
15
|
+
)
|
|
16
|
+
from toolz import assoc, assoc_in
|
|
17
|
+
from typing_extensions import Annotated
|
|
18
|
+
|
|
19
|
+
from nmdc_runtime.api.core.util import pick
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def bson_to_json(doc: Any, handler) -> dict:
|
|
23
|
+
"""Ensure a dict with e.g. mongo ObjectIds will serialize as JSON."""
|
|
24
|
+
return json.loads(bson.json_util.dumps(doc))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
Document = Annotated[Dict[str, Any], WrapSerializer(bson_to_json)]
|
|
28
|
+
|
|
29
|
+
OneOrZero = Annotated[int, Field(ge=0, le=1)]
|
|
30
|
+
One = Annotated[int, Field(ge=1, le=1)]
|
|
31
|
+
MinusOne = Annotated[int, Field(ge=-1, le=-1)]
|
|
32
|
+
OneOrMinusOne = Union[One, MinusOne]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CommandBase(BaseModel):
|
|
36
|
+
comment: Optional[Any] = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CollStatsCommand(CommandBase):
|
|
40
|
+
collStats: str
|
|
41
|
+
scale: Optional[int] = 1
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CountCommand(CommandBase):
|
|
45
|
+
count: str
|
|
46
|
+
query: Optional[Document] = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class FindCommand(CommandBase):
|
|
50
|
+
find: str
|
|
51
|
+
filter: Optional[Document] = None
|
|
52
|
+
projection: Optional[Dict[str, OneOrZero]] = None
|
|
53
|
+
allowPartialResults: Optional[bool] = True
|
|
54
|
+
batchSize: Optional[PositiveInt] = 101
|
|
55
|
+
sort: Optional[Dict[str, OneOrMinusOne]] = None
|
|
56
|
+
limit: Optional[NonNegativeInt] = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class AggregateCommand(CommandBase):
|
|
60
|
+
aggregate: str
|
|
61
|
+
pipeline: List[Document]
|
|
62
|
+
allowDiskUse: Optional[bool] = False
|
|
63
|
+
cursor: Optional[Document] = None
|
|
64
|
+
|
|
65
|
+
@field_validator("pipeline")
|
|
66
|
+
@classmethod
|
|
67
|
+
def disallow_invalid_pipeline_stages(
|
|
68
|
+
cls, pipeline: List[Document]
|
|
69
|
+
) -> List[Document]:
|
|
70
|
+
deny_list = ["$out", "$merge"]
|
|
71
|
+
|
|
72
|
+
if any(
|
|
73
|
+
key in deny_list for pipeline_stage in pipeline for key in pipeline_stage
|
|
74
|
+
):
|
|
75
|
+
raise ValueError("$Out and $merge pipeline stages are not allowed.")
|
|
76
|
+
|
|
77
|
+
return pipeline
|
|
78
|
+
|
|
79
|
+
@model_validator(mode="before")
|
|
80
|
+
@classmethod
|
|
81
|
+
def ensure_default_value_for_cursor(cls, data: Any) -> Document:
|
|
82
|
+
if isinstance(data, dict) and "cursor" not in data:
|
|
83
|
+
return assoc(data, "cursor", {"batchSize": 25})
|
|
84
|
+
return data
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class GetMoreCommand(CommandBase):
|
|
88
|
+
# Note: No `collection` field. See `QueryContinuation` for inter-API-request "sessions" are modeled.
|
|
89
|
+
getMore: str # Note: runtime uses a `str` id, not an `int` like mongo's native session cursors.
|
|
90
|
+
batchSize: Optional[PositiveInt] = None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class CommandResponse(BaseModel):
|
|
94
|
+
ok: OneOrZero
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class CollStatsCommandResponse(CommandResponse):
|
|
98
|
+
ns: str
|
|
99
|
+
size: float
|
|
100
|
+
count: float
|
|
101
|
+
avgObjSize: Optional[float] = None
|
|
102
|
+
storageSize: float
|
|
103
|
+
totalIndexSize: float
|
|
104
|
+
totalSize: float
|
|
105
|
+
scaleFactor: float
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class CountCommandResponse(CommandResponse):
|
|
109
|
+
n: NonNegativeInt
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class CommandResponseCursor(BaseModel):
|
|
113
|
+
# Note: No `ns` field, `id` is a `str`, and `partialResultsReturned` aliased to `queriedShardsUnavailable` to be
|
|
114
|
+
# less confusing to Runtime API clients. See `QueryContinuation` for inter-API-request "sessions" are modeled.
|
|
115
|
+
batch: List[Document]
|
|
116
|
+
partialResultsReturned: Optional[bool] = Field(
|
|
117
|
+
None, alias="queriedShardsUnavailable"
|
|
118
|
+
)
|
|
119
|
+
id: Optional[str] = None
|
|
120
|
+
|
|
121
|
+
@field_validator("id", mode="before")
|
|
122
|
+
@classmethod
|
|
123
|
+
def coerce_int_to_str(cls, value: Any) -> Any:
|
|
124
|
+
if isinstance(value, int):
|
|
125
|
+
return str(value)
|
|
126
|
+
else:
|
|
127
|
+
return value
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class CursorYieldingCommandResponse(CommandResponse):
|
|
131
|
+
cursor: CommandResponseCursor
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def slimmed(cls, cmd_response) -> Optional["CursorYieldingCommandResponse"]:
|
|
135
|
+
"""Create a new response object that retains only the `_id` for each cursor batch document."""
|
|
136
|
+
dump: dict = cmd_response.model_dump(exclude_unset=True)
|
|
137
|
+
|
|
138
|
+
# If any dictionary in this batch lacks an `_id` key, log a warning and return `None`.`
|
|
139
|
+
id_list = [pick(["_id"], batch_doc) for batch_doc in dump["cursor"]["batch"]]
|
|
140
|
+
if any("_id" not in doc for doc in id_list):
|
|
141
|
+
logging.warning("Some documents in the batch lack an `_id` field.")
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
dump = assoc_in(
|
|
145
|
+
dump,
|
|
146
|
+
["cursor", "batch"],
|
|
147
|
+
id_list,
|
|
148
|
+
)
|
|
149
|
+
return cls(**dump)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class DeleteStatement(BaseModel):
|
|
153
|
+
q: Document
|
|
154
|
+
# `limit` is required: https://www.mongodb.com/docs/manual/reference/command/delete/#std-label-deletes-array-limit
|
|
155
|
+
limit: OneOrZero
|
|
156
|
+
hint: Optional[Dict[str, OneOrMinusOne]] = None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class DeleteCommand(CommandBase):
|
|
160
|
+
delete: str
|
|
161
|
+
deletes: List[DeleteStatement]
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class DeleteCommandResponse(CommandResponse):
|
|
165
|
+
ok: OneOrZero
|
|
166
|
+
n: NonNegativeInt
|
|
167
|
+
writeErrors: Optional[List[Document]] = None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# Custom types for the `delete_specs` derived from `DeleteStatement`s.
|
|
171
|
+
DeleteSpec = TypedDict("DeleteSpec", {"filter": Document, "limit": OneOrZero})
|
|
172
|
+
DeleteSpecs = List[DeleteSpec]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# If `multi==True` all documents that meet the query criteria will be updated.
|
|
176
|
+
# Else only a single document that meets the query criteria will be updated.
|
|
177
|
+
class UpdateStatement(BaseModel):
|
|
178
|
+
q: Document
|
|
179
|
+
u: Document
|
|
180
|
+
upsert: bool = False
|
|
181
|
+
multi: bool = False
|
|
182
|
+
hint: Optional[Dict[str, OneOrMinusOne]] = None
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# Custom types for the `update_specs` derived from `UpdateStatement`s.
|
|
186
|
+
UpdateSpec = TypedDict("UpdateSpec", {"filter": Document, "limit": OneOrZero})
|
|
187
|
+
UpdateSpecs = List[UpdateSpec]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class UpdateCommand(CommandBase):
|
|
191
|
+
update: str
|
|
192
|
+
updates: List[UpdateStatement]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class DocumentUpserted(BaseModel):
|
|
196
|
+
index: NonNegativeInt
|
|
197
|
+
_id: bson.ObjectId
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class UpdateCommandResponse(CommandResponse):
|
|
201
|
+
ok: OneOrZero
|
|
202
|
+
n: NonNegativeInt
|
|
203
|
+
nModified: NonNegativeInt
|
|
204
|
+
upserted: Optional[List[DocumentUpserted]] = None
|
|
205
|
+
writeErrors: Optional[List[Document]] = None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
QueryCmd = Union[FindCommand, AggregateCommand]
|
|
209
|
+
|
|
210
|
+
CursorYieldingCommand = Union[
|
|
211
|
+
QueryCmd,
|
|
212
|
+
GetMoreCommand,
|
|
213
|
+
]
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
Cmd = Union[
|
|
217
|
+
CursorYieldingCommand,
|
|
218
|
+
CollStatsCommand,
|
|
219
|
+
CountCommand,
|
|
220
|
+
DeleteCommand,
|
|
221
|
+
UpdateCommand,
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
CommandResponseOptions = Union[
|
|
225
|
+
CursorYieldingCommandResponse,
|
|
226
|
+
CollStatsCommandResponse,
|
|
227
|
+
CountCommandResponse,
|
|
228
|
+
DeleteCommandResponse,
|
|
229
|
+
UpdateCommandResponse,
|
|
230
|
+
]
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def command_response_for(type_):
|
|
234
|
+
r"""
|
|
235
|
+
TODO: Add a docstring and type hints to this function.
|
|
236
|
+
"""
|
|
237
|
+
if issubclass(type_, CursorYieldingCommand):
|
|
238
|
+
return CursorYieldingCommandResponse
|
|
239
|
+
|
|
240
|
+
d = {
|
|
241
|
+
CollStatsCommand: CollStatsCommandResponse,
|
|
242
|
+
CountCommand: CountCommandResponse,
|
|
243
|
+
DeleteCommand: DeleteCommandResponse,
|
|
244
|
+
UpdateCommand: UpdateCommandResponse,
|
|
245
|
+
}
|
|
246
|
+
return d.get(type_)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A *query continuation* is a means to effectively resume a query, i.e. a `find` or `aggregate` MongoDB database command.
|
|
3
|
+
|
|
4
|
+
A *query continuation* document represents a *continuation* (cf. <https://en.wikipedia.org/wiki/Continuation>) for a
|
|
5
|
+
query and uses a stored value ("cursor") for MongoDB's guaranteed unique-valued document field, `_id`,
|
|
6
|
+
such that the documents returned by the command are guaranteed to be sorted in ascending order by `_id`.
|
|
7
|
+
|
|
8
|
+
In this way, an API client may retrieve all documents defined by a `find` or `aggregate` command over multiple HTTP
|
|
9
|
+
requests. One can think of this process as akin to pagination; however, with "cursor-based" pagination, there are no
|
|
10
|
+
guarantees wrt a fixed "page size".
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import datetime
|
|
15
|
+
import logging
|
|
16
|
+
import json
|
|
17
|
+
|
|
18
|
+
from pydantic import BaseModel, Field
|
|
19
|
+
from pymongo.database import Database as MongoDatabase
|
|
20
|
+
|
|
21
|
+
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
22
|
+
from nmdc_runtime.api.core.util import now
|
|
23
|
+
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
24
|
+
from nmdc_runtime.api.models.query import (
|
|
25
|
+
CommandResponse,
|
|
26
|
+
QueryCmd,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
COLLECTION_NAME_FOR_QUERY_CONTINUATIONS = "_runtime.query_continuations"
|
|
30
|
+
|
|
31
|
+
_mdb: MongoDatabase = get_mongo_db()
|
|
32
|
+
_qc_collection = _mdb[COLLECTION_NAME_FOR_QUERY_CONTINUATIONS]
|
|
33
|
+
|
|
34
|
+
# Ensure one-hour TTL on `_runtime.query_continuations` documents via TTL Index.
|
|
35
|
+
# Reference: https://www.mongodb.com/docs/manual/core/index-ttl/
|
|
36
|
+
_qc_collection.create_index({"last_modified": 1}, expireAfterSeconds=3600)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def not_empty(lst: list) -> bool:
|
|
40
|
+
return len(lst) > 0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class QueryContinuation(BaseModel):
|
|
44
|
+
"""A query that has not completed, and that may be resumed, using `cursor` to modify `query_cmd`.
|
|
45
|
+
|
|
46
|
+
This model is intended to represent the state of a logical "session" to "page" through a query's results
|
|
47
|
+
over several HTTP requests, and may be discarded after fetching all "batches" of documents.
|
|
48
|
+
|
|
49
|
+
Thus, a mongo collection tracking query continuations may be reasonably given e.g. a so-called "TTL Index"
|
|
50
|
+
for the `last_modified` field, assuming that `last_modified` is updated each time `query` is updated.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
id: str = Field(..., alias="_id")
|
|
54
|
+
query_cmd: QueryCmd
|
|
55
|
+
cursor: str
|
|
56
|
+
last_modified: datetime.datetime
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class QueryContinuationError(Exception):
|
|
60
|
+
def __init__(self, detail: str):
|
|
61
|
+
self.detail = detail
|
|
62
|
+
|
|
63
|
+
def __repr__(self):
|
|
64
|
+
return f"{self.__class__.__name__}: {self.detail})"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def dump_qc(m: BaseModel):
|
|
68
|
+
return m.model_dump(by_alias=True, exclude_unset=True)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def create_qc(query_cmd: QueryCmd, cmd_response: CommandResponse) -> QueryContinuation:
|
|
72
|
+
"""Creates query continuation from command and response, and persists continuation to database."""
|
|
73
|
+
|
|
74
|
+
logging.info(f"cmd_response: {cmd_response}")
|
|
75
|
+
last_id = json.dumps(cmd_response.cursor.batch[-1]["_id"])
|
|
76
|
+
logging.info(f"Last document ID for query continuation: {last_id}")
|
|
77
|
+
cc = QueryContinuation(
|
|
78
|
+
_id=generate_one_id(_mdb, "query_continuation"),
|
|
79
|
+
query_cmd=query_cmd,
|
|
80
|
+
cursor=last_id,
|
|
81
|
+
last_modified=now(),
|
|
82
|
+
)
|
|
83
|
+
_qc_collection.insert_one(dump_qc(cc))
|
|
84
|
+
return cc
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_qc_by__id(_id: str) -> QueryContinuation | None:
|
|
88
|
+
r"""
|
|
89
|
+
Returns the `QueryContinuation` having the specified `_id` value, raising an exception
|
|
90
|
+
if the corresponding document does not exist in the database.
|
|
91
|
+
"""
|
|
92
|
+
doc = _qc_collection.find_one({"_id": _id})
|
|
93
|
+
if doc is None:
|
|
94
|
+
raise QueryContinuationError(f"cannot find cc with id {_id}")
|
|
95
|
+
return QueryContinuation(**doc)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def get_last_doc__id_for_qc(query_continuation: QueryContinuation) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Retrieve the last document `_id` for the given `QueryContinuation`.
|
|
101
|
+
"""
|
|
102
|
+
# Assuming `query_continuation` has an attribute `cursor` that stores the last document _id
|
|
103
|
+
logging.info(f"Cursor for last doc query continuation: {query_continuation.cursor}")
|
|
104
|
+
return json.loads(query_continuation.cursor)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_initial_query_for_qc(query_continuation: QueryContinuation) -> QueryCmd:
|
|
108
|
+
"""
|
|
109
|
+
Retrieve the initial query command for the given `QueryContinuation`.
|
|
110
|
+
"""
|
|
111
|
+
return query_continuation.query_cmd
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
import os
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
from dagster_graphql import DagsterGraphQLClient
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
from pymongo.database import Database as MongoDatabase
|
|
9
|
+
from toolz import merge
|
|
10
|
+
|
|
11
|
+
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
12
|
+
from nmdc_runtime.api.core.util import now, now_str, raise404_if_none, pick
|
|
13
|
+
from nmdc_runtime.api.models.user import User
|
|
14
|
+
|
|
15
|
+
PRODUCER_URL_BASE_DEFAULT = (
|
|
16
|
+
"https://github.com/microbiomedata/nmdc-runtime/tree/main/nmdc_runtime/"
|
|
17
|
+
)
|
|
18
|
+
SCHEMA_URL_BASE_DEFAULT = (
|
|
19
|
+
"https://github.com/microbiomedata/nmdc-runtime/tree/main/nmdc_runtime/"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
PRODUCER_URL = PRODUCER_URL_BASE_DEFAULT.replace("/main/", "/v0-0-1/") + "producer"
|
|
23
|
+
SCHEMA_URL = SCHEMA_URL_BASE_DEFAULT.replace("/main/", "/v0-0-1/") + "schema.json"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class OpenLineageBase(BaseModel):
|
|
27
|
+
producer: str
|
|
28
|
+
schemaURL: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RunUserSpec(BaseModel):
|
|
32
|
+
job_id: str
|
|
33
|
+
run_config: dict = {}
|
|
34
|
+
inputs: List[str] = []
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class JobSummary(OpenLineageBase):
|
|
38
|
+
id: str
|
|
39
|
+
description: str
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Run(BaseModel):
|
|
43
|
+
id: str
|
|
44
|
+
facets: Optional[dict] = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class RunEventType(str, Enum):
|
|
48
|
+
REQUESTED = "REQUESTED"
|
|
49
|
+
STARTED = "STARTED"
|
|
50
|
+
FAIL = "FAIL"
|
|
51
|
+
COMPLETE = "COMPLETE"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class RunSummary(OpenLineageBase):
|
|
55
|
+
id: str
|
|
56
|
+
status: RunEventType
|
|
57
|
+
started_at_time: str
|
|
58
|
+
was_started_by: str
|
|
59
|
+
inputs: List[str]
|
|
60
|
+
outputs: List[str]
|
|
61
|
+
job: JobSummary
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class RunEvent(OpenLineageBase):
|
|
65
|
+
run: Run
|
|
66
|
+
job: JobSummary
|
|
67
|
+
type: RunEventType
|
|
68
|
+
time: str
|
|
69
|
+
inputs: Optional[List[str]] = []
|
|
70
|
+
outputs: Optional[List[str]] = []
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@lru_cache
|
|
74
|
+
def get_dagster_graphql_client() -> DagsterGraphQLClient:
|
|
75
|
+
hostname, port_str = os.getenv("DAGIT_HOST").split("://", 1)[-1].split(":", 1)
|
|
76
|
+
port_number = int(port_str)
|
|
77
|
+
return DagsterGraphQLClient(hostname=hostname, port_number=port_number)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _add_run_requested_event(run_spec: RunUserSpec, mdb: MongoDatabase, user: User):
|
|
81
|
+
# XXX what we consider a "job" here, is currently a "workflow" elsewhere...
|
|
82
|
+
job = raise404_if_none(mdb.workflows.find_one({"id": run_spec.job_id}))
|
|
83
|
+
run_id = generate_one_id(mdb, "runs")
|
|
84
|
+
event = RunEvent(
|
|
85
|
+
producer=user.username,
|
|
86
|
+
schemaURL=SCHEMA_URL,
|
|
87
|
+
run=Run(id=run_id, facets={"nmdcRuntime_runConfig": run_spec.run_config}),
|
|
88
|
+
job=merge(
|
|
89
|
+
pick(["id", "description"], job),
|
|
90
|
+
{"producer": PRODUCER_URL, "schemaURL": SCHEMA_URL},
|
|
91
|
+
),
|
|
92
|
+
type=RunEventType.REQUESTED,
|
|
93
|
+
time=now_str(),
|
|
94
|
+
inputs=run_spec.inputs,
|
|
95
|
+
)
|
|
96
|
+
mdb.run_events.insert_one(event.model_dump())
|
|
97
|
+
return run_id
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _add_run_started_event(run_id: str, mdb: MongoDatabase):
|
|
101
|
+
requested: RunEvent = RunEvent(
|
|
102
|
+
**raise404_if_none(
|
|
103
|
+
mdb.run_events.find_one(
|
|
104
|
+
{"run.id": run_id, "type": "REQUESTED"}, sort=[("time", -1)]
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
mdb.run_events.insert_one(
|
|
109
|
+
RunEvent(
|
|
110
|
+
producer=PRODUCER_URL,
|
|
111
|
+
schemaURL=SCHEMA_URL,
|
|
112
|
+
run=requested.run,
|
|
113
|
+
job=requested.job,
|
|
114
|
+
type=RunEventType.STARTED,
|
|
115
|
+
time=now_str(),
|
|
116
|
+
).model_dump()
|
|
117
|
+
)
|
|
118
|
+
return run_id
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _add_run_fail_event(run_id: str, mdb: MongoDatabase):
|
|
122
|
+
requested: RunEvent = RunEvent(
|
|
123
|
+
**raise404_if_none(
|
|
124
|
+
mdb.run_events.find_one(
|
|
125
|
+
{"run.id": run_id, "type": "REQUESTED"}, sort=[("time", -1)]
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
mdb.run_events.insert_one(
|
|
130
|
+
RunEvent(
|
|
131
|
+
producer=PRODUCER_URL,
|
|
132
|
+
schemaURL=SCHEMA_URL,
|
|
133
|
+
run=requested.run,
|
|
134
|
+
job=requested.job,
|
|
135
|
+
type=RunEventType.FAIL,
|
|
136
|
+
time=now_str(),
|
|
137
|
+
).model_dump()
|
|
138
|
+
)
|
|
139
|
+
return run_id
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _add_run_complete_event(run_id: str, mdb: MongoDatabase, outputs: List[str]):
|
|
143
|
+
started: RunEvent = RunEvent(
|
|
144
|
+
**raise404_if_none(
|
|
145
|
+
mdb.run_events.find_one(
|
|
146
|
+
{"run.id": run_id, "type": "STARTED"}, sort=[("time", -1)]
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
mdb.run_events.insert_one(
|
|
151
|
+
RunEvent(
|
|
152
|
+
producer=PRODUCER_URL,
|
|
153
|
+
schemaURL=SCHEMA_URL,
|
|
154
|
+
run=started.run,
|
|
155
|
+
job=started.job,
|
|
156
|
+
type=RunEventType.COMPLETE,
|
|
157
|
+
time=now_str(),
|
|
158
|
+
outputs=outputs,
|
|
159
|
+
).model_dump()
|
|
160
|
+
)
|
|
161
|
+
return run_id
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
import pymongo.database
|
|
4
|
+
from fastapi import Depends
|
|
5
|
+
from jose import JWTError, jwt
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from nmdc_runtime.api.core.auth import (
|
|
9
|
+
verify_password,
|
|
10
|
+
TokenData,
|
|
11
|
+
optional_oauth2_scheme,
|
|
12
|
+
)
|
|
13
|
+
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
14
|
+
from nmdc_runtime.api.models.user import (
|
|
15
|
+
oauth2_scheme,
|
|
16
|
+
credentials_exception,
|
|
17
|
+
SECRET_KEY,
|
|
18
|
+
ALGORITHM,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Site(BaseModel):
|
|
23
|
+
id: str
|
|
24
|
+
capability_ids: List[str] = []
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SiteClientInDB(BaseModel):
|
|
28
|
+
id: str
|
|
29
|
+
hashed_secret: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SiteInDB(Site):
|
|
33
|
+
clients: List[SiteClientInDB] = []
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_site(mdb, client_id: str) -> Optional[SiteInDB]:
|
|
37
|
+
r"""
|
|
38
|
+
Returns the site, if any, for which the specified `client_id` was generated.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
site = mdb.sites.find_one({"clients.id": client_id})
|
|
42
|
+
if site is not None:
|
|
43
|
+
return SiteInDB(**site)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def authenticate_site_client(mdb, client_id: str, client_secret: str):
|
|
47
|
+
site = get_site(mdb, client_id)
|
|
48
|
+
if not site:
|
|
49
|
+
return False
|
|
50
|
+
hashed_secret = next(
|
|
51
|
+
client.hashed_secret for client in site.clients if client.id == client_id
|
|
52
|
+
)
|
|
53
|
+
if not verify_password(client_secret, hashed_secret):
|
|
54
|
+
return False
|
|
55
|
+
return site
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def get_current_client_site(
|
|
59
|
+
token: str = Depends(oauth2_scheme),
|
|
60
|
+
mdb: pymongo.database.Database = Depends(get_mongo_db),
|
|
61
|
+
):
|
|
62
|
+
if mdb.invalidated_tokens.find_one({"_id": token}):
|
|
63
|
+
raise credentials_exception
|
|
64
|
+
try:
|
|
65
|
+
payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
|
66
|
+
subject: str = payload.get("sub")
|
|
67
|
+
if subject is None:
|
|
68
|
+
raise credentials_exception
|
|
69
|
+
if not subject.startswith("client:"):
|
|
70
|
+
raise credentials_exception
|
|
71
|
+
client_id = subject.split("client:", 1)[1]
|
|
72
|
+
token_data = TokenData(subject=client_id)
|
|
73
|
+
except JWTError:
|
|
74
|
+
raise credentials_exception
|
|
75
|
+
site = get_site(mdb, client_id=token_data.subject)
|
|
76
|
+
if site is None:
|
|
77
|
+
raise credentials_exception
|
|
78
|
+
return site
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def maybe_get_current_client_site(
|
|
82
|
+
token: str = Depends(optional_oauth2_scheme),
|
|
83
|
+
mdb: pymongo.database.Database = Depends(get_mongo_db),
|
|
84
|
+
):
|
|
85
|
+
if token is None:
|
|
86
|
+
return None
|
|
87
|
+
return await get_current_client_site(token, mdb)
|