nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from datetime import datetime, timedelta, timezone
|
|
3
|
+
from typing import Optional, Dict
|
|
4
|
+
|
|
5
|
+
from fastapi import Depends
|
|
6
|
+
from fastapi.exceptions import HTTPException
|
|
7
|
+
from fastapi.openapi.models import OAuthFlows as OAuthFlowsModel
|
|
8
|
+
from fastapi.param_functions import Form
|
|
9
|
+
from fastapi.security import (
|
|
10
|
+
OAuth2,
|
|
11
|
+
HTTPBasic,
|
|
12
|
+
HTTPBasicCredentials,
|
|
13
|
+
HTTPBearer,
|
|
14
|
+
HTTPAuthorizationCredentials,
|
|
15
|
+
)
|
|
16
|
+
from fastapi.security.utils import get_authorization_scheme_param
|
|
17
|
+
from jose import JWTError, jwt
|
|
18
|
+
from passlib.context import CryptContext
|
|
19
|
+
from pydantic import BaseModel
|
|
20
|
+
from starlette.requests import Request
|
|
21
|
+
from starlette.status import HTTP_400_BAD_REQUEST, HTTP_401_UNAUTHORIZED
|
|
22
|
+
|
|
23
|
+
ORCID_PRODUCTION_BASE_URL = "https://orcid.org"
|
|
24
|
+
|
|
25
|
+
SECRET_KEY = os.getenv("JWT_SECRET_KEY")
|
|
26
|
+
ALGORITHM = "HS256"
|
|
27
|
+
ORCID_NMDC_CLIENT_ID = os.getenv("ORCID_NMDC_CLIENT_ID")
|
|
28
|
+
ORCID_NMDC_CLIENT_SECRET = os.getenv("ORCID_NMDC_CLIENT_SECRET")
|
|
29
|
+
ORCID_BASE_URL = os.getenv("ORCID_BASE_URL", default=ORCID_PRODUCTION_BASE_URL)
|
|
30
|
+
|
|
31
|
+
# Define the JSON Web Key Set (JWKS) for ORCID.
|
|
32
|
+
#
|
|
33
|
+
# Note: The URL from which we got this dictionary is: https://orcid.org/oauth/jwks
|
|
34
|
+
# We got _that_ URL from the dictionary at: https://orcid.org/.well-known/openid-configuration
|
|
35
|
+
#
|
|
36
|
+
# TODO: Consider _live-loading_ this dictionary from the Internet.
|
|
37
|
+
#
|
|
38
|
+
ORCID_JWK = {
|
|
39
|
+
"e": "AQAB",
|
|
40
|
+
"kid": "production-orcid-org-7hdmdswarosg3gjujo8agwtazgkp1ojs",
|
|
41
|
+
"kty": "RSA",
|
|
42
|
+
"n": "jxTIntA7YvdfnYkLSN4wk__E2zf_wbb0SV_HLHFvh6a9ENVRD1_rHK0EijlBzikb-1rgDQihJETcgBLsMoZVQqGj8fDUUuxnVHsuGav_bf41PA7E_58HXKPrB2C0cON41f7K3o9TStKpVJOSXBrRWURmNQ64qnSSryn1nCxMzXpaw7VUo409ohybbvN6ngxVy4QR2NCC7Fr0QVdtapxD7zdlwx6lEwGemuqs_oG5oDtrRuRgeOHmRps2R6gG5oc-JqVMrVRv6F9h4ja3UgxCDBQjOVT1BFPWmMHnHCsVYLqbbXkZUfvP2sO1dJiYd_zrQhi-FtNth9qrLLv3gkgtwQ",
|
|
43
|
+
"use": "sig",
|
|
44
|
+
}
|
|
45
|
+
# If the application is using a _non-production_ ORCID environment, overwrite
|
|
46
|
+
# the "kid" and "n" values with those from the sandbox ORCID environment.
|
|
47
|
+
#
|
|
48
|
+
# Source: https://sandbox.orcid.org/oauth/jwks
|
|
49
|
+
#
|
|
50
|
+
if ORCID_BASE_URL != ORCID_PRODUCTION_BASE_URL:
|
|
51
|
+
ORCID_JWK["kid"] = "sandbox-orcid-org-3hpgosl3b6lapenh1ewsgdob3fawepoj"
|
|
52
|
+
ORCID_JWK["n"] = (
|
|
53
|
+
"pl-jp-kTAGf6BZUrWIYUJTvqqMVd4iAnoLS6vve-KNV0q8TxKvMre7oi9IulDcqTuJ1alHrZAIVlgrgFn88MKirZuTqHG6LCtEsr7qGD9XyVcz64oXrb9vx4FO9tLNQxvdnIWCIwyPAYWtPMHMSSD5oEVUtVL_5IaxfCJvU-FchdHiwfxvXMWmA-i3mcEEe9zggag2vUPPIqUwbPVUFNj2hE7UsZbasuIToEMFRZqSB6juc9zv6PEUueQ5hAJCEylTkzMwyBMibrt04TmtZk2w9DfKJR91555s2ZMstX4G_su1_FqQ6p9vgcuLQ6tCtrW77tta-Rw7McF_tyPmvnhQ"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
ORCID_JWS_VERITY_ALGORITHM = "RS256"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ClientCredentials(BaseModel):
|
|
60
|
+
client_id: str
|
|
61
|
+
client_secret: str
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class TokenExpires(BaseModel):
|
|
65
|
+
days: Optional[int] = 1
|
|
66
|
+
hours: Optional[int] = 0
|
|
67
|
+
minutes: Optional[int] = 0
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
ACCESS_TOKEN_EXPIRES = TokenExpires(days=1, hours=0, minutes=0)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class Token(BaseModel):
|
|
74
|
+
access_token: str
|
|
75
|
+
token_type: str
|
|
76
|
+
expires: Optional[TokenExpires] = None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class TokenData(BaseModel):
|
|
80
|
+
subject: Optional[str] = None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
|
84
|
+
|
|
85
|
+
credentials_exception = HTTPException(
|
|
86
|
+
status_code=HTTP_401_UNAUTHORIZED,
|
|
87
|
+
detail="Could not validate credentials",
|
|
88
|
+
headers={"WWW-Authenticate": "Bearer"},
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def verify_password(plain_password, hashed_password):
|
|
93
|
+
return pwd_context.verify(plain_password, hashed_password)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def get_password_hash(password):
|
|
97
|
+
return pwd_context.hash(password)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
|
|
101
|
+
to_encode = data.copy()
|
|
102
|
+
if expires_delta:
|
|
103
|
+
expire = datetime.now(timezone.utc) + expires_delta
|
|
104
|
+
else:
|
|
105
|
+
expire = datetime.now(timezone.utc) + timedelta(minutes=15)
|
|
106
|
+
to_encode.update({"exp": expire})
|
|
107
|
+
encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
|
108
|
+
return encoded_jwt
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def get_access_token_expiration(token) -> datetime:
|
|
112
|
+
try:
|
|
113
|
+
payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
|
114
|
+
return payload.get("exp")
|
|
115
|
+
except JWTError:
|
|
116
|
+
raise credentials_exception
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class OAuth2PasswordOrClientCredentialsBearer(OAuth2):
|
|
120
|
+
"""
|
|
121
|
+
TODO: Document this undocumented class.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
def __init__(
|
|
125
|
+
self,
|
|
126
|
+
tokenUrl: str,
|
|
127
|
+
scheme_name: Optional[str] = None,
|
|
128
|
+
scopes: Optional[Dict[str, str]] = None,
|
|
129
|
+
auto_error: bool = True,
|
|
130
|
+
):
|
|
131
|
+
if not scopes:
|
|
132
|
+
scopes = {}
|
|
133
|
+
flows = OAuthFlowsModel(
|
|
134
|
+
password={"tokenUrl": tokenUrl, "scopes": scopes},
|
|
135
|
+
clientCredentials={"tokenUrl": tokenUrl},
|
|
136
|
+
)
|
|
137
|
+
super().__init__(flows=flows, scheme_name=scheme_name, auto_error=auto_error)
|
|
138
|
+
|
|
139
|
+
async def __call__(self, request: Request) -> Optional[str]:
|
|
140
|
+
authorization: str = request.headers.get("Authorization")
|
|
141
|
+
scheme, param = get_authorization_scheme_param(authorization)
|
|
142
|
+
if not authorization or scheme.lower() != "bearer":
|
|
143
|
+
if self.auto_error:
|
|
144
|
+
raise HTTPException(
|
|
145
|
+
status_code=HTTP_401_UNAUTHORIZED,
|
|
146
|
+
detail="Not authenticated",
|
|
147
|
+
headers={"WWW-Authenticate": "Bearer"},
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
print(request.url)
|
|
151
|
+
return None
|
|
152
|
+
return param
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
oauth2_scheme = OAuth2PasswordOrClientCredentialsBearer(
|
|
156
|
+
tokenUrl="token", auto_error=False
|
|
157
|
+
)
|
|
158
|
+
optional_oauth2_scheme = OAuth2PasswordOrClientCredentialsBearer(
|
|
159
|
+
tokenUrl="token", auto_error=False
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
bearer_scheme = HTTPBearer(scheme_name="bearerAuth", auto_error=False)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
async def basic_credentials(req: Request):
|
|
166
|
+
return await HTTPBasic(auto_error=False)(req)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
async def bearer_credentials(req: Request):
|
|
170
|
+
return await HTTPBearer(scheme_name="bearerAuth", auto_error=False)(req)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class OAuth2PasswordOrClientCredentialsRequestForm:
|
|
174
|
+
def __init__(
|
|
175
|
+
self,
|
|
176
|
+
basic_creds: Optional[HTTPBasicCredentials] = Depends(basic_credentials),
|
|
177
|
+
bearer_creds: Optional[HTTPAuthorizationCredentials] = Depends(
|
|
178
|
+
bearer_credentials
|
|
179
|
+
),
|
|
180
|
+
grant_type: str = Form(None, pattern="^password$|^client_credentials$"),
|
|
181
|
+
username: Optional[str] = Form(None),
|
|
182
|
+
password: Optional[str] = Form(None),
|
|
183
|
+
scope: str = Form(""),
|
|
184
|
+
client_id: Optional[str] = Form(None),
|
|
185
|
+
client_secret: Optional[str] = Form(None),
|
|
186
|
+
):
|
|
187
|
+
if bearer_creds:
|
|
188
|
+
self.grant_type = "client_credentials"
|
|
189
|
+
self.username, self.password = None, None
|
|
190
|
+
self.scopes = scope.split()
|
|
191
|
+
self.client_id = bearer_creds.credentials
|
|
192
|
+
self.client_secret = None
|
|
193
|
+
elif grant_type == "password" and (username is None or password is None):
|
|
194
|
+
raise HTTPException(
|
|
195
|
+
status_code=HTTP_400_BAD_REQUEST,
|
|
196
|
+
detail="grant_type password requires username and password",
|
|
197
|
+
)
|
|
198
|
+
elif grant_type == "client_credentials" and (client_id is None):
|
|
199
|
+
if basic_creds:
|
|
200
|
+
client_id = basic_creds.username
|
|
201
|
+
client_secret = basic_creds.password
|
|
202
|
+
else:
|
|
203
|
+
raise HTTPException(
|
|
204
|
+
status_code=HTTP_400_BAD_REQUEST,
|
|
205
|
+
detail="grant_type client_credentials requires client_id and client_secret",
|
|
206
|
+
)
|
|
207
|
+
self.grant_type = grant_type
|
|
208
|
+
self.username = username
|
|
209
|
+
self.password = password
|
|
210
|
+
self.scopes = scope.split()
|
|
211
|
+
self.client_id = client_id
|
|
212
|
+
self.client_secret = client_secret
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import base32_lib as base32
|
|
5
|
+
from pymongo.database import Database as MongoDatabase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def generate_id(length=10, split_every=4, checksum=True) -> str:
|
|
9
|
+
"""Generate random base32 string: a user-shareable ID for a database entity.
|
|
10
|
+
|
|
11
|
+
Uses Douglas Crockford Base32 encoding: <https://www.crockford.com/base32.html>
|
|
12
|
+
|
|
13
|
+
Default is 8 characters (5-bits each) plus 2 digit characters for ISO 7064 checksum,
|
|
14
|
+
so 2**40 ~ 1 trillion possible values, *much* larger than the number of statements
|
|
15
|
+
feasibly storable by the database. Hyphen splits are optional for human readability,
|
|
16
|
+
and the default is one split after 5 characters, so an example output using the default
|
|
17
|
+
settings is '3sbk2-5j060'.
|
|
18
|
+
|
|
19
|
+
:param length: non-hyphen identifier length *including* checksum
|
|
20
|
+
:param split_every: hyphenates every that many characters
|
|
21
|
+
:param checksum: computes and appends ISO-7064 checksum
|
|
22
|
+
:returns: identifier as a string
|
|
23
|
+
"""
|
|
24
|
+
return base32.generate(length=length, split_every=split_every, checksum=checksum)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def decode_id(encoded: str, checksum=True) -> int:
|
|
28
|
+
"""Decodes generated string ID (via `generate_id`) to a number.
|
|
29
|
+
|
|
30
|
+
The string is normalized -- lowercased, hyphens removed,
|
|
31
|
+
{I,i,l,L}=>1 and {O,o}=>0 (user typos corrected) -- before decoding.
|
|
32
|
+
|
|
33
|
+
If `checksum` is enabled, raises a ValueError on checksum error.
|
|
34
|
+
|
|
35
|
+
:param encoded: string to decode
|
|
36
|
+
:param checksum: extract checksum and validate
|
|
37
|
+
:returns: original number.
|
|
38
|
+
"""
|
|
39
|
+
return base32.decode(encoded=encoded, checksum=checksum)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def encode_id(number: int, split_every=4, min_length=10, checksum=True) -> int:
|
|
43
|
+
"""Encodes `number` to URI-friendly Douglas Crockford base32 string.
|
|
44
|
+
|
|
45
|
+
:param number: number to encode
|
|
46
|
+
:param split_every: if provided, insert '-' every `split_every` characters
|
|
47
|
+
going from left to right
|
|
48
|
+
:param min_length: 0-pad beginning of string to obtain minimum desired length
|
|
49
|
+
:param checksum: append modulo 97-10 (ISO 7064) checksum to string
|
|
50
|
+
:returns: A random Douglas Crockford base32 encoded string composed only
|
|
51
|
+
of valid URI characters.
|
|
52
|
+
"""
|
|
53
|
+
return base32.encode(
|
|
54
|
+
number, split_every=split_every, min_length=min_length, checksum=checksum
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# sping: "semi-opaque string" (https://n2t.net/e/n2t_apidoc.html).
|
|
59
|
+
#
|
|
60
|
+
# Note: The result is always the following list of tuples:
|
|
61
|
+
# ```
|
|
62
|
+
# [
|
|
63
|
+
# ( 2, 512),
|
|
64
|
+
# ( 4, 524288),
|
|
65
|
+
# ( 6, 536870912),
|
|
66
|
+
# ( 8, 549755813888),
|
|
67
|
+
# (10, 562949953421312)
|
|
68
|
+
# ]
|
|
69
|
+
# ````
|
|
70
|
+
SPING_SIZE_THRESHOLDS = [(n, (2 ** (5 * n)) // 2) for n in [2, 4, 6, 8, 10]]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def collection_name(naa, shoulder):
|
|
74
|
+
r"""
|
|
75
|
+
Returns a string designed to be used as a MongoDB collection name.
|
|
76
|
+
|
|
77
|
+
TODO: Document the function parameters, including expanding the "naa" acronym.
|
|
78
|
+
"""
|
|
79
|
+
return f"ids_{naa}_{shoulder}"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def generate_ids(
|
|
83
|
+
mdb: MongoDatabase,
|
|
84
|
+
owner: str,
|
|
85
|
+
populator: str,
|
|
86
|
+
number: int,
|
|
87
|
+
ns: str = "",
|
|
88
|
+
naa: str = "nmdc",
|
|
89
|
+
shoulder: str = "fk4",
|
|
90
|
+
) -> List[str]:
|
|
91
|
+
r"""
|
|
92
|
+
Generate the specified number of identifiers, storing them in a MongoDB collection
|
|
93
|
+
whose name is derived from the specified Name-Assigning Authority (NAA) and Shoulder.
|
|
94
|
+
|
|
95
|
+
:param mdb: Handle to a MongoDB database
|
|
96
|
+
:param owner: String that will go in the "__ao" field of the identifier record.
|
|
97
|
+
Callers will oftentimes set this to the name of a Runtime "site"
|
|
98
|
+
(as in, a "site client" site, not a "Dagster" site).
|
|
99
|
+
:param populator: String that will go in the "who" field of the identifier record.
|
|
100
|
+
Indicates "who generated this ID." Callers will oftentimes set
|
|
101
|
+
this to the name of a Runtime "site" (as in, a "site client" site,
|
|
102
|
+
not a "Dagster" site).
|
|
103
|
+
:param ns: Namespace (see Minter docs); e.g. "changesheets"
|
|
104
|
+
:param naa: Name-Assigning Authority (see Minter docs); e.g. "nmdc"
|
|
105
|
+
:param shoulder: String that will go in the "how" field (see Minter docs); e.g. "sys0"
|
|
106
|
+
|
|
107
|
+
This function was written the way it was in an attempt to mirror the ARK spec:
|
|
108
|
+
https://www.ietf.org/archive/id/draft-kunze-ark-41.html (found via: https://arks.org/specs/)
|
|
109
|
+
|
|
110
|
+
Deviations from the ARK spec include:
|
|
111
|
+
1. The inclusion of a typecode.
|
|
112
|
+
The inclusion of a typecode came out of discussions with team members,
|
|
113
|
+
who wanted identifiers to include some non-opaque substring that could be used
|
|
114
|
+
to determine what type of resource a given identifier refers to.
|
|
115
|
+
2. Making hyphens mandatory.
|
|
116
|
+
We decided to make the hyphens mandatory, whereas the spec says they are optional.
|
|
117
|
+
> "Hyphens are considered to be insignificant and are always ignored in ARKs."
|
|
118
|
+
> Reference: https://www.ietf.org/archive/id/draft-kunze-ark-41.html#name-character-repertoires
|
|
119
|
+
In our case, we require that users include an identifier's hyphens whenever
|
|
120
|
+
they are using that identifier.
|
|
121
|
+
"""
|
|
122
|
+
collection = mdb.get_collection(collection_name(naa, shoulder))
|
|
123
|
+
estimated_document_count = collection.estimated_document_count()
|
|
124
|
+
n_chars = next(
|
|
125
|
+
(
|
|
126
|
+
n
|
|
127
|
+
for n, t in SPING_SIZE_THRESHOLDS
|
|
128
|
+
if (number + estimated_document_count) < t
|
|
129
|
+
),
|
|
130
|
+
12,
|
|
131
|
+
)
|
|
132
|
+
collected = []
|
|
133
|
+
|
|
134
|
+
while True:
|
|
135
|
+
eids = set()
|
|
136
|
+
n_to_generate = number - len(collected)
|
|
137
|
+
while len(eids) < n_to_generate:
|
|
138
|
+
eids.add(generate_id(length=(n_chars + 2), split_every=0, checksum=True))
|
|
139
|
+
eids = list(eids)
|
|
140
|
+
deids = [decode_id(eid) for eid in eids]
|
|
141
|
+
taken = {d["_id"] for d in collection.find({"_id": {"$in": deids}}, {"_id": 1})}
|
|
142
|
+
not_taken = [
|
|
143
|
+
(eid, eid_decoded)
|
|
144
|
+
for eid, eid_decoded in zip(eids, deids)
|
|
145
|
+
if eid_decoded not in taken
|
|
146
|
+
]
|
|
147
|
+
if not_taken:
|
|
148
|
+
# All attribute names beginning with "__a" are reserved...
|
|
149
|
+
# https://github.com/jkunze/n2t-eggnog/blob/0f0f4c490e6dece507dba710d3557e29b8f6627e/egg#L1882
|
|
150
|
+
# The author of this function opted to refrain from using property names beginning with "_.e",
|
|
151
|
+
# because he thought it would complicate MongoDB queries involving those properties, given that
|
|
152
|
+
# the "." is used as a field delimiter in MongoDB syntax (e.g. "foo.bar.baz").
|
|
153
|
+
docs = [
|
|
154
|
+
{
|
|
155
|
+
"@context": "https://n2t.net/e/n2t_apidoc.html#identifier-metadata",
|
|
156
|
+
"_id": eid_decoded,
|
|
157
|
+
"who": populator,
|
|
158
|
+
"what": (f"{ns}/{eid}" if ns else "(:tba) Work in progress"),
|
|
159
|
+
"when": datetime.now(timezone.utc).isoformat(timespec="seconds"),
|
|
160
|
+
"how": shoulder,
|
|
161
|
+
"where": f"{naa}:{shoulder}{eid}",
|
|
162
|
+
"__as": "reserved", # status, public|reserved|unavailable
|
|
163
|
+
"__ao": owner, # owner
|
|
164
|
+
"__ac": datetime.now(timezone.utc).isoformat(
|
|
165
|
+
timespec="seconds"
|
|
166
|
+
), # created
|
|
167
|
+
}
|
|
168
|
+
for eid, eid_decoded in not_taken
|
|
169
|
+
]
|
|
170
|
+
collection.insert_many(docs)
|
|
171
|
+
collected.extend(docs)
|
|
172
|
+
if len(collected) == number:
|
|
173
|
+
break
|
|
174
|
+
return [d["where"] for d in collected]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def generate_one_id(
|
|
178
|
+
mdb: MongoDatabase,
|
|
179
|
+
ns: str = "",
|
|
180
|
+
shoulder: str = "sys0", # "sys0" represents the Runtime
|
|
181
|
+
) -> str:
|
|
182
|
+
"""Generate unique Crockford Base32-encoded ID for mdb repository.
|
|
183
|
+
|
|
184
|
+
Can associate ID with namespace ns to facilitate ID deletion/recycling.
|
|
185
|
+
|
|
186
|
+
"""
|
|
187
|
+
return generate_ids(
|
|
188
|
+
mdb,
|
|
189
|
+
owner="_system", # "_system" represents the Runtime
|
|
190
|
+
populator="_system", # "_system" represents the Runtime
|
|
191
|
+
number=1,
|
|
192
|
+
ns=ns,
|
|
193
|
+
naa="nmdc",
|
|
194
|
+
shoulder=shoulder,
|
|
195
|
+
)[0]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def local_part(id_):
|
|
199
|
+
"""nmdc:fk0123 -> fk0123"""
|
|
200
|
+
return id_.split(":", maxsplit=1)[1]
|