nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +167 -0
- nmdc_runtime/api/analytics.py +22 -2
- nmdc_runtime/api/core/idgen.py +36 -6
- nmdc_runtime/api/db/mongo.py +0 -12
- nmdc_runtime/api/endpoints/find.py +65 -225
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
- nmdc_runtime/api/endpoints/objects.py +4 -11
- nmdc_runtime/api/endpoints/operations.py +0 -27
- nmdc_runtime/api/endpoints/queries.py +22 -0
- nmdc_runtime/api/endpoints/sites.py +0 -24
- nmdc_runtime/api/endpoints/util.py +57 -35
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +84 -60
- nmdc_runtime/api/models/util.py +12 -5
- nmdc_runtime/api/openapi.py +116 -180
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/minter/adapters/repository.py +21 -0
- nmdc_runtime/minter/domain/model.py +20 -0
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +632 -11
- nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
- nmdc_runtime/site/graphs.py +7 -0
- nmdc_runtime/site/ops.py +92 -34
- nmdc_runtime/site/repository.py +2 -0
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +87 -1
- nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/RECORD +47 -57
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/api/endpoints/ids.py +0 -192
- nmdc_runtime/client/__init__.py +0 -0
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/__init__.py +0 -0
- nmdc_runtime/core/db/Database.py +0 -13
- nmdc_runtime/core/db/__init__.py +0 -0
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/__init__.py +0 -0
- nmdc_runtime/domain/users/__init__.py +0 -0
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/models/user.py +0 -1
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -33
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -825
- nmdc_runtime/lib/nmdc_etl_class.py +0 -396
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/__init__.py +0 -0
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
- nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
nmdc_runtime/Dockerfile
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# Note: Most of the steps for the `base` image were copied verbatim from either `fastapi.Dockerfile`,
|
|
2
|
+
# `dagster.Dockerfile`, or `test.Dockerfile` (indeed, most of the steps were present in all three files).
|
|
3
|
+
# Reference: https://docs.docker.com/get-started/docker-concepts/building-images/multi-stage-builds/
|
|
4
|
+
#
|
|
5
|
+
# Base this image upon a variant of the official Python 3.10 image that is, in turn,
|
|
6
|
+
# based upon a minimal (slim) variant of the Debian 11 (bullseye) image.
|
|
7
|
+
# Reference: https://hub.docker.com/_/python
|
|
8
|
+
# ────────────────────────────────────────────────────────────────────────────┐
|
|
9
|
+
FROM python:3.10-slim-bullseye AS base
|
|
10
|
+
# ────────────────────────────────────────────────────────────────────────────┘
|
|
11
|
+
|
|
12
|
+
# Install and upgrade system-level software in a non-interactive way, then delete temporary files.
|
|
13
|
+
# Note: Setting `DEBIAN_FRONTEND=noninteractive` and passing `-y` to `apt-get` makes things non-interactive.
|
|
14
|
+
RUN export DEBIAN_FRONTEND=noninteractive && \
|
|
15
|
+
apt-get update && \
|
|
16
|
+
apt-get -y upgrade && \
|
|
17
|
+
apt-get install -y --no-install-recommends \
|
|
18
|
+
tini \
|
|
19
|
+
procps \
|
|
20
|
+
net-tools \
|
|
21
|
+
build-essential \
|
|
22
|
+
git \
|
|
23
|
+
make \
|
|
24
|
+
zip \
|
|
25
|
+
curl \
|
|
26
|
+
wget \
|
|
27
|
+
gnupg && \
|
|
28
|
+
apt-get -y clean && \
|
|
29
|
+
rm -rf /var/lib/apt/lists/*
|
|
30
|
+
|
|
31
|
+
# Enable Python's "fault handler" feature, so, when low-level errors occur (e.g. segfaults), Python prints lots of info.
|
|
32
|
+
# Reference: https://docs.python.org/3/using/cmdline.html#envvar-PYTHONFAULTHANDLER
|
|
33
|
+
ENV PYTHONFAULTHANDLER=1
|
|
34
|
+
|
|
35
|
+
# Configure Git to consider the `/code` directory to be "safe", so that, when a Git repository
|
|
36
|
+
# created outside of the container gets mounted at that path within the container, the
|
|
37
|
+
# `uv-dynamic-versioning` tool running within the container does not fail with the error:
|
|
38
|
+
# > "Detected Git repository, but failed because of dubious ownership"
|
|
39
|
+
# Reference: https://git-scm.com/docs/git-config#Documentation/git-config.txt-safedirectory
|
|
40
|
+
RUN git config --global --add safe.directory /code
|
|
41
|
+
|
|
42
|
+
# Install `uv`.
|
|
43
|
+
# Reference: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
|
|
44
|
+
ADD https://astral.sh/uv/install.sh /uv-installer.sh
|
|
45
|
+
RUN sh /uv-installer.sh && \
|
|
46
|
+
rm /uv-installer.sh
|
|
47
|
+
ENV PATH="/root/.local/bin/:$PATH"
|
|
48
|
+
|
|
49
|
+
# Install Python dependencies (production dependencies only).
|
|
50
|
+
#
|
|
51
|
+
# Note: We copy only the files that `uv` needs in order to install dependencies. That way,
|
|
52
|
+
# we minimize the number of files whose changes would invalidate cached image layers
|
|
53
|
+
#
|
|
54
|
+
# Note: We use the `VIRTUAL_ENV` environment variable to specify the path to the Python virtual
|
|
55
|
+
# environment that we want the `uv` program inside the container to create and use.
|
|
56
|
+
#
|
|
57
|
+
# Q: Why don't we use `./.venv` in the repository file tree?
|
|
58
|
+
# A: If we were to do that, then, whenever a developer would mount (via our Docker Compose file)
|
|
59
|
+
# the repository file tree from their host machine (which may include a `.venv/` directory
|
|
60
|
+
# created by their host machine) into the container, it would overwrite the Python virtual
|
|
61
|
+
# environment that the `uv` program inside the container is using.
|
|
62
|
+
#
|
|
63
|
+
# Q: What is special about the `VIRTUAL_ENV` environment variable?
|
|
64
|
+
# A: When using `uv`'s `--active` option (as we do in later stages of this Dockerfile),
|
|
65
|
+
# `uv` determines which virtual environment is active by looking at `VIRTUAL_ENV'. This
|
|
66
|
+
# is the case, even though the documentation of the `venv` module (in Python's standard
|
|
67
|
+
# library) specifically says: "`VIRTUAL_ENV` cannot be relied upon to determine whether
|
|
68
|
+
# a virtual environment is being used."
|
|
69
|
+
#
|
|
70
|
+
# References:
|
|
71
|
+
# - https://docs.astral.sh/uv/pip/environments/#using-arbitrary-python-environments (RE: `VIRTUAL_ENV`)
|
|
72
|
+
# - https://docs.astral.sh/uv/reference/environment/#virtual_env (RE: `VIRTUAL_ENV`, from uv's perspective)
|
|
73
|
+
# - https://docs.python.org/3/library/venv.html#how-venvs-work (RE: `VIRTUAL_ENV`, from venv's perspective)
|
|
74
|
+
# - https://docs.astral.sh/uv/concepts/projects/sync/#partial-installations (RE: `--no-install-project`)
|
|
75
|
+
#
|
|
76
|
+
# Note: In the `RUN` command, we use a "cache mount" (a feature of Docker) to cache production dependencies
|
|
77
|
+
# across builds. This is a performance optimization technique shown in the `uv` docs.
|
|
78
|
+
# Reference:
|
|
79
|
+
# - https://docs.astral.sh/uv/guides/integration/docker/#caching (RE: the technique)
|
|
80
|
+
# - https://docs.docker.com/build/cache/optimize/#use-cache-mounts (RE: the feature)
|
|
81
|
+
# - https://docs.astral.sh/uv/reference/settings/#link-mode (RE: `UV_LINK_MODE`)
|
|
82
|
+
# - https://docs.astral.sh/uv/reference/cli/#uv-sync--no-install-project (RE: `--no-install-project`)
|
|
83
|
+
#
|
|
84
|
+
# Note: We use `--compile-bytecode` so that Python compiles `.py` files to `.pyc` files now,
|
|
85
|
+
# instead of when the container is running. By default, `uv` defers this compilation
|
|
86
|
+
# to "import time," whereas `pip` (by default) performs it at "install time" (like this).
|
|
87
|
+
#
|
|
88
|
+
ENV VIRTUAL_ENV="/venv"
|
|
89
|
+
RUN mkdir -p "${VIRTUAL_ENV}"
|
|
90
|
+
COPY ./pyproject.toml /code/pyproject.toml
|
|
91
|
+
COPY ./uv.lock /code/uv.lock
|
|
92
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
93
|
+
cd /code && \
|
|
94
|
+
UV_LINK_MODE=copy uv sync --active --no-dev --no-install-project --compile-bytecode
|
|
95
|
+
|
|
96
|
+
# ────────────────────────────────────────────────────────────────────────────┐
|
|
97
|
+
FROM base AS fastapi
|
|
98
|
+
# ────────────────────────────────────────────────────────────────────────────┘
|
|
99
|
+
|
|
100
|
+
# Copy repository contents into image.
|
|
101
|
+
COPY . /code
|
|
102
|
+
|
|
103
|
+
# Install the project in editable mode.
|
|
104
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
105
|
+
cd /code && \
|
|
106
|
+
uv sync --active --no-dev
|
|
107
|
+
|
|
108
|
+
# Use Uvicorn to serve the FastAPI app on port 8000.
|
|
109
|
+
EXPOSE 8000
|
|
110
|
+
WORKDIR /code
|
|
111
|
+
CMD ["uv", "run", "--active", "uvicorn", "nmdc_runtime.api.main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "8000"]
|
|
112
|
+
|
|
113
|
+
# ────────────────────────────────────────────────────────────────────────────┐
|
|
114
|
+
FROM base AS dagster
|
|
115
|
+
# ────────────────────────────────────────────────────────────────────────────┘
|
|
116
|
+
|
|
117
|
+
# Copy repository contents into image.
|
|
118
|
+
#
|
|
119
|
+
# Note: This path (i.e. "/opt/dagster/lib/") is hard-coded in a few places in `nmdc_runtime/site/ops.py`. That's why
|
|
120
|
+
# this image does not store the repository contents in `/code`, unlike the other images in this Dockerfile.
|
|
121
|
+
#
|
|
122
|
+
COPY . /opt/dagster/lib
|
|
123
|
+
|
|
124
|
+
# Install the project in editable mode.
|
|
125
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
126
|
+
cd /opt/dagster/lib && \
|
|
127
|
+
uv sync --active --no-dev
|
|
128
|
+
|
|
129
|
+
# Move Dagster configuration files to the place Dagster expects.
|
|
130
|
+
ENV DAGSTER_HOME="/opt/dagster/dagster_home/"
|
|
131
|
+
RUN mkdir -p "${DAGSTER_HOME}" && \
|
|
132
|
+
cp /opt/dagster/lib/nmdc_runtime/site/dagster.yaml "${DAGSTER_HOME}" && \
|
|
133
|
+
cp /opt/dagster/lib/nmdc_runtime/site/workspace.yaml "${DAGSTER_HOME}"
|
|
134
|
+
|
|
135
|
+
# Use Tini to run Dagit.
|
|
136
|
+
#
|
|
137
|
+
# Notes:
|
|
138
|
+
# - The port number (i.e. "3000") is hard-coded in `nmdc_runtime/site/entrypoint-dagit.sh`.
|
|
139
|
+
# - Dagster daemon (versus Dagit) can be launched by overriding the `ENTRYPOINT` defined here.
|
|
140
|
+
#
|
|
141
|
+
# Reference: https://github.com/krallin/tini
|
|
142
|
+
#
|
|
143
|
+
EXPOSE 3000
|
|
144
|
+
WORKDIR /opt/dagster/dagster_home/
|
|
145
|
+
ENTRYPOINT ["tini", "--", "../lib/nmdc_runtime/site/entrypoint-dagit.sh"]
|
|
146
|
+
|
|
147
|
+
# ────────────────────────────────────────────────────────────────────────────┐
|
|
148
|
+
FROM base AS test
|
|
149
|
+
# ────────────────────────────────────────────────────────────────────────────┘
|
|
150
|
+
|
|
151
|
+
# Copy all repository contents into image.
|
|
152
|
+
COPY . /code
|
|
153
|
+
|
|
154
|
+
# Install the project in editable mode, and install development dependencies.
|
|
155
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
156
|
+
cd /code && \
|
|
157
|
+
uv sync --active
|
|
158
|
+
|
|
159
|
+
# Make `wait-for-it.sh` executable.
|
|
160
|
+
RUN chmod +x /code/.docker/wait-for-it.sh
|
|
161
|
+
|
|
162
|
+
WORKDIR /code
|
|
163
|
+
|
|
164
|
+
# Ensure started container does not exit, so that a subsequent `docker exec` command can run tests.
|
|
165
|
+
# For an example `docker exec` command, see `Makefile`'s `run-test` target.
|
|
166
|
+
# Such a command should use `wait-for-it.sh` to run `pytest` no earlier than when the FastAPI server is accessible.
|
|
167
|
+
ENTRYPOINT ["tail", "-f", "/dev/null"]
|
nmdc_runtime/api/analytics.py
CHANGED
|
@@ -16,25 +16,42 @@ from toolz import merge
|
|
|
16
16
|
|
|
17
17
|
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
18
18
|
|
|
19
|
+
# This is a queue of the "request descriptors" that we will eventually insert into the database.
|
|
19
20
|
_requests = []
|
|
20
21
|
_last_posted = datetime.now()
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
def _post_requests(collection: str, requests_data: List[Dict], source: str):
|
|
25
|
+
"""Inserts the specified request descriptors into the specified MongoDB collection."""
|
|
24
26
|
mdb = get_mongo_db()
|
|
25
27
|
mdb[collection].insert_many([merge(d, {"source": source}) for d in requests_data])
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
def log_request(collection: str, request_data: Dict, source: str = "FastAPI"):
|
|
31
|
+
"""Flushes the queue of request descriptors to the database if enough time has passed since the previous time."""
|
|
29
32
|
global _requests, _last_posted
|
|
30
33
|
_requests.append(request_data)
|
|
31
34
|
now = datetime.now()
|
|
32
35
|
# flush queue every minute at most
|
|
33
36
|
if (now - _last_posted).total_seconds() > 60.0:
|
|
37
|
+
# Note: This use of threading is an attempt to avoid blocking the current thread
|
|
38
|
+
# while performing the insertion(s).
|
|
39
|
+
#
|
|
40
|
+
# TODO: Is there is a race condition here? If multiple requests arrive at approximately
|
|
41
|
+
# the same time, is it possible that each one causes a different thread to be
|
|
42
|
+
# started, each with a different (and possibly overlapping) set of requests to
|
|
43
|
+
# insert?
|
|
44
|
+
#
|
|
45
|
+
# TODO: If the insertion fails, will the requests be lost?
|
|
46
|
+
#
|
|
47
|
+
# Note: The author of this function said it may have been a "standard" solution copied
|
|
48
|
+
# from some documentation. Indeed, the comment at the top of this module contains
|
|
49
|
+
# a link to code on which it was based.
|
|
50
|
+
#
|
|
34
51
|
threading.Thread(
|
|
35
52
|
target=_post_requests, args=(collection, _requests, source)
|
|
36
53
|
).start()
|
|
37
|
-
_requests = []
|
|
54
|
+
_requests = [] # empties the queue
|
|
38
55
|
_last_posted = now
|
|
39
56
|
|
|
40
57
|
|
|
@@ -49,6 +66,9 @@ class Analytics(BaseHTTPMiddleware):
|
|
|
49
66
|
start = time()
|
|
50
67
|
response = await call_next(request)
|
|
51
68
|
|
|
69
|
+
# Use a fallback IP address value (currently an empty string) if we can't derive one from the request.
|
|
70
|
+
ip_address: str = "" if request.client is None else request.client.host
|
|
71
|
+
|
|
52
72
|
# Build a dictionary that describes the incoming request.
|
|
53
73
|
#
|
|
54
74
|
# Note: `request.headers` is an instance of `MultiDict`. References:
|
|
@@ -57,7 +77,7 @@ class Analytics(BaseHTTPMiddleware):
|
|
|
57
77
|
#
|
|
58
78
|
request_data = {
|
|
59
79
|
"hostname": request.url.hostname,
|
|
60
|
-
"ip_address":
|
|
80
|
+
"ip_address": ip_address,
|
|
61
81
|
"path": request.url.path,
|
|
62
82
|
"user_agent": request.headers.get("user-agent"),
|
|
63
83
|
"method": request.method,
|
nmdc_runtime/api/core/idgen.py
CHANGED
|
@@ -89,7 +89,35 @@ def generate_ids(
|
|
|
89
89
|
shoulder: str = "fk4",
|
|
90
90
|
) -> List[str]:
|
|
91
91
|
r"""
|
|
92
|
-
|
|
92
|
+
Generate the specified number of identifiers, storing them in a MongoDB collection
|
|
93
|
+
whose name is derived from the specified Name-Assigning Authority (NAA) and Shoulder.
|
|
94
|
+
|
|
95
|
+
:param mdb: Handle to a MongoDB database
|
|
96
|
+
:param owner: String that will go in the "__ao" field of the identifier record.
|
|
97
|
+
Callers will oftentimes set this to the name of a Runtime "site"
|
|
98
|
+
(as in, a "site client" site, not a "Dagster" site).
|
|
99
|
+
:param populator: String that will go in the "who" field of the identifier record.
|
|
100
|
+
Indicates "who generated this ID." Callers will oftentimes set
|
|
101
|
+
this to the name of a Runtime "site" (as in, a "site client" site,
|
|
102
|
+
not a "Dagster" site).
|
|
103
|
+
:param ns: Namespace (see Minter docs); e.g. "changesheets"
|
|
104
|
+
:param naa: Name-Assigning Authority (see Minter docs); e.g. "nmdc"
|
|
105
|
+
:param shoulder: String that will go in the "how" field (see Minter docs); e.g. "sys0"
|
|
106
|
+
|
|
107
|
+
This function was written the way it was in an attempt to mirror the ARK spec:
|
|
108
|
+
https://www.ietf.org/archive/id/draft-kunze-ark-41.html (found via: https://arks.org/specs/)
|
|
109
|
+
|
|
110
|
+
Deviations from the ARK spec include:
|
|
111
|
+
1. The inclusion of a typecode.
|
|
112
|
+
The inclusion of a typecode came out of discussions with team members,
|
|
113
|
+
who wanted identifiers to include some non-opaque substring that could be used
|
|
114
|
+
to determine what type of resource a given identifier refers to.
|
|
115
|
+
2. Making hyphens mandatory.
|
|
116
|
+
We decided to make the hyphens mandatory, whereas the spec says they are optional.
|
|
117
|
+
> "Hyphens are considered to be insignificant and are always ignored in ARKs."
|
|
118
|
+
> Reference: https://www.ietf.org/archive/id/draft-kunze-ark-41.html#name-character-repertoires
|
|
119
|
+
In our case, we require that users include an identifier's hyphens whenever
|
|
120
|
+
they are using that identifier.
|
|
93
121
|
"""
|
|
94
122
|
collection = mdb.get_collection(collection_name(naa, shoulder))
|
|
95
123
|
estimated_document_count = collection.estimated_document_count()
|
|
@@ -119,7 +147,9 @@ def generate_ids(
|
|
|
119
147
|
if not_taken:
|
|
120
148
|
# All attribute names beginning with "__a" are reserved...
|
|
121
149
|
# https://github.com/jkunze/n2t-eggnog/blob/0f0f4c490e6dece507dba710d3557e29b8f6627e/egg#L1882
|
|
122
|
-
#
|
|
150
|
+
# The author of this function opted to refrain from using property names beginning with "_.e",
|
|
151
|
+
# because he thought it would complicate MongoDB queries involving those properties, given that
|
|
152
|
+
# the "." is used as a field delimiter in MongoDB syntax (e.g. "foo.bar.baz").
|
|
123
153
|
docs = [
|
|
124
154
|
{
|
|
125
155
|
"@context": "https://n2t.net/e/n2t_apidoc.html#identifier-metadata",
|
|
@@ -145,9 +175,9 @@ def generate_ids(
|
|
|
145
175
|
|
|
146
176
|
|
|
147
177
|
def generate_one_id(
|
|
148
|
-
mdb: MongoDatabase
|
|
178
|
+
mdb: MongoDatabase,
|
|
149
179
|
ns: str = "",
|
|
150
|
-
shoulder: str = "sys0",
|
|
180
|
+
shoulder: str = "sys0", # "sys0" represents the Runtime
|
|
151
181
|
) -> str:
|
|
152
182
|
"""Generate unique Crockford Base32-encoded ID for mdb repository.
|
|
153
183
|
|
|
@@ -156,8 +186,8 @@ def generate_one_id(
|
|
|
156
186
|
"""
|
|
157
187
|
return generate_ids(
|
|
158
188
|
mdb,
|
|
159
|
-
owner="_system",
|
|
160
|
-
populator="_system",
|
|
189
|
+
owner="_system", # "_system" represents the Runtime
|
|
190
|
+
populator="_system", # "_system" represents the Runtime
|
|
161
191
|
number=1,
|
|
162
192
|
ns=ns,
|
|
163
193
|
naa="nmdc",
|
nmdc_runtime/api/db/mongo.py
CHANGED
|
@@ -10,7 +10,6 @@ import bson
|
|
|
10
10
|
from jsonschema import Draft7Validator
|
|
11
11
|
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
12
12
|
from pymongo.errors import AutoReconnect, OperationFailure
|
|
13
|
-
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
|
|
14
13
|
from refscan.lib.Finder import Finder
|
|
15
14
|
from refscan.scanner import scan_outgoing_references
|
|
16
15
|
from tenacity import wait_random_exponential, retry, retry_if_exception_type
|
|
@@ -83,17 +82,6 @@ def get_session_bound_mongo_db(session=None) -> MongoDatabase:
|
|
|
83
82
|
return SessionBoundDatabase(mdb, session) if session is not None else mdb
|
|
84
83
|
|
|
85
84
|
|
|
86
|
-
@lru_cache
|
|
87
|
-
def get_async_mongo_db() -> AsyncIOMotorDatabase:
|
|
88
|
-
_client = AsyncIOMotorClient(
|
|
89
|
-
host=os.getenv("MONGO_HOST"),
|
|
90
|
-
username=os.getenv("MONGO_USERNAME"),
|
|
91
|
-
password=os.getenv("MONGO_PASSWORD"),
|
|
92
|
-
directConnection=True,
|
|
93
|
-
)
|
|
94
|
-
return _client[os.getenv("MONGO_DBNAME")]
|
|
95
|
-
|
|
96
|
-
|
|
97
85
|
def get_nonempty_nmdc_schema_collection_names(mdb: MongoDatabase) -> Set[str]:
|
|
98
86
|
"""
|
|
99
87
|
Returns the names of the collections that (a) exist in the database,
|
|
@@ -1,21 +1,17 @@
|
|
|
1
|
-
|
|
2
|
-
from typing import
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Annotated
|
|
3
3
|
|
|
4
4
|
from fastapi import APIRouter, Depends, Path, Query
|
|
5
|
-
from jinja2 import Environment, PackageLoader, select_autoescape
|
|
6
|
-
from nmdc_runtime.util import get_nmdc_jsonschema_dict
|
|
7
5
|
from pymongo.database import Database as MongoDatabase
|
|
8
|
-
from starlette.responses import HTMLResponse
|
|
9
|
-
from toolz import merge, assoc_in
|
|
10
6
|
|
|
11
7
|
from nmdc_schema.get_nmdc_view import ViewGetter
|
|
12
8
|
from nmdc_runtime.api.core.util import raise404_if_none
|
|
13
9
|
from nmdc_runtime.api.db.mongo import (
|
|
14
10
|
get_mongo_db,
|
|
15
|
-
activity_collection_names,
|
|
16
11
|
get_planned_process_collection_names,
|
|
17
12
|
get_nonempty_nmdc_schema_collection_names,
|
|
18
13
|
)
|
|
14
|
+
from nmdc_runtime.api.endpoints.nmdcschema import get_linked_instances
|
|
19
15
|
from nmdc_runtime.api.endpoints.util import (
|
|
20
16
|
find_resources,
|
|
21
17
|
strip_oid,
|
|
@@ -25,9 +21,8 @@ from nmdc_runtime.api.models.metadata import Doc
|
|
|
25
21
|
from nmdc_runtime.api.models.util import (
|
|
26
22
|
FindResponse,
|
|
27
23
|
FindRequest,
|
|
28
|
-
entity_attributes_to_index,
|
|
29
24
|
)
|
|
30
|
-
|
|
25
|
+
|
|
31
26
|
|
|
32
27
|
router = APIRouter()
|
|
33
28
|
|
|
@@ -178,133 +173,71 @@ def find_data_objects_for_study(
|
|
|
178
173
|
is a list of the `DataObject`s associated with that `Biosample`.
|
|
179
174
|
"""
|
|
180
175
|
biosample_data_objects = []
|
|
181
|
-
study = raise404_if_none(
|
|
182
|
-
mdb.study_set.find_one({"id": study_id}, ["id"]), detail="Study not found"
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
# Note: With nmdc-schema v10 (legacy schema), we used the field named `part_of` here.
|
|
186
|
-
# With nmdc-schema v11 (Berkeley schema), we use the field named `associated_studies` here.
|
|
187
|
-
biosamples = mdb.biosample_set.find({"associated_studies": study["id"]}, ["id"])
|
|
188
|
-
biosample_ids = [biosample["id"] for biosample in biosamples]
|
|
189
|
-
|
|
190
|
-
# SchemaView interface to NMDC Schema
|
|
191
|
-
nmdc_view = ViewGetter()
|
|
192
|
-
nmdc_sv = nmdc_view.get_view()
|
|
193
|
-
dg_descendants = [
|
|
194
|
-
(f"nmdc:{t}" if ":" not in t else t)
|
|
195
|
-
for t in nmdc_sv.class_descendants("DataGeneration")
|
|
196
|
-
]
|
|
197
|
-
|
|
198
|
-
def collect_data_objects(doc_ids, collected_objects, unique_ids):
|
|
199
|
-
"""Helper function to collect data objects from `has_input` and `has_output` references."""
|
|
200
|
-
for doc_id in doc_ids:
|
|
201
|
-
# Check if this is a DataObject by looking at the document's type directly
|
|
202
|
-
doc = mdb.alldocs.find_one({"id": doc_id}, {"type": 1})
|
|
203
|
-
if (
|
|
204
|
-
doc
|
|
205
|
-
and doc.get("type") == "nmdc:DataObject"
|
|
206
|
-
and doc_id not in unique_ids
|
|
207
|
-
):
|
|
208
|
-
data_obj = mdb.data_object_set.find_one({"id": doc_id})
|
|
209
|
-
if data_obj:
|
|
210
|
-
collected_objects.append(strip_oid(data_obj))
|
|
211
|
-
unique_ids.add(doc_id)
|
|
212
|
-
|
|
213
|
-
# Another way in which DataObjects can be related to Biosamples is through the
|
|
214
|
-
# `was_informed_by` key/slot. We need to link records from the `workflow_execution_set`
|
|
215
|
-
# collection that are "informed" by the same DataGeneration records that created
|
|
216
|
-
# the outputs above. Then we need to get additional DataObject records that are
|
|
217
|
-
# created by this linkage.
|
|
218
|
-
def process_informed_by_docs(doc, collected_objects, unique_ids):
|
|
219
|
-
"""Process documents linked by `was_informed_by` and collect relevant data objects."""
|
|
220
|
-
# Note: As of nmdc-schema 11.9.0, the `was_informed_by` field, if defined,
|
|
221
|
-
# will contain a list of strings. In MongoDB, the `{k: v}` filter
|
|
222
|
-
# can be used to check whether either (a) the value of field `f` is
|
|
223
|
-
# an array containing `v` as one of its elements, or (b) the value
|
|
224
|
-
# of field `f` is exactly equal to `v`. We rely on behavior (a) here.
|
|
225
|
-
informed_by_docs = mdb.workflow_execution_set.find(
|
|
226
|
-
{"was_informed_by": doc["id"]}
|
|
227
|
-
)
|
|
228
|
-
for informed_doc in informed_by_docs:
|
|
229
|
-
collect_data_objects(
|
|
230
|
-
informed_doc.get("has_input", []), collected_objects, unique_ids
|
|
231
|
-
)
|
|
232
|
-
collect_data_objects(
|
|
233
|
-
informed_doc.get("has_output", []), collected_objects, unique_ids
|
|
234
|
-
)
|
|
235
176
|
|
|
236
|
-
|
|
177
|
+
# Respond with an error if the specified `Study` does not exist.
|
|
178
|
+
# Note: We project only the `_id` field, to minimize data transfer.
|
|
179
|
+
raise404_if_none(
|
|
180
|
+
mdb["study_set"].find_one({"id": study_id}, projection={"_id": 1}),
|
|
181
|
+
detail="Study not found",
|
|
182
|
+
)
|
|
237
183
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
has_output, collected_data_objects, unique_ids
|
|
292
|
-
)
|
|
293
|
-
# Add non-DataObject outputs to continue the chain
|
|
294
|
-
for op in has_output:
|
|
295
|
-
doc_check = mdb.alldocs.find_one({"id": op}, {"type": 1})
|
|
296
|
-
if doc_check and doc_check.get("type") != "nmdc:DataObject":
|
|
297
|
-
new_current_ids.append(op)
|
|
298
|
-
|
|
299
|
-
current_ids = new_current_ids
|
|
300
|
-
|
|
301
|
-
if collected_data_objects:
|
|
302
|
-
result = {
|
|
184
|
+
# Use the `get_linked_instances` function—which is the function that
|
|
185
|
+
# underlies the `/nmdcschema/linked_instances` API endpoint—to get all
|
|
186
|
+
# the `Biosample`s that are downstream of the specified `Study`.
|
|
187
|
+
#
|
|
188
|
+
# Note: The `get_linked_instances` function requires that a `max_page_size`
|
|
189
|
+
# integer argument be passed in. In our case, we want to get _all_ of
|
|
190
|
+
# the instances. Python has no "infinity" integer; and, even if it did,
|
|
191
|
+
# if we were to specify too large of an integer, we'd get this error:
|
|
192
|
+
# > "OverflowError: MongoDB can only handle up to 8-byte ints"
|
|
193
|
+
# So, as a workaround, we pass in a number that is large enough that we
|
|
194
|
+
# think it will account for all cases in practice (e.g., a study having
|
|
195
|
+
# a trillion biosamples or a trillion data objects).
|
|
196
|
+
#
|
|
197
|
+
# TODO: Update the `get_linked_instances` function to optionally impose _no_ limit.
|
|
198
|
+
#
|
|
199
|
+
large_max_page_size: int = 1_000_000_000_000
|
|
200
|
+
linked_biosamples_result: dict = get_linked_instances(
|
|
201
|
+
ids=[study_id],
|
|
202
|
+
types=["nmdc:Biosample"],
|
|
203
|
+
hydrate=False, # we'll only use their `id` values
|
|
204
|
+
page_token=None,
|
|
205
|
+
max_page_size=large_max_page_size,
|
|
206
|
+
mdb=mdb,
|
|
207
|
+
)
|
|
208
|
+
biosample_ids = [d["id"] for d in linked_biosamples_result.get("resources", [])]
|
|
209
|
+
logging.debug(f"Found {len(biosample_ids)} Biosamples for Study {study_id}")
|
|
210
|
+
|
|
211
|
+
# Get all the `DataObject`s that are downstream from any of those `Biosample`s.
|
|
212
|
+
data_objects_by_biosample_id = {}
|
|
213
|
+
linked_data_objects_result: dict = get_linked_instances(
|
|
214
|
+
ids=biosample_ids,
|
|
215
|
+
types=["nmdc:DataObject"],
|
|
216
|
+
hydrate=True, # we want the full `DataObject` documents
|
|
217
|
+
page_token=None,
|
|
218
|
+
max_page_size=large_max_page_size,
|
|
219
|
+
mdb=mdb,
|
|
220
|
+
)
|
|
221
|
+
for data_object in linked_data_objects_result.get("resources", []):
|
|
222
|
+
upstream_biosample_id = data_object["_downstream_of"][0]
|
|
223
|
+
if upstream_biosample_id not in data_objects_by_biosample_id.keys():
|
|
224
|
+
data_objects_by_biosample_id[upstream_biosample_id] = []
|
|
225
|
+
|
|
226
|
+
# Strip away the metadata fields injected by `get_linked_instances()`.
|
|
227
|
+
data_object.pop("_upstream_of", None)
|
|
228
|
+
data_object.pop("_downstream_of", None)
|
|
229
|
+
data_objects_by_biosample_id[upstream_biosample_id].append(data_object)
|
|
230
|
+
|
|
231
|
+
# Convert the `data_objects_by_biosample_id` dictionary into a list of dicts;
|
|
232
|
+
# i.e., into the format returned by the initial version of this API endpoint,
|
|
233
|
+
# which did not use the `get_linked_instances` function under the hood.
|
|
234
|
+
for biosample_id, data_objects in data_objects_by_biosample_id.items():
|
|
235
|
+
biosample_data_objects.append(
|
|
236
|
+
{
|
|
303
237
|
"biosample_id": biosample_id,
|
|
304
|
-
"data_objects":
|
|
238
|
+
"data_objects": data_objects,
|
|
305
239
|
}
|
|
306
|
-
|
|
307
|
-
|
|
240
|
+
)
|
|
308
241
|
return biosample_data_objects
|
|
309
242
|
|
|
310
243
|
|
|
@@ -699,96 +632,3 @@ def find_related_objects_for_workflow_execution(
|
|
|
699
632
|
}
|
|
700
633
|
|
|
701
634
|
return response
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
jinja_env = Environment(
|
|
705
|
-
loader=PackageLoader("nmdc_runtime"), autoescape=select_autoescape()
|
|
706
|
-
)
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
def attr_index_sort_key(attr):
|
|
710
|
-
return "_" if attr == "id" else attr
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
def documentation_links(jsonschema_dict, collection_names) -> dict:
|
|
714
|
-
"""This function constructs a hierarchical catalog of (links to) schema classes and their slots.
|
|
715
|
-
|
|
716
|
-
The returned dictionary `doc_links` is used as input to the Jinja template `nmdc_runtime/templates/search.html`
|
|
717
|
-
in order to support user experience for `GET /search`.
|
|
718
|
-
"""
|
|
719
|
-
|
|
720
|
-
# Note: All documentation URLs generated within this function will begin with this.
|
|
721
|
-
base_url = r"https://w3id.org/nmdc"
|
|
722
|
-
|
|
723
|
-
# Initialize dictionary in which to associate key/value pairs via the following for loop.
|
|
724
|
-
doc_links = {}
|
|
725
|
-
|
|
726
|
-
for collection_name in collection_names:
|
|
727
|
-
# Since a given collection can be associated with multiple classes, the `doc_links` dictionary
|
|
728
|
-
# will have a _list_ of values for each collection.
|
|
729
|
-
class_descriptors = []
|
|
730
|
-
|
|
731
|
-
# If the collection name is one that the `search.html` page has a dedicated section for,
|
|
732
|
-
# give it a top-level key; otherwise, nest it under `activity_set`.
|
|
733
|
-
key_hierarchy: List[str] = ["activity_set", collection_name]
|
|
734
|
-
if collection_name in ("biosample_set", "study_set", "data_object_set"):
|
|
735
|
-
key_hierarchy = [collection_name]
|
|
736
|
-
|
|
737
|
-
# Process the name of each class that the schema associates with this collection.
|
|
738
|
-
collection_spec = jsonschema_dict["$defs"]["Database"]["properties"][
|
|
739
|
-
collection_name
|
|
740
|
-
]
|
|
741
|
-
class_names = get_class_names_from_collection_spec(collection_spec)
|
|
742
|
-
for idx, class_name in enumerate(class_names):
|
|
743
|
-
# Make a list of dictionaries, each of which describes one attribute of this class.
|
|
744
|
-
entity_attrs = list(jsonschema_dict["$defs"][class_name]["properties"])
|
|
745
|
-
entity_attr_descriptors = [
|
|
746
|
-
{"url": f"{base_url}/{attr_name}", "attr_name": attr_name}
|
|
747
|
-
for attr_name in entity_attrs
|
|
748
|
-
]
|
|
749
|
-
|
|
750
|
-
# Make a dictionary describing this class.
|
|
751
|
-
class_descriptor = {
|
|
752
|
-
"collection_name": collection_name,
|
|
753
|
-
"entity_url": f"{base_url}/{class_name}",
|
|
754
|
-
"entity_name": class_name,
|
|
755
|
-
"entity_attrs": sorted(
|
|
756
|
-
entity_attr_descriptors, key=itemgetter("attr_name")
|
|
757
|
-
),
|
|
758
|
-
}
|
|
759
|
-
|
|
760
|
-
# Add that descriptor to this collection's list of class descriptors.
|
|
761
|
-
class_descriptors.append(class_descriptor)
|
|
762
|
-
|
|
763
|
-
# Add a key/value pair describing this collection to the `doc_links` dictionary.
|
|
764
|
-
# Reference: https://toolz.readthedocs.io/en/latest/api.html#toolz.dicttoolz.assoc_in
|
|
765
|
-
doc_links = assoc_in(doc_links, keys=key_hierarchy, value=class_descriptors)
|
|
766
|
-
|
|
767
|
-
return doc_links
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
@router.get("/search", response_class=HTMLResponse, include_in_schema=False)
|
|
771
|
-
def search_page(
|
|
772
|
-
mdb: MongoDatabase = Depends(get_mongo_db),
|
|
773
|
-
):
|
|
774
|
-
template = jinja_env.get_template("search.html")
|
|
775
|
-
indexed_entity_attributes = merge(
|
|
776
|
-
{n: {"id"} for n in activity_collection_names(mdb)},
|
|
777
|
-
{
|
|
778
|
-
coll: sorted(attrs | {"id"}, key=attr_index_sort_key)
|
|
779
|
-
for coll, attrs in entity_attributes_to_index.items()
|
|
780
|
-
},
|
|
781
|
-
)
|
|
782
|
-
doc_links = documentation_links(
|
|
783
|
-
get_nmdc_jsonschema_dict(),
|
|
784
|
-
(
|
|
785
|
-
list(activity_collection_names(mdb))
|
|
786
|
-
+ ["biosample_set", "study_set", "data_object_set"]
|
|
787
|
-
),
|
|
788
|
-
)
|
|
789
|
-
html_content = template.render(
|
|
790
|
-
activity_collection_names=sorted(activity_collection_names(mdb)),
|
|
791
|
-
indexed_entity_attributes=indexed_entity_attributes,
|
|
792
|
-
doc_links=doc_links,
|
|
793
|
-
)
|
|
794
|
-
return HTMLResponse(content=html_content, status_code=200)
|