nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +22 -2
  3. nmdc_runtime/api/core/idgen.py +36 -6
  4. nmdc_runtime/api/db/mongo.py +0 -12
  5. nmdc_runtime/api/endpoints/find.py +65 -225
  6. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  7. nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
  8. nmdc_runtime/api/endpoints/objects.py +4 -11
  9. nmdc_runtime/api/endpoints/operations.py +0 -27
  10. nmdc_runtime/api/endpoints/queries.py +22 -0
  11. nmdc_runtime/api/endpoints/sites.py +0 -24
  12. nmdc_runtime/api/endpoints/util.py +57 -35
  13. nmdc_runtime/api/entrypoint.sh +7 -0
  14. nmdc_runtime/api/main.py +84 -60
  15. nmdc_runtime/api/models/util.py +12 -5
  16. nmdc_runtime/api/openapi.py +116 -180
  17. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  18. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  19. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  20. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  21. nmdc_runtime/minter/adapters/repository.py +21 -0
  22. nmdc_runtime/minter/domain/model.py +20 -0
  23. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  24. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  25. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  26. nmdc_runtime/site/dagster.yaml +53 -0
  27. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  28. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  29. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  30. nmdc_runtime/site/export/ncbi_xml.py +632 -11
  31. nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
  32. nmdc_runtime/site/graphs.py +7 -0
  33. nmdc_runtime/site/ops.py +92 -34
  34. nmdc_runtime/site/repository.py +2 -0
  35. nmdc_runtime/site/resources.py +16 -3
  36. nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
  37. nmdc_runtime/site/workspace.yaml +13 -0
  38. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  39. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  40. nmdc_runtime/static/README.md +5 -0
  41. nmdc_runtime/static/favicon.ico +0 -0
  42. nmdc_runtime/util.py +87 -1
  43. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  44. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/RECORD +47 -57
  45. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  46. nmdc_runtime/api/endpoints/ids.py +0 -192
  47. nmdc_runtime/client/__init__.py +0 -0
  48. nmdc_runtime/containers.py +0 -14
  49. nmdc_runtime/core/__init__.py +0 -0
  50. nmdc_runtime/core/db/Database.py +0 -13
  51. nmdc_runtime/core/db/__init__.py +0 -0
  52. nmdc_runtime/core/exceptions/__init__.py +0 -23
  53. nmdc_runtime/core/exceptions/base.py +0 -47
  54. nmdc_runtime/core/exceptions/token.py +0 -13
  55. nmdc_runtime/domain/__init__.py +0 -0
  56. nmdc_runtime/domain/users/__init__.py +0 -0
  57. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  58. nmdc_runtime/domain/users/userSchema.py +0 -37
  59. nmdc_runtime/domain/users/userService.py +0 -14
  60. nmdc_runtime/infrastructure/__init__.py +0 -0
  61. nmdc_runtime/infrastructure/database/__init__.py +0 -0
  62. nmdc_runtime/infrastructure/database/db.py +0 -3
  63. nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
  64. nmdc_runtime/infrastructure/database/models/user.py +0 -1
  65. nmdc_runtime/lib/__init__.py +0 -1
  66. nmdc_runtime/lib/extract_nmdc_data.py +0 -33
  67. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  68. nmdc_runtime/lib/nmdc_dataframes.py +0 -825
  69. nmdc_runtime/lib/nmdc_etl_class.py +0 -396
  70. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  71. nmdc_runtime/site/drsobjects/__init__.py +0 -0
  72. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  73. nmdc_runtime/site/drsobjects/registration.py +0 -131
  74. nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
  75. nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
  76. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  77. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,167 @@
1
+ # Note: Most of the steps for the `base` image were copied verbatim from either `fastapi.Dockerfile`,
2
+ # `dagster.Dockerfile`, or `test.Dockerfile` (indeed, most of the steps were present in all three files).
3
+ # Reference: https://docs.docker.com/get-started/docker-concepts/building-images/multi-stage-builds/
4
+ #
5
+ # Base this image upon a variant of the official Python 3.10 image that is, in turn,
6
+ # based upon a minimal (slim) variant of the Debian 11 (bullseye) image.
7
+ # Reference: https://hub.docker.com/_/python
8
+ # ────────────────────────────────────────────────────────────────────────────┐
9
+ FROM python:3.10-slim-bullseye AS base
10
+ # ────────────────────────────────────────────────────────────────────────────┘
11
+
12
+ # Install and upgrade system-level software in a non-interactive way, then delete temporary files.
13
+ # Note: Setting `DEBIAN_FRONTEND=noninteractive` and passing `-y` to `apt-get` makes things non-interactive.
14
+ RUN export DEBIAN_FRONTEND=noninteractive && \
15
+ apt-get update && \
16
+ apt-get -y upgrade && \
17
+ apt-get install -y --no-install-recommends \
18
+ tini \
19
+ procps \
20
+ net-tools \
21
+ build-essential \
22
+ git \
23
+ make \
24
+ zip \
25
+ curl \
26
+ wget \
27
+ gnupg && \
28
+ apt-get -y clean && \
29
+ rm -rf /var/lib/apt/lists/*
30
+
31
+ # Enable Python's "fault handler" feature, so, when low-level errors occur (e.g. segfaults), Python prints lots of info.
32
+ # Reference: https://docs.python.org/3/using/cmdline.html#envvar-PYTHONFAULTHANDLER
33
+ ENV PYTHONFAULTHANDLER=1
34
+
35
+ # Configure Git to consider the `/code` directory to be "safe", so that, when a Git repository
36
+ # created outside of the container gets mounted at that path within the container, the
37
+ # `uv-dynamic-versioning` tool running within the container does not fail with the error:
38
+ # > "Detected Git repository, but failed because of dubious ownership"
39
+ # Reference: https://git-scm.com/docs/git-config#Documentation/git-config.txt-safedirectory
40
+ RUN git config --global --add safe.directory /code
41
+
42
+ # Install `uv`.
43
+ # Reference: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
44
+ ADD https://astral.sh/uv/install.sh /uv-installer.sh
45
+ RUN sh /uv-installer.sh && \
46
+ rm /uv-installer.sh
47
+ ENV PATH="/root/.local/bin/:$PATH"
48
+
49
+ # Install Python dependencies (production dependencies only).
50
+ #
51
+ # Note: We copy only the files that `uv` needs in order to install dependencies. That way,
52
+ # we minimize the number of files whose changes would invalidate cached image layers
53
+ #
54
+ # Note: We use the `VIRTUAL_ENV` environment variable to specify the path to the Python virtual
55
+ # environment that we want the `uv` program inside the container to create and use.
56
+ #
57
+ # Q: Why don't we use `./.venv` in the repository file tree?
58
+ # A: If we were to do that, then, whenever a developer would mount (via our Docker Compose file)
59
+ # the repository file tree from their host machine (which may include a `.venv/` directory
60
+ # created by their host machine) into the container, it would overwrite the Python virtual
61
+ # environment that the `uv` program inside the container is using.
62
+ #
63
+ # Q: What is special about the `VIRTUAL_ENV` environment variable?
64
+ # A: When using `uv`'s `--active` option (as we do in later stages of this Dockerfile),
65
+ # `uv` determines which virtual environment is active by looking at `VIRTUAL_ENV'. This
66
+ # is the case, even though the documentation of the `venv` module (in Python's standard
67
+ # library) specifically says: "`VIRTUAL_ENV` cannot be relied upon to determine whether
68
+ # a virtual environment is being used."
69
+ #
70
+ # References:
71
+ # - https://docs.astral.sh/uv/pip/environments/#using-arbitrary-python-environments (RE: `VIRTUAL_ENV`)
72
+ # - https://docs.astral.sh/uv/reference/environment/#virtual_env (RE: `VIRTUAL_ENV`, from uv's perspective)
73
+ # - https://docs.python.org/3/library/venv.html#how-venvs-work (RE: `VIRTUAL_ENV`, from venv's perspective)
74
+ # - https://docs.astral.sh/uv/concepts/projects/sync/#partial-installations (RE: `--no-install-project`)
75
+ #
76
+ # Note: In the `RUN` command, we use a "cache mount" (a feature of Docker) to cache production dependencies
77
+ # across builds. This is a performance optimization technique shown in the `uv` docs.
78
+ # Reference:
79
+ # - https://docs.astral.sh/uv/guides/integration/docker/#caching (RE: the technique)
80
+ # - https://docs.docker.com/build/cache/optimize/#use-cache-mounts (RE: the feature)
81
+ # - https://docs.astral.sh/uv/reference/settings/#link-mode (RE: `UV_LINK_MODE`)
82
+ # - https://docs.astral.sh/uv/reference/cli/#uv-sync--no-install-project (RE: `--no-install-project`)
83
+ #
84
+ # Note: We use `--compile-bytecode` so that Python compiles `.py` files to `.pyc` files now,
85
+ # instead of when the container is running. By default, `uv` defers this compilation
86
+ # to "import time," whereas `pip` (by default) performs it at "install time" (like this).
87
+ #
88
+ ENV VIRTUAL_ENV="/venv"
89
+ RUN mkdir -p "${VIRTUAL_ENV}"
90
+ COPY ./pyproject.toml /code/pyproject.toml
91
+ COPY ./uv.lock /code/uv.lock
92
+ RUN --mount=type=cache,target=/root/.cache/uv \
93
+ cd /code && \
94
+ UV_LINK_MODE=copy uv sync --active --no-dev --no-install-project --compile-bytecode
95
+
96
+ # ────────────────────────────────────────────────────────────────────────────┐
97
+ FROM base AS fastapi
98
+ # ────────────────────────────────────────────────────────────────────────────┘
99
+
100
+ # Copy repository contents into image.
101
+ COPY . /code
102
+
103
+ # Install the project in editable mode.
104
+ RUN --mount=type=cache,target=/root/.cache/uv \
105
+ cd /code && \
106
+ uv sync --active --no-dev
107
+
108
+ # Use Uvicorn to serve the FastAPI app on port 8000.
109
+ EXPOSE 8000
110
+ WORKDIR /code
111
+ CMD ["uv", "run", "--active", "uvicorn", "nmdc_runtime.api.main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "8000"]
112
+
113
+ # ────────────────────────────────────────────────────────────────────────────┐
114
+ FROM base AS dagster
115
+ # ────────────────────────────────────────────────────────────────────────────┘
116
+
117
+ # Copy repository contents into image.
118
+ #
119
+ # Note: This path (i.e. "/opt/dagster/lib/") is hard-coded in a few places in `nmdc_runtime/site/ops.py`. That's why
120
+ # this image does not store the repository contents in `/code`, unlike the other images in this Dockerfile.
121
+ #
122
+ COPY . /opt/dagster/lib
123
+
124
+ # Install the project in editable mode.
125
+ RUN --mount=type=cache,target=/root/.cache/uv \
126
+ cd /opt/dagster/lib && \
127
+ uv sync --active --no-dev
128
+
129
+ # Move Dagster configuration files to the place Dagster expects.
130
+ ENV DAGSTER_HOME="/opt/dagster/dagster_home/"
131
+ RUN mkdir -p "${DAGSTER_HOME}" && \
132
+ cp /opt/dagster/lib/nmdc_runtime/site/dagster.yaml "${DAGSTER_HOME}" && \
133
+ cp /opt/dagster/lib/nmdc_runtime/site/workspace.yaml "${DAGSTER_HOME}"
134
+
135
+ # Use Tini to run Dagit.
136
+ #
137
+ # Notes:
138
+ # - The port number (i.e. "3000") is hard-coded in `nmdc_runtime/site/entrypoint-dagit.sh`.
139
+ # - Dagster daemon (versus Dagit) can be launched by overriding the `ENTRYPOINT` defined here.
140
+ #
141
+ # Reference: https://github.com/krallin/tini
142
+ #
143
+ EXPOSE 3000
144
+ WORKDIR /opt/dagster/dagster_home/
145
+ ENTRYPOINT ["tini", "--", "../lib/nmdc_runtime/site/entrypoint-dagit.sh"]
146
+
147
+ # ────────────────────────────────────────────────────────────────────────────┐
148
+ FROM base AS test
149
+ # ────────────────────────────────────────────────────────────────────────────┘
150
+
151
+ # Copy all repository contents into image.
152
+ COPY . /code
153
+
154
+ # Install the project in editable mode, and install development dependencies.
155
+ RUN --mount=type=cache,target=/root/.cache/uv \
156
+ cd /code && \
157
+ uv sync --active
158
+
159
+ # Make `wait-for-it.sh` executable.
160
+ RUN chmod +x /code/.docker/wait-for-it.sh
161
+
162
+ WORKDIR /code
163
+
164
+ # Ensure started container does not exit, so that a subsequent `docker exec` command can run tests.
165
+ # For an example `docker exec` command, see `Makefile`'s `run-test` target.
166
+ # Such a command should use `wait-for-it.sh` to run `pytest` no earlier than when the FastAPI server is accessible.
167
+ ENTRYPOINT ["tail", "-f", "/dev/null"]
@@ -16,25 +16,42 @@ from toolz import merge
16
16
 
17
17
  from nmdc_runtime.api.db.mongo import get_mongo_db
18
18
 
19
+ # This is a queue of the "request descriptors" that we will eventually insert into the database.
19
20
  _requests = []
20
21
  _last_posted = datetime.now()
21
22
 
22
23
 
23
24
  def _post_requests(collection: str, requests_data: List[Dict], source: str):
25
+ """Inserts the specified request descriptors into the specified MongoDB collection."""
24
26
  mdb = get_mongo_db()
25
27
  mdb[collection].insert_many([merge(d, {"source": source}) for d in requests_data])
26
28
 
27
29
 
28
30
  def log_request(collection: str, request_data: Dict, source: str = "FastAPI"):
31
+ """Flushes the queue of request descriptors to the database if enough time has passed since the previous time."""
29
32
  global _requests, _last_posted
30
33
  _requests.append(request_data)
31
34
  now = datetime.now()
32
35
  # flush queue every minute at most
33
36
  if (now - _last_posted).total_seconds() > 60.0:
37
+ # Note: This use of threading is an attempt to avoid blocking the current thread
38
+ # while performing the insertion(s).
39
+ #
40
+ # TODO: Is there is a race condition here? If multiple requests arrive at approximately
41
+ # the same time, is it possible that each one causes a different thread to be
42
+ # started, each with a different (and possibly overlapping) set of requests to
43
+ # insert?
44
+ #
45
+ # TODO: If the insertion fails, will the requests be lost?
46
+ #
47
+ # Note: The author of this function said it may have been a "standard" solution copied
48
+ # from some documentation. Indeed, the comment at the top of this module contains
49
+ # a link to code on which it was based.
50
+ #
34
51
  threading.Thread(
35
52
  target=_post_requests, args=(collection, _requests, source)
36
53
  ).start()
37
- _requests = []
54
+ _requests = [] # empties the queue
38
55
  _last_posted = now
39
56
 
40
57
 
@@ -49,6 +66,9 @@ class Analytics(BaseHTTPMiddleware):
49
66
  start = time()
50
67
  response = await call_next(request)
51
68
 
69
+ # Use a fallback IP address value (currently an empty string) if we can't derive one from the request.
70
+ ip_address: str = "" if request.client is None else request.client.host
71
+
52
72
  # Build a dictionary that describes the incoming request.
53
73
  #
54
74
  # Note: `request.headers` is an instance of `MultiDict`. References:
@@ -57,7 +77,7 @@ class Analytics(BaseHTTPMiddleware):
57
77
  #
58
78
  request_data = {
59
79
  "hostname": request.url.hostname,
60
- "ip_address": request.client.host,
80
+ "ip_address": ip_address,
61
81
  "path": request.url.path,
62
82
  "user_agent": request.headers.get("user-agent"),
63
83
  "method": request.method,
@@ -89,7 +89,35 @@ def generate_ids(
89
89
  shoulder: str = "fk4",
90
90
  ) -> List[str]:
91
91
  r"""
92
- TODO: Document this function.
92
+ Generate the specified number of identifiers, storing them in a MongoDB collection
93
+ whose name is derived from the specified Name-Assigning Authority (NAA) and Shoulder.
94
+
95
+ :param mdb: Handle to a MongoDB database
96
+ :param owner: String that will go in the "__ao" field of the identifier record.
97
+ Callers will oftentimes set this to the name of a Runtime "site"
98
+ (as in, a "site client" site, not a "Dagster" site).
99
+ :param populator: String that will go in the "who" field of the identifier record.
100
+ Indicates "who generated this ID." Callers will oftentimes set
101
+ this to the name of a Runtime "site" (as in, a "site client" site,
102
+ not a "Dagster" site).
103
+ :param ns: Namespace (see Minter docs); e.g. "changesheets"
104
+ :param naa: Name-Assigning Authority (see Minter docs); e.g. "nmdc"
105
+ :param shoulder: String that will go in the "how" field (see Minter docs); e.g. "sys0"
106
+
107
+ This function was written the way it was in an attempt to mirror the ARK spec:
108
+ https://www.ietf.org/archive/id/draft-kunze-ark-41.html (found via: https://arks.org/specs/)
109
+
110
+ Deviations from the ARK spec include:
111
+ 1. The inclusion of a typecode.
112
+ The inclusion of a typecode came out of discussions with team members,
113
+ who wanted identifiers to include some non-opaque substring that could be used
114
+ to determine what type of resource a given identifier refers to.
115
+ 2. Making hyphens mandatory.
116
+ We decided to make the hyphens mandatory, whereas the spec says they are optional.
117
+ > "Hyphens are considered to be insignificant and are always ignored in ARKs."
118
+ > Reference: https://www.ietf.org/archive/id/draft-kunze-ark-41.html#name-character-repertoires
119
+ In our case, we require that users include an identifier's hyphens whenever
120
+ they are using that identifier.
93
121
  """
94
122
  collection = mdb.get_collection(collection_name(naa, shoulder))
95
123
  estimated_document_count = collection.estimated_document_count()
@@ -119,7 +147,9 @@ def generate_ids(
119
147
  if not_taken:
120
148
  # All attribute names beginning with "__a" are reserved...
121
149
  # https://github.com/jkunze/n2t-eggnog/blob/0f0f4c490e6dece507dba710d3557e29b8f6627e/egg#L1882
122
- # XXX mongo is a pain with '.'s in field names, so not using e.g. "_.e" names.
150
+ # The author of this function opted to refrain from using property names beginning with "_.e",
151
+ # because he thought it would complicate MongoDB queries involving those properties, given that
152
+ # the "." is used as a field delimiter in MongoDB syntax (e.g. "foo.bar.baz").
123
153
  docs = [
124
154
  {
125
155
  "@context": "https://n2t.net/e/n2t_apidoc.html#identifier-metadata",
@@ -145,9 +175,9 @@ def generate_ids(
145
175
 
146
176
 
147
177
  def generate_one_id(
148
- mdb: MongoDatabase = None,
178
+ mdb: MongoDatabase,
149
179
  ns: str = "",
150
- shoulder: str = "sys0",
180
+ shoulder: str = "sys0", # "sys0" represents the Runtime
151
181
  ) -> str:
152
182
  """Generate unique Crockford Base32-encoded ID for mdb repository.
153
183
 
@@ -156,8 +186,8 @@ def generate_one_id(
156
186
  """
157
187
  return generate_ids(
158
188
  mdb,
159
- owner="_system",
160
- populator="_system",
189
+ owner="_system", # "_system" represents the Runtime
190
+ populator="_system", # "_system" represents the Runtime
161
191
  number=1,
162
192
  ns=ns,
163
193
  naa="nmdc",
@@ -10,7 +10,6 @@ import bson
10
10
  from jsonschema import Draft7Validator
11
11
  from nmdc_schema.nmdc import Database as NMDCDatabase
12
12
  from pymongo.errors import AutoReconnect, OperationFailure
13
- from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
14
13
  from refscan.lib.Finder import Finder
15
14
  from refscan.scanner import scan_outgoing_references
16
15
  from tenacity import wait_random_exponential, retry, retry_if_exception_type
@@ -83,17 +82,6 @@ def get_session_bound_mongo_db(session=None) -> MongoDatabase:
83
82
  return SessionBoundDatabase(mdb, session) if session is not None else mdb
84
83
 
85
84
 
86
- @lru_cache
87
- def get_async_mongo_db() -> AsyncIOMotorDatabase:
88
- _client = AsyncIOMotorClient(
89
- host=os.getenv("MONGO_HOST"),
90
- username=os.getenv("MONGO_USERNAME"),
91
- password=os.getenv("MONGO_PASSWORD"),
92
- directConnection=True,
93
- )
94
- return _client[os.getenv("MONGO_DBNAME")]
95
-
96
-
97
85
  def get_nonempty_nmdc_schema_collection_names(mdb: MongoDatabase) -> Set[str]:
98
86
  """
99
87
  Returns the names of the collections that (a) exist in the database,
@@ -1,21 +1,17 @@
1
- from operator import itemgetter
2
- from typing import List, Annotated
1
+ import logging
2
+ from typing import Annotated
3
3
 
4
4
  from fastapi import APIRouter, Depends, Path, Query
5
- from jinja2 import Environment, PackageLoader, select_autoescape
6
- from nmdc_runtime.util import get_nmdc_jsonschema_dict
7
5
  from pymongo.database import Database as MongoDatabase
8
- from starlette.responses import HTMLResponse
9
- from toolz import merge, assoc_in
10
6
 
11
7
  from nmdc_schema.get_nmdc_view import ViewGetter
12
8
  from nmdc_runtime.api.core.util import raise404_if_none
13
9
  from nmdc_runtime.api.db.mongo import (
14
10
  get_mongo_db,
15
- activity_collection_names,
16
11
  get_planned_process_collection_names,
17
12
  get_nonempty_nmdc_schema_collection_names,
18
13
  )
14
+ from nmdc_runtime.api.endpoints.nmdcschema import get_linked_instances
19
15
  from nmdc_runtime.api.endpoints.util import (
20
16
  find_resources,
21
17
  strip_oid,
@@ -25,9 +21,8 @@ from nmdc_runtime.api.models.metadata import Doc
25
21
  from nmdc_runtime.api.models.util import (
26
22
  FindResponse,
27
23
  FindRequest,
28
- entity_attributes_to_index,
29
24
  )
30
- from nmdc_runtime.util import get_class_names_from_collection_spec
25
+
31
26
 
32
27
  router = APIRouter()
33
28
 
@@ -178,133 +173,71 @@ def find_data_objects_for_study(
178
173
  is a list of the `DataObject`s associated with that `Biosample`.
179
174
  """
180
175
  biosample_data_objects = []
181
- study = raise404_if_none(
182
- mdb.study_set.find_one({"id": study_id}, ["id"]), detail="Study not found"
183
- )
184
-
185
- # Note: With nmdc-schema v10 (legacy schema), we used the field named `part_of` here.
186
- # With nmdc-schema v11 (Berkeley schema), we use the field named `associated_studies` here.
187
- biosamples = mdb.biosample_set.find({"associated_studies": study["id"]}, ["id"])
188
- biosample_ids = [biosample["id"] for biosample in biosamples]
189
-
190
- # SchemaView interface to NMDC Schema
191
- nmdc_view = ViewGetter()
192
- nmdc_sv = nmdc_view.get_view()
193
- dg_descendants = [
194
- (f"nmdc:{t}" if ":" not in t else t)
195
- for t in nmdc_sv.class_descendants("DataGeneration")
196
- ]
197
-
198
- def collect_data_objects(doc_ids, collected_objects, unique_ids):
199
- """Helper function to collect data objects from `has_input` and `has_output` references."""
200
- for doc_id in doc_ids:
201
- # Check if this is a DataObject by looking at the document's type directly
202
- doc = mdb.alldocs.find_one({"id": doc_id}, {"type": 1})
203
- if (
204
- doc
205
- and doc.get("type") == "nmdc:DataObject"
206
- and doc_id not in unique_ids
207
- ):
208
- data_obj = mdb.data_object_set.find_one({"id": doc_id})
209
- if data_obj:
210
- collected_objects.append(strip_oid(data_obj))
211
- unique_ids.add(doc_id)
212
-
213
- # Another way in which DataObjects can be related to Biosamples is through the
214
- # `was_informed_by` key/slot. We need to link records from the `workflow_execution_set`
215
- # collection that are "informed" by the same DataGeneration records that created
216
- # the outputs above. Then we need to get additional DataObject records that are
217
- # created by this linkage.
218
- def process_informed_by_docs(doc, collected_objects, unique_ids):
219
- """Process documents linked by `was_informed_by` and collect relevant data objects."""
220
- # Note: As of nmdc-schema 11.9.0, the `was_informed_by` field, if defined,
221
- # will contain a list of strings. In MongoDB, the `{k: v}` filter
222
- # can be used to check whether either (a) the value of field `f` is
223
- # an array containing `v` as one of its elements, or (b) the value
224
- # of field `f` is exactly equal to `v`. We rely on behavior (a) here.
225
- informed_by_docs = mdb.workflow_execution_set.find(
226
- {"was_informed_by": doc["id"]}
227
- )
228
- for informed_doc in informed_by_docs:
229
- collect_data_objects(
230
- informed_doc.get("has_input", []), collected_objects, unique_ids
231
- )
232
- collect_data_objects(
233
- informed_doc.get("has_output", []), collected_objects, unique_ids
234
- )
235
176
 
236
- biosample_data_objects = []
177
+ # Respond with an error if the specified `Study` does not exist.
178
+ # Note: We project only the `_id` field, to minimize data transfer.
179
+ raise404_if_none(
180
+ mdb["study_set"].find_one({"id": study_id}, projection={"_id": 1}),
181
+ detail="Study not found",
182
+ )
237
183
 
238
- for biosample_id in biosample_ids:
239
- current_ids = [biosample_id]
240
- collected_data_objects = []
241
- unique_ids = set()
242
-
243
- # Iterate over records in the `alldocs` collection. Look for
244
- # records that have the given biosample_id as value on the
245
- # `has_input` key/slot. The retrieved documents might also have a
246
- # `has_output` key/slot associated with them. Get the value of the
247
- # `has_output` key and check if it's type is `nmdc:DataObject`. If
248
- # it's not, repeat the process till it is.
249
- while current_ids:
250
- new_current_ids = []
251
- for current_id in current_ids:
252
- # Query to find all documents with current_id as the value on
253
- # `has_input` slot
254
- for doc in mdb.alldocs.find({"has_input": current_id}):
255
- has_output = doc.get("has_output", [])
256
-
257
- # Process `DataGeneration` type documents linked by `was_informed_by`
258
- if not has_output and any(
259
- t in dg_descendants for t in doc.get("_type_and_ancestors", [])
260
- ):
261
- process_informed_by_docs(
262
- doc, collected_data_objects, unique_ids
263
- )
264
- continue
265
-
266
- collect_data_objects(has_output, collected_data_objects, unique_ids)
267
- # Add non-DataObject outputs to continue the chain
268
- for op in has_output:
269
- doc_check = mdb.alldocs.find_one({"id": op}, {"type": 1})
270
- if doc_check and doc_check.get("type") != "nmdc:DataObject":
271
- new_current_ids.append(op)
272
-
273
- if any(
274
- t in dg_descendants for t in doc.get("_type_and_ancestors", [])
275
- ):
276
- process_informed_by_docs(
277
- doc, collected_data_objects, unique_ids
278
- )
279
-
280
- # Also check if current_id is a DataObject that serves as input to other processes
281
- current_doc_type = mdb.alldocs.find_one({"id": current_id}, {"type": 1})
282
- if (
283
- current_doc_type
284
- and current_doc_type.get("type") == "nmdc:DataObject"
285
- ):
286
- # Find all documents in alldocs that have this DataObject as input
287
- for doc in mdb.alldocs.find({"has_input": current_id}):
288
- has_output = doc.get("has_output", [])
289
- # Process outputs from these documents
290
- collect_data_objects(
291
- has_output, collected_data_objects, unique_ids
292
- )
293
- # Add non-DataObject outputs to continue the chain
294
- for op in has_output:
295
- doc_check = mdb.alldocs.find_one({"id": op}, {"type": 1})
296
- if doc_check and doc_check.get("type") != "nmdc:DataObject":
297
- new_current_ids.append(op)
298
-
299
- current_ids = new_current_ids
300
-
301
- if collected_data_objects:
302
- result = {
184
+ # Use the `get_linked_instances` function—which is the function that
185
+ # underlies the `/nmdcschema/linked_instances` API endpoint—to get all
186
+ # the `Biosample`s that are downstream of the specified `Study`.
187
+ #
188
+ # Note: The `get_linked_instances` function requires that a `max_page_size`
189
+ # integer argument be passed in. In our case, we want to get _all_ of
190
+ # the instances. Python has no "infinity" integer; and, even if it did,
191
+ # if we were to specify too large of an integer, we'd get this error:
192
+ # > "OverflowError: MongoDB can only handle up to 8-byte ints"
193
+ # So, as a workaround, we pass in a number that is large enough that we
194
+ # think it will account for all cases in practice (e.g., a study having
195
+ # a trillion biosamples or a trillion data objects).
196
+ #
197
+ # TODO: Update the `get_linked_instances` function to optionally impose _no_ limit.
198
+ #
199
+ large_max_page_size: int = 1_000_000_000_000
200
+ linked_biosamples_result: dict = get_linked_instances(
201
+ ids=[study_id],
202
+ types=["nmdc:Biosample"],
203
+ hydrate=False, # we'll only use their `id` values
204
+ page_token=None,
205
+ max_page_size=large_max_page_size,
206
+ mdb=mdb,
207
+ )
208
+ biosample_ids = [d["id"] for d in linked_biosamples_result.get("resources", [])]
209
+ logging.debug(f"Found {len(biosample_ids)} Biosamples for Study {study_id}")
210
+
211
+ # Get all the `DataObject`s that are downstream from any of those `Biosample`s.
212
+ data_objects_by_biosample_id = {}
213
+ linked_data_objects_result: dict = get_linked_instances(
214
+ ids=biosample_ids,
215
+ types=["nmdc:DataObject"],
216
+ hydrate=True, # we want the full `DataObject` documents
217
+ page_token=None,
218
+ max_page_size=large_max_page_size,
219
+ mdb=mdb,
220
+ )
221
+ for data_object in linked_data_objects_result.get("resources", []):
222
+ upstream_biosample_id = data_object["_downstream_of"][0]
223
+ if upstream_biosample_id not in data_objects_by_biosample_id.keys():
224
+ data_objects_by_biosample_id[upstream_biosample_id] = []
225
+
226
+ # Strip away the metadata fields injected by `get_linked_instances()`.
227
+ data_object.pop("_upstream_of", None)
228
+ data_object.pop("_downstream_of", None)
229
+ data_objects_by_biosample_id[upstream_biosample_id].append(data_object)
230
+
231
+ # Convert the `data_objects_by_biosample_id` dictionary into a list of dicts;
232
+ # i.e., into the format returned by the initial version of this API endpoint,
233
+ # which did not use the `get_linked_instances` function under the hood.
234
+ for biosample_id, data_objects in data_objects_by_biosample_id.items():
235
+ biosample_data_objects.append(
236
+ {
303
237
  "biosample_id": biosample_id,
304
- "data_objects": collected_data_objects,
238
+ "data_objects": data_objects,
305
239
  }
306
- biosample_data_objects.append(result)
307
-
240
+ )
308
241
  return biosample_data_objects
309
242
 
310
243
 
@@ -699,96 +632,3 @@ def find_related_objects_for_workflow_execution(
699
632
  }
700
633
 
701
634
  return response
702
-
703
-
704
- jinja_env = Environment(
705
- loader=PackageLoader("nmdc_runtime"), autoescape=select_autoescape()
706
- )
707
-
708
-
709
- def attr_index_sort_key(attr):
710
- return "_" if attr == "id" else attr
711
-
712
-
713
- def documentation_links(jsonschema_dict, collection_names) -> dict:
714
- """This function constructs a hierarchical catalog of (links to) schema classes and their slots.
715
-
716
- The returned dictionary `doc_links` is used as input to the Jinja template `nmdc_runtime/templates/search.html`
717
- in order to support user experience for `GET /search`.
718
- """
719
-
720
- # Note: All documentation URLs generated within this function will begin with this.
721
- base_url = r"https://w3id.org/nmdc"
722
-
723
- # Initialize dictionary in which to associate key/value pairs via the following for loop.
724
- doc_links = {}
725
-
726
- for collection_name in collection_names:
727
- # Since a given collection can be associated with multiple classes, the `doc_links` dictionary
728
- # will have a _list_ of values for each collection.
729
- class_descriptors = []
730
-
731
- # If the collection name is one that the `search.html` page has a dedicated section for,
732
- # give it a top-level key; otherwise, nest it under `activity_set`.
733
- key_hierarchy: List[str] = ["activity_set", collection_name]
734
- if collection_name in ("biosample_set", "study_set", "data_object_set"):
735
- key_hierarchy = [collection_name]
736
-
737
- # Process the name of each class that the schema associates with this collection.
738
- collection_spec = jsonschema_dict["$defs"]["Database"]["properties"][
739
- collection_name
740
- ]
741
- class_names = get_class_names_from_collection_spec(collection_spec)
742
- for idx, class_name in enumerate(class_names):
743
- # Make a list of dictionaries, each of which describes one attribute of this class.
744
- entity_attrs = list(jsonschema_dict["$defs"][class_name]["properties"])
745
- entity_attr_descriptors = [
746
- {"url": f"{base_url}/{attr_name}", "attr_name": attr_name}
747
- for attr_name in entity_attrs
748
- ]
749
-
750
- # Make a dictionary describing this class.
751
- class_descriptor = {
752
- "collection_name": collection_name,
753
- "entity_url": f"{base_url}/{class_name}",
754
- "entity_name": class_name,
755
- "entity_attrs": sorted(
756
- entity_attr_descriptors, key=itemgetter("attr_name")
757
- ),
758
- }
759
-
760
- # Add that descriptor to this collection's list of class descriptors.
761
- class_descriptors.append(class_descriptor)
762
-
763
- # Add a key/value pair describing this collection to the `doc_links` dictionary.
764
- # Reference: https://toolz.readthedocs.io/en/latest/api.html#toolz.dicttoolz.assoc_in
765
- doc_links = assoc_in(doc_links, keys=key_hierarchy, value=class_descriptors)
766
-
767
- return doc_links
768
-
769
-
770
- @router.get("/search", response_class=HTMLResponse, include_in_schema=False)
771
- def search_page(
772
- mdb: MongoDatabase = Depends(get_mongo_db),
773
- ):
774
- template = jinja_env.get_template("search.html")
775
- indexed_entity_attributes = merge(
776
- {n: {"id"} for n in activity_collection_names(mdb)},
777
- {
778
- coll: sorted(attrs | {"id"}, key=attr_index_sort_key)
779
- for coll, attrs in entity_attributes_to_index.items()
780
- },
781
- )
782
- doc_links = documentation_links(
783
- get_nmdc_jsonschema_dict(),
784
- (
785
- list(activity_collection_names(mdb))
786
- + ["biosample_set", "study_set", "data_object_set"]
787
- ),
788
- )
789
- html_content = template.render(
790
- activity_collection_names=sorted(activity_collection_names(mdb)),
791
- indexed_entity_attributes=indexed_entity_attributes,
792
- doc_links=doc_links,
793
- )
794
- return HTMLResponse(content=html_content, status_code=200)