nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +22 -2
  3. nmdc_runtime/api/core/idgen.py +36 -6
  4. nmdc_runtime/api/db/mongo.py +0 -12
  5. nmdc_runtime/api/endpoints/find.py +65 -225
  6. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  7. nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
  8. nmdc_runtime/api/endpoints/objects.py +4 -11
  9. nmdc_runtime/api/endpoints/operations.py +0 -27
  10. nmdc_runtime/api/endpoints/queries.py +22 -0
  11. nmdc_runtime/api/endpoints/sites.py +0 -24
  12. nmdc_runtime/api/endpoints/util.py +57 -35
  13. nmdc_runtime/api/entrypoint.sh +7 -0
  14. nmdc_runtime/api/main.py +84 -60
  15. nmdc_runtime/api/models/util.py +12 -5
  16. nmdc_runtime/api/openapi.py +116 -180
  17. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  18. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  19. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  20. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  21. nmdc_runtime/minter/adapters/repository.py +21 -0
  22. nmdc_runtime/minter/domain/model.py +20 -0
  23. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  24. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  25. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  26. nmdc_runtime/site/dagster.yaml +53 -0
  27. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  28. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  29. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  30. nmdc_runtime/site/export/ncbi_xml.py +632 -11
  31. nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
  32. nmdc_runtime/site/graphs.py +7 -0
  33. nmdc_runtime/site/ops.py +92 -34
  34. nmdc_runtime/site/repository.py +2 -0
  35. nmdc_runtime/site/resources.py +16 -3
  36. nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
  37. nmdc_runtime/site/workspace.yaml +13 -0
  38. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  39. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  40. nmdc_runtime/static/README.md +5 -0
  41. nmdc_runtime/static/favicon.ico +0 -0
  42. nmdc_runtime/util.py +87 -1
  43. nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
  44. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
  45. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
  46. nmdc_runtime/api/endpoints/ids.py +0 -192
  47. nmdc_runtime/client/__init__.py +0 -0
  48. nmdc_runtime/containers.py +0 -14
  49. nmdc_runtime/core/__init__.py +0 -0
  50. nmdc_runtime/core/db/Database.py +0 -13
  51. nmdc_runtime/core/db/__init__.py +0 -0
  52. nmdc_runtime/core/exceptions/__init__.py +0 -23
  53. nmdc_runtime/core/exceptions/base.py +0 -47
  54. nmdc_runtime/core/exceptions/token.py +0 -13
  55. nmdc_runtime/domain/__init__.py +0 -0
  56. nmdc_runtime/domain/users/__init__.py +0 -0
  57. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  58. nmdc_runtime/domain/users/userSchema.py +0 -37
  59. nmdc_runtime/domain/users/userService.py +0 -14
  60. nmdc_runtime/infrastructure/__init__.py +0 -0
  61. nmdc_runtime/infrastructure/database/__init__.py +0 -0
  62. nmdc_runtime/infrastructure/database/db.py +0 -3
  63. nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
  64. nmdc_runtime/infrastructure/database/models/user.py +0 -1
  65. nmdc_runtime/lib/__init__.py +0 -1
  66. nmdc_runtime/lib/extract_nmdc_data.py +0 -33
  67. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  68. nmdc_runtime/lib/nmdc_dataframes.py +0 -825
  69. nmdc_runtime/lib/nmdc_etl_class.py +0 -396
  70. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  71. nmdc_runtime/site/drsobjects/__init__.py +0 -0
  72. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  73. nmdc_runtime/site/drsobjects/registration.py +0 -131
  74. nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
  75. nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
  76. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
  77. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,177 @@
1
+ # Note: Most of the steps for the `base` image were copied verbatim from either `fastapi.Dockerfile`,
2
+ # `dagster.Dockerfile`, or `test.Dockerfile` (indeed, most of the steps were present in all three files).
3
+ # Reference: https://docs.docker.com/get-started/docker-concepts/building-images/multi-stage-builds/
4
+ #
5
+ # Base this image upon a variant of the official Python 3.10 image that is, in turn,
6
+ # based upon a minimal (slim) variant of the Debian 11 (bullseye) image.
7
+ # Reference: https://hub.docker.com/_/python
8
+ # ────────────────────────────────────────────────────────────────────────────┐
9
+ FROM python:3.10-slim-bullseye AS base
10
+ # ────────────────────────────────────────────────────────────────────────────┘
11
+
12
+ # Install and upgrade system-level software in a non-interactive way, then delete temporary files.
13
+ # Note: Setting `DEBIAN_FRONTEND=noninteractive` and passing `-y` to `apt-get` makes things non-interactive.
14
+ RUN export DEBIAN_FRONTEND=noninteractive && \
15
+ apt-get update && \
16
+ apt-get -y upgrade && \
17
+ apt-get install -y --no-install-recommends \
18
+ tini \
19
+ procps \
20
+ net-tools \
21
+ build-essential \
22
+ git \
23
+ make \
24
+ zip \
25
+ curl \
26
+ wget \
27
+ gnupg && \
28
+ apt-get -y clean && \
29
+ rm -rf /var/lib/apt/lists/*
30
+
31
+ # Enable Python's "fault handler" feature, so, when low-level errors occur (e.g. segfaults), Python prints lots of info.
32
+ # Reference: https://docs.python.org/3/using/cmdline.html#envvar-PYTHONFAULTHANDLER
33
+ ENV PYTHONFAULTHANDLER=1
34
+
35
+ # Configure Git to consider the `/code` directory to be "safe", so that, when a Git repository
36
+ # created outside of the container gets mounted at that path within the container, the
37
+ # `uv-dynamic-versioning` tool running within the container does not fail with the error:
38
+ # > "Detected Git repository, but failed because of dubious ownership"
39
+ # Reference: https://git-scm.com/docs/git-config#Documentation/git-config.txt-safedirectory
40
+ RUN git config --global --add safe.directory /code
41
+
42
+ # Install `uv`.
43
+ # Reference: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
44
+ ADD https://astral.sh/uv/install.sh /uv-installer.sh
45
+ RUN sh /uv-installer.sh && \
46
+ rm /uv-installer.sh
47
+ ENV PATH="/root/.local/bin/:$PATH"
48
+
49
+ # Install Python dependencies (production dependencies only).
50
+ #
51
+ # Note: We copy only the files that `uv` needs in order to install dependencies. That way,
52
+ # we minimize the number of files whose changes would invalidate cached image layers
53
+ #
54
+ # Note: We use the `VIRTUAL_ENV` environment variable to specify the path to the Python virtual
55
+ # environment that we want the `uv` program inside the container to create and use.
56
+ #
57
+ # Q: Why don't we use `./.venv` in the repository file tree?
58
+ # A: If we were to do that, then, whenever a developer would mount (via our Docker Compose file)
59
+ # the repository file tree from their host machine (which may include a `.venv/` directory
60
+ # created by their host machine) into the container, it would overwrite the Python virtual
61
+ # environment that the `uv` program inside the container is using.
62
+ #
63
+ # Q: What is special about the `VIRTUAL_ENV` environment variable?
64
+ # A: When using `uv`'s `--active` option (as we do in later stages of this Dockerfile),
65
+ # `uv` determines which virtual environment is active by looking at `VIRTUAL_ENV'. This
66
+ # is the case, even though the documentation of the `venv` module (in Python's standard
67
+ # library) specifically says: "`VIRTUAL_ENV` cannot be relied upon to determine whether
68
+ # a virtual environment is being used."
69
+ #
70
+ # References:
71
+ # - https://docs.astral.sh/uv/pip/environments/#using-arbitrary-python-environments (RE: `VIRTUAL_ENV`)
72
+ # - https://docs.astral.sh/uv/reference/environment/#virtual_env (RE: `VIRTUAL_ENV`, from uv's perspective)
73
+ # - https://docs.python.org/3/library/venv.html#how-venvs-work (RE: `VIRTUAL_ENV`, from venv's perspective)
74
+ # - https://docs.astral.sh/uv/concepts/projects/sync/#partial-installations (RE: `--no-install-project`)
75
+ #
76
+ # Note: In the `RUN` command, we use a "cache mount" (a feature of Docker) to cache production dependencies
77
+ # across builds. This is a performance optimization technique shown in the `uv` docs.
78
+ # Reference:
79
+ # - https://docs.astral.sh/uv/guides/integration/docker/#caching (RE: the technique)
80
+ # - https://docs.docker.com/build/cache/optimize/#use-cache-mounts (RE: the feature)
81
+ # - https://docs.astral.sh/uv/reference/settings/#link-mode (RE: `UV_LINK_MODE`)
82
+ # - https://docs.astral.sh/uv/reference/cli/#uv-sync--no-install-project (RE: `--no-install-project`)
83
+ #
84
+ # Note: We use `--compile-bytecode` so that Python compiles `.py` files to `.pyc` files now,
85
+ # instead of when the container is running. By default, `uv` defers this compilation
86
+ # to "import time," whereas `pip` (by default) performs it at "install time" (like this).
87
+ #
88
+ # Note: We use `--locked` so that `uv sync` exits with an error if the `uv.lock` file isn't _already_
89
+ # up to date. By default, `uv sync` would automatically update the lock file if necessary.
90
+ # Reference: https://docs.astral.sh/uv/reference/cli/#uv-sync--locked
91
+ #
92
+ ENV VIRTUAL_ENV="/venv"
93
+ RUN mkdir -p "${VIRTUAL_ENV}"
94
+ COPY ./pyproject.toml /code/pyproject.toml
95
+ COPY ./uv.lock /code/uv.lock
96
+ RUN --mount=type=cache,target=/root/.cache/uv \
97
+ cd /code && \
98
+ UV_LINK_MODE=copy uv sync --active --no-dev --no-install-project --compile-bytecode --locked
99
+
100
+ # ────────────────────────────────────────────────────────────────────────────┐
101
+ FROM base AS fastapi
102
+ # ────────────────────────────────────────────────────────────────────────────┘
103
+
104
+ # Copy repository contents into image.
105
+ COPY . /code
106
+
107
+ # Install the project in editable mode.
108
+ RUN --mount=type=cache,target=/root/.cache/uv \
109
+ cd /code && \
110
+ uv sync --active --no-dev --compile-bytecode --locked
111
+
112
+ # Use Uvicorn to serve the FastAPI app on port 8000.
113
+ #
114
+ # Note: We include the `--no-sync` option to prevent `uv run` from automatically syncing dependencies.
115
+ # If it were to sync dependencies at this point, it would install development dependencies, since
116
+ # we exclude them above, but they are listed in uv's `default-groups` configuration by default.
117
+ # This is explained at: https://github.com/astral-sh/uv/issues/12558#issuecomment-2764611918
118
+ #
119
+ EXPOSE 8000
120
+ WORKDIR /code
121
+ CMD ["uv", "run", "--active", "--no-sync", "uvicorn", "nmdc_runtime.api.main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "8000"]
122
+
123
+ # ────────────────────────────────────────────────────────────────────────────┐
124
+ FROM base AS dagster
125
+ # ────────────────────────────────────────────────────────────────────────────┘
126
+
127
+ # Copy repository contents into image.
128
+ #
129
+ # Note: This path (i.e. "/opt/dagster/lib/") is hard-coded in a few places in `nmdc_runtime/site/ops.py`. That's why
130
+ # this image does not store the repository contents in `/code`, unlike the other images in this Dockerfile.
131
+ #
132
+ COPY . /opt/dagster/lib
133
+
134
+ # Install the project in editable mode.
135
+ RUN --mount=type=cache,target=/root/.cache/uv \
136
+ cd /opt/dagster/lib && \
137
+ uv sync --active --no-dev --compile-bytecode --locked
138
+
139
+ # Move Dagster configuration files to the place Dagster expects.
140
+ ENV DAGSTER_HOME="/opt/dagster/dagster_home/"
141
+ RUN mkdir -p "${DAGSTER_HOME}" && \
142
+ cp /opt/dagster/lib/nmdc_runtime/site/dagster.yaml "${DAGSTER_HOME}" && \
143
+ cp /opt/dagster/lib/nmdc_runtime/site/workspace.yaml "${DAGSTER_HOME}"
144
+
145
+ # Use Tini to run Dagit.
146
+ #
147
+ # Notes:
148
+ # - The port number (i.e. "3000") is hard-coded in `nmdc_runtime/site/entrypoint-dagit.sh`.
149
+ # - Dagster daemon (versus Dagit) can be launched by overriding the `ENTRYPOINT` defined here.
150
+ #
151
+ # Reference: https://github.com/krallin/tini
152
+ #
153
+ EXPOSE 3000
154
+ WORKDIR /opt/dagster/dagster_home/
155
+ ENTRYPOINT ["tini", "--", "../lib/nmdc_runtime/site/entrypoint-dagit.sh"]
156
+
157
+ # ────────────────────────────────────────────────────────────────────────────┐
158
+ FROM base AS test
159
+ # ────────────────────────────────────────────────────────────────────────────┘
160
+
161
+ # Copy all repository contents into image.
162
+ COPY . /code
163
+
164
+ # Install the project in editable mode, and install development dependencies.
165
+ RUN --mount=type=cache,target=/root/.cache/uv \
166
+ cd /code && \
167
+ uv sync --active --compile-bytecode --locked
168
+
169
+ # Make `wait-for-it.sh` executable.
170
+ RUN chmod +x /code/.docker/wait-for-it.sh
171
+
172
+ WORKDIR /code
173
+
174
+ # Ensure started container does not exit, so that a subsequent `docker exec` command can run tests.
175
+ # For an example `docker exec` command, see `Makefile`'s `run-test` target.
176
+ # Such a command should use `wait-for-it.sh` to run `pytest` no earlier than when the FastAPI server is accessible.
177
+ ENTRYPOINT ["tail", "-f", "/dev/null"]
@@ -16,25 +16,42 @@ from toolz import merge
16
16
 
17
17
  from nmdc_runtime.api.db.mongo import get_mongo_db
18
18
 
19
+ # This is a queue of the "request descriptors" that we will eventually insert into the database.
19
20
  _requests = []
20
21
  _last_posted = datetime.now()
21
22
 
22
23
 
23
24
  def _post_requests(collection: str, requests_data: List[Dict], source: str):
25
+ """Inserts the specified request descriptors into the specified MongoDB collection."""
24
26
  mdb = get_mongo_db()
25
27
  mdb[collection].insert_many([merge(d, {"source": source}) for d in requests_data])
26
28
 
27
29
 
28
30
  def log_request(collection: str, request_data: Dict, source: str = "FastAPI"):
31
+ """Flushes the queue of request descriptors to the database if enough time has passed since the previous time."""
29
32
  global _requests, _last_posted
30
33
  _requests.append(request_data)
31
34
  now = datetime.now()
32
35
  # flush queue every minute at most
33
36
  if (now - _last_posted).total_seconds() > 60.0:
37
+ # Note: This use of threading is an attempt to avoid blocking the current thread
38
+ # while performing the insertion(s).
39
+ #
40
+ # TODO: Is there is a race condition here? If multiple requests arrive at approximately
41
+ # the same time, is it possible that each one causes a different thread to be
42
+ # started, each with a different (and possibly overlapping) set of requests to
43
+ # insert?
44
+ #
45
+ # TODO: If the insertion fails, will the requests be lost?
46
+ #
47
+ # Note: The author of this function said it may have been a "standard" solution copied
48
+ # from some documentation. Indeed, the comment at the top of this module contains
49
+ # a link to code on which it was based.
50
+ #
34
51
  threading.Thread(
35
52
  target=_post_requests, args=(collection, _requests, source)
36
53
  ).start()
37
- _requests = []
54
+ _requests = [] # empties the queue
38
55
  _last_posted = now
39
56
 
40
57
 
@@ -49,6 +66,9 @@ class Analytics(BaseHTTPMiddleware):
49
66
  start = time()
50
67
  response = await call_next(request)
51
68
 
69
+ # Use a fallback IP address value (currently an empty string) if we can't derive one from the request.
70
+ ip_address: str = "" if request.client is None else request.client.host
71
+
52
72
  # Build a dictionary that describes the incoming request.
53
73
  #
54
74
  # Note: `request.headers` is an instance of `MultiDict`. References:
@@ -57,7 +77,7 @@ class Analytics(BaseHTTPMiddleware):
57
77
  #
58
78
  request_data = {
59
79
  "hostname": request.url.hostname,
60
- "ip_address": request.client.host,
80
+ "ip_address": ip_address,
61
81
  "path": request.url.path,
62
82
  "user_agent": request.headers.get("user-agent"),
63
83
  "method": request.method,
@@ -89,7 +89,35 @@ def generate_ids(
89
89
  shoulder: str = "fk4",
90
90
  ) -> List[str]:
91
91
  r"""
92
- TODO: Document this function.
92
+ Generate the specified number of identifiers, storing them in a MongoDB collection
93
+ whose name is derived from the specified Name-Assigning Authority (NAA) and Shoulder.
94
+
95
+ :param mdb: Handle to a MongoDB database
96
+ :param owner: String that will go in the "__ao" field of the identifier record.
97
+ Callers will oftentimes set this to the name of a Runtime "site"
98
+ (as in, a "site client" site, not a "Dagster" site).
99
+ :param populator: String that will go in the "who" field of the identifier record.
100
+ Indicates "who generated this ID." Callers will oftentimes set
101
+ this to the name of a Runtime "site" (as in, a "site client" site,
102
+ not a "Dagster" site).
103
+ :param ns: Namespace (see Minter docs); e.g. "changesheets"
104
+ :param naa: Name-Assigning Authority (see Minter docs); e.g. "nmdc"
105
+ :param shoulder: String that will go in the "how" field (see Minter docs); e.g. "sys0"
106
+
107
+ This function was written the way it was in an attempt to mirror the ARK spec:
108
+ https://www.ietf.org/archive/id/draft-kunze-ark-41.html (found via: https://arks.org/specs/)
109
+
110
+ Deviations from the ARK spec include:
111
+ 1. The inclusion of a typecode.
112
+ The inclusion of a typecode came out of discussions with team members,
113
+ who wanted identifiers to include some non-opaque substring that could be used
114
+ to determine what type of resource a given identifier refers to.
115
+ 2. Making hyphens mandatory.
116
+ We decided to make the hyphens mandatory, whereas the spec says they are optional.
117
+ > "Hyphens are considered to be insignificant and are always ignored in ARKs."
118
+ > Reference: https://www.ietf.org/archive/id/draft-kunze-ark-41.html#name-character-repertoires
119
+ In our case, we require that users include an identifier's hyphens whenever
120
+ they are using that identifier.
93
121
  """
94
122
  collection = mdb.get_collection(collection_name(naa, shoulder))
95
123
  estimated_document_count = collection.estimated_document_count()
@@ -119,7 +147,9 @@ def generate_ids(
119
147
  if not_taken:
120
148
  # All attribute names beginning with "__a" are reserved...
121
149
  # https://github.com/jkunze/n2t-eggnog/blob/0f0f4c490e6dece507dba710d3557e29b8f6627e/egg#L1882
122
- # XXX mongo is a pain with '.'s in field names, so not using e.g. "_.e" names.
150
+ # The author of this function opted to refrain from using property names beginning with "_.e",
151
+ # because he thought it would complicate MongoDB queries involving those properties, given that
152
+ # the "." is used as a field delimiter in MongoDB syntax (e.g. "foo.bar.baz").
123
153
  docs = [
124
154
  {
125
155
  "@context": "https://n2t.net/e/n2t_apidoc.html#identifier-metadata",
@@ -145,9 +175,9 @@ def generate_ids(
145
175
 
146
176
 
147
177
  def generate_one_id(
148
- mdb: MongoDatabase = None,
178
+ mdb: MongoDatabase,
149
179
  ns: str = "",
150
- shoulder: str = "sys0",
180
+ shoulder: str = "sys0", # "sys0" represents the Runtime
151
181
  ) -> str:
152
182
  """Generate unique Crockford Base32-encoded ID for mdb repository.
153
183
 
@@ -156,8 +186,8 @@ def generate_one_id(
156
186
  """
157
187
  return generate_ids(
158
188
  mdb,
159
- owner="_system",
160
- populator="_system",
189
+ owner="_system", # "_system" represents the Runtime
190
+ populator="_system", # "_system" represents the Runtime
161
191
  number=1,
162
192
  ns=ns,
163
193
  naa="nmdc",
@@ -10,7 +10,6 @@ import bson
10
10
  from jsonschema import Draft7Validator
11
11
  from nmdc_schema.nmdc import Database as NMDCDatabase
12
12
  from pymongo.errors import AutoReconnect, OperationFailure
13
- from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
14
13
  from refscan.lib.Finder import Finder
15
14
  from refscan.scanner import scan_outgoing_references
16
15
  from tenacity import wait_random_exponential, retry, retry_if_exception_type
@@ -83,17 +82,6 @@ def get_session_bound_mongo_db(session=None) -> MongoDatabase:
83
82
  return SessionBoundDatabase(mdb, session) if session is not None else mdb
84
83
 
85
84
 
86
- @lru_cache
87
- def get_async_mongo_db() -> AsyncIOMotorDatabase:
88
- _client = AsyncIOMotorClient(
89
- host=os.getenv("MONGO_HOST"),
90
- username=os.getenv("MONGO_USERNAME"),
91
- password=os.getenv("MONGO_PASSWORD"),
92
- directConnection=True,
93
- )
94
- return _client[os.getenv("MONGO_DBNAME")]
95
-
96
-
97
85
  def get_nonempty_nmdc_schema_collection_names(mdb: MongoDatabase) -> Set[str]:
98
86
  """
99
87
  Returns the names of the collections that (a) exist in the database,