nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +167 -0
- nmdc_runtime/api/analytics.py +22 -2
- nmdc_runtime/api/core/idgen.py +36 -6
- nmdc_runtime/api/db/mongo.py +0 -12
- nmdc_runtime/api/endpoints/find.py +65 -225
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
- nmdc_runtime/api/endpoints/objects.py +4 -11
- nmdc_runtime/api/endpoints/operations.py +0 -27
- nmdc_runtime/api/endpoints/queries.py +22 -0
- nmdc_runtime/api/endpoints/sites.py +0 -24
- nmdc_runtime/api/endpoints/util.py +57 -35
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +84 -60
- nmdc_runtime/api/models/util.py +12 -5
- nmdc_runtime/api/openapi.py +116 -180
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/minter/adapters/repository.py +21 -0
- nmdc_runtime/minter/domain/model.py +20 -0
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +632 -11
- nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
- nmdc_runtime/site/graphs.py +7 -0
- nmdc_runtime/site/ops.py +92 -34
- nmdc_runtime/site/repository.py +2 -0
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +87 -1
- nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/RECORD +47 -57
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/api/endpoints/ids.py +0 -192
- nmdc_runtime/client/__init__.py +0 -0
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/__init__.py +0 -0
- nmdc_runtime/core/db/Database.py +0 -13
- nmdc_runtime/core/db/__init__.py +0 -0
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/__init__.py +0 -0
- nmdc_runtime/domain/users/__init__.py +0 -0
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/models/user.py +0 -1
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -33
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -825
- nmdc_runtime/lib/nmdc_etl_class.py +0 -396
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/__init__.py +0 -0
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
- nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
|
File without changes
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from datetime import datetime, timezone
|
|
3
|
-
|
|
4
|
-
from toolz import dissoc
|
|
5
|
-
|
|
6
|
-
from nmdc_runtime.api.models.job import JobOperationMetadata
|
|
7
|
-
from nmdc_runtime.api.models.operation import Operation
|
|
8
|
-
from nmdc_runtime.api.models.operation import UpdateOperationRequest
|
|
9
|
-
from nmdc_runtime.api.models.util import ListRequest
|
|
10
|
-
from nmdc_runtime.api.models.util import ResultT
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def load_local_json(url, prefixes_url_to_local=None):
|
|
14
|
-
"""Useful for large files cached on local filesystem.
|
|
15
|
-
|
|
16
|
-
You may, for example, `cp --parents ` many files on a remote filesystem to a staging
|
|
17
|
-
folder on that remote filesystem, gzip that folder, scp it to your local machine, and then
|
|
18
|
-
extract to your local machine.
|
|
19
|
-
|
|
20
|
-
Example:
|
|
21
|
-
prefixes_url_to_local = {
|
|
22
|
-
"https://data.microbiomedata.org/data/": "/Users/dwinston/nmdc_files/2021-09-scanon-meta/ficus/pipeline_products/",
|
|
23
|
-
"https://portal.nersc.gov/project/m3408/": "/Users/dwinston/nmdc_files/2021-09-scanon-meta/www/",
|
|
24
|
-
}
|
|
25
|
-
"""
|
|
26
|
-
path = url
|
|
27
|
-
for before, after in prefixes_url_to_local.items():
|
|
28
|
-
path = path.replace(before, after)
|
|
29
|
-
with open(path) as f:
|
|
30
|
-
return json.load(f)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def claim_metadata_ingest_jobs(
|
|
34
|
-
client, drs_object_ids_to_ingest, wf_id, max_page_size=1000
|
|
35
|
-
):
|
|
36
|
-
lr = ListRequest(
|
|
37
|
-
filter=json.dumps(
|
|
38
|
-
{
|
|
39
|
-
"workflow.id": wf_id,
|
|
40
|
-
"config.object_id": {"$in": drs_object_ids_to_ingest},
|
|
41
|
-
}
|
|
42
|
-
),
|
|
43
|
-
max_page_size=max_page_size,
|
|
44
|
-
)
|
|
45
|
-
jobs = []
|
|
46
|
-
while True:
|
|
47
|
-
rv = client.list_jobs(lr.model_dump()).json()
|
|
48
|
-
jobs.extend(rv["resources"])
|
|
49
|
-
if "next_page_token" not in rv:
|
|
50
|
-
break
|
|
51
|
-
else:
|
|
52
|
-
lr.page_token = rv["next_page_token"]
|
|
53
|
-
|
|
54
|
-
# safety escape
|
|
55
|
-
if len(jobs) == len(drs_object_ids_to_ingest):
|
|
56
|
-
break
|
|
57
|
-
|
|
58
|
-
job_claim_responses = [client.claim_job(j["id"]) for j in jobs]
|
|
59
|
-
|
|
60
|
-
return job_claim_responses
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def mongo_add_docs_result_as_dict(rv):
|
|
64
|
-
return {
|
|
65
|
-
collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
|
|
66
|
-
for collection_name, bulk_write_result in rv.items()
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def get_metadata_ingest_job_ops(mongo, wf_id, drs_object_ids_to_ingest):
|
|
71
|
-
return list(
|
|
72
|
-
mongo.db.operations.find(
|
|
73
|
-
{
|
|
74
|
-
"metadata.job.workflow.id": wf_id,
|
|
75
|
-
"metadata.job.config.object_id": {"$in": drs_object_ids_to_ingest},
|
|
76
|
-
"done": False,
|
|
77
|
-
}
|
|
78
|
-
)
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def do_metadata_ingest_job(client, mongo, job_op_doc):
|
|
83
|
-
op = Operation[ResultT, JobOperationMetadata](**job_op_doc)
|
|
84
|
-
object_info = client.get_object_info(op.metadata.job.config["object_id"]).json()
|
|
85
|
-
url = object_info["access_methods"][0]["access_url"]["url"]
|
|
86
|
-
docs = load_local_json(url)
|
|
87
|
-
op_result = mongo.add_docs(docs, validate=False, replace=False)
|
|
88
|
-
op_patch = UpdateOperationRequest(
|
|
89
|
-
done=True,
|
|
90
|
-
result=mongo_add_docs_result_as_dict(op_result),
|
|
91
|
-
metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")},
|
|
92
|
-
)
|
|
93
|
-
return client.update_operation(op.id, op_patch)
|
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import re
|
|
4
|
-
from datetime import datetime, timezone, timedelta
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from tempfile import TemporaryDirectory
|
|
7
|
-
|
|
8
|
-
import requests
|
|
9
|
-
from bs4 import BeautifulSoup
|
|
10
|
-
|
|
11
|
-
from nmdc_runtime.api.models.object import DrsObjectIn
|
|
12
|
-
from nmdc_runtime.util import (
|
|
13
|
-
drs_metadata_for,
|
|
14
|
-
nmdc_jsonschema_validator,
|
|
15
|
-
specialize_activity_set_docs,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
pattern = re.compile(r"https?://(?P<domain>[^/]+)/(?P<path>.+)")
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def url_to_name(url):
|
|
22
|
-
m = pattern.match(url)
|
|
23
|
-
return (
|
|
24
|
-
f"{'.'.join(reversed(m.group('domain').split('.')))}"
|
|
25
|
-
f"__{m.group('path').replace('/', '.')}"
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def fetch_url(url, timeout=30):
|
|
30
|
-
return requests.get(url, timeout=timeout)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class HttpResponseNotOk(Exception):
|
|
34
|
-
pass
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class HttpResponseNotJson(Exception):
|
|
38
|
-
pass
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def response_to_json(response):
|
|
42
|
-
if response.status_code != 200:
|
|
43
|
-
raise HttpResponseNotOk()
|
|
44
|
-
try:
|
|
45
|
-
json_data = response.json()
|
|
46
|
-
except ValueError:
|
|
47
|
-
raise HttpResponseNotJson()
|
|
48
|
-
return json_data
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def json_data_from_url_to_file(json_data, url, save_dir):
|
|
52
|
-
filepath = os.path.join(save_dir, url_to_name(url))
|
|
53
|
-
with open(filepath, "w") as f:
|
|
54
|
-
json.dump(json_data, f)
|
|
55
|
-
return filepath
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def json_clean(d, model, exclude_unset=False):
|
|
59
|
-
return json.loads(model(**d).json(exclude_unset=exclude_unset))
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def drs_object_in_for(url):
|
|
63
|
-
with TemporaryDirectory() as save_dir:
|
|
64
|
-
response = fetch_url(url)
|
|
65
|
-
try:
|
|
66
|
-
json_data = response_to_json(response)
|
|
67
|
-
except HttpResponseNotOk:
|
|
68
|
-
return {"error": "HttpResponseNotOk"}
|
|
69
|
-
|
|
70
|
-
except HttpResponseNotJson:
|
|
71
|
-
return {"error": "HttpResponseNotJson"}
|
|
72
|
-
|
|
73
|
-
filepath = json_data_from_url_to_file(json_data, url, save_dir)
|
|
74
|
-
drs_object_in = DrsObjectIn(
|
|
75
|
-
**drs_metadata_for(
|
|
76
|
-
filepath,
|
|
77
|
-
{
|
|
78
|
-
"access_methods": [{"access_url": {"url": url}}],
|
|
79
|
-
"name": Path(filepath).name.replace(":", "-"),
|
|
80
|
-
},
|
|
81
|
-
)
|
|
82
|
-
)
|
|
83
|
-
return {"result": drs_object_in}
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def create_drs_object_for(url, drs_object_in, client):
|
|
87
|
-
rv = client.create_object(json.loads(drs_object_in.json(exclude_unset=True)))
|
|
88
|
-
return {"url": url, "response": rv}
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def validate_as_metadata_and_ensure_tags_for(
|
|
92
|
-
drs_id, client, tags=("schema#/definitions/Database", "metadata-in")
|
|
93
|
-
):
|
|
94
|
-
docs = client.get_object_bytes(drs_id).json()
|
|
95
|
-
docs, _ = specialize_activity_set_docs(docs)
|
|
96
|
-
_ = nmdc_jsonschema_validator(docs)
|
|
97
|
-
return {tag: client.ensure_object_tag(drs_id, tag) for tag in tags}
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def recent_metadata_urls(
|
|
101
|
-
urlpath="https://portal.nersc.gov/project/m3408/meta/anno2/",
|
|
102
|
-
urlpath_extra="?C=M;O=D",
|
|
103
|
-
since="2021-09",
|
|
104
|
-
):
|
|
105
|
-
"""Scrapes recent URLs from Apache/2.4.38 (Debian) Server listing.
|
|
106
|
-
|
|
107
|
-
Designed with urlpath.startwsith("https://portal.nersc.gov/project/m3408/") in mind.
|
|
108
|
-
"""
|
|
109
|
-
if since is None:
|
|
110
|
-
now = datetime.now(timezone.utc)
|
|
111
|
-
recent_enuf = now - timedelta(days=30)
|
|
112
|
-
since = f"{recent_enuf.year}-{recent_enuf.month}"
|
|
113
|
-
|
|
114
|
-
rv = requests.get(f"{urlpath}{urlpath_extra}")
|
|
115
|
-
|
|
116
|
-
soup = BeautifulSoup(rv.text, "html.parser")
|
|
117
|
-
|
|
118
|
-
urls = []
|
|
119
|
-
|
|
120
|
-
for tr in soup.find_all("tr"):
|
|
121
|
-
tds = tr.find_all("td")
|
|
122
|
-
if len(tds) != 5:
|
|
123
|
-
continue
|
|
124
|
-
|
|
125
|
-
_, td_name, td_last_modified, td_size, _ = tds
|
|
126
|
-
if td_last_modified.text.startswith(since):
|
|
127
|
-
name = td_name.a.text
|
|
128
|
-
if name.endswith(".json"):
|
|
129
|
-
urls.append(f"{urlpath}{name}")
|
|
130
|
-
|
|
131
|
-
return urls
|
|
@@ -1,265 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: nmdc_runtime
|
|
3
|
-
Version: 2.10.0
|
|
4
|
-
Summary: A runtime system for NMDC data management and orchestration
|
|
5
|
-
Home-page: https://github.com/microbiomedata/nmdc-runtime
|
|
6
|
-
Author: Donny Winston
|
|
7
|
-
Author-email: donny@polyneme.xyz
|
|
8
|
-
Classifier: Development Status :: 3 - Alpha
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
-
Requires-Python: >=3.10
|
|
12
|
-
Description-Content-Type: text/markdown
|
|
13
|
-
License-File: LICENSE
|
|
14
|
-
Dynamic: author
|
|
15
|
-
Dynamic: author-email
|
|
16
|
-
Dynamic: classifier
|
|
17
|
-
Dynamic: description
|
|
18
|
-
Dynamic: description-content-type
|
|
19
|
-
Dynamic: home-page
|
|
20
|
-
Dynamic: license-file
|
|
21
|
-
Dynamic: requires-python
|
|
22
|
-
Dynamic: summary
|
|
23
|
-
|
|
24
|
-
A runtime system for NMDC data management and orchestration.
|
|
25
|
-
|
|
26
|
-
## Service Status
|
|
27
|
-
|
|
28
|
-
http://nmdcstatus.polyneme.xyz/
|
|
29
|
-
|
|
30
|
-
## How It Fits In
|
|
31
|
-
|
|
32
|
-
* [issues](https://github.com/microbiomedata/issues)
|
|
33
|
-
tracks issues related to NMDC, which may necessitate work across multiple repos.
|
|
34
|
-
|
|
35
|
-
* [nmdc-schema](https://github.com/microbiomedata/nmdc-schema/)
|
|
36
|
-
houses the LinkML schema specification, as well as generated artifacts (e.g. JSON Schema).
|
|
37
|
-
|
|
38
|
-
* [nmdc-server](https://github.com/microbiomedata/nmdc-server)
|
|
39
|
-
houses code specific to the data portal -- its database, back-end API, and front-end application.
|
|
40
|
-
|
|
41
|
-
* Workflows — documented in the [workflows](https://docs.microbiomedata.org/workflows/) section of the NMDC documentation website — take source data and produce computed data.
|
|
42
|
-
|
|
43
|
-
* This repo (nmdc-runtime)
|
|
44
|
-
* houses code that takes source data and computed data, and transforms it
|
|
45
|
-
to broadly accommodate downstream applications such as the data portal
|
|
46
|
-
* manages execution of the above (i.e., lightweight data transformations) and also
|
|
47
|
-
of computationally- and data-intensive workflows performed at other sites,
|
|
48
|
-
ensuring that claimed jobs have access to needed configuration and data resources.
|
|
49
|
-
|
|
50
|
-
## Data exports
|
|
51
|
-
|
|
52
|
-
The NMDC metadata as of 2021-10 is available here:
|
|
53
|
-
|
|
54
|
-
https://drs.microbiomedata.org/ga4gh/drs/v1/objects/sys086d541
|
|
55
|
-
|
|
56
|
-
The link returns a [GA4GH DRS API bundle object record](https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.0.0/docs/#_drs_datatypes), with the NMDC metadata collections (study_set, biosample_set, etc.) as contents, each a DRS API blob object.
|
|
57
|
-
|
|
58
|
-
For example the blob for the study_set collection export, named "study_set.jsonl.gz", is listed with DRS API ID "sys0xsry70". Thus, it is retrievable via
|
|
59
|
-
|
|
60
|
-
https://drs.microbiomedata.org/ga4gh/drs/v1/objects/sys0xsry70
|
|
61
|
-
|
|
62
|
-
The returned blob object record lists https://nmdc-runtime.files.polyneme.xyz/nmdcdb-mongoexport/2021-10-14/study_set.jsonl.gz as the url for an access method.
|
|
63
|
-
|
|
64
|
-
The 2021-10 exports are currently all accessible at `https://nmdc-runtime.files.polyneme.xyz/nmdcdb-mongoexport/2021-10-14/${COLLECTION_NAME}.jsonl.gz`, but the DRS API indirection allows these links to change in the future, for mirroring via other URLs, etc. So, the DRS API links should be the links you share.
|
|
65
|
-
|
|
66
|
-
## Overview
|
|
67
|
-
|
|
68
|
-
The runtime features:
|
|
69
|
-
|
|
70
|
-
1. [Dagster](https://docs.dagster.io/concepts) orchestration:
|
|
71
|
-
- dagit - a web UI to monitor and manage the running system.
|
|
72
|
-
- dagster-daemon - a service that triggers pipeline runs based on time or external state.
|
|
73
|
-
- PostgresSQL database - for storing run history, event logs, and scheduler state.
|
|
74
|
-
- workspace code
|
|
75
|
-
- Code to run is loaded into a Dagster `workspace`. This code is loaded from
|
|
76
|
-
one or more dagster `repositories`. Each Dagster `repository` may be run with a different
|
|
77
|
-
Python virtual environment if need be, and may be loaded from a local Python file or
|
|
78
|
-
`pip install`ed from an external source. In our case, each Dagster `repository` is simply
|
|
79
|
-
loaded from a Python file local to the nmdc-runtime GitHub repository, and all code is
|
|
80
|
-
run in the same Python environment.
|
|
81
|
-
- A Dagster repository consists of `solids` and `pipelines`,
|
|
82
|
-
and optionally `schedules` and `sensors`.
|
|
83
|
-
- `solids` represent individual units of computation
|
|
84
|
-
- `pipelines` are built up from solids
|
|
85
|
-
- `schedules` trigger recurring pipeline runs based on time
|
|
86
|
-
- `sensors` trigger pipeline runs based on external state
|
|
87
|
-
- Each `pipeline` can declare dependencies on any runtime `resources` or additional
|
|
88
|
-
configuration. There are MongoDB `resources` defined, as well as `preset`
|
|
89
|
-
configuration definitions for both "dev" and "prod" `modes`. The `preset`s tell Dagster to
|
|
90
|
-
look to a set of known environment variables to load resources configurations, depending on
|
|
91
|
-
the `mode`.
|
|
92
|
-
|
|
93
|
-
2. A MongoDB database supporting write-once, high-throughput internal
|
|
94
|
-
data storage by the nmdc-runtime FastAPI instance.
|
|
95
|
-
|
|
96
|
-
3. A [FastAPI](https://fastapi.tiangolo.com/) service to interface with the orchestrator and
|
|
97
|
-
database, as a hub for data management and workflow automation.
|
|
98
|
-
|
|
99
|
-
## Local Development
|
|
100
|
-
|
|
101
|
-
Ensure Docker (and Docker Compose) are installed; and the Docker engine is running.
|
|
102
|
-
|
|
103
|
-
```shell
|
|
104
|
-
docker --version
|
|
105
|
-
docker compose version
|
|
106
|
-
docker info
|
|
107
|
-
```
|
|
108
|
-
|
|
109
|
-
Ensure the permissions of `./.docker/mongoKeyFile` are such that only the file's owner can read or write the file.
|
|
110
|
-
|
|
111
|
-
```shell
|
|
112
|
-
chmod 600 ./.docker/mongoKeyFile
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
Ensure you have a `.env` file for the Docker services to source from. You may copy `.env.example` to
|
|
116
|
-
`.env` (which is gitignore'd) to get started.
|
|
117
|
-
|
|
118
|
-
```shell
|
|
119
|
-
cp .env.example .env
|
|
120
|
-
```
|
|
121
|
-
|
|
122
|
-
Create environment variables in your shell session, based upon the contents of the `.env` file.
|
|
123
|
-
|
|
124
|
-
```shell
|
|
125
|
-
set -a # automatically export all variables
|
|
126
|
-
source .env
|
|
127
|
-
set +a
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
If you are connecting to resources that require an SSH tunnel—for example, a MongoDB server that is only accessible on
|
|
131
|
-
the NERSC network—set up the SSH tunnel.
|
|
132
|
-
|
|
133
|
-
The following command could be useful to you, either directly or as a template (see `Makefile`).
|
|
134
|
-
|
|
135
|
-
```shell
|
|
136
|
-
make nersc-mongo-tunnels
|
|
137
|
-
```
|
|
138
|
-
|
|
139
|
-
Finally, spin up the Docker Compose stack.
|
|
140
|
-
|
|
141
|
-
```bash
|
|
142
|
-
make up-dev
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
Docker Compose is used to start local MongoDB and PostgresSQL (used by Dagster) instances, as well
|
|
146
|
-
as a Dagster web server (dagit) and daemon (dagster-daemon).
|
|
147
|
-
|
|
148
|
-
The Dagit web server is viewable at http://127.0.0.1:3000/.
|
|
149
|
-
|
|
150
|
-
The FastAPI service is viewable at http://127.0.0.1:8000/ -- e.g., rendered documentation at
|
|
151
|
-
http://127.0.0.1:8000/redoc/.
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
* NOTE: Any time you add or change requirements in requirements/main.in or requirements/dev.in, you must run:
|
|
155
|
-
```bash
|
|
156
|
-
pip-compile --build-isolation --allow-unsafe --resolver=backtracking --strip-extras --output-file requirements/[main|dev].txt requirements/[main|dev].in
|
|
157
|
-
```
|
|
158
|
-
to generate main.txt and dev.txt files respectively. main.in is kind of like a poetry dependency stanza, dev.in is kind
|
|
159
|
-
of like poetry dev.dependencies stanza. main.txt and dev.txt are kind of like poetry.lock files to specify the exact
|
|
160
|
-
versions of dependencies to use. main.txt and dev.txt are combined in the docker compose build process to create the
|
|
161
|
-
final requirements.txt file and import the dependencies into the Docker image.
|
|
162
|
-
|
|
163
|
-
## Local Testing
|
|
164
|
-
|
|
165
|
-
Tests can be found in `tests` and are run with the following commands:
|
|
166
|
-
|
|
167
|
-
```bash
|
|
168
|
-
make up-test
|
|
169
|
-
make test
|
|
170
|
-
|
|
171
|
-
# Run a Specific test file eg. tests/test_api/test_endpoints.py
|
|
172
|
-
make test ARGS="tests/test_api/test_endpoints.py"
|
|
173
|
-
|
|
174
|
-
docker compose --file docker-compose.test.yml run test
|
|
175
|
-
```
|
|
176
|
-
|
|
177
|
-
As you create Dagster solids and pipelines, add tests in `tests/` to check that your code behaves as
|
|
178
|
-
desired and does not break over time.
|
|
179
|
-
|
|
180
|
-
[For hints on how to write tests for solids and pipelines in Dagster, see their documentation
|
|
181
|
-
tutorial on Testing](https://docs.dagster.io/guides/test/unit-testing-assets-and-ops).
|
|
182
|
-
|
|
183
|
-
### Performance profiling
|
|
184
|
-
|
|
185
|
-
We use a tool called [Pyinstrument](https://pyinstrument.readthedocs.io) to profile the performance of the Runtime API while processing an individual HTTP request.
|
|
186
|
-
|
|
187
|
-
Here's how you can do that:
|
|
188
|
-
|
|
189
|
-
1. In your `.env` file, set `IS_PROFILING_ENABLED` to `true`
|
|
190
|
-
2. Start/restart your development stack: `$ make up-dev`
|
|
191
|
-
3. Ensure the endpoint function whose performance you want to profile is defined using `async def` (as opposed to just `def`) ([reference](https://github.com/joerick/pyinstrument/issues/257))
|
|
192
|
-
|
|
193
|
-
Then—with all of that done—submit an HTTP request that includes the URL query parameter: `profile=true`. Instructions for doing that are in the sections below.
|
|
194
|
-
|
|
195
|
-
<details>
|
|
196
|
-
<summary>Show/hide instructions for <code>GET</code> requests only (involves web browser)</summary>
|
|
197
|
-
|
|
198
|
-
1. In your web browser, visit the endpoint's URL, but add the `profile=true` query parameter to the URL. Examples:
|
|
199
|
-
```diff
|
|
200
|
-
A. If the URL doesn't already have query parameters, append `?profile=true`.
|
|
201
|
-
- http://127.0.0.1:8000/nmdcschema/biosample_set
|
|
202
|
-
+ http://127.0.0.1:8000/nmdcschema/biosample_set?profile=true
|
|
203
|
-
|
|
204
|
-
B. If the URL already has query parameters, append `&profile=true`.
|
|
205
|
-
- http://127.0.0.1:8000/nmdcschema/biosample_set?filter={}
|
|
206
|
-
+ http://127.0.0.1:8000/nmdcschema/biosample_set?filter={}&profile=true
|
|
207
|
-
```
|
|
208
|
-
2. Your web browser will display a performance profiling report.
|
|
209
|
-
> Note: The Runtime API will have responded with a performance profiling report web page, instead of its normal response (which the Runtime discards).
|
|
210
|
-
|
|
211
|
-
That'll only work for `GET` requests, though, since you're limited to specifying the request via the address bar.
|
|
212
|
-
|
|
213
|
-
</details>
|
|
214
|
-
|
|
215
|
-
<details>
|
|
216
|
-
<summary>Show/hide instructions for <strong>all</strong> kinds of requests (involves <code>curl</code> + web browser)</summary>
|
|
217
|
-
|
|
218
|
-
1. At your terminal, type or paste the `curl` command you want to run (you can copy/paste one from Swagger UI).
|
|
219
|
-
2. Append the `profile=true` query parameter to the URL in the command, and use the `-o` option to save the response to a file whose name ends with `.html`. For example:
|
|
220
|
-
```diff
|
|
221
|
-
curl -X 'POST' \
|
|
222
|
-
- 'http://127.0.0.1:8000/metadata/json:validate' \
|
|
223
|
-
+ 'http://127.0.0.1:8000/metadata/json:validate?profile=true' \
|
|
224
|
-
+ -o /tmp/profile.html
|
|
225
|
-
-H 'accept: application/json' \
|
|
226
|
-
-H 'Content-Type: application/json' \
|
|
227
|
-
-d '{"biosample_set": []}'
|
|
228
|
-
```
|
|
229
|
-
3. Run the command.
|
|
230
|
-
> Note: The Runtime API will respond with a performance profiling report web page, instead of its normal response (which the Runtime discards). The performance profiling report web page will be saved to the `.html` file to which you redirected the command output.
|
|
231
|
-
4. Double-click on the `.html` file to view it in your web browser.
|
|
232
|
-
1. Alternatively, open your web browser and navigate to the `.html` file; e.g., enter `file:///tmp/profile.html` into the address bar.
|
|
233
|
-
|
|
234
|
-
</details>
|
|
235
|
-
|
|
236
|
-
### RAM usage
|
|
237
|
-
|
|
238
|
-
The `dagster-daemon` and `dagster-dagit` containers can consume a lot of RAM. If tests are failing and the console of
|
|
239
|
-
the `test` container shows "Error 137," here is something you can try as a workaround: In Docker Desktop, go to
|
|
240
|
-
"Settings > Resources > Advanced," and increase the memory limit. One of our team members has
|
|
241
|
-
found **12 GB** to be sufficient for running the tests.
|
|
242
|
-
|
|
243
|
-
> Dedicating 12 GB of RAM to Docker may be prohibitive for some prospective developers.
|
|
244
|
-
> There is an open [issue](https://github.com/microbiomedata/nmdc-runtime/issues/928) about the memory requirement.
|
|
245
|
-
|
|
246
|
-
## Publish to PyPI
|
|
247
|
-
|
|
248
|
-
This repository contains a GitHub Actions workflow that publishes a Python package to [PyPI](https://pypi.org/project/nmdc-runtime/).
|
|
249
|
-
|
|
250
|
-
You can also _manually_ publish the Python package to PyPI by issuing the following commands in the root directory of the repository:
|
|
251
|
-
|
|
252
|
-
```
|
|
253
|
-
rm -rf dist
|
|
254
|
-
python -m build
|
|
255
|
-
twine upload dist/*
|
|
256
|
-
```
|
|
257
|
-
|
|
258
|
-
## Links
|
|
259
|
-
|
|
260
|
-
Here are links related to this repository:
|
|
261
|
-
|
|
262
|
-
- Production API server: https://api.microbiomedata.org
|
|
263
|
-
- PyPI package: https://pypi.org/project/nmdc-runtime
|
|
264
|
-
- DockerHub image (API server): https://hub.docker.com/r/microbiomedata/nmdc-runtime-fastapi
|
|
265
|
-
- DockerHub image (Dagster): https://hub.docker.com/r/microbiomedata/nmdc-runtime-dagster
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
nmdc_runtime
|
|
File without changes
|
|
File without changes
|