nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
nmdc_runtime/site/ops.py
CHANGED
|
@@ -1,16 +1,21 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import json
|
|
3
|
-
import
|
|
3
|
+
import logging
|
|
4
4
|
import os
|
|
5
5
|
import subprocess
|
|
6
|
-
import tempfile
|
|
7
6
|
from collections import defaultdict
|
|
8
7
|
from datetime import datetime, timezone
|
|
9
|
-
from io import BytesIO
|
|
10
|
-
from
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from pprint import pformat
|
|
10
|
+
from toolz.dicttoolz import keyfilter
|
|
11
|
+
from typing import Tuple, Set
|
|
11
12
|
from zipfile import ZipFile
|
|
13
|
+
from itertools import chain
|
|
14
|
+
from ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
12
15
|
import pandas as pd
|
|
13
16
|
import requests
|
|
17
|
+
from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
|
|
18
|
+
from toolz import dissoc
|
|
14
19
|
|
|
15
20
|
from bson import ObjectId, json_util
|
|
16
21
|
from dagster import (
|
|
@@ -21,6 +26,7 @@ from dagster import (
|
|
|
21
26
|
Failure,
|
|
22
27
|
List,
|
|
23
28
|
MetadataValue,
|
|
29
|
+
Noneable,
|
|
24
30
|
OpExecutionContext,
|
|
25
31
|
Out,
|
|
26
32
|
Output,
|
|
@@ -29,10 +35,15 @@ from dagster import (
|
|
|
29
35
|
String,
|
|
30
36
|
op,
|
|
31
37
|
Optional,
|
|
38
|
+
Field,
|
|
39
|
+
Permissive,
|
|
40
|
+
In,
|
|
41
|
+
Nothing,
|
|
32
42
|
)
|
|
33
43
|
from gridfs import GridFS
|
|
34
|
-
from linkml_runtime.
|
|
44
|
+
from linkml_runtime.utils.dictutils import as_simple_dict
|
|
35
45
|
from linkml_runtime.utils.yamlutils import YAMLRoot
|
|
46
|
+
from nmdc_runtime.api.db.mongo import validate_json
|
|
36
47
|
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
37
48
|
from nmdc_runtime.api.core.metadata import (
|
|
38
49
|
_validate_changesheet,
|
|
@@ -42,6 +53,7 @@ from nmdc_runtime.api.core.metadata import (
|
|
|
42
53
|
)
|
|
43
54
|
from nmdc_runtime.api.core.util import dotted_path_for, hash_from_str, json_clean, now
|
|
44
55
|
from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object
|
|
56
|
+
from nmdc_runtime.api.endpoints.find import find_study_by_id
|
|
45
57
|
from nmdc_runtime.api.models.job import Job, JobOperationMetadata
|
|
46
58
|
from nmdc_runtime.api.models.metadata import ChangesheetIn
|
|
47
59
|
from nmdc_runtime.api.models.operation import (
|
|
@@ -55,36 +67,53 @@ from nmdc_runtime.api.models.run import (
|
|
|
55
67
|
_add_run_complete_event,
|
|
56
68
|
)
|
|
57
69
|
from nmdc_runtime.api.models.util import ResultT
|
|
58
|
-
from nmdc_runtime.site.
|
|
70
|
+
from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
|
|
71
|
+
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
72
|
+
fetch_data_objects_from_biosamples,
|
|
73
|
+
fetch_nucleotide_sequencing_from_biosamples,
|
|
74
|
+
fetch_library_preparation_from_biosamples,
|
|
75
|
+
)
|
|
59
76
|
from nmdc_runtime.site.resources import (
|
|
60
77
|
NmdcPortalApiClient,
|
|
61
78
|
GoldApiClient,
|
|
62
79
|
RuntimeApiSiteClient,
|
|
63
80
|
RuntimeApiUserClient,
|
|
64
81
|
NeonApiClient,
|
|
82
|
+
MongoDB as MongoDBResource,
|
|
65
83
|
)
|
|
66
84
|
from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
|
|
67
85
|
from nmdc_runtime.site.translation.neon_soil_translator import NeonSoilDataTranslator
|
|
68
86
|
from nmdc_runtime.site.translation.neon_benthic_translator import (
|
|
69
87
|
NeonBenthicDataTranslator,
|
|
70
88
|
)
|
|
89
|
+
from nmdc_runtime.site.translation.neon_surface_water_translator import (
|
|
90
|
+
NeonSurfaceWaterDataTranslator,
|
|
91
|
+
)
|
|
71
92
|
from nmdc_runtime.site.translation.submission_portal_translator import (
|
|
72
93
|
SubmissionPortalTranslator,
|
|
73
94
|
)
|
|
74
|
-
from nmdc_runtime.site.
|
|
95
|
+
from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
|
|
96
|
+
from nmdc_runtime.site.util import (
|
|
97
|
+
schema_collection_has_index_on_id,
|
|
98
|
+
nmdc_study_id_to_filename,
|
|
99
|
+
get_instruments_by_id,
|
|
100
|
+
)
|
|
75
101
|
from nmdc_runtime.util import (
|
|
76
|
-
drs_object_in_for,
|
|
77
102
|
pluralize,
|
|
78
|
-
put_object,
|
|
79
|
-
validate_json,
|
|
80
103
|
specialize_activity_set_docs,
|
|
104
|
+
collection_name_to_class_names,
|
|
105
|
+
nmdc_schema_view,
|
|
106
|
+
populated_schema_collection_names_with_id_field,
|
|
81
107
|
)
|
|
82
108
|
from nmdc_schema import nmdc
|
|
83
|
-
from
|
|
109
|
+
from pymongo import InsertOne, UpdateOne
|
|
84
110
|
from pymongo.database import Database as MongoDatabase
|
|
85
|
-
from
|
|
86
|
-
from
|
|
87
|
-
|
|
111
|
+
from pymongo.collection import Collection as MongoCollection
|
|
112
|
+
from toolz import get_in, valfilter, identity
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# batch size for writing documents to alldocs
|
|
116
|
+
BULK_WRITE_BATCH_SIZE = 2000
|
|
88
117
|
|
|
89
118
|
|
|
90
119
|
@op
|
|
@@ -108,14 +137,6 @@ def log_env(context):
|
|
|
108
137
|
context.log.info("\n".join(out))
|
|
109
138
|
|
|
110
139
|
|
|
111
|
-
@op(required_resource_keys={"terminus"})
|
|
112
|
-
def list_databases(context) -> List[String]:
|
|
113
|
-
client = context.resources.terminus.client
|
|
114
|
-
list_ = client.list_databases()
|
|
115
|
-
context.log.info(f"databases: {list_}")
|
|
116
|
-
return list_
|
|
117
|
-
|
|
118
|
-
|
|
119
140
|
@op(required_resource_keys={"mongo"})
|
|
120
141
|
def mongo_stats(context) -> List[str]:
|
|
121
142
|
db = context.resources.mongo.db
|
|
@@ -124,134 +145,6 @@ def mongo_stats(context) -> List[str]:
|
|
|
124
145
|
return collection_names
|
|
125
146
|
|
|
126
147
|
|
|
127
|
-
@op(required_resource_keys={"terminus"})
|
|
128
|
-
def update_schema(context):
|
|
129
|
-
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
130
|
-
try:
|
|
131
|
-
context.log.info("shallow-cloning nmdc-schema repo")
|
|
132
|
-
subprocess.check_output(
|
|
133
|
-
"git clone https://github.com/microbiomedata/nmdc-schema.git"
|
|
134
|
-
f" --branch main --single-branch {tmpdirname}/nmdc-schema",
|
|
135
|
-
shell=True,
|
|
136
|
-
)
|
|
137
|
-
context.log.info("generating TerminusDB JSON-LD from NMDC LinkML")
|
|
138
|
-
subprocess.check_output(
|
|
139
|
-
f"gen-terminusdb {tmpdirname}/nmdc-schema/src/schema/nmdc.yaml"
|
|
140
|
-
f" > {tmpdirname}/nmdc.terminus.json",
|
|
141
|
-
shell=True,
|
|
142
|
-
)
|
|
143
|
-
except subprocess.CalledProcessError as e:
|
|
144
|
-
if e.stdout:
|
|
145
|
-
context.log.debug(e.stdout.decode())
|
|
146
|
-
if e.stderr:
|
|
147
|
-
context.log.error(e.stderr.decode())
|
|
148
|
-
context.log.debug(str(e.returncode))
|
|
149
|
-
raise e
|
|
150
|
-
|
|
151
|
-
with open(f"{tmpdirname}/nmdc.terminus.json") as f:
|
|
152
|
-
woql_dict = json.load(f)
|
|
153
|
-
|
|
154
|
-
context.log.info("Updating terminus schema via WOQLQuery")
|
|
155
|
-
rv = WQ(query=woql_dict).execute(
|
|
156
|
-
context.resources.terminus.client, "update schema via WOQL"
|
|
157
|
-
)
|
|
158
|
-
context.log.info(str(rv))
|
|
159
|
-
return rv
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
@op(
|
|
163
|
-
required_resource_keys={"mongo", "runtime_api_site_client"},
|
|
164
|
-
retry_policy=RetryPolicy(max_retries=2),
|
|
165
|
-
)
|
|
166
|
-
def local_file_to_api_object(context, file_info):
|
|
167
|
-
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
168
|
-
storage_path: str = file_info["storage_path"]
|
|
169
|
-
mime_type = file_info.get("mime_type")
|
|
170
|
-
if mime_type is None:
|
|
171
|
-
mime_type = mimetypes.guess_type(storage_path)[0]
|
|
172
|
-
rv = client.put_object_in_site(
|
|
173
|
-
{"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
|
|
174
|
-
)
|
|
175
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
176
|
-
raise Failure(description=f"put_object_in_site failed: {rv.content}")
|
|
177
|
-
op = rv.json()
|
|
178
|
-
context.log.info(f"put_object_in_site: {op}")
|
|
179
|
-
rv = put_object(storage_path, op["metadata"]["url"])
|
|
180
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
181
|
-
raise Failure(description=f"put_object failed: {rv.content}")
|
|
182
|
-
op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
|
|
183
|
-
rv = client.update_operation(op["id"], op_patch)
|
|
184
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
185
|
-
raise Failure(description="update_operation failed")
|
|
186
|
-
op = rv.json()
|
|
187
|
-
context.log.info(f"update_operation: {op}")
|
|
188
|
-
rv = client.create_object_from_op(op)
|
|
189
|
-
if rv.status_code != status.HTTP_201_CREATED:
|
|
190
|
-
raise Failure("create_object_from_op failed")
|
|
191
|
-
obj = rv.json()
|
|
192
|
-
context.log.info(f'Created /objects/{obj["id"]}')
|
|
193
|
-
mdb = context.resources.mongo.db
|
|
194
|
-
rv = mdb.operations.delete_one({"id": op["id"]})
|
|
195
|
-
if rv.deleted_count != 1:
|
|
196
|
-
context.log.error("deleting op failed")
|
|
197
|
-
yield AssetMaterialization(
|
|
198
|
-
asset_key=AssetKey(["object", obj["name"]]),
|
|
199
|
-
description="output of metadata-translation run_etl",
|
|
200
|
-
metadata={"object_id": MetadataValue.text(obj["id"])},
|
|
201
|
-
)
|
|
202
|
-
yield Output(obj)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
@op(
|
|
206
|
-
out={
|
|
207
|
-
"merged_data_path": Out(
|
|
208
|
-
str,
|
|
209
|
-
description="path to TSV merging of source metadata",
|
|
210
|
-
)
|
|
211
|
-
}
|
|
212
|
-
)
|
|
213
|
-
def build_merged_db(context) -> str:
|
|
214
|
-
context.log.info("metadata-translation: running `make build-merged-db`")
|
|
215
|
-
run_and_log(
|
|
216
|
-
"cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
|
|
217
|
-
)
|
|
218
|
-
storage_path = (
|
|
219
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
|
|
220
|
-
)
|
|
221
|
-
yield AssetMaterialization(
|
|
222
|
-
asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
|
|
223
|
-
description="input to metadata-translation run_etl",
|
|
224
|
-
metadata={"path": MetadataValue.path(storage_path)},
|
|
225
|
-
)
|
|
226
|
-
yield Output(storage_path, "merged_data_path")
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
@op(
|
|
230
|
-
required_resource_keys={"runtime_api_site_client"},
|
|
231
|
-
)
|
|
232
|
-
def run_etl(context, merged_data_path: str):
|
|
233
|
-
context.log.info("metadata-translation: running `make run-etl`")
|
|
234
|
-
if not os.path.exists(merged_data_path):
|
|
235
|
-
raise Failure(description=f"merged_db not present at {merged_data_path}")
|
|
236
|
-
run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
|
|
237
|
-
storage_path = (
|
|
238
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
|
|
239
|
-
)
|
|
240
|
-
with ZipFile(storage_path) as zf:
|
|
241
|
-
name = zf.namelist()[0]
|
|
242
|
-
with zf.open(name) as f:
|
|
243
|
-
rv = json.load(f)
|
|
244
|
-
context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
|
|
245
|
-
yield AssetMaterialization(
|
|
246
|
-
asset_key=AssetKey(["gold_translation", "database.json.zip"]),
|
|
247
|
-
description="output of metadata-translation run_etl",
|
|
248
|
-
metadata={
|
|
249
|
-
"path": MetadataValue.path(storage_path),
|
|
250
|
-
},
|
|
251
|
-
)
|
|
252
|
-
yield Output({"storage_path": storage_path})
|
|
253
|
-
|
|
254
|
-
|
|
255
148
|
@op(required_resource_keys={"mongo"})
|
|
256
149
|
def get_operation(context):
|
|
257
150
|
mdb = context.resources.mongo.db
|
|
@@ -476,6 +369,9 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn):
|
|
|
476
369
|
|
|
477
370
|
@op(required_resource_keys={"runtime_api_site_client"})
|
|
478
371
|
def get_json_in(context):
|
|
372
|
+
"""
|
|
373
|
+
TODO: Document this function.
|
|
374
|
+
"""
|
|
479
375
|
object_id = context.op_config.get("object_id")
|
|
480
376
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
481
377
|
rv = client.get_object_bytes(object_id)
|
|
@@ -486,63 +382,17 @@ def get_json_in(context):
|
|
|
486
382
|
return rv.json()
|
|
487
383
|
|
|
488
384
|
|
|
489
|
-
def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
|
|
490
|
-
"""Does not ensure ordering of `docs`."""
|
|
491
|
-
|
|
492
|
-
if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
|
|
493
|
-
return docs, 0
|
|
494
|
-
|
|
495
|
-
do_docs = docs["data_object_set"]
|
|
496
|
-
|
|
497
|
-
class FileTypeEnumBase(BaseModel):
|
|
498
|
-
name: str
|
|
499
|
-
description: str
|
|
500
|
-
filter: str # JSON-encoded data_object_set mongo collection filter document
|
|
501
|
-
|
|
502
|
-
class FileTypeEnum(FileTypeEnumBase):
|
|
503
|
-
id: str
|
|
504
|
-
|
|
505
|
-
temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
|
|
506
|
-
temp_collection = mdb[temp_collection_name]
|
|
507
|
-
temp_collection.insert_many(do_docs)
|
|
508
|
-
temp_collection.create_index("id")
|
|
509
|
-
|
|
510
|
-
def fte_matches(fte_filter: str):
|
|
511
|
-
return [
|
|
512
|
-
dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
|
|
513
|
-
]
|
|
514
|
-
|
|
515
|
-
do_docs_map = {d["id"]: d for d in do_docs}
|
|
516
|
-
|
|
517
|
-
n_docs_with_types_added = 0
|
|
518
|
-
|
|
519
|
-
for fte_doc in mdb.file_type_enum.find():
|
|
520
|
-
fte = FileTypeEnum(**fte_doc)
|
|
521
|
-
docs_matching = fte_matches(fte.filter)
|
|
522
|
-
for doc in docs_matching:
|
|
523
|
-
if "data_object_type" not in doc:
|
|
524
|
-
do_docs_map[doc["id"]] = assoc(doc, "data_object_type", fte.id)
|
|
525
|
-
n_docs_with_types_added += 1
|
|
526
|
-
|
|
527
|
-
mdb.drop_collection(temp_collection_name)
|
|
528
|
-
return (
|
|
529
|
-
assoc(
|
|
530
|
-
docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
|
|
531
|
-
),
|
|
532
|
-
n_docs_with_types_added,
|
|
533
|
-
)
|
|
534
|
-
|
|
535
|
-
|
|
536
385
|
@op(required_resource_keys={"runtime_api_site_client", "mongo"})
|
|
537
386
|
def perform_mongo_updates(context, json_in):
|
|
387
|
+
"""
|
|
388
|
+
TODO: Document this function.
|
|
389
|
+
"""
|
|
538
390
|
mongo = context.resources.mongo
|
|
539
391
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
540
392
|
op_id = context.op_config.get("operation_id")
|
|
541
393
|
|
|
542
394
|
docs = json_in
|
|
543
395
|
docs, _ = specialize_activity_set_docs(docs)
|
|
544
|
-
docs, n_docs_with_types_added = ensure_data_object_type(docs, mongo.db)
|
|
545
|
-
context.log.info(f"added `data_object_type` to {n_docs_with_types_added} docs")
|
|
546
396
|
context.log.debug(f"{docs}")
|
|
547
397
|
|
|
548
398
|
rv = validate_json(
|
|
@@ -551,29 +401,54 @@ def perform_mongo_updates(context, json_in):
|
|
|
551
401
|
if rv["result"] == "errors":
|
|
552
402
|
raise Failure(str(rv["detail"]))
|
|
553
403
|
|
|
554
|
-
|
|
555
|
-
|
|
404
|
+
# TODO containing op `perform_mongo_updates` needs test coverage, as below line had trivial bug.
|
|
405
|
+
# ref: https://github.com/microbiomedata/nmdc-runtime/issues/631
|
|
406
|
+
add_docs_result = _add_schema_docs_with_or_without_replacement(mongo, docs)
|
|
407
|
+
op_patch = UpdateOperationRequest(
|
|
408
|
+
done=True,
|
|
409
|
+
result=add_docs_result,
|
|
410
|
+
metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")},
|
|
411
|
+
)
|
|
412
|
+
op_doc = client.update_operation(op_id, op_patch).json()
|
|
413
|
+
return ["/operations/" + op_doc["id"]]
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _add_schema_docs_with_or_without_replacement(
|
|
417
|
+
mongo: MongoDBResource, docs: Dict[str, list]
|
|
418
|
+
):
|
|
419
|
+
"""
|
|
420
|
+
TODO: Document this function.
|
|
421
|
+
"""
|
|
422
|
+
coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
|
|
423
|
+
if all(coll_index_on_id_map[coll] for coll in docs.keys()):
|
|
556
424
|
replace = True
|
|
557
|
-
elif all(not
|
|
425
|
+
elif all(not coll_index_on_id_map[coll] for coll in docs.keys()):
|
|
426
|
+
# FIXME: XXX: This is a hack because e.g. <https://w3id.org/nmdc/FunctionalAnnotationAggMember>
|
|
427
|
+
# documents should be unique with compound key (metagenome_annotation_id, gene_function_id)
|
|
428
|
+
# and yet this is not explicit in the schema. One potential solution is to auto-generate an `id`
|
|
429
|
+
# as a deterministic hash of the compound key.
|
|
430
|
+
#
|
|
431
|
+
# For now, decision is to potentially re-insert "duplicate" documents, i.e. to interpret
|
|
432
|
+
# lack of `id` as lack of unique document identity for de-duplication.
|
|
558
433
|
replace = False # wasting time trying to upsert by `id`.
|
|
559
434
|
else:
|
|
560
435
|
colls_not_id_indexed = [
|
|
561
|
-
coll for coll in docs.keys() if not
|
|
436
|
+
coll for coll in docs.keys() if not coll_index_on_id_map[coll]
|
|
562
437
|
]
|
|
563
|
-
colls_id_indexed = [coll for coll in docs.keys() if
|
|
438
|
+
colls_id_indexed = [coll for coll in docs.keys() if coll_index_on_id_map[coll]]
|
|
564
439
|
raise Failure(
|
|
565
440
|
"Simultaneous addition of non-`id`ed collections and `id`-ed collections"
|
|
566
441
|
" is not supported at this time."
|
|
567
442
|
f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
|
|
568
443
|
)
|
|
569
444
|
op_result = mongo.add_docs(docs, validate=False, replace=replace)
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
445
|
+
|
|
446
|
+
# Translate the operation result into a dictionary in which each item's key is a collection name
|
|
447
|
+
# and each item's value is the corresponding bulk API result (excluding the "upserted" field).
|
|
448
|
+
return {
|
|
449
|
+
collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
|
|
450
|
+
for collection_name, bulk_write_result in op_result.items()
|
|
451
|
+
}
|
|
577
452
|
|
|
578
453
|
|
|
579
454
|
@op(required_resource_keys={"mongo"})
|
|
@@ -589,9 +464,32 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
|
|
|
589
464
|
context.log.info(f"No NMDC RunEvent doc for Dagster Run {context.run_id}")
|
|
590
465
|
|
|
591
466
|
|
|
592
|
-
@op(
|
|
593
|
-
|
|
594
|
-
|
|
467
|
+
@op(
|
|
468
|
+
config_schema={
|
|
469
|
+
"study_id": str,
|
|
470
|
+
"study_type": str,
|
|
471
|
+
"gold_nmdc_instrument_mapping_file_url": str,
|
|
472
|
+
"include_field_site_info": bool,
|
|
473
|
+
"enable_biosample_filtering": bool,
|
|
474
|
+
},
|
|
475
|
+
out={
|
|
476
|
+
"study_id": Out(str),
|
|
477
|
+
"study_type": Out(str),
|
|
478
|
+
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
479
|
+
"include_field_site_info": Out(bool),
|
|
480
|
+
"enable_biosample_filtering": Out(bool),
|
|
481
|
+
},
|
|
482
|
+
)
|
|
483
|
+
def get_gold_study_pipeline_inputs(
|
|
484
|
+
context: OpExecutionContext,
|
|
485
|
+
) -> Tuple[str, str, str, bool, bool]:
|
|
486
|
+
return (
|
|
487
|
+
context.op_config["study_id"],
|
|
488
|
+
context.op_config["study_type"],
|
|
489
|
+
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
490
|
+
context.op_config["include_field_site_info"],
|
|
491
|
+
context.op_config["enable_biosample_filtering"],
|
|
492
|
+
)
|
|
595
493
|
|
|
596
494
|
|
|
597
495
|
@op(required_resource_keys={"gold_api_client"})
|
|
@@ -628,9 +526,13 @@ def gold_study(context: OpExecutionContext, study_id: str) -> Dict[str, Any]:
|
|
|
628
526
|
def nmdc_schema_database_from_gold_study(
|
|
629
527
|
context: OpExecutionContext,
|
|
630
528
|
study: Dict[str, Any],
|
|
529
|
+
study_type: str,
|
|
631
530
|
projects: List[Dict[str, Any]],
|
|
632
531
|
biosamples: List[Dict[str, Any]],
|
|
633
532
|
analysis_projects: List[Dict[str, Any]],
|
|
533
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
534
|
+
include_field_site_info: bool,
|
|
535
|
+
enable_biosample_filtering: bool,
|
|
634
536
|
) -> nmdc.Database:
|
|
635
537
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
636
538
|
|
|
@@ -639,34 +541,54 @@ def nmdc_schema_database_from_gold_study(
|
|
|
639
541
|
return response.json()
|
|
640
542
|
|
|
641
543
|
translator = GoldStudyTranslator(
|
|
642
|
-
study,
|
|
544
|
+
study,
|
|
545
|
+
study_type,
|
|
546
|
+
biosamples,
|
|
547
|
+
projects,
|
|
548
|
+
analysis_projects,
|
|
549
|
+
gold_nmdc_instrument_map_df,
|
|
550
|
+
include_field_site_info,
|
|
551
|
+
enable_biosample_filtering,
|
|
552
|
+
id_minter=id_minter,
|
|
643
553
|
)
|
|
644
554
|
database = translator.get_database()
|
|
645
555
|
return database
|
|
646
556
|
|
|
647
557
|
|
|
648
558
|
@op(
|
|
559
|
+
required_resource_keys={"mongo"},
|
|
649
560
|
out={
|
|
650
561
|
"submission_id": Out(),
|
|
651
|
-
"
|
|
562
|
+
"nucleotide_sequencing_mapping_file_url": Out(Optional[str]),
|
|
652
563
|
"data_object_mapping_file_url": Out(Optional[str]),
|
|
653
564
|
"biosample_extras_file_url": Out(Optional[str]),
|
|
654
565
|
"biosample_extras_slot_mapping_file_url": Out(Optional[str]),
|
|
566
|
+
"study_id": Out(Optional[str]),
|
|
655
567
|
},
|
|
656
568
|
)
|
|
657
569
|
def get_submission_portal_pipeline_inputs(
|
|
570
|
+
context: OpExecutionContext,
|
|
658
571
|
submission_id: str,
|
|
659
|
-
|
|
572
|
+
nucleotide_sequencing_mapping_file_url: Optional[str],
|
|
660
573
|
data_object_mapping_file_url: Optional[str],
|
|
661
574
|
biosample_extras_file_url: Optional[str],
|
|
662
575
|
biosample_extras_slot_mapping_file_url: Optional[str],
|
|
663
|
-
|
|
576
|
+
study_id: Optional[str],
|
|
577
|
+
) -> Tuple[str, str | None, str | None, str | None, str | None, str | None]:
|
|
578
|
+
# query for studies matching the ID to see if it eists
|
|
579
|
+
if study_id:
|
|
580
|
+
mdb = context.resources.mongo.db
|
|
581
|
+
result = mdb.study_set.find_one({"id": study_id})
|
|
582
|
+
if not result:
|
|
583
|
+
raise Exception(f"Study id: {study_id} does not exist in Mongo.")
|
|
584
|
+
|
|
664
585
|
return (
|
|
665
586
|
submission_id,
|
|
666
|
-
|
|
587
|
+
nucleotide_sequencing_mapping_file_url,
|
|
667
588
|
data_object_mapping_file_url,
|
|
668
589
|
biosample_extras_file_url,
|
|
669
590
|
biosample_extras_slot_mapping_file_url,
|
|
591
|
+
study_id,
|
|
670
592
|
)
|
|
671
593
|
|
|
672
594
|
|
|
@@ -684,15 +606,14 @@ def fetch_nmdc_portal_submission_by_id(
|
|
|
684
606
|
def translate_portal_submission_to_nmdc_schema_database(
|
|
685
607
|
context: OpExecutionContext,
|
|
686
608
|
metadata_submission: Dict[str, Any],
|
|
687
|
-
|
|
609
|
+
nucleotide_sequencing_mapping: List,
|
|
688
610
|
data_object_mapping: List,
|
|
611
|
+
instrument_mapping: Dict[str, str],
|
|
689
612
|
study_category: Optional[str],
|
|
690
|
-
study_doi_category: Optional[str],
|
|
691
|
-
study_doi_provider: Optional[str],
|
|
692
|
-
study_funding_sources: Optional[List[str]],
|
|
693
613
|
study_pi_image_url: Optional[str],
|
|
694
614
|
biosample_extras: Optional[list[dict]],
|
|
695
615
|
biosample_extras_slot_mapping: Optional[list[dict]],
|
|
616
|
+
study_id: Optional[str],
|
|
696
617
|
) -> nmdc.Database:
|
|
697
618
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
698
619
|
|
|
@@ -702,21 +623,45 @@ def translate_portal_submission_to_nmdc_schema_database(
|
|
|
702
623
|
|
|
703
624
|
translator = SubmissionPortalTranslator(
|
|
704
625
|
metadata_submission,
|
|
705
|
-
|
|
706
|
-
data_object_mapping,
|
|
626
|
+
nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
|
|
627
|
+
data_object_mapping=data_object_mapping,
|
|
707
628
|
id_minter=id_minter,
|
|
708
629
|
study_category=study_category,
|
|
709
|
-
study_doi_category=study_doi_category,
|
|
710
|
-
study_doi_provider=study_doi_provider,
|
|
711
|
-
study_funding_sources=study_funding_sources,
|
|
712
630
|
study_pi_image_url=study_pi_image_url,
|
|
713
631
|
biosample_extras=biosample_extras,
|
|
714
632
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
633
|
+
illumina_instrument_mapping=instrument_mapping,
|
|
634
|
+
study_id=study_id,
|
|
715
635
|
)
|
|
716
636
|
database = translator.get_database()
|
|
717
637
|
return database
|
|
718
638
|
|
|
719
639
|
|
|
640
|
+
@op(required_resource_keys={"nmdc_portal_api_client"})
|
|
641
|
+
def add_public_image_urls(
|
|
642
|
+
context: OpExecutionContext, database: nmdc.Database, submission_id: str
|
|
643
|
+
) -> nmdc.Database:
|
|
644
|
+
client: NmdcPortalApiClient = context.resources.nmdc_portal_api_client
|
|
645
|
+
|
|
646
|
+
if len(database.study_set) != 1:
|
|
647
|
+
raise Failure(
|
|
648
|
+
description="Expected exactly one study in the database to add public image URLs."
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
study_id = database.study_set[0].id
|
|
652
|
+
public_images = client.make_submission_images_public(
|
|
653
|
+
submission_id, study_id=study_id
|
|
654
|
+
)
|
|
655
|
+
SubmissionPortalTranslator.set_study_images(
|
|
656
|
+
database.study_set[0],
|
|
657
|
+
public_images.get("pi_image_url"),
|
|
658
|
+
public_images.get("primary_study_image_url"),
|
|
659
|
+
public_images.get("study_image_urls"),
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
return database
|
|
663
|
+
|
|
664
|
+
|
|
720
665
|
@op
|
|
721
666
|
def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
|
|
722
667
|
source_id = None
|
|
@@ -729,7 +674,7 @@ def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
|
|
|
729
674
|
|
|
730
675
|
@op
|
|
731
676
|
def nmdc_schema_object_to_dict(object: YAMLRoot) -> Dict[str, Any]:
|
|
732
|
-
return
|
|
677
|
+
return as_simple_dict(object)
|
|
733
678
|
|
|
734
679
|
|
|
735
680
|
@op(required_resource_keys={"mongo"}, config_schema={"username": str})
|
|
@@ -765,6 +710,33 @@ def export_json_to_drs(
|
|
|
765
710
|
return ["/objects/" + drs_object["id"]]
|
|
766
711
|
|
|
767
712
|
|
|
713
|
+
@op(
|
|
714
|
+
description="NCBI Submission XML file rendered in a Dagster Asset",
|
|
715
|
+
out=Out(description="XML content rendered through Dagit UI"),
|
|
716
|
+
)
|
|
717
|
+
def ncbi_submission_xml_asset(context: OpExecutionContext, data: str):
|
|
718
|
+
filename = "ncbi_submission.xml"
|
|
719
|
+
file_path = os.path.join(context.instance.storage_directory(), filename)
|
|
720
|
+
|
|
721
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
722
|
+
|
|
723
|
+
with open(file_path, "w") as f:
|
|
724
|
+
f.write(data)
|
|
725
|
+
|
|
726
|
+
context.log_event(
|
|
727
|
+
AssetMaterialization(
|
|
728
|
+
asset_key="ncbi_submission_xml",
|
|
729
|
+
description="NCBI Submission XML Data",
|
|
730
|
+
metadata={
|
|
731
|
+
"file_path": MetadataValue.path(file_path),
|
|
732
|
+
"xml": MetadataValue.text(data),
|
|
733
|
+
},
|
|
734
|
+
)
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
return Output(data)
|
|
738
|
+
|
|
739
|
+
|
|
768
740
|
def unique_field_values(docs: List[Dict[str, Any]], field: str):
|
|
769
741
|
return {doc[field] for doc in docs if field in doc}
|
|
770
742
|
|
|
@@ -784,6 +756,11 @@ def get_neon_pipeline_benthic_data_product(context: OpExecutionContext) -> dict:
|
|
|
784
756
|
return context.op_config["benthic_data_product"]
|
|
785
757
|
|
|
786
758
|
|
|
759
|
+
@op(config_schema={"surface_water_data_product": dict})
|
|
760
|
+
def get_neon_pipeline_surface_water_data_product(context: OpExecutionContext) -> dict:
|
|
761
|
+
return context.op_config["surface_water_data_product"]
|
|
762
|
+
|
|
763
|
+
|
|
787
764
|
@op(required_resource_keys={"neon_api_client"})
|
|
788
765
|
def neon_data_by_product(
|
|
789
766
|
context: OpExecutionContext, data_product: dict
|
|
@@ -817,6 +794,7 @@ def nmdc_schema_database_from_neon_soil_data(
|
|
|
817
794
|
sls_data: Dict[str, pd.DataFrame],
|
|
818
795
|
neon_envo_mappings_file: pd.DataFrame,
|
|
819
796
|
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
797
|
+
neon_nmdc_instrument_mapping_file: pd.DataFrame,
|
|
820
798
|
) -> nmdc.Database:
|
|
821
799
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
822
800
|
|
|
@@ -829,6 +807,7 @@ def nmdc_schema_database_from_neon_soil_data(
|
|
|
829
807
|
sls_data,
|
|
830
808
|
neon_envo_mappings_file,
|
|
831
809
|
neon_raw_data_file_mappings_file,
|
|
810
|
+
neon_nmdc_instrument_mapping_file,
|
|
832
811
|
id_minter=id_minter,
|
|
833
812
|
)
|
|
834
813
|
|
|
@@ -843,6 +822,7 @@ def nmdc_schema_database_from_neon_benthic_data(
|
|
|
843
822
|
site_code_mapping: Dict[str, str],
|
|
844
823
|
neon_envo_mappings_file: pd.DataFrame,
|
|
845
824
|
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
825
|
+
neon_nmdc_instrument_mapping_file: pd.DataFrame,
|
|
846
826
|
) -> nmdc.Database:
|
|
847
827
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
848
828
|
|
|
@@ -855,6 +835,35 @@ def nmdc_schema_database_from_neon_benthic_data(
|
|
|
855
835
|
site_code_mapping,
|
|
856
836
|
neon_envo_mappings_file,
|
|
857
837
|
neon_raw_data_file_mappings_file,
|
|
838
|
+
neon_nmdc_instrument_mapping_file,
|
|
839
|
+
id_minter=id_minter,
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
database = translator.get_database()
|
|
843
|
+
return database
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
@op(required_resource_keys={"runtime_api_site_client"})
|
|
847
|
+
def nmdc_schema_database_from_neon_surface_water_data(
|
|
848
|
+
context: OpExecutionContext,
|
|
849
|
+
surface_water_data: Dict[str, pd.DataFrame],
|
|
850
|
+
site_code_mapping: Dict[str, str],
|
|
851
|
+
neon_envo_mappings_file: pd.DataFrame,
|
|
852
|
+
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
853
|
+
neon_nmdc_instrument_mapping_file: pd.DataFrame,
|
|
854
|
+
) -> nmdc.Database:
|
|
855
|
+
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
856
|
+
|
|
857
|
+
def id_minter(*args, **kwargs):
|
|
858
|
+
response = client.mint_id(*args, **kwargs)
|
|
859
|
+
return response.json()
|
|
860
|
+
|
|
861
|
+
translator = NeonSurfaceWaterDataTranslator(
|
|
862
|
+
surface_water_data,
|
|
863
|
+
site_code_mapping,
|
|
864
|
+
neon_envo_mappings_file,
|
|
865
|
+
neon_raw_data_file_mappings_file,
|
|
866
|
+
neon_nmdc_instrument_mapping_file,
|
|
858
867
|
id_minter=id_minter,
|
|
859
868
|
)
|
|
860
869
|
|
|
@@ -866,15 +875,18 @@ def nmdc_schema_database_from_neon_benthic_data(
|
|
|
866
875
|
out={
|
|
867
876
|
"neon_envo_mappings_file_url": Out(),
|
|
868
877
|
"neon_raw_data_file_mappings_file_url": Out(),
|
|
878
|
+
"neon_nmdc_instrument_mapping_file_url": Out(),
|
|
869
879
|
}
|
|
870
880
|
)
|
|
871
881
|
def get_neon_pipeline_inputs(
|
|
872
882
|
neon_envo_mappings_file_url: str,
|
|
873
883
|
neon_raw_data_file_mappings_file_url: str,
|
|
874
|
-
|
|
884
|
+
neon_nmdc_instrument_mapping_file_url: str,
|
|
885
|
+
) -> Tuple[str, str, str]:
|
|
875
886
|
return (
|
|
876
887
|
neon_envo_mappings_file_url,
|
|
877
888
|
neon_raw_data_file_mappings_file_url,
|
|
889
|
+
neon_nmdc_instrument_mapping_file_url,
|
|
878
890
|
)
|
|
879
891
|
|
|
880
892
|
|
|
@@ -943,3 +955,769 @@ def site_code_mapping() -> dict:
|
|
|
943
955
|
raise Exception(
|
|
944
956
|
f"Failed to fetch site data from {endpoint}. Status code: {response.status_code}, Content: {response.content}"
|
|
945
957
|
)
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
@op(
|
|
961
|
+
required_resource_keys={"mongo"},
|
|
962
|
+
config_schema={
|
|
963
|
+
"source_ontology": str,
|
|
964
|
+
"output_directory": Field(Noneable(str), default_value=None, is_required=False),
|
|
965
|
+
"generate_reports": Field(bool, default_value=True, is_required=False),
|
|
966
|
+
},
|
|
967
|
+
)
|
|
968
|
+
def load_ontology(context: OpExecutionContext):
|
|
969
|
+
cfg = context.op_config
|
|
970
|
+
source_ontology = cfg["source_ontology"]
|
|
971
|
+
output_directory = cfg.get("output_directory")
|
|
972
|
+
generate_reports = cfg.get("generate_reports", True)
|
|
973
|
+
|
|
974
|
+
if output_directory is None:
|
|
975
|
+
output_directory = os.path.join(os.getcwd(), "ontology_reports")
|
|
976
|
+
|
|
977
|
+
# Redirect Python logging to Dagster context
|
|
978
|
+
handler = logging.Handler()
|
|
979
|
+
handler.emit = lambda record: context.log.info(record.getMessage())
|
|
980
|
+
|
|
981
|
+
# Get logger from ontology-loader package
|
|
982
|
+
controller_logger = logging.getLogger("ontology_loader.ontology_load_controller")
|
|
983
|
+
controller_logger.setLevel(logging.INFO)
|
|
984
|
+
controller_logger.addHandler(handler)
|
|
985
|
+
|
|
986
|
+
context.log.info(f"Running Ontology Loader for ontology: {source_ontology}")
|
|
987
|
+
loader = OntologyLoaderController(
|
|
988
|
+
source_ontology=source_ontology,
|
|
989
|
+
output_directory=output_directory,
|
|
990
|
+
generate_reports=generate_reports,
|
|
991
|
+
mongo_client=context.resources.mongo.client,
|
|
992
|
+
db_name=context.resources.mongo.db.name,
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
loader.run_ontology_loader()
|
|
996
|
+
context.log.info(f"Ontology load for {source_ontology} completed successfully!")
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def _add_linked_instances_to_alldocs(
|
|
1000
|
+
temp_collection: MongoCollection,
|
|
1001
|
+
context: OpExecutionContext,
|
|
1002
|
+
document_reference_ranged_slots_by_type: dict,
|
|
1003
|
+
) -> None:
|
|
1004
|
+
"""
|
|
1005
|
+
Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
|
|
1006
|
+
|
|
1007
|
+
The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
|
|
1008
|
+
Each subdocument represents a link to another document that either links to or is linked from the document via
|
|
1009
|
+
document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
|
|
1010
|
+
document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
|
|
1011
|
+
considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
|
|
1012
|
+
|
|
1013
|
+
Args:
|
|
1014
|
+
temp_collection: The temporary MongoDB collection to process
|
|
1015
|
+
context: The Dagster execution context for logging
|
|
1016
|
+
document_reference_ranged_slots_by_type: Dictionary mapping document types to their reference-ranged slot names
|
|
1017
|
+
|
|
1018
|
+
Returns:
|
|
1019
|
+
None (modifies the documents in place)
|
|
1020
|
+
"""
|
|
1021
|
+
|
|
1022
|
+
context.log.info(
|
|
1023
|
+
"Building relationships and adding `_upstream` and `_downstream` fields..."
|
|
1024
|
+
)
|
|
1025
|
+
|
|
1026
|
+
# document ID -> type (with "nmdc:" prefix preserved)
|
|
1027
|
+
id_to_type_map: Dict[str, str] = {}
|
|
1028
|
+
|
|
1029
|
+
# set of (<referencing document ID>, <slot>, <referenced document ID>) 3-tuples.
|
|
1030
|
+
relationship_triples: Set[Tuple[str, str, str]] = set()
|
|
1031
|
+
|
|
1032
|
+
# Collect relationship triples.
|
|
1033
|
+
for doc in temp_collection.find():
|
|
1034
|
+
doc_id = doc["id"]
|
|
1035
|
+
# Store the full type with prefix intact
|
|
1036
|
+
doc_type = doc["type"]
|
|
1037
|
+
# For looking up reference slots, we still need the type without prefix
|
|
1038
|
+
doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
|
|
1039
|
+
|
|
1040
|
+
# Record ID to type mapping - preserve the original type with prefix
|
|
1041
|
+
id_to_type_map[doc_id] = doc_type
|
|
1042
|
+
|
|
1043
|
+
# Find all document references from this document
|
|
1044
|
+
reference_slots = document_reference_ranged_slots_by_type.get(doc_type, [])
|
|
1045
|
+
for slot in reference_slots:
|
|
1046
|
+
if slot in doc:
|
|
1047
|
+
# Handle both single-value and array references
|
|
1048
|
+
refs = doc[slot] if isinstance(doc[slot], list) else [doc[slot]]
|
|
1049
|
+
for ref_doc in temp_collection.find(
|
|
1050
|
+
{"id": {"$in": refs}}, ["id", "type"]
|
|
1051
|
+
):
|
|
1052
|
+
id_to_type_map[ref_doc["id"]] = ref_doc["type"]
|
|
1053
|
+
for ref_id in refs:
|
|
1054
|
+
relationship_triples.add((doc_id, slot, ref_id))
|
|
1055
|
+
|
|
1056
|
+
context.log.info(
|
|
1057
|
+
f"Found {len(id_to_type_map)} documents, with "
|
|
1058
|
+
f"{len({d for (d, _, _) in relationship_triples})} containing references"
|
|
1059
|
+
)
|
|
1060
|
+
|
|
1061
|
+
# The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
|
|
1062
|
+
# in order to perform graph traversal and collect all entities "related" to a given entity without
|
|
1063
|
+
# recursion "exploding".
|
|
1064
|
+
#
|
|
1065
|
+
# Note: We are hard-coding this "direction" information here in the Runtime
|
|
1066
|
+
# because the NMDC schema does not currently contain or expose it.
|
|
1067
|
+
#
|
|
1068
|
+
# An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
|
|
1069
|
+
upstream_document_reference_ranged_slots = [
|
|
1070
|
+
"associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
|
|
1071
|
+
"collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
|
|
1072
|
+
"has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1073
|
+
"has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
|
|
1074
|
+
"has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1075
|
+
"instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
|
|
1076
|
+
"part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
|
|
1077
|
+
"was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
|
|
1078
|
+
"was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
|
|
1079
|
+
]
|
|
1080
|
+
# A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
|
|
1081
|
+
downstream_document_reference_ranged_slots = [
|
|
1082
|
+
"calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
|
|
1083
|
+
"generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
|
|
1084
|
+
"has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
|
|
1085
|
+
"in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
|
|
1086
|
+
"uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
|
|
1087
|
+
# Note: I don't think of superseding something as being either upstream or downstream of that thing;
|
|
1088
|
+
# but this function requires every document-reference-ranged slot to be accounted for in one
|
|
1089
|
+
# list or the other, and the superseding thing does arise _later_ than the thing it supersedes,
|
|
1090
|
+
# so I have opted to treat the superseding thing as being downstream.
|
|
1091
|
+
"superseded_by", # when a `nmdc:WorkflowExecution` or `nmdc:DataObject` is superseded by a `nmdc:WorkflowExecution`.
|
|
1092
|
+
]
|
|
1093
|
+
|
|
1094
|
+
unique_document_reference_ranged_slot_names = set()
|
|
1095
|
+
for slot_names in document_reference_ranged_slots_by_type.values():
|
|
1096
|
+
for slot_name in slot_names:
|
|
1097
|
+
unique_document_reference_ranged_slot_names.add(slot_name)
|
|
1098
|
+
context.log.info(f"{unique_document_reference_ranged_slot_names=}")
|
|
1099
|
+
if len(upstream_document_reference_ranged_slots) + len(
|
|
1100
|
+
downstream_document_reference_ranged_slots
|
|
1101
|
+
) != len(unique_document_reference_ranged_slot_names):
|
|
1102
|
+
raise Failure(
|
|
1103
|
+
"Number of detected unique document-reference-ranged slot names does not match "
|
|
1104
|
+
"sum of accounted-for upstream and downstream document-reference-ranged slot names."
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
# Construct, and update documents with, `_upstream` and `_downstream` field values.
|
|
1108
|
+
#
|
|
1109
|
+
# manage batching of MongoDB `bulk_write` operations
|
|
1110
|
+
bulk_operations, update_count = [], 0
|
|
1111
|
+
for doc_id, slot, ref_id in relationship_triples:
|
|
1112
|
+
|
|
1113
|
+
# Determine in which respective fields to push this relationship
|
|
1114
|
+
# for the subject (doc) and object (ref) of this triple.
|
|
1115
|
+
if slot in upstream_document_reference_ranged_slots:
|
|
1116
|
+
field_for_doc, field_for_ref = "_upstream", "_downstream"
|
|
1117
|
+
elif slot in downstream_document_reference_ranged_slots:
|
|
1118
|
+
field_for_doc, field_for_ref = "_downstream", "_upstream"
|
|
1119
|
+
else:
|
|
1120
|
+
raise Failure(f"Unknown slot {slot} for document {doc_id}")
|
|
1121
|
+
|
|
1122
|
+
updates = [
|
|
1123
|
+
{
|
|
1124
|
+
"filter": {"id": doc_id},
|
|
1125
|
+
"update": {
|
|
1126
|
+
"$push": {
|
|
1127
|
+
field_for_doc: {
|
|
1128
|
+
"id": ref_id,
|
|
1129
|
+
# TODO existing tests are failing due to `KeyError`s for `id_to_type_map.get[ref_id]` here,
|
|
1130
|
+
# which acts as an implicit referential integrity checker (!). Using `.get` with
|
|
1131
|
+
# "nmdc:NamedThing" as default in order to (for now) allow such tests to continue to pass.
|
|
1132
|
+
"type": id_to_type_map.get(ref_id, "nmdc:NamedThing"),
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
},
|
|
1136
|
+
},
|
|
1137
|
+
{
|
|
1138
|
+
"filter": {"id": ref_id},
|
|
1139
|
+
"update": {
|
|
1140
|
+
"$push": {
|
|
1141
|
+
field_for_ref: {"id": doc_id, "type": id_to_type_map[doc_id]}
|
|
1142
|
+
}
|
|
1143
|
+
},
|
|
1144
|
+
},
|
|
1145
|
+
]
|
|
1146
|
+
for update in updates:
|
|
1147
|
+
bulk_operations.append(UpdateOne(**update))
|
|
1148
|
+
|
|
1149
|
+
# Execute in batches for efficiency
|
|
1150
|
+
if len(bulk_operations) >= BULK_WRITE_BATCH_SIZE:
|
|
1151
|
+
temp_collection.bulk_write(bulk_operations)
|
|
1152
|
+
update_count += len(bulk_operations)
|
|
1153
|
+
context.log.info(
|
|
1154
|
+
f"Pushed {update_count/(2*len(relationship_triples)):.1%} of updates so far..."
|
|
1155
|
+
)
|
|
1156
|
+
bulk_operations = []
|
|
1157
|
+
|
|
1158
|
+
# Execute any remaining operations
|
|
1159
|
+
if bulk_operations:
|
|
1160
|
+
temp_collection.bulk_write(bulk_operations)
|
|
1161
|
+
update_count += len(bulk_operations)
|
|
1162
|
+
|
|
1163
|
+
context.log.info(f"Pushed {update_count} updates in total")
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
# Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
|
|
1167
|
+
# pass an argument to the op (in order to specify the order of the ops in the graph)
|
|
1168
|
+
# while also telling Dagster that this op doesn't need the _value_ of that argument.
|
|
1169
|
+
# This is the approach shown on: https://docs.dagster.io/api/dagster/types#dagster.Nothing
|
|
1170
|
+
# Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
|
|
1171
|
+
#
|
|
1172
|
+
@op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
|
|
1173
|
+
def materialize_alldocs(context: OpExecutionContext) -> int:
|
|
1174
|
+
"""
|
|
1175
|
+
This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
|
|
1176
|
+
|
|
1177
|
+
1. Getting all populated schema collection names with an `id` field.
|
|
1178
|
+
2. Create a temporary collection to build the new alldocs collection.
|
|
1179
|
+
3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
|
|
1180
|
+
4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
|
|
1181
|
+
5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
|
|
1182
|
+
6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
|
|
1183
|
+
7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
|
|
1184
|
+
|
|
1185
|
+
The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
|
|
1186
|
+
`nmdc_runtime.site.repository.ensure_alldocs_daily`. The collection is also updated as part of various workflows,
|
|
1187
|
+
such as when applying a changesheet or metadata updates (see `nmdc_runtime.site.graphs`).
|
|
1188
|
+
|
|
1189
|
+
The `alldocs` collection is used primarily by API endpoints like `/data_objects/study/{study_id}` and
|
|
1190
|
+
`/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
|
|
1191
|
+
related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
|
|
1192
|
+
|
|
1193
|
+
The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
|
|
1194
|
+
that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
|
|
1195
|
+
expansions.
|
|
1196
|
+
"""
|
|
1197
|
+
mdb = context.resources.mongo.db
|
|
1198
|
+
schema_view = nmdc_schema_view()
|
|
1199
|
+
|
|
1200
|
+
# TODO include functional_annotation_agg for "real-time" ref integrity checking.
|
|
1201
|
+
# For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
|
|
1202
|
+
collection_names = populated_schema_collection_names_with_id_field(mdb)
|
|
1203
|
+
context.log.info(f"constructing `alldocs` collection using {collection_names=}")
|
|
1204
|
+
|
|
1205
|
+
document_class_names = set(
|
|
1206
|
+
chain.from_iterable(collection_name_to_class_names.values())
|
|
1207
|
+
)
|
|
1208
|
+
|
|
1209
|
+
cls_slot_map = {
|
|
1210
|
+
cls_name: {
|
|
1211
|
+
slot.name: slot for slot in schema_view.class_induced_slots(cls_name)
|
|
1212
|
+
}
|
|
1213
|
+
for cls_name in document_class_names
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1216
|
+
# Any ancestor of a document class is a document-referencable range,
|
|
1217
|
+
# i.e., a valid range of a document-reference-ranged slot.
|
|
1218
|
+
document_referenceable_ranges = set(
|
|
1219
|
+
chain.from_iterable(
|
|
1220
|
+
schema_view.class_ancestors(cls_name) for cls_name in document_class_names
|
|
1221
|
+
)
|
|
1222
|
+
)
|
|
1223
|
+
|
|
1224
|
+
document_reference_ranged_slots_by_type = defaultdict(list)
|
|
1225
|
+
for cls_name, slot_map in cls_slot_map.items():
|
|
1226
|
+
for slot_name, slot in slot_map.items():
|
|
1227
|
+
if (
|
|
1228
|
+
set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
|
|
1229
|
+
& document_referenceable_ranges
|
|
1230
|
+
):
|
|
1231
|
+
document_reference_ranged_slots_by_type[f"nmdc:{cls_name}"].append(
|
|
1232
|
+
slot_name
|
|
1233
|
+
)
|
|
1234
|
+
|
|
1235
|
+
# Build `alldocs` to a temporary collection for atomic replacement
|
|
1236
|
+
# https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
|
|
1237
|
+
temp_alldocs_collection_name = f"tmp.alldocs.{ObjectId()}"
|
|
1238
|
+
temp_alldocs_collection = mdb[temp_alldocs_collection_name]
|
|
1239
|
+
context.log.info(f"constructing `{temp_alldocs_collection.name}` collection")
|
|
1240
|
+
|
|
1241
|
+
for coll_name in collection_names:
|
|
1242
|
+
context.log.info(f"{coll_name=}")
|
|
1243
|
+
write_operations = []
|
|
1244
|
+
documents_processed_counter = 0
|
|
1245
|
+
for doc in mdb[coll_name].find():
|
|
1246
|
+
try:
|
|
1247
|
+
# Keep the full type with prefix for document
|
|
1248
|
+
doc_type_full = doc["type"]
|
|
1249
|
+
# Remove prefix for slot lookup and ancestor lookup
|
|
1250
|
+
doc_type = doc_type_full.removeprefix("nmdc:")
|
|
1251
|
+
except KeyError:
|
|
1252
|
+
raise Exception(
|
|
1253
|
+
f"doc {doc['id']} in collection {coll_name} has no 'type'!"
|
|
1254
|
+
)
|
|
1255
|
+
slots_to_include = ["id", "type"] + document_reference_ranged_slots_by_type[
|
|
1256
|
+
doc_type_full
|
|
1257
|
+
]
|
|
1258
|
+
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1259
|
+
|
|
1260
|
+
# Get ancestors without the prefix, but add prefix to each one in the output
|
|
1261
|
+
new_doc["_type_and_ancestors"] = [
|
|
1262
|
+
f"nmdc:{a}" for a in schema_view.class_ancestors(doc_type)
|
|
1263
|
+
]
|
|
1264
|
+
# InsertOne is a pymongo representation of a mongo command.
|
|
1265
|
+
write_operations.append(InsertOne(new_doc))
|
|
1266
|
+
if len(write_operations) == BULK_WRITE_BATCH_SIZE:
|
|
1267
|
+
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1268
|
+
write_operations.clear()
|
|
1269
|
+
documents_processed_counter += BULK_WRITE_BATCH_SIZE
|
|
1270
|
+
if len(write_operations) > 0:
|
|
1271
|
+
# here bulk_write is a method on the pymongo db Collection class
|
|
1272
|
+
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1273
|
+
documents_processed_counter += len(write_operations)
|
|
1274
|
+
context.log.info(
|
|
1275
|
+
f"Inserted {documents_processed_counter} documents from {coll_name=} "
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
context.log.info(
|
|
1279
|
+
f"produced `{temp_alldocs_collection.name}` collection with"
|
|
1280
|
+
f" {temp_alldocs_collection.estimated_document_count()} docs."
|
|
1281
|
+
)
|
|
1282
|
+
|
|
1283
|
+
context.log.info(f"creating indexes on `{temp_alldocs_collection.name}` ...")
|
|
1284
|
+
# Ensure unique index on "id". Index creation here is blocking (i.e. background=False),
|
|
1285
|
+
# so that `temp_alldocs_collection` will be "good to go" on renaming.
|
|
1286
|
+
temp_alldocs_collection.create_index("id", unique=True)
|
|
1287
|
+
# Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
|
|
1288
|
+
slots_to_index = {"_type_and_ancestors"} | {
|
|
1289
|
+
slot
|
|
1290
|
+
for slots in document_reference_ranged_slots_by_type.values()
|
|
1291
|
+
for slot in slots
|
|
1292
|
+
}
|
|
1293
|
+
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1294
|
+
context.log.info(f"created indexes on id and on each of {slots_to_index=}.")
|
|
1295
|
+
|
|
1296
|
+
# Add related-ids fields to enable efficient relationship traversal
|
|
1297
|
+
context.log.info("Adding fields for related ids to documents...")
|
|
1298
|
+
_add_linked_instances_to_alldocs(
|
|
1299
|
+
temp_alldocs_collection, context, document_reference_ranged_slots_by_type
|
|
1300
|
+
)
|
|
1301
|
+
context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
|
|
1302
|
+
temp_alldocs_collection.create_index("_upstream.id")
|
|
1303
|
+
temp_alldocs_collection.create_index("_downstream.id")
|
|
1304
|
+
# Create compound indexes to ensure index-covered queries
|
|
1305
|
+
temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
|
|
1306
|
+
temp_alldocs_collection.create_index(
|
|
1307
|
+
[("_downstream.type", 1), ("_downstream.id", 1)]
|
|
1308
|
+
)
|
|
1309
|
+
context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
|
|
1310
|
+
|
|
1311
|
+
context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
|
|
1312
|
+
temp_alldocs_collection.rename("alldocs", dropTarget=True)
|
|
1313
|
+
n_alldocs_documents = mdb.alldocs.estimated_document_count()
|
|
1314
|
+
context.log.info(
|
|
1315
|
+
f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
|
|
1316
|
+
)
|
|
1317
|
+
return n_alldocs_documents
|
|
1318
|
+
|
|
1319
|
+
|
|
1320
|
+
@op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
|
|
1321
|
+
def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any:
|
|
1322
|
+
nmdc_study = find_study_by_id(
|
|
1323
|
+
context.op_config["nmdc_study_id"], context.resources.mongo.db
|
|
1324
|
+
)
|
|
1325
|
+
return nmdc_study
|
|
1326
|
+
|
|
1327
|
+
|
|
1328
|
+
@op(
|
|
1329
|
+
config_schema={
|
|
1330
|
+
"nmdc_ncbi_attribute_mapping_file_url": str,
|
|
1331
|
+
"ncbi_submission_metadata": Field(
|
|
1332
|
+
Permissive(
|
|
1333
|
+
{
|
|
1334
|
+
"organization": String,
|
|
1335
|
+
}
|
|
1336
|
+
),
|
|
1337
|
+
is_required=True,
|
|
1338
|
+
description="General metadata about the NCBI submission.",
|
|
1339
|
+
),
|
|
1340
|
+
"ncbi_biosample_metadata": Field(
|
|
1341
|
+
Permissive(
|
|
1342
|
+
{
|
|
1343
|
+
"organism_name": String,
|
|
1344
|
+
}
|
|
1345
|
+
),
|
|
1346
|
+
is_required=True,
|
|
1347
|
+
description="Metadata for one or many NCBI BioSample in the Submission.",
|
|
1348
|
+
),
|
|
1349
|
+
},
|
|
1350
|
+
out=Out(Dict),
|
|
1351
|
+
)
|
|
1352
|
+
def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
|
|
1353
|
+
nmdc_ncbi_attribute_mapping_file_url = context.op_config[
|
|
1354
|
+
"nmdc_ncbi_attribute_mapping_file_url"
|
|
1355
|
+
]
|
|
1356
|
+
ncbi_submission_metadata = context.op_config.get("ncbi_submission_metadata", {})
|
|
1357
|
+
ncbi_biosample_metadata = context.op_config.get("ncbi_biosample_metadata", {})
|
|
1358
|
+
|
|
1359
|
+
return {
|
|
1360
|
+
"nmdc_ncbi_attribute_mapping_file_url": nmdc_ncbi_attribute_mapping_file_url,
|
|
1361
|
+
"ncbi_submission_metadata": ncbi_submission_metadata,
|
|
1362
|
+
"ncbi_biosample_metadata": ncbi_biosample_metadata,
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
|
|
1366
|
+
@op(required_resource_keys={"mongo"})
|
|
1367
|
+
def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1368
|
+
mdb = context.resources.mongo.db
|
|
1369
|
+
alldocs_collection = mdb["alldocs"]
|
|
1370
|
+
data_object_set = mdb["data_object_set"]
|
|
1371
|
+
biosample_data_objects = fetch_data_objects_from_biosamples(
|
|
1372
|
+
alldocs_collection, data_object_set, biosamples
|
|
1373
|
+
)
|
|
1374
|
+
return biosample_data_objects
|
|
1375
|
+
|
|
1376
|
+
|
|
1377
|
+
@op(required_resource_keys={"mongo"})
|
|
1378
|
+
def get_nucleotide_sequencing_from_biosamples(
|
|
1379
|
+
context: OpExecutionContext, biosamples: list
|
|
1380
|
+
):
|
|
1381
|
+
mdb = context.resources.mongo.db
|
|
1382
|
+
alldocs_collection = mdb["alldocs"]
|
|
1383
|
+
data_generation_set = mdb["data_generation_set"]
|
|
1384
|
+
biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
|
|
1385
|
+
alldocs_collection, data_generation_set, biosamples
|
|
1386
|
+
)
|
|
1387
|
+
return biosample_omics_processing
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
@op(required_resource_keys={"mongo"})
|
|
1391
|
+
def get_library_preparation_from_biosamples(
|
|
1392
|
+
context: OpExecutionContext, biosamples: list
|
|
1393
|
+
):
|
|
1394
|
+
mdb = context.resources.mongo.db
|
|
1395
|
+
alldocs_collection = mdb["alldocs"]
|
|
1396
|
+
material_processing_set = mdb["material_processing_set"]
|
|
1397
|
+
biosample_lib_prep = fetch_library_preparation_from_biosamples(
|
|
1398
|
+
alldocs_collection, material_processing_set, biosamples
|
|
1399
|
+
)
|
|
1400
|
+
return biosample_lib_prep
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
@op(required_resource_keys={"mongo"})
|
|
1404
|
+
def get_aggregated_pooled_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1405
|
+
from nmdc_runtime.site.export.ncbi_xml_utils import check_pooling_for_biosamples
|
|
1406
|
+
|
|
1407
|
+
mdb = context.resources.mongo.db
|
|
1408
|
+
material_processing_set = mdb["material_processing_set"]
|
|
1409
|
+
pooled_biosamples_data = check_pooling_for_biosamples(
|
|
1410
|
+
material_processing_set, biosamples
|
|
1411
|
+
)
|
|
1412
|
+
|
|
1413
|
+
# Fetch ProcessedSample names from database
|
|
1414
|
+
processed_sample_ids = set()
|
|
1415
|
+
for biosample_id, pooling_info in pooled_biosamples_data.items():
|
|
1416
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
1417
|
+
processed_sample_ids.add(pooling_info["processed_sample_id"])
|
|
1418
|
+
|
|
1419
|
+
# Query database for ProcessedSample names
|
|
1420
|
+
if processed_sample_ids:
|
|
1421
|
+
processed_sample_set = mdb["processed_sample_set"]
|
|
1422
|
+
cursor = processed_sample_set.find(
|
|
1423
|
+
{"id": {"$in": list(processed_sample_ids)}}, {"id": 1, "name": 1}
|
|
1424
|
+
)
|
|
1425
|
+
processed_samples = {doc["id"]: doc.get("name", "") for doc in cursor}
|
|
1426
|
+
|
|
1427
|
+
# Update pooled_biosamples_data with ProcessedSample names
|
|
1428
|
+
for biosample_id, pooling_info in pooled_biosamples_data.items():
|
|
1429
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
1430
|
+
processed_sample_id = pooling_info["processed_sample_id"]
|
|
1431
|
+
if processed_sample_id in processed_samples:
|
|
1432
|
+
pooling_info["processed_sample_name"] = processed_samples[
|
|
1433
|
+
processed_sample_id
|
|
1434
|
+
]
|
|
1435
|
+
|
|
1436
|
+
return pooled_biosamples_data
|
|
1437
|
+
|
|
1438
|
+
|
|
1439
|
+
@op(required_resource_keys={"mongo"})
|
|
1440
|
+
def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
|
|
1441
|
+
mdb = context.resources.mongo.db
|
|
1442
|
+
return get_instruments_by_id(mdb)
|
|
1443
|
+
|
|
1444
|
+
|
|
1445
|
+
@op(required_resource_keys={"mongo"})
|
|
1446
|
+
def get_instrument_ids_by_model(context: OpExecutionContext) -> dict[str, str]:
|
|
1447
|
+
mdb = context.resources.mongo.db
|
|
1448
|
+
instruments_by_id = get_instruments_by_id(mdb)
|
|
1449
|
+
instruments_by_model: dict[str, str] = {}
|
|
1450
|
+
for inst_id, instrument in instruments_by_id.items():
|
|
1451
|
+
model = instrument.get("model")
|
|
1452
|
+
if model is None:
|
|
1453
|
+
context.log.warning(f"Instrument {inst_id} has no model.")
|
|
1454
|
+
continue
|
|
1455
|
+
if model in instruments_by_model:
|
|
1456
|
+
context.log.warning(f"Instrument model {model} is not unique.")
|
|
1457
|
+
instruments_by_model[model] = inst_id
|
|
1458
|
+
context.log.info("Instrument models: %s", pformat(instruments_by_model))
|
|
1459
|
+
return instruments_by_model
|
|
1460
|
+
|
|
1461
|
+
|
|
1462
|
+
@op
|
|
1463
|
+
def ncbi_submission_xml_from_nmdc_study(
|
|
1464
|
+
context: OpExecutionContext,
|
|
1465
|
+
nmdc_study: Any,
|
|
1466
|
+
ncbi_exporter_metadata: dict,
|
|
1467
|
+
biosamples: list,
|
|
1468
|
+
omics_processing_records: list,
|
|
1469
|
+
data_object_records: list,
|
|
1470
|
+
library_preparation_records: list,
|
|
1471
|
+
all_instruments: dict,
|
|
1472
|
+
pooled_biosamples_data: dict,
|
|
1473
|
+
) -> str:
|
|
1474
|
+
ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
|
|
1475
|
+
ncbi_xml = ncbi_exporter.get_submission_xml(
|
|
1476
|
+
biosamples,
|
|
1477
|
+
omics_processing_records,
|
|
1478
|
+
data_object_records,
|
|
1479
|
+
library_preparation_records,
|
|
1480
|
+
all_instruments,
|
|
1481
|
+
pooled_biosamples_data,
|
|
1482
|
+
)
|
|
1483
|
+
return ncbi_xml
|
|
1484
|
+
|
|
1485
|
+
|
|
1486
|
+
@op
|
|
1487
|
+
def post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
1488
|
+
nmdc_study_id: str,
|
|
1489
|
+
) -> str:
|
|
1490
|
+
filename = nmdc_study_id_to_filename(nmdc_study_id)
|
|
1491
|
+
return f"missing_database_records_for_{filename}.json"
|
|
1492
|
+
|
|
1493
|
+
|
|
1494
|
+
@op(
|
|
1495
|
+
config_schema={
|
|
1496
|
+
"nmdc_study_id": str,
|
|
1497
|
+
"gold_nmdc_instrument_mapping_file_url": str,
|
|
1498
|
+
"include_field_site_info": bool,
|
|
1499
|
+
"enable_biosample_filtering": bool,
|
|
1500
|
+
},
|
|
1501
|
+
out={
|
|
1502
|
+
"nmdc_study_id": Out(str),
|
|
1503
|
+
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
1504
|
+
"include_field_site_info": Out(bool),
|
|
1505
|
+
"enable_biosample_filtering": Out(bool),
|
|
1506
|
+
},
|
|
1507
|
+
)
|
|
1508
|
+
def get_database_updater_inputs(
|
|
1509
|
+
context: OpExecutionContext,
|
|
1510
|
+
) -> Tuple[str, str, bool, bool]:
|
|
1511
|
+
return (
|
|
1512
|
+
context.op_config["nmdc_study_id"],
|
|
1513
|
+
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
1514
|
+
context.op_config["include_field_site_info"],
|
|
1515
|
+
context.op_config["enable_biosample_filtering"],
|
|
1516
|
+
)
|
|
1517
|
+
|
|
1518
|
+
|
|
1519
|
+
@op(
|
|
1520
|
+
required_resource_keys={
|
|
1521
|
+
"runtime_api_user_client",
|
|
1522
|
+
"runtime_api_site_client",
|
|
1523
|
+
"gold_api_client",
|
|
1524
|
+
}
|
|
1525
|
+
)
|
|
1526
|
+
def generate_data_generation_set_post_biosample_ingest(
|
|
1527
|
+
context: OpExecutionContext,
|
|
1528
|
+
nmdc_study_id: str,
|
|
1529
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1530
|
+
include_field_site_info: bool,
|
|
1531
|
+
enable_biosample_filtering: bool,
|
|
1532
|
+
) -> nmdc.Database:
|
|
1533
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1534
|
+
context.resources.runtime_api_user_client
|
|
1535
|
+
)
|
|
1536
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1537
|
+
context.resources.runtime_api_site_client
|
|
1538
|
+
)
|
|
1539
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1540
|
+
|
|
1541
|
+
database_updater = DatabaseUpdater(
|
|
1542
|
+
runtime_api_user_client,
|
|
1543
|
+
runtime_api_site_client,
|
|
1544
|
+
gold_api_client,
|
|
1545
|
+
nmdc_study_id,
|
|
1546
|
+
gold_nmdc_instrument_map_df,
|
|
1547
|
+
include_field_site_info,
|
|
1548
|
+
enable_biosample_filtering,
|
|
1549
|
+
)
|
|
1550
|
+
database = (
|
|
1551
|
+
database_updater.generate_data_generation_set_records_from_gold_api_for_study()
|
|
1552
|
+
)
|
|
1553
|
+
|
|
1554
|
+
return database
|
|
1555
|
+
|
|
1556
|
+
|
|
1557
|
+
@op(
|
|
1558
|
+
required_resource_keys={
|
|
1559
|
+
"runtime_api_user_client",
|
|
1560
|
+
"runtime_api_site_client",
|
|
1561
|
+
"gold_api_client",
|
|
1562
|
+
}
|
|
1563
|
+
)
|
|
1564
|
+
def generate_biosample_set_for_nmdc_study_from_gold(
|
|
1565
|
+
context: OpExecutionContext,
|
|
1566
|
+
nmdc_study_id: str,
|
|
1567
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1568
|
+
include_field_site_info: bool = False,
|
|
1569
|
+
enable_biosample_filtering: bool = False,
|
|
1570
|
+
) -> nmdc.Database:
|
|
1571
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1572
|
+
context.resources.runtime_api_user_client
|
|
1573
|
+
)
|
|
1574
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1575
|
+
context.resources.runtime_api_site_client
|
|
1576
|
+
)
|
|
1577
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1578
|
+
|
|
1579
|
+
database_updater = DatabaseUpdater(
|
|
1580
|
+
runtime_api_user_client,
|
|
1581
|
+
runtime_api_site_client,
|
|
1582
|
+
gold_api_client,
|
|
1583
|
+
nmdc_study_id,
|
|
1584
|
+
gold_nmdc_instrument_map_df,
|
|
1585
|
+
include_field_site_info,
|
|
1586
|
+
enable_biosample_filtering,
|
|
1587
|
+
)
|
|
1588
|
+
database = database_updater.generate_biosample_set_from_gold_api_for_study()
|
|
1589
|
+
|
|
1590
|
+
return database
|
|
1591
|
+
|
|
1592
|
+
|
|
1593
|
+
@op(
|
|
1594
|
+
required_resource_keys={
|
|
1595
|
+
"runtime_api_user_client",
|
|
1596
|
+
"runtime_api_site_client",
|
|
1597
|
+
"gold_api_client",
|
|
1598
|
+
},
|
|
1599
|
+
out=Out(Any),
|
|
1600
|
+
)
|
|
1601
|
+
def run_script_to_update_insdc_biosample_identifiers(
|
|
1602
|
+
context: OpExecutionContext,
|
|
1603
|
+
nmdc_study_id: str,
|
|
1604
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1605
|
+
include_field_site_info: bool,
|
|
1606
|
+
enable_biosample_filtering: bool,
|
|
1607
|
+
):
|
|
1608
|
+
"""Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
|
|
1609
|
+
|
|
1610
|
+
This op uses the DatabaseUpdater to generate a script that can be used to update biosample
|
|
1611
|
+
records with INSDC identifiers obtained from GOLD.
|
|
1612
|
+
|
|
1613
|
+
Args:
|
|
1614
|
+
context: The execution context
|
|
1615
|
+
nmdc_study_id: The NMDC study ID for which to generate the update script
|
|
1616
|
+
gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
|
|
1617
|
+
|
|
1618
|
+
Returns:
|
|
1619
|
+
A dictionary or list of dictionaries containing the MongoDB update script(s)
|
|
1620
|
+
"""
|
|
1621
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1622
|
+
context.resources.runtime_api_user_client
|
|
1623
|
+
)
|
|
1624
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1625
|
+
context.resources.runtime_api_site_client
|
|
1626
|
+
)
|
|
1627
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1628
|
+
|
|
1629
|
+
database_updater = DatabaseUpdater(
|
|
1630
|
+
runtime_api_user_client,
|
|
1631
|
+
runtime_api_site_client,
|
|
1632
|
+
gold_api_client,
|
|
1633
|
+
nmdc_study_id,
|
|
1634
|
+
gold_nmdc_instrument_map_df,
|
|
1635
|
+
include_field_site_info,
|
|
1636
|
+
enable_biosample_filtering,
|
|
1637
|
+
)
|
|
1638
|
+
update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
|
|
1639
|
+
|
|
1640
|
+
if isinstance(update_script, list):
|
|
1641
|
+
total_updates = sum(len(item.get("updates", [])) for item in update_script)
|
|
1642
|
+
else:
|
|
1643
|
+
total_updates = len(update_script.get("updates", []))
|
|
1644
|
+
context.log.info(
|
|
1645
|
+
f"Generated update script for study {nmdc_study_id} with {total_updates} updates"
|
|
1646
|
+
)
|
|
1647
|
+
|
|
1648
|
+
return update_script
|
|
1649
|
+
|
|
1650
|
+
|
|
1651
|
+
@op
|
|
1652
|
+
def log_database_ids(
|
|
1653
|
+
context: OpExecutionContext,
|
|
1654
|
+
database: nmdc.Database,
|
|
1655
|
+
) -> None:
|
|
1656
|
+
"""Log the IDs of the database."""
|
|
1657
|
+
database_dict = as_simple_dict(database)
|
|
1658
|
+
message = ""
|
|
1659
|
+
for collection_name, collection in database_dict.items():
|
|
1660
|
+
if not isinstance(collection, list):
|
|
1661
|
+
continue
|
|
1662
|
+
message += f"{collection_name} ({len(collection)}):\n"
|
|
1663
|
+
if len(collection) < 10:
|
|
1664
|
+
message += "\n".join(f" {doc['id']}" for doc in collection)
|
|
1665
|
+
else:
|
|
1666
|
+
message += "\n".join(f" {doc['id']}" for doc in collection[:4])
|
|
1667
|
+
message += f"\n ... {len(collection) - 8} more\n"
|
|
1668
|
+
message += "\n".join(f" {doc['id']}" for doc in collection[-4:])
|
|
1669
|
+
message += "\n"
|
|
1670
|
+
if message:
|
|
1671
|
+
context.log.info(message)
|
|
1672
|
+
|
|
1673
|
+
|
|
1674
|
+
@op(
|
|
1675
|
+
description="Render free text through the Dagit UI",
|
|
1676
|
+
out=Out(description="Text content rendered through Dagit UI"),
|
|
1677
|
+
)
|
|
1678
|
+
def render_text(context: OpExecutionContext, text: Any):
|
|
1679
|
+
"""
|
|
1680
|
+
Renders content as a Dagster Asset in the Dagit UI.
|
|
1681
|
+
|
|
1682
|
+
This operation creates a Dagster Asset with the provided content, making it
|
|
1683
|
+
visible in the Dagit UI for easy viewing and sharing.
|
|
1684
|
+
|
|
1685
|
+
Args:
|
|
1686
|
+
context: The execution context
|
|
1687
|
+
text: The content to render (can be a string or a dictionary that will be converted to JSON)
|
|
1688
|
+
|
|
1689
|
+
Returns:
|
|
1690
|
+
The same content that was provided as input
|
|
1691
|
+
"""
|
|
1692
|
+
# Convert dictionary to formatted JSON string if needed
|
|
1693
|
+
if isinstance(text, dict):
|
|
1694
|
+
import json
|
|
1695
|
+
|
|
1696
|
+
content = json.dumps(text, indent=2)
|
|
1697
|
+
file_extension = "json"
|
|
1698
|
+
hash_text = json.dumps(text, sort_keys=True)[:20] # For consistent hashing
|
|
1699
|
+
else:
|
|
1700
|
+
content = str(text) # Convert to string in case it's not already
|
|
1701
|
+
file_extension = "txt"
|
|
1702
|
+
hash_text = content[:20]
|
|
1703
|
+
|
|
1704
|
+
filename = f"rendered_text_{context.run_id}.{file_extension}"
|
|
1705
|
+
file_path = os.path.join(context.instance.storage_directory(), filename)
|
|
1706
|
+
|
|
1707
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
1708
|
+
|
|
1709
|
+
with open(file_path, "w") as f:
|
|
1710
|
+
f.write(content)
|
|
1711
|
+
|
|
1712
|
+
context.log_event(
|
|
1713
|
+
AssetMaterialization(
|
|
1714
|
+
asset_key=f"rendered_text_{hash_from_str(hash_text, 'md5')[:8]}",
|
|
1715
|
+
description="Rendered Content",
|
|
1716
|
+
metadata={
|
|
1717
|
+
"file_path": MetadataValue.path(file_path),
|
|
1718
|
+
"content": MetadataValue.text(content),
|
|
1719
|
+
},
|
|
1720
|
+
)
|
|
1721
|
+
)
|
|
1722
|
+
|
|
1723
|
+
return Output(text)
|