nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/api/__init__.py +0 -0
- nmdc_runtime/api/analytics.py +70 -0
- nmdc_runtime/api/boot/__init__.py +0 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/__init__.py +0 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +170 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/__init__.py +0 -0
- nmdc_runtime/api/db/mongo.py +447 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/__init__.py +0 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +794 -0
- nmdc_runtime/api/endpoints/ids.py +192 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +105 -0
- nmdc_runtime/api/endpoints/queries.py +679 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +229 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +774 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/main.py +401 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/__init__.py +0 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/__init__.py +0 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/minter.py +0 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +253 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +242 -0
- nmdc_runtime/config.py +55 -4
- nmdc_runtime/core/db/Database.py +1 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -9
- nmdc_runtime/lib/extract_nmdc_data.py +0 -8
- nmdc_runtime/lib/nmdc_dataframes.py +3 -7
- nmdc_runtime/lib/nmdc_etl_class.py +1 -7
- nmdc_runtime/minter/adapters/repository.py +1 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +35 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/export/ncbi_xml.py +1 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
- nmdc_runtime/site/graphs.py +33 -28
- nmdc_runtime/site/ops.py +97 -237
- nmdc_runtime/site/repair/database_updater.py +8 -0
- nmdc_runtime/site/repository.py +7 -117
- nmdc_runtime/site/resources.py +4 -4
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/util.py +9 -321
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
- nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/site/ops.py
CHANGED
|
@@ -4,10 +4,9 @@ import logging
|
|
|
4
4
|
import mimetypes
|
|
5
5
|
import os
|
|
6
6
|
import subprocess
|
|
7
|
-
import tempfile
|
|
8
7
|
from collections import defaultdict
|
|
9
8
|
from datetime import datetime, timezone
|
|
10
|
-
from io import BytesIO
|
|
9
|
+
from io import BytesIO
|
|
11
10
|
from pprint import pformat
|
|
12
11
|
from toolz.dicttoolz import keyfilter
|
|
13
12
|
from typing import Tuple, Set
|
|
@@ -16,7 +15,7 @@ from itertools import chain
|
|
|
16
15
|
from ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
17
16
|
import pandas as pd
|
|
18
17
|
import requests
|
|
19
|
-
|
|
18
|
+
from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
|
|
20
19
|
|
|
21
20
|
from bson import ObjectId, json_util
|
|
22
21
|
from dagster import (
|
|
@@ -44,7 +43,7 @@ from dagster import (
|
|
|
44
43
|
from gridfs import GridFS
|
|
45
44
|
from linkml_runtime.utils.dictutils import as_simple_dict
|
|
46
45
|
from linkml_runtime.utils.yamlutils import YAMLRoot
|
|
47
|
-
from nmdc_runtime.api.db.mongo import
|
|
46
|
+
from nmdc_runtime.api.db.mongo import validate_json
|
|
48
47
|
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
49
48
|
from nmdc_runtime.api.core.metadata import (
|
|
50
49
|
_validate_changesheet,
|
|
@@ -103,23 +102,19 @@ from nmdc_runtime.site.util import (
|
|
|
103
102
|
)
|
|
104
103
|
from nmdc_runtime.util import (
|
|
105
104
|
drs_object_in_for,
|
|
106
|
-
get_names_of_classes_in_effective_range_of_slot,
|
|
107
105
|
pluralize,
|
|
108
106
|
put_object,
|
|
109
|
-
validate_json,
|
|
110
107
|
specialize_activity_set_docs,
|
|
111
108
|
collection_name_to_class_names,
|
|
112
|
-
class_hierarchy_as_list,
|
|
113
109
|
nmdc_schema_view,
|
|
114
110
|
populated_schema_collection_names_with_id_field,
|
|
115
111
|
)
|
|
116
112
|
from nmdc_schema import nmdc
|
|
117
|
-
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
118
|
-
from pydantic import BaseModel
|
|
119
113
|
from pymongo import InsertOne, UpdateOne
|
|
120
114
|
from pymongo.database import Database as MongoDatabase
|
|
121
115
|
from starlette import status
|
|
122
|
-
from toolz import
|
|
116
|
+
from toolz import get_in, valfilter, identity
|
|
117
|
+
|
|
123
118
|
|
|
124
119
|
# batch size for writing documents to alldocs
|
|
125
120
|
BULK_WRITE_BATCH_SIZE = 2000
|
|
@@ -154,99 +149,6 @@ def mongo_stats(context) -> List[str]:
|
|
|
154
149
|
return collection_names
|
|
155
150
|
|
|
156
151
|
|
|
157
|
-
@op(
|
|
158
|
-
required_resource_keys={"mongo", "runtime_api_site_client"},
|
|
159
|
-
retry_policy=RetryPolicy(max_retries=2),
|
|
160
|
-
)
|
|
161
|
-
def local_file_to_api_object(context, file_info):
|
|
162
|
-
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
163
|
-
storage_path: str = file_info["storage_path"]
|
|
164
|
-
mime_type = file_info.get("mime_type")
|
|
165
|
-
if mime_type is None:
|
|
166
|
-
mime_type = mimetypes.guess_type(storage_path)[0]
|
|
167
|
-
rv = client.put_object_in_site(
|
|
168
|
-
{"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
|
|
169
|
-
)
|
|
170
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
171
|
-
raise Failure(description=f"put_object_in_site failed: {rv.content}")
|
|
172
|
-
op = rv.json()
|
|
173
|
-
context.log.info(f"put_object_in_site: {op}")
|
|
174
|
-
rv = put_object(storage_path, op["metadata"]["url"])
|
|
175
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
176
|
-
raise Failure(description=f"put_object failed: {rv.content}")
|
|
177
|
-
op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
|
|
178
|
-
rv = client.update_operation(op["id"], op_patch)
|
|
179
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
180
|
-
raise Failure(description="update_operation failed")
|
|
181
|
-
op = rv.json()
|
|
182
|
-
context.log.info(f"update_operation: {op}")
|
|
183
|
-
rv = client.create_object_from_op(op)
|
|
184
|
-
if rv.status_code != status.HTTP_201_CREATED:
|
|
185
|
-
raise Failure("create_object_from_op failed")
|
|
186
|
-
obj = rv.json()
|
|
187
|
-
context.log.info(f'Created /objects/{obj["id"]}')
|
|
188
|
-
mdb = context.resources.mongo.db
|
|
189
|
-
rv = mdb.operations.delete_one({"id": op["id"]})
|
|
190
|
-
if rv.deleted_count != 1:
|
|
191
|
-
context.log.error("deleting op failed")
|
|
192
|
-
yield AssetMaterialization(
|
|
193
|
-
asset_key=AssetKey(["object", obj["name"]]),
|
|
194
|
-
description="output of metadata-translation run_etl",
|
|
195
|
-
metadata={"object_id": MetadataValue.text(obj["id"])},
|
|
196
|
-
)
|
|
197
|
-
yield Output(obj)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
@op(
|
|
201
|
-
out={
|
|
202
|
-
"merged_data_path": Out(
|
|
203
|
-
str,
|
|
204
|
-
description="path to TSV merging of source metadata",
|
|
205
|
-
)
|
|
206
|
-
}
|
|
207
|
-
)
|
|
208
|
-
def build_merged_db(context) -> str:
|
|
209
|
-
context.log.info("metadata-translation: running `make build-merged-db`")
|
|
210
|
-
run_and_log(
|
|
211
|
-
"cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
|
|
212
|
-
)
|
|
213
|
-
storage_path = (
|
|
214
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
|
|
215
|
-
)
|
|
216
|
-
yield AssetMaterialization(
|
|
217
|
-
asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
|
|
218
|
-
description="input to metadata-translation run_etl",
|
|
219
|
-
metadata={"path": MetadataValue.path(storage_path)},
|
|
220
|
-
)
|
|
221
|
-
yield Output(storage_path, "merged_data_path")
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
@op(
|
|
225
|
-
required_resource_keys={"runtime_api_site_client"},
|
|
226
|
-
)
|
|
227
|
-
def run_etl(context, merged_data_path: str):
|
|
228
|
-
context.log.info("metadata-translation: running `make run-etl`")
|
|
229
|
-
if not os.path.exists(merged_data_path):
|
|
230
|
-
raise Failure(description=f"merged_db not present at {merged_data_path}")
|
|
231
|
-
run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
|
|
232
|
-
storage_path = (
|
|
233
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
|
|
234
|
-
)
|
|
235
|
-
with ZipFile(storage_path) as zf:
|
|
236
|
-
name = zf.namelist()[0]
|
|
237
|
-
with zf.open(name) as f:
|
|
238
|
-
rv = json.load(f)
|
|
239
|
-
context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
|
|
240
|
-
yield AssetMaterialization(
|
|
241
|
-
asset_key=AssetKey(["gold_translation", "database.json.zip"]),
|
|
242
|
-
description="output of metadata-translation run_etl",
|
|
243
|
-
metadata={
|
|
244
|
-
"path": MetadataValue.path(storage_path),
|
|
245
|
-
},
|
|
246
|
-
)
|
|
247
|
-
yield Output({"storage_path": storage_path})
|
|
248
|
-
|
|
249
|
-
|
|
250
152
|
@op(required_resource_keys={"mongo"})
|
|
251
153
|
def get_operation(context):
|
|
252
154
|
mdb = context.resources.mongo.db
|
|
@@ -481,83 +383,6 @@ def get_json_in(context):
|
|
|
481
383
|
return rv.json()
|
|
482
384
|
|
|
483
385
|
|
|
484
|
-
def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
|
|
485
|
-
"""
|
|
486
|
-
Does not ensure ordering of `docs`.
|
|
487
|
-
|
|
488
|
-
TODO: Document this function. What _does_ it do (or what was it designed to do)?
|
|
489
|
-
What, conceptually, did the author design it to receive (as `docs`); a dict
|
|
490
|
-
having a `data_object_set` item whose value is a list of documents.
|
|
491
|
-
What, conceptually, did the author design it to return?
|
|
492
|
-
"""
|
|
493
|
-
|
|
494
|
-
if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
|
|
495
|
-
return docs, 0
|
|
496
|
-
|
|
497
|
-
do_docs = docs["data_object_set"]
|
|
498
|
-
|
|
499
|
-
class FileTypeEnumBase(BaseModel):
|
|
500
|
-
name: str
|
|
501
|
-
description: str
|
|
502
|
-
filter: str # JSON-encoded data_object_set mongo collection filter document
|
|
503
|
-
|
|
504
|
-
class FileTypeEnum(FileTypeEnumBase):
|
|
505
|
-
id: str
|
|
506
|
-
|
|
507
|
-
# Make a temporary collection (which will be dropped below) and insert the
|
|
508
|
-
# specified `data_object_set` documents into it.
|
|
509
|
-
temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
|
|
510
|
-
temp_collection = mdb[temp_collection_name]
|
|
511
|
-
temp_collection.insert_many(do_docs)
|
|
512
|
-
temp_collection.create_index("id")
|
|
513
|
-
|
|
514
|
-
def fte_matches(fte_filter: str) -> List[dict]:
|
|
515
|
-
r"""
|
|
516
|
-
Returns a list of documents—without their `_id` field—that match the specified filter,
|
|
517
|
-
which is encoded as a JSON string.
|
|
518
|
-
"""
|
|
519
|
-
return [
|
|
520
|
-
dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
|
|
521
|
-
]
|
|
522
|
-
|
|
523
|
-
# Create a mapping from each document's `id` to the document, itself.
|
|
524
|
-
do_docs_map = {d["id"]: d for d in do_docs}
|
|
525
|
-
|
|
526
|
-
n_docs_with_types_added = 0
|
|
527
|
-
|
|
528
|
-
# For each `file_type_enum` document in the database, find all the documents (among the
|
|
529
|
-
# `data_object_set` documents provided by the caller) that match that `file_type_enum`
|
|
530
|
-
# document's filter.
|
|
531
|
-
#
|
|
532
|
-
# If any of those documents lacks a `data_object_type` field, update the original
|
|
533
|
-
# `data_object_set` document so that its `data_object_type` field is set to
|
|
534
|
-
# the `file_type_enum` document's `id` (why not its `name`?).
|
|
535
|
-
#
|
|
536
|
-
# TODO: I don't know why this sets `data_object_type` to `file_type_enum.id`,
|
|
537
|
-
# as opposed to `file_type_enum.name`.
|
|
538
|
-
#
|
|
539
|
-
for fte_doc in mdb.file_type_enum.find():
|
|
540
|
-
fte = FileTypeEnum(**fte_doc)
|
|
541
|
-
docs_matching = fte_matches(fte.filter)
|
|
542
|
-
for doc in docs_matching:
|
|
543
|
-
if "data_object_type" not in doc:
|
|
544
|
-
do_docs_map[doc["id"]] = assoc(doc, "data_object_type", fte.id)
|
|
545
|
-
n_docs_with_types_added += 1
|
|
546
|
-
|
|
547
|
-
mdb.drop_collection(temp_collection_name)
|
|
548
|
-
|
|
549
|
-
# Returns a tuple. The first item is the original `docs` dictionary, but with the
|
|
550
|
-
# `data_object_set` list replaced by the list of the documents that are in the
|
|
551
|
-
# `do_docs_map` dictionary (with their `_id` fields omitted). The second item is
|
|
552
|
-
# the number of documents to which this function added a `data_object_type` field.
|
|
553
|
-
return (
|
|
554
|
-
assoc(
|
|
555
|
-
docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
|
|
556
|
-
),
|
|
557
|
-
n_docs_with_types_added,
|
|
558
|
-
)
|
|
559
|
-
|
|
560
|
-
|
|
561
386
|
@op(required_resource_keys={"runtime_api_site_client", "mongo"})
|
|
562
387
|
def perform_mongo_updates(context, json_in):
|
|
563
388
|
mongo = context.resources.mongo
|
|
@@ -566,8 +391,6 @@ def perform_mongo_updates(context, json_in):
|
|
|
566
391
|
|
|
567
392
|
docs = json_in
|
|
568
393
|
docs, _ = specialize_activity_set_docs(docs)
|
|
569
|
-
docs, n_docs_with_types_added = ensure_data_object_type(docs, mongo.db)
|
|
570
|
-
context.log.info(f"added `data_object_type` to {n_docs_with_types_added} docs")
|
|
571
394
|
context.log.debug(f"{docs}")
|
|
572
395
|
|
|
573
396
|
rv = validate_json(
|
|
@@ -636,22 +459,25 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
|
|
|
636
459
|
"study_type": str,
|
|
637
460
|
"gold_nmdc_instrument_mapping_file_url": str,
|
|
638
461
|
"include_field_site_info": bool,
|
|
462
|
+
"enable_biosample_filtering": bool,
|
|
639
463
|
},
|
|
640
464
|
out={
|
|
641
465
|
"study_id": Out(str),
|
|
642
466
|
"study_type": Out(str),
|
|
643
467
|
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
644
468
|
"include_field_site_info": Out(bool),
|
|
469
|
+
"enable_biosample_filtering": Out(bool),
|
|
645
470
|
},
|
|
646
471
|
)
|
|
647
472
|
def get_gold_study_pipeline_inputs(
|
|
648
473
|
context: OpExecutionContext,
|
|
649
|
-
) -> Tuple[str, str, str, bool]:
|
|
474
|
+
) -> Tuple[str, str, str, bool, bool]:
|
|
650
475
|
return (
|
|
651
476
|
context.op_config["study_id"],
|
|
652
477
|
context.op_config["study_type"],
|
|
653
478
|
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
654
479
|
context.op_config["include_field_site_info"],
|
|
480
|
+
context.op_config["enable_biosample_filtering"],
|
|
655
481
|
)
|
|
656
482
|
|
|
657
483
|
|
|
@@ -695,6 +521,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
695
521
|
analysis_projects: List[Dict[str, Any]],
|
|
696
522
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
697
523
|
include_field_site_info: bool,
|
|
524
|
+
enable_biosample_filtering: bool,
|
|
698
525
|
) -> nmdc.Database:
|
|
699
526
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
700
527
|
|
|
@@ -710,6 +537,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
710
537
|
analysis_projects,
|
|
711
538
|
gold_nmdc_instrument_map_df,
|
|
712
539
|
include_field_site_info,
|
|
540
|
+
enable_biosample_filtering,
|
|
713
541
|
id_minter=id_minter,
|
|
714
542
|
)
|
|
715
543
|
database = translator.get_database()
|
|
@@ -1110,21 +938,25 @@ def load_ontology(context: OpExecutionContext):
|
|
|
1110
938
|
source_ontology=source_ontology,
|
|
1111
939
|
output_directory=output_directory,
|
|
1112
940
|
generate_reports=generate_reports,
|
|
941
|
+
mongo_client=context.resources.mongo.client,
|
|
942
|
+
db_name=context.resources.mongo.db.name,
|
|
1113
943
|
)
|
|
1114
944
|
|
|
1115
945
|
loader.run_ontology_loader()
|
|
1116
946
|
context.log.info(f"Ontology load for {source_ontology} completed successfully!")
|
|
1117
947
|
|
|
1118
948
|
|
|
1119
|
-
def
|
|
949
|
+
def _add_linked_instances_to_alldocs(
|
|
1120
950
|
temp_collection, context, document_reference_ranged_slots_by_type
|
|
1121
951
|
) -> None:
|
|
1122
952
|
"""
|
|
1123
|
-
Adds {`
|
|
953
|
+
Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
|
|
1124
954
|
|
|
1125
|
-
The {`
|
|
1126
|
-
Each subdocument represents a link to
|
|
1127
|
-
|
|
955
|
+
The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
|
|
956
|
+
Each subdocument represents a link to another document that either links to or is linked from the document via
|
|
957
|
+
document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
|
|
958
|
+
document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
|
|
959
|
+
considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
|
|
1128
960
|
|
|
1129
961
|
Args:
|
|
1130
962
|
temp_collection: The temporary MongoDB collection to process
|
|
@@ -1136,7 +968,7 @@ def _add_related_ids_to_alldocs(
|
|
|
1136
968
|
"""
|
|
1137
969
|
|
|
1138
970
|
context.log.info(
|
|
1139
|
-
"Building relationships and adding `
|
|
971
|
+
"Building relationships and adding `_upstream` and `_downstream` fields..."
|
|
1140
972
|
)
|
|
1141
973
|
|
|
1142
974
|
# document ID -> type (with "nmdc:" prefix preserved)
|
|
@@ -1151,6 +983,7 @@ def _add_related_ids_to_alldocs(
|
|
|
1151
983
|
# Store the full type with prefix intact
|
|
1152
984
|
doc_type = doc["type"]
|
|
1153
985
|
# For looking up reference slots, we still need the type without prefix
|
|
986
|
+
# FIXME `document_reference_ranged_slots_by_type` should key on `doc_type`
|
|
1154
987
|
doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
|
|
1155
988
|
|
|
1156
989
|
# Record ID to type mapping - preserve the original type with prefix
|
|
@@ -1176,34 +1009,32 @@ def _add_related_ids_to_alldocs(
|
|
|
1176
1009
|
f"{len({d for (d, _, _) in relationship_triples})} containing references"
|
|
1177
1010
|
)
|
|
1178
1011
|
|
|
1179
|
-
# The bifurcation of document-reference-ranged slots as "
|
|
1012
|
+
# The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
|
|
1180
1013
|
# in order to perform graph traversal and collect all entities "related" to a given entity without
|
|
1181
1014
|
# recursion "exploding".
|
|
1182
1015
|
#
|
|
1183
1016
|
# Note: We are hard-coding this "direction" information here in the Runtime
|
|
1184
1017
|
# because the NMDC schema does not currently contain or expose it.
|
|
1185
1018
|
#
|
|
1186
|
-
# An "
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
"collected_from", # a `nmdc:
|
|
1190
|
-
"has_chromatography_configuration", # a `nmdc:
|
|
1191
|
-
"has_input", # a `nmdc:
|
|
1192
|
-
"has_mass_spectrometry_configuration", # a `nmdc:
|
|
1193
|
-
"instrument_used", # a `nmdc:
|
|
1194
|
-
"
|
|
1195
|
-
"was_generated_by", #
|
|
1196
|
-
"was_informed_by", #
|
|
1019
|
+
# An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
|
|
1020
|
+
upstream_document_reference_ranged_slots = [
|
|
1021
|
+
"associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
|
|
1022
|
+
"collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
|
|
1023
|
+
"has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1024
|
+
"has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
|
|
1025
|
+
"has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1026
|
+
"instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
|
|
1027
|
+
"part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
|
|
1028
|
+
"was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
|
|
1029
|
+
"was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
|
|
1197
1030
|
]
|
|
1198
|
-
#
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
"
|
|
1202
|
-
"
|
|
1203
|
-
"
|
|
1204
|
-
"
|
|
1205
|
-
"in_manifest", # a `nmdc:DataObject` becomes associated with `nmdc:Manifest`.
|
|
1206
|
-
"part_of", # a "contained" `nmdc:NamedThing` influences its "container" `nmdc:NamedThing`,
|
|
1031
|
+
# A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
|
|
1032
|
+
downstream_document_reference_ranged_slots = [
|
|
1033
|
+
"calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
|
|
1034
|
+
"generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
|
|
1035
|
+
"has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
|
|
1036
|
+
"in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
|
|
1037
|
+
"uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
|
|
1207
1038
|
]
|
|
1208
1039
|
|
|
1209
1040
|
unique_document_reference_ranged_slot_names = set()
|
|
@@ -1211,15 +1042,15 @@ def _add_related_ids_to_alldocs(
|
|
|
1211
1042
|
for slot_name in slot_names:
|
|
1212
1043
|
unique_document_reference_ranged_slot_names.add(slot_name)
|
|
1213
1044
|
context.log.info(f"{unique_document_reference_ranged_slot_names=}")
|
|
1214
|
-
if len(
|
|
1215
|
-
|
|
1045
|
+
if len(upstream_document_reference_ranged_slots) + len(
|
|
1046
|
+
downstream_document_reference_ranged_slots
|
|
1216
1047
|
) != len(unique_document_reference_ranged_slot_names):
|
|
1217
1048
|
raise Failure(
|
|
1218
1049
|
"Number of detected unique document-reference-ranged slot names does not match "
|
|
1219
|
-
"sum of accounted-for
|
|
1050
|
+
"sum of accounted-for upstream and downstream document-reference-ranged slot names."
|
|
1220
1051
|
)
|
|
1221
1052
|
|
|
1222
|
-
# Construct, and update documents with, `
|
|
1053
|
+
# Construct, and update documents with, `_upstream` and `_downstream` field values.
|
|
1223
1054
|
#
|
|
1224
1055
|
# manage batching of MongoDB `bulk_write` operations
|
|
1225
1056
|
bulk_operations, update_count = [], 0
|
|
@@ -1227,10 +1058,10 @@ def _add_related_ids_to_alldocs(
|
|
|
1227
1058
|
|
|
1228
1059
|
# Determine in which respective fields to push this relationship
|
|
1229
1060
|
# for the subject (doc) and object (ref) of this triple.
|
|
1230
|
-
if slot in
|
|
1231
|
-
field_for_doc, field_for_ref = "
|
|
1232
|
-
elif slot in
|
|
1233
|
-
field_for_doc, field_for_ref = "
|
|
1061
|
+
if slot in upstream_document_reference_ranged_slots:
|
|
1062
|
+
field_for_doc, field_for_ref = "_upstream", "_downstream"
|
|
1063
|
+
elif slot in downstream_document_reference_ranged_slots:
|
|
1064
|
+
field_for_doc, field_for_ref = "_downstream", "_upstream"
|
|
1234
1065
|
else:
|
|
1235
1066
|
raise Failure(f"Unknown slot {slot} for document {doc_id}")
|
|
1236
1067
|
|
|
@@ -1277,14 +1108,6 @@ def _add_related_ids_to_alldocs(
|
|
|
1277
1108
|
|
|
1278
1109
|
context.log.info(f"Pushed {update_count} updates in total")
|
|
1279
1110
|
|
|
1280
|
-
context.log.info("Creating {`_inbound`,`_outbound`} indexes...")
|
|
1281
|
-
temp_collection.create_index("_inbound.id")
|
|
1282
|
-
temp_collection.create_index("_outbound.id")
|
|
1283
|
-
# Create compound indexes to ensure index-covered queries
|
|
1284
|
-
temp_collection.create_index([("_inbound.type", 1), ("_inbound.id", 1)])
|
|
1285
|
-
temp_collection.create_index([("_outbound.type", 1), ("_outbound.id", 1)])
|
|
1286
|
-
context.log.info("Successfully created {`_inbound`,`_outbound`} indexes")
|
|
1287
|
-
|
|
1288
1111
|
|
|
1289
1112
|
# Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
|
|
1290
1113
|
# pass an argument to the op (in order to specify the order of the ops in the graph)
|
|
@@ -1301,8 +1124,8 @@ def materialize_alldocs(context) -> int:
|
|
|
1301
1124
|
2. Create a temporary collection to build the new alldocs collection.
|
|
1302
1125
|
3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
|
|
1303
1126
|
4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
|
|
1304
|
-
5. Add special `
|
|
1305
|
-
6. Add indexes for `id`, relationship fields, and `{
|
|
1127
|
+
5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
|
|
1128
|
+
6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
|
|
1306
1129
|
7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
|
|
1307
1130
|
|
|
1308
1131
|
The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
|
|
@@ -1313,7 +1136,7 @@ def materialize_alldocs(context) -> int:
|
|
|
1313
1136
|
`/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
|
|
1314
1137
|
related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
|
|
1315
1138
|
|
|
1316
|
-
The {`
|
|
1139
|
+
The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
|
|
1317
1140
|
that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
|
|
1318
1141
|
expansions.
|
|
1319
1142
|
"""
|
|
@@ -1344,6 +1167,9 @@ def materialize_alldocs(context) -> int:
|
|
|
1344
1167
|
)
|
|
1345
1168
|
)
|
|
1346
1169
|
|
|
1170
|
+
# FIXME rename to `document_reference_ranged_slots_by_type`
|
|
1171
|
+
# FIXME key on CURIE, e.g. `nmdc:Study`
|
|
1172
|
+
# (here, not upstream in `cls_slot_map`/`document_referenceable_ranges`, b/c `schema_view` used directly in those)
|
|
1347
1173
|
document_reference_ranged_slots = defaultdict(list)
|
|
1348
1174
|
for cls_name, slot_map in cls_slot_map.items():
|
|
1349
1175
|
for slot_name, slot in slot_map.items():
|
|
@@ -1383,12 +1209,12 @@ def materialize_alldocs(context) -> int:
|
|
|
1383
1209
|
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1384
1210
|
|
|
1385
1211
|
new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
|
|
1386
|
-
# InsertOne is a method on the py-mongo Client class.
|
|
1387
1212
|
# Get ancestors without the prefix, but add prefix to each one in the output
|
|
1388
1213
|
ancestors = schema_view.class_ancestors(doc_type)
|
|
1389
1214
|
new_doc["_type_and_ancestors"] = [
|
|
1390
1215
|
"nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
|
|
1391
1216
|
]
|
|
1217
|
+
# InsertOne is a pymongo representation of a mongo command.
|
|
1392
1218
|
write_operations.append(InsertOne(new_doc))
|
|
1393
1219
|
if len(write_operations) == BULK_WRITE_BATCH_SIZE:
|
|
1394
1220
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
@@ -1412,19 +1238,28 @@ def materialize_alldocs(context) -> int:
|
|
|
1412
1238
|
# so that `temp_alldocs_collection` will be "good to go" on renaming.
|
|
1413
1239
|
temp_alldocs_collection.create_index("id", unique=True)
|
|
1414
1240
|
# Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
|
|
1241
|
+
# TODO add indexes on each of `set(document_reference_ranged_slots.values())`.
|
|
1415
1242
|
slots_to_index = ["has_input", "has_output", "was_informed_by"]
|
|
1416
1243
|
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1417
1244
|
context.log.info(f"created indexes on id, {slots_to_index}.")
|
|
1418
1245
|
|
|
1419
1246
|
# Add related-ids fields to enable efficient relationship traversal
|
|
1420
1247
|
context.log.info("Adding fields for related ids to documents...")
|
|
1421
|
-
|
|
1248
|
+
_add_linked_instances_to_alldocs(
|
|
1422
1249
|
temp_alldocs_collection, context, document_reference_ranged_slots
|
|
1423
1250
|
)
|
|
1251
|
+
context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
|
|
1252
|
+
temp_alldocs_collection.create_index("_upstream.id")
|
|
1253
|
+
temp_alldocs_collection.create_index("_downstream.id")
|
|
1254
|
+
# Create compound indexes to ensure index-covered queries
|
|
1255
|
+
temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
|
|
1256
|
+
temp_alldocs_collection.create_index(
|
|
1257
|
+
[("_downstream.type", 1), ("_downstream.id", 1)]
|
|
1258
|
+
)
|
|
1259
|
+
context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
|
|
1424
1260
|
|
|
1425
1261
|
context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
|
|
1426
1262
|
temp_alldocs_collection.rename("alldocs", dropTarget=True)
|
|
1427
|
-
|
|
1428
1263
|
n_alldocs_documents = mdb.alldocs.estimated_document_count()
|
|
1429
1264
|
context.log.info(
|
|
1430
1265
|
f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
|
|
@@ -1572,16 +1407,24 @@ def post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
|
1572
1407
|
config_schema={
|
|
1573
1408
|
"nmdc_study_id": str,
|
|
1574
1409
|
"gold_nmdc_instrument_mapping_file_url": str,
|
|
1410
|
+
"include_field_site_info": bool,
|
|
1411
|
+
"enable_biosample_filtering": bool,
|
|
1575
1412
|
},
|
|
1576
1413
|
out={
|
|
1577
1414
|
"nmdc_study_id": Out(str),
|
|
1578
1415
|
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
1416
|
+
"include_field_site_info": Out(bool),
|
|
1417
|
+
"enable_biosample_filtering": Out(bool),
|
|
1579
1418
|
},
|
|
1580
1419
|
)
|
|
1581
|
-
def get_database_updater_inputs(
|
|
1420
|
+
def get_database_updater_inputs(
|
|
1421
|
+
context: OpExecutionContext,
|
|
1422
|
+
) -> Tuple[str, str, bool, bool]:
|
|
1582
1423
|
return (
|
|
1583
1424
|
context.op_config["nmdc_study_id"],
|
|
1584
1425
|
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
1426
|
+
context.op_config["include_field_site_info"],
|
|
1427
|
+
context.op_config["enable_biosample_filtering"],
|
|
1585
1428
|
)
|
|
1586
1429
|
|
|
1587
1430
|
|
|
@@ -1596,6 +1439,8 @@ def generate_data_generation_set_post_biosample_ingest(
|
|
|
1596
1439
|
context: OpExecutionContext,
|
|
1597
1440
|
nmdc_study_id: str,
|
|
1598
1441
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1442
|
+
include_field_site_info: bool,
|
|
1443
|
+
enable_biosample_filtering: bool,
|
|
1599
1444
|
) -> nmdc.Database:
|
|
1600
1445
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1601
1446
|
context.resources.runtime_api_user_client
|
|
@@ -1611,6 +1456,8 @@ def generate_data_generation_set_post_biosample_ingest(
|
|
|
1611
1456
|
gold_api_client,
|
|
1612
1457
|
nmdc_study_id,
|
|
1613
1458
|
gold_nmdc_instrument_map_df,
|
|
1459
|
+
include_field_site_info,
|
|
1460
|
+
enable_biosample_filtering,
|
|
1614
1461
|
)
|
|
1615
1462
|
database = (
|
|
1616
1463
|
database_updater.generate_data_generation_set_records_from_gold_api_for_study()
|
|
@@ -1630,6 +1477,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1630
1477
|
context: OpExecutionContext,
|
|
1631
1478
|
nmdc_study_id: str,
|
|
1632
1479
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1480
|
+
include_field_site_info: bool = False,
|
|
1481
|
+
enable_biosample_filtering: bool = False,
|
|
1633
1482
|
) -> nmdc.Database:
|
|
1634
1483
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1635
1484
|
context.resources.runtime_api_user_client
|
|
@@ -1645,6 +1494,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1645
1494
|
gold_api_client,
|
|
1646
1495
|
nmdc_study_id,
|
|
1647
1496
|
gold_nmdc_instrument_map_df,
|
|
1497
|
+
include_field_site_info,
|
|
1498
|
+
enable_biosample_filtering,
|
|
1648
1499
|
)
|
|
1649
1500
|
database = database_updater.generate_biosample_set_from_gold_api_for_study()
|
|
1650
1501
|
|
|
@@ -1656,13 +1507,16 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1656
1507
|
"runtime_api_user_client",
|
|
1657
1508
|
"runtime_api_site_client",
|
|
1658
1509
|
"gold_api_client",
|
|
1659
|
-
}
|
|
1510
|
+
},
|
|
1511
|
+
out=Out(Any),
|
|
1660
1512
|
)
|
|
1661
1513
|
def run_script_to_update_insdc_biosample_identifiers(
|
|
1662
1514
|
context: OpExecutionContext,
|
|
1663
1515
|
nmdc_study_id: str,
|
|
1664
1516
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1665
|
-
|
|
1517
|
+
include_field_site_info: bool,
|
|
1518
|
+
enable_biosample_filtering: bool,
|
|
1519
|
+
):
|
|
1666
1520
|
"""Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
|
|
1667
1521
|
|
|
1668
1522
|
This op uses the DatabaseUpdater to generate a script that can be used to update biosample
|
|
@@ -1674,7 +1528,7 @@ def run_script_to_update_insdc_biosample_identifiers(
|
|
|
1674
1528
|
gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
|
|
1675
1529
|
|
|
1676
1530
|
Returns:
|
|
1677
|
-
A dictionary containing the MongoDB update script
|
|
1531
|
+
A dictionary or list of dictionaries containing the MongoDB update script(s)
|
|
1678
1532
|
"""
|
|
1679
1533
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1680
1534
|
context.resources.runtime_api_user_client
|
|
@@ -1690,11 +1544,17 @@ def run_script_to_update_insdc_biosample_identifiers(
|
|
|
1690
1544
|
gold_api_client,
|
|
1691
1545
|
nmdc_study_id,
|
|
1692
1546
|
gold_nmdc_instrument_map_df,
|
|
1547
|
+
include_field_site_info,
|
|
1548
|
+
enable_biosample_filtering,
|
|
1693
1549
|
)
|
|
1694
1550
|
update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
|
|
1695
1551
|
|
|
1552
|
+
if isinstance(update_script, list):
|
|
1553
|
+
total_updates = sum(len(item.get("updates", [])) for item in update_script)
|
|
1554
|
+
else:
|
|
1555
|
+
total_updates = len(update_script.get("updates", []))
|
|
1696
1556
|
context.log.info(
|
|
1697
|
-
f"Generated update script for study {nmdc_study_id} with {
|
|
1557
|
+
f"Generated update script for study {nmdc_study_id} with {total_updates} updates"
|
|
1698
1558
|
)
|
|
1699
1559
|
|
|
1700
1560
|
return update_script
|
|
@@ -18,6 +18,8 @@ class DatabaseUpdater:
|
|
|
18
18
|
gold_api_client: GoldApiClient,
|
|
19
19
|
study_id: str,
|
|
20
20
|
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
21
|
+
include_field_site_info: bool = False,
|
|
22
|
+
enable_biosample_filtering: bool = True,
|
|
21
23
|
):
|
|
22
24
|
"""This class serves as an API for repairing connections in the database by
|
|
23
25
|
adding records that are essentially missing "links"/"connections". As we identify
|
|
@@ -39,6 +41,8 @@ class DatabaseUpdater:
|
|
|
39
41
|
self.gold_api_client = gold_api_client
|
|
40
42
|
self.study_id = study_id
|
|
41
43
|
self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
|
|
44
|
+
self.include_field_site_info = include_field_site_info
|
|
45
|
+
self.enable_biosample_filtering = enable_biosample_filtering
|
|
42
46
|
|
|
43
47
|
@lru_cache
|
|
44
48
|
def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
|
|
@@ -95,6 +99,8 @@ class DatabaseUpdater:
|
|
|
95
99
|
biosamples=all_gold_biosamples,
|
|
96
100
|
projects=all_gold_projects,
|
|
97
101
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
102
|
+
include_field_site_info=self.include_field_site_info,
|
|
103
|
+
enable_biosample_filtering=self.enable_biosample_filtering,
|
|
98
104
|
)
|
|
99
105
|
|
|
100
106
|
# The GoldStudyTranslator class has some pre-processing logic which filters out
|
|
@@ -214,6 +220,8 @@ class DatabaseUpdater:
|
|
|
214
220
|
projects=gold_sequencing_projects_for_study,
|
|
215
221
|
analysis_projects=gold_analysis_projects_for_study,
|
|
216
222
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
223
|
+
include_field_site_info=self.include_field_site_info,
|
|
224
|
+
enable_biosample_filtering=self.enable_biosample_filtering,
|
|
217
225
|
)
|
|
218
226
|
|
|
219
227
|
translated_biosamples = gold_study_translator.biosamples
|