nmdc-runtime 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/minter/config.py +18 -50
- nmdc_runtime/site/graphs.py +39 -0
- nmdc_runtime/site/ops.py +125 -28
- nmdc_runtime/site/repair/__init__.py +0 -0
- nmdc_runtime/site/repair/database_updater.py +230 -0
- nmdc_runtime/site/repository.py +101 -1
- nmdc_runtime/site/resources.py +36 -5
- nmdc_runtime/site/translation/gold_translator.py +26 -4
- nmdc_runtime/site/util.py +7 -2
- nmdc_runtime/util.py +143 -1
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/METADATA +10 -2
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/RECORD +16 -14
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/minter/config.py
CHANGED
|
@@ -2,8 +2,9 @@ import os
|
|
|
2
2
|
from functools import lru_cache
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from nmdc_schema.id_helpers import get_typecode_for_future_ids
|
|
6
6
|
|
|
7
|
+
from nmdc_runtime.util import get_nmdc_jsonschema_dict
|
|
7
8
|
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
8
9
|
|
|
9
10
|
|
|
@@ -12,55 +13,24 @@ def minting_service_id() -> str | None:
|
|
|
12
13
|
return os.getenv("MINTING_SERVICE_ID")
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def extract_typecode_from_pattern(pattern: str) -> str:
|
|
16
|
-
r"""
|
|
17
|
-
Returns the typecode portion of the specified string.
|
|
18
|
-
|
|
19
|
-
>>> extract_typecode_from_pattern("foo-123-456$") # original behavior
|
|
20
|
-
'foo'
|
|
21
|
-
>>> extract_typecode_from_pattern("(foo)-123-456$") # returns first and only typecode
|
|
22
|
-
'foo'
|
|
23
|
-
>>> extract_typecode_from_pattern("(foo|bar)-123-456$") # returns first of 2 typecodes
|
|
24
|
-
'foo'
|
|
25
|
-
>>> extract_typecode_from_pattern("(foo|bar|baz)-123-456$") # returns first of > 2 typecodes
|
|
26
|
-
'foo'
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
# Get the portion of the pattern preceding the first hyphen.
|
|
30
|
-
# e.g. "foo-bar-baz" → ["foo", "bar-baz"] → "foo"
|
|
31
|
-
typecode_sub_pattern = pattern.split("-", maxsplit=1)[0]
|
|
32
|
-
|
|
33
|
-
# If that portion of the pattern is enclosed in parentheses, get the portion between the parentheses.
|
|
34
|
-
# e.g. "(apple|banana|carrot)" → "apple|banana|carrot"
|
|
35
|
-
if typecode_sub_pattern.startswith("(") and typecode_sub_pattern.endswith(")"):
|
|
36
|
-
inner_pattern = typecode_sub_pattern[1:-1]
|
|
37
|
-
|
|
38
|
-
# Finally, get everything before the first `|`, if any.
|
|
39
|
-
# e.g. "apple|banana|carrot" → "apple"
|
|
40
|
-
# e.g. "apple" → "apple"
|
|
41
|
-
typecode = inner_pattern.split("|", maxsplit=1)[0]
|
|
42
|
-
else:
|
|
43
|
-
# Note: This is the original behavior, before we added support for multi-typecode patterns.
|
|
44
|
-
# e.g. "apple" → "apple"
|
|
45
|
-
typecode = typecode_sub_pattern
|
|
46
|
-
|
|
47
|
-
return typecode
|
|
48
|
-
|
|
49
|
-
|
|
50
16
|
@lru_cache()
|
|
51
17
|
def typecodes() -> List[dict]:
|
|
52
18
|
r"""
|
|
53
19
|
Returns a list of dictionaries containing typecodes and associated information derived from the schema.
|
|
54
20
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
21
|
+
Note: In this function, we rely on a helper function provided by the `nmdc-schema` package to extract—from a given
|
|
22
|
+
class's `id` slot's pattern—the typecode that the minter would use when generating an ID for an instance of
|
|
23
|
+
that class _today_; regardless of what it may have used in the past.
|
|
24
|
+
|
|
25
|
+
>>> typecode_descriptors = typecodes()
|
|
26
|
+
# Test #1: We get the typecode we expect, for a class whose pattern contains only one typecode.
|
|
27
|
+
>>> any((td["name"] == "sty" and td["schema_class"] == "nmdc:Study") for td in typecode_descriptors)
|
|
28
|
+
True
|
|
29
|
+
# Tests #2 and #3: We get only the typecode we expect, for a class whose pattern contains multiple typecodes.
|
|
30
|
+
>>> any((td["name"] == "dgms" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
|
|
31
|
+
True
|
|
32
|
+
>>> any((td["name"] == "omprc" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
|
|
33
|
+
False
|
|
64
34
|
"""
|
|
65
35
|
id_pattern_prefix = r"^(nmdc):"
|
|
66
36
|
|
|
@@ -69,16 +39,14 @@ def typecodes() -> List[dict]:
|
|
|
69
39
|
for cls_name, defn in schema_dict["$defs"].items():
|
|
70
40
|
match defn.get("properties"):
|
|
71
41
|
case {"id": {"pattern": p}} if p.startswith(id_pattern_prefix):
|
|
72
|
-
#
|
|
73
|
-
|
|
74
|
-
index_of_first_character_following_prefix = len(id_pattern_prefix)
|
|
75
|
-
pattern_without_prefix = p[index_of_first_character_following_prefix:]
|
|
42
|
+
# Extract the typecode from the pattern.
|
|
43
|
+
typecode_for_future_ids = get_typecode_for_future_ids(slot_pattern=p)
|
|
76
44
|
|
|
77
45
|
rv.append(
|
|
78
46
|
{
|
|
79
47
|
"id": "nmdc:" + cls_name + "_" + "typecode",
|
|
80
48
|
"schema_class": "nmdc:" + cls_name,
|
|
81
|
-
"name":
|
|
49
|
+
"name": typecode_for_future_ids,
|
|
82
50
|
}
|
|
83
51
|
)
|
|
84
52
|
case _:
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -2,6 +2,7 @@ from dagster import graph
|
|
|
2
2
|
|
|
3
3
|
from nmdc_runtime.site.ops import (
|
|
4
4
|
build_merged_db,
|
|
5
|
+
generate_biosample_set_for_nmdc_study_from_gold,
|
|
5
6
|
nmdc_schema_database_export_filename,
|
|
6
7
|
nmdc_schema_database_from_gold_study,
|
|
7
8
|
nmdc_schema_object_to_dict,
|
|
@@ -57,6 +58,9 @@ from nmdc_runtime.site.ops import (
|
|
|
57
58
|
get_ncbi_export_pipeline_inputs,
|
|
58
59
|
ncbi_submission_xml_from_nmdc_study,
|
|
59
60
|
ncbi_submission_xml_asset,
|
|
61
|
+
get_database_updater_inputs,
|
|
62
|
+
post_submission_portal_biosample_ingest_record_stitching_filename,
|
|
63
|
+
generate_data_generation_set_post_biosample_ingest,
|
|
60
64
|
)
|
|
61
65
|
from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
|
|
62
66
|
|
|
@@ -117,12 +121,14 @@ def apply_changesheet():
|
|
|
117
121
|
sheet_in = get_changesheet_in()
|
|
118
122
|
outputs = perform_changesheet_updates(sheet_in)
|
|
119
123
|
add_output_run_event(outputs)
|
|
124
|
+
materialize_alldocs()
|
|
120
125
|
|
|
121
126
|
|
|
122
127
|
@graph
|
|
123
128
|
def apply_metadata_in():
|
|
124
129
|
outputs = perform_mongo_updates(get_json_in())
|
|
125
130
|
add_output_run_event(outputs)
|
|
131
|
+
materialize_alldocs()
|
|
126
132
|
|
|
127
133
|
|
|
128
134
|
@graph
|
|
@@ -465,3 +471,36 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
465
471
|
all_instruments,
|
|
466
472
|
)
|
|
467
473
|
ncbi_submission_xml_asset(xml_data)
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
@graph
|
|
477
|
+
def generate_data_generation_set_for_biosamples_in_nmdc_study():
|
|
478
|
+
(study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
|
|
479
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
480
|
+
|
|
481
|
+
database = generate_data_generation_set_post_biosample_ingest(
|
|
482
|
+
study_id, gold_nmdc_instrument_map_df
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
database_dict = nmdc_schema_object_to_dict(database)
|
|
486
|
+
filename = post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
487
|
+
study_id
|
|
488
|
+
)
|
|
489
|
+
outputs = export_json_to_drs(database_dict, filename)
|
|
490
|
+
add_output_run_event(outputs)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
@graph
|
|
494
|
+
def generate_biosample_set_from_samples_in_gold():
|
|
495
|
+
(study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
|
|
496
|
+
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
497
|
+
|
|
498
|
+
database = generate_biosample_set_for_nmdc_study_from_gold(
|
|
499
|
+
study_id, gold_nmdc_instrument_map_df
|
|
500
|
+
)
|
|
501
|
+
database_dict = nmdc_schema_object_to_dict(database)
|
|
502
|
+
filename = post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
503
|
+
study_id
|
|
504
|
+
)
|
|
505
|
+
outputs = export_json_to_drs(database_dict, filename)
|
|
506
|
+
add_output_run_event(outputs)
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -91,7 +91,12 @@ from nmdc_runtime.site.translation.neon_surface_water_translator import (
|
|
|
91
91
|
from nmdc_runtime.site.translation.submission_portal_translator import (
|
|
92
92
|
SubmissionPortalTranslator,
|
|
93
93
|
)
|
|
94
|
-
from nmdc_runtime.site.
|
|
94
|
+
from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
|
|
95
|
+
from nmdc_runtime.site.util import (
|
|
96
|
+
run_and_log,
|
|
97
|
+
schema_collection_has_index_on_id,
|
|
98
|
+
nmdc_study_id_to_filename,
|
|
99
|
+
)
|
|
95
100
|
from nmdc_runtime.util import (
|
|
96
101
|
drs_object_in_for,
|
|
97
102
|
get_names_of_classes_in_effective_range_of_slot,
|
|
@@ -1054,10 +1059,7 @@ def materialize_alldocs(context) -> int:
|
|
|
1054
1059
|
# TODO include functional_annotation_agg for "real-time" ref integrity checking.
|
|
1055
1060
|
# For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
|
|
1056
1061
|
collection_names = populated_schema_collection_names_with_id_field(mdb)
|
|
1057
|
-
context.log.info(f"{collection_names=}")
|
|
1058
|
-
|
|
1059
|
-
# Build alldocs
|
|
1060
|
-
context.log.info("constructing `alldocs` collection")
|
|
1062
|
+
context.log.info(f"constructing `alldocs` collection using {collection_names=}")
|
|
1061
1063
|
|
|
1062
1064
|
document_class_names = set(
|
|
1063
1065
|
chain.from_iterable(collection_name_to_class_names.values())
|
|
@@ -1070,7 +1072,8 @@ def materialize_alldocs(context) -> int:
|
|
|
1070
1072
|
for cls_name in document_class_names
|
|
1071
1073
|
}
|
|
1072
1074
|
|
|
1073
|
-
# Any ancestor of a document class is a document-
|
|
1075
|
+
# Any ancestor of a document class is a document-referencable range,
|
|
1076
|
+
# i.e., a valid range of a document-reference-ranged slot.
|
|
1074
1077
|
document_referenceable_ranges = set(
|
|
1075
1078
|
chain.from_iterable(
|
|
1076
1079
|
schema_view.class_ancestors(cls_name) for cls_name in document_class_names
|
|
@@ -1086,17 +1089,15 @@ def materialize_alldocs(context) -> int:
|
|
|
1086
1089
|
):
|
|
1087
1090
|
document_reference_ranged_slots[cls_name].append(slot_name)
|
|
1088
1091
|
|
|
1089
|
-
#
|
|
1090
|
-
#
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
#
|
|
1095
|
-
mdb.alldocs.drop()
|
|
1092
|
+
# Build `alldocs` to a temporary collection for atomic replacement
|
|
1093
|
+
# https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
|
|
1094
|
+
temp_alldocs_collection_name = f"tmp.alldocs.{ObjectId()}"
|
|
1095
|
+
temp_alldocs_collection = mdb[temp_alldocs_collection_name]
|
|
1096
|
+
context.log.info(f"constructing `{temp_alldocs_collection.name}` collection")
|
|
1096
1097
|
|
|
1097
1098
|
for coll_name in collection_names:
|
|
1098
1099
|
context.log.info(f"{coll_name=}")
|
|
1099
|
-
|
|
1100
|
+
write_operations = []
|
|
1100
1101
|
documents_processed_counter = 0
|
|
1101
1102
|
for doc in mdb[coll_name].find():
|
|
1102
1103
|
doc_type = doc["type"][5:] # lop off "nmdc:" prefix
|
|
@@ -1105,30 +1106,35 @@ def materialize_alldocs(context) -> int:
|
|
|
1105
1106
|
]
|
|
1106
1107
|
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1107
1108
|
new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
|
|
1108
|
-
|
|
1109
|
-
if len(
|
|
1110
|
-
_ =
|
|
1111
|
-
|
|
1109
|
+
write_operations.append(InsertOne(new_doc))
|
|
1110
|
+
if len(write_operations) == BULK_WRITE_BATCH_SIZE:
|
|
1111
|
+
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1112
|
+
write_operations.clear()
|
|
1112
1113
|
documents_processed_counter += BULK_WRITE_BATCH_SIZE
|
|
1113
|
-
if len(
|
|
1114
|
-
_ =
|
|
1115
|
-
documents_processed_counter += len(
|
|
1114
|
+
if len(write_operations) > 0:
|
|
1115
|
+
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1116
|
+
documents_processed_counter += len(write_operations)
|
|
1116
1117
|
context.log.info(
|
|
1117
1118
|
f"Inserted {documents_processed_counter} documents from {coll_name=} "
|
|
1118
1119
|
)
|
|
1119
1120
|
|
|
1120
1121
|
context.log.info(
|
|
1121
|
-
f"
|
|
1122
|
+
f"produced `{temp_alldocs_collection.name}` collection with"
|
|
1123
|
+
f" {temp_alldocs_collection.estimated_document_count()} docs."
|
|
1122
1124
|
)
|
|
1123
1125
|
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
#
|
|
1127
|
-
|
|
1126
|
+
context.log.info(f"creating indexes on `{temp_alldocs_collection.name}` ...")
|
|
1127
|
+
# Ensure unique index on "id". Index creation here is blocking (i.e. background=False),
|
|
1128
|
+
# so that `temp_alldocs_collection` will be "good to go" on renaming.
|
|
1129
|
+
temp_alldocs_collection.create_index("id", unique=True)
|
|
1130
|
+
# Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
|
|
1128
1131
|
slots_to_index = ["has_input", "has_output", "was_informed_by"]
|
|
1129
|
-
[
|
|
1130
|
-
|
|
1132
|
+
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1131
1133
|
context.log.info(f"created indexes on id, {slots_to_index}.")
|
|
1134
|
+
|
|
1135
|
+
context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
|
|
1136
|
+
temp_alldocs_collection.rename("alldocs", dropTarget=True)
|
|
1137
|
+
|
|
1132
1138
|
return mdb.alldocs.estimated_document_count()
|
|
1133
1139
|
|
|
1134
1140
|
|
|
@@ -1240,3 +1246,94 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1240
1246
|
all_instruments,
|
|
1241
1247
|
)
|
|
1242
1248
|
return ncbi_xml
|
|
1249
|
+
|
|
1250
|
+
|
|
1251
|
+
@op
|
|
1252
|
+
def post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
1253
|
+
nmdc_study_id: str,
|
|
1254
|
+
) -> str:
|
|
1255
|
+
filename = nmdc_study_id_to_filename(nmdc_study_id)
|
|
1256
|
+
return f"missing_database_records_for_{filename}.json"
|
|
1257
|
+
|
|
1258
|
+
|
|
1259
|
+
@op(
|
|
1260
|
+
config_schema={
|
|
1261
|
+
"nmdc_study_id": str,
|
|
1262
|
+
"gold_nmdc_instrument_mapping_file_url": str,
|
|
1263
|
+
},
|
|
1264
|
+
out={
|
|
1265
|
+
"nmdc_study_id": Out(str),
|
|
1266
|
+
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
1267
|
+
},
|
|
1268
|
+
)
|
|
1269
|
+
def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]:
|
|
1270
|
+
return (
|
|
1271
|
+
context.op_config["nmdc_study_id"],
|
|
1272
|
+
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
@op(
|
|
1277
|
+
required_resource_keys={
|
|
1278
|
+
"runtime_api_user_client",
|
|
1279
|
+
"runtime_api_site_client",
|
|
1280
|
+
"gold_api_client",
|
|
1281
|
+
}
|
|
1282
|
+
)
|
|
1283
|
+
def generate_data_generation_set_post_biosample_ingest(
|
|
1284
|
+
context: OpExecutionContext,
|
|
1285
|
+
nmdc_study_id: str,
|
|
1286
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1287
|
+
) -> nmdc.Database:
|
|
1288
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1289
|
+
context.resources.runtime_api_user_client
|
|
1290
|
+
)
|
|
1291
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1292
|
+
context.resources.runtime_api_site_client
|
|
1293
|
+
)
|
|
1294
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1295
|
+
|
|
1296
|
+
database_updater = DatabaseUpdater(
|
|
1297
|
+
runtime_api_user_client,
|
|
1298
|
+
runtime_api_site_client,
|
|
1299
|
+
gold_api_client,
|
|
1300
|
+
nmdc_study_id,
|
|
1301
|
+
gold_nmdc_instrument_map_df,
|
|
1302
|
+
)
|
|
1303
|
+
database = (
|
|
1304
|
+
database_updater.generate_data_generation_set_records_from_gold_api_for_study()
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1307
|
+
return database
|
|
1308
|
+
|
|
1309
|
+
|
|
1310
|
+
@op(
|
|
1311
|
+
required_resource_keys={
|
|
1312
|
+
"runtime_api_user_client",
|
|
1313
|
+
"runtime_api_site_client",
|
|
1314
|
+
"gold_api_client",
|
|
1315
|
+
}
|
|
1316
|
+
)
|
|
1317
|
+
def generate_biosample_set_for_nmdc_study_from_gold(
|
|
1318
|
+
context: OpExecutionContext,
|
|
1319
|
+
nmdc_study_id: str,
|
|
1320
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1321
|
+
) -> nmdc.Database:
|
|
1322
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1323
|
+
context.resources.runtime_api_user_client
|
|
1324
|
+
)
|
|
1325
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1326
|
+
context.resources.runtime_api_site_client
|
|
1327
|
+
)
|
|
1328
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1329
|
+
|
|
1330
|
+
database_updater = DatabaseUpdater(
|
|
1331
|
+
runtime_api_user_client,
|
|
1332
|
+
runtime_api_site_client,
|
|
1333
|
+
gold_api_client,
|
|
1334
|
+
nmdc_study_id,
|
|
1335
|
+
gold_nmdc_instrument_map_df,
|
|
1336
|
+
)
|
|
1337
|
+
database = database_updater.generate_biosample_set_from_gold_api_for_study()
|
|
1338
|
+
|
|
1339
|
+
return database
|
|
File without changes
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from nmdc_runtime.site.resources import (
|
|
5
|
+
RuntimeApiUserClient,
|
|
6
|
+
RuntimeApiSiteClient,
|
|
7
|
+
GoldApiClient,
|
|
8
|
+
)
|
|
9
|
+
from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
|
|
10
|
+
from nmdc_schema import nmdc
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DatabaseUpdater:
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
runtime_api_user_client: RuntimeApiUserClient,
|
|
17
|
+
runtime_api_site_client: RuntimeApiSiteClient,
|
|
18
|
+
gold_api_client: GoldApiClient,
|
|
19
|
+
study_id: str,
|
|
20
|
+
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
21
|
+
):
|
|
22
|
+
"""This class serves as an API for repairing connections in the database by
|
|
23
|
+
adding records that are essentially missing "links"/"connections". As we identify
|
|
24
|
+
common use cases for adding missing records to the database, we can
|
|
25
|
+
add helper methods to this class.
|
|
26
|
+
|
|
27
|
+
:param runtime_api_user_client: An object of RuntimeApiUserClient which can be
|
|
28
|
+
used to retrieve instance records from the NMDC database.
|
|
29
|
+
:param runtime_api_site_client: An object of RuntimeApiSiteClient which can be
|
|
30
|
+
used to mint new IDs for the repaired records that need to be added into the NMDC database.
|
|
31
|
+
:param gold_api_client: An object of GoldApiClient which can be used to retrieve
|
|
32
|
+
records from GOLD via the GOLD API.
|
|
33
|
+
:param study_id: NMDC study ID for which the missing records need to be added.
|
|
34
|
+
:param gold_nmdc_instrument_map_df: A dataframe originally stored as a TSV mapping file in the
|
|
35
|
+
NMDC schema repo, which maps GOLD instrument IDs to IDs of NMDC instrument_set records.
|
|
36
|
+
"""
|
|
37
|
+
self.runtime_api_user_client = runtime_api_user_client
|
|
38
|
+
self.runtime_api_site_client = runtime_api_site_client
|
|
39
|
+
self.gold_api_client = gold_api_client
|
|
40
|
+
self.study_id = study_id
|
|
41
|
+
self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
|
|
42
|
+
|
|
43
|
+
@lru_cache
|
|
44
|
+
def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
|
|
45
|
+
"""Fetch response from GOLD /biosamples API for a given biosample id.
|
|
46
|
+
|
|
47
|
+
:param gold_biosample_id: GOLD biosample ID.
|
|
48
|
+
:return: Dictionary containing the response from the GOLD /biosamples API.
|
|
49
|
+
"""
|
|
50
|
+
return self.gold_api_client.fetch_biosample_by_biosample_id(gold_biosample_id)
|
|
51
|
+
|
|
52
|
+
@lru_cache
|
|
53
|
+
def _fetch_gold_projects(self, gold_biosample_id: str):
|
|
54
|
+
"""Fetch response from GOLD /projects API for a given biosample id.
|
|
55
|
+
|
|
56
|
+
:param gold_biosample_id: GOLD biosample ID
|
|
57
|
+
:return: Dictionary containing the response from the GOLD /projects API.
|
|
58
|
+
"""
|
|
59
|
+
return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id)
|
|
60
|
+
|
|
61
|
+
def generate_data_generation_set_records_from_gold_api_for_study(
|
|
62
|
+
self,
|
|
63
|
+
) -> nmdc.Database:
|
|
64
|
+
"""This method creates missing data generation records for a given study in the NMDC database using
|
|
65
|
+
metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated
|
|
66
|
+
with the study from the NMDC database. Then, it fetches all the biosample and project data data
|
|
67
|
+
associated with the individual biosamples from the GOLD API using the NMDC-GOLD biosample id
|
|
68
|
+
mappings on the "gold_biosample_identifiers" key/slot. We use the GoldStudyTranslator class
|
|
69
|
+
to mint the required number of `nmdc:DataGeneration` (`nmdc:NucleotideSequencing`) records based
|
|
70
|
+
on the number of GOLD sequencing projects, and then reimplement only the part of logic from that
|
|
71
|
+
class which is responsible for making data_generation_set records.
|
|
72
|
+
|
|
73
|
+
:return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
|
|
74
|
+
"""
|
|
75
|
+
database = nmdc.Database()
|
|
76
|
+
|
|
77
|
+
biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
|
|
78
|
+
self.study_id
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
all_gold_biosamples = []
|
|
82
|
+
all_gold_projects = []
|
|
83
|
+
for biosample in biosample_set:
|
|
84
|
+
gold_biosample_identifiers = biosample.get("gold_biosample_identifiers")
|
|
85
|
+
if gold_biosample_identifiers:
|
|
86
|
+
for gold_biosample_id in gold_biosample_identifiers:
|
|
87
|
+
gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0]
|
|
88
|
+
gold_projects = self._fetch_gold_projects(gold_biosample_id)
|
|
89
|
+
gold_biosample["projects"] = gold_projects
|
|
90
|
+
|
|
91
|
+
all_gold_biosamples.append(gold_biosample)
|
|
92
|
+
all_gold_projects.extend(gold_projects)
|
|
93
|
+
|
|
94
|
+
gold_study_translator = GoldStudyTranslator(
|
|
95
|
+
biosamples=all_gold_biosamples,
|
|
96
|
+
projects=all_gold_projects,
|
|
97
|
+
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# The GoldStudyTranslator class has some pre-processing logic which filters out
|
|
101
|
+
# invalid biosamples and projects (based on `sequencingStrategy`, `projectStatus`, etc.)
|
|
102
|
+
filtered_biosamples = gold_study_translator.biosamples
|
|
103
|
+
filtered_projects = gold_study_translator.projects
|
|
104
|
+
|
|
105
|
+
gold_project_ids = [project["projectGoldId"] for project in filtered_projects]
|
|
106
|
+
nmdc_nucleotide_sequencing_ids = self.runtime_api_site_client.mint_id(
|
|
107
|
+
"nmdc:NucleotideSequencing", len(gold_project_ids)
|
|
108
|
+
).json()
|
|
109
|
+
gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
|
|
110
|
+
zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
gold_to_nmdc_biosample_ids = {}
|
|
114
|
+
|
|
115
|
+
for biosample in biosample_set:
|
|
116
|
+
gold_ids = biosample.get("gold_biosample_identifiers", [])
|
|
117
|
+
for gold_id in gold_ids:
|
|
118
|
+
gold_id_stripped = gold_id.replace("gold:", "")
|
|
119
|
+
gold_to_nmdc_biosample_ids[gold_id_stripped] = biosample["id"]
|
|
120
|
+
|
|
121
|
+
database.data_generation_set = []
|
|
122
|
+
# Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records
|
|
123
|
+
# created is based on the number of GOLD sequencing projects
|
|
124
|
+
for project in filtered_projects:
|
|
125
|
+
# map the projectGoldId to the NMDC biosample ID
|
|
126
|
+
biosample_gold_id = next(
|
|
127
|
+
(
|
|
128
|
+
biosample["biosampleGoldId"]
|
|
129
|
+
for biosample in filtered_biosamples
|
|
130
|
+
if any(
|
|
131
|
+
p["projectGoldId"] == project["projectGoldId"]
|
|
132
|
+
for p in biosample.get("projects", [])
|
|
133
|
+
)
|
|
134
|
+
),
|
|
135
|
+
None,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if biosample_gold_id:
|
|
139
|
+
nmdc_biosample_id = gold_to_nmdc_biosample_ids.get(biosample_gold_id)
|
|
140
|
+
if nmdc_biosample_id:
|
|
141
|
+
database.data_generation_set.append(
|
|
142
|
+
gold_study_translator._translate_nucleotide_sequencing(
|
|
143
|
+
project,
|
|
144
|
+
nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
|
|
145
|
+
project["projectGoldId"]
|
|
146
|
+
],
|
|
147
|
+
nmdc_biosample_id=nmdc_biosample_id,
|
|
148
|
+
nmdc_study_id=self.study_id,
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return database
|
|
153
|
+
|
|
154
|
+
def generate_biosample_set_from_gold_api_for_study(self) -> nmdc.Database:
|
|
155
|
+
"""This method creates biosample_set records for a given study in the NMDC database using
|
|
156
|
+
metadata from GOLD. The logic works by first fetching the biosampleGoldId values of all
|
|
157
|
+
biosamples associated with the study. Then, it fetches the list of all biosamples associated
|
|
158
|
+
with the GOLD study using the GOLD API. There's pre-processing logic in the GoldStudyTranslator
|
|
159
|
+
to filter out biosamples based on `sequencingStrategy` and `projectStatus`. On this list of
|
|
160
|
+
filtered biosamples, we compute a "set difference" (conceptually) between the list of
|
|
161
|
+
filtered samples and ones that are already in the NMDC database, i.e., we ignore biosamples
|
|
162
|
+
that are already present in the database, and continue on to create biosample_set records for
|
|
163
|
+
those that do not have records in the database already.
|
|
164
|
+
|
|
165
|
+
:return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
|
|
166
|
+
"""
|
|
167
|
+
database = nmdc.Database()
|
|
168
|
+
|
|
169
|
+
# get a list of all biosamples associated with a given NMDC study id
|
|
170
|
+
biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
|
|
171
|
+
self.study_id
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# get a list of GOLD biosample ids (`biosampleGoldId` values) by iterating
|
|
175
|
+
# over all the biosample_set records retrieved using the above logic
|
|
176
|
+
nmdc_gold_ids = set()
|
|
177
|
+
for biosample in biosample_set:
|
|
178
|
+
gold_ids = biosample.get("gold_biosample_identifiers", [])
|
|
179
|
+
for gold_id in gold_ids:
|
|
180
|
+
nmdc_gold_ids.add(gold_id.replace("gold:", ""))
|
|
181
|
+
|
|
182
|
+
# retrieve GOLD study id by looking at the `gold_study_identifiers` key/slot
|
|
183
|
+
# on the NMDC study record
|
|
184
|
+
nmdc_study = self.runtime_api_user_client.get_study(self.study_id)[0]
|
|
185
|
+
gold_study_id = nmdc_study.get("gold_study_identifiers", [])[0].replace(
|
|
186
|
+
"gold:", ""
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# use the GOLD study id to fetch all biosample records associated with the study
|
|
190
|
+
gold_biosamples_for_study = self.gold_api_client.fetch_biosamples_by_study(
|
|
191
|
+
gold_study_id
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# part of the code where we are (conceptually) computing a set difference between
|
|
195
|
+
# the list of filtered samples and ones that are already in the NMDC database
|
|
196
|
+
missing_gold_biosamples = [
|
|
197
|
+
gbs
|
|
198
|
+
for gbs in gold_biosamples_for_study
|
|
199
|
+
if gbs.get("biosampleGoldId") not in nmdc_gold_ids
|
|
200
|
+
]
|
|
201
|
+
|
|
202
|
+
gold_study_translator = GoldStudyTranslator(
|
|
203
|
+
biosamples=missing_gold_biosamples,
|
|
204
|
+
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
translated_biosamples = gold_study_translator.biosamples
|
|
208
|
+
|
|
209
|
+
# mint new NMDC biosample IDs for the "missing" biosamples
|
|
210
|
+
gold_biosample_ids = [
|
|
211
|
+
biosample["biosampleGoldId"] for biosample in translated_biosamples
|
|
212
|
+
]
|
|
213
|
+
nmdc_biosample_ids = self.runtime_api_site_client.mint_id(
|
|
214
|
+
"nmdc:Biosample", len(translated_biosamples)
|
|
215
|
+
).json()
|
|
216
|
+
gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
|
|
217
|
+
|
|
218
|
+
database.biosample_set = [
|
|
219
|
+
gold_study_translator._translate_biosample(
|
|
220
|
+
biosample,
|
|
221
|
+
nmdc_biosample_id=gold_to_nmdc_biosample_ids[
|
|
222
|
+
biosample["biosampleGoldId"]
|
|
223
|
+
],
|
|
224
|
+
nmdc_study_id=self.study_id,
|
|
225
|
+
nmdc_field_site_id=None,
|
|
226
|
+
)
|
|
227
|
+
for biosample in translated_biosamples
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
return database
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -25,6 +25,7 @@ from nmdc_runtime.api.models.run import _add_run_fail_event
|
|
|
25
25
|
from nmdc_runtime.api.models.trigger import Trigger
|
|
26
26
|
from nmdc_runtime.site.export.study_metadata import export_study_biosamples_metadata
|
|
27
27
|
from nmdc_runtime.site.graphs import (
|
|
28
|
+
generate_biosample_set_from_samples_in_gold,
|
|
28
29
|
translate_metadata_submission_to_nmdc_schema_database,
|
|
29
30
|
ingest_metadata_submission,
|
|
30
31
|
gold_study_to_database,
|
|
@@ -44,6 +45,7 @@ from nmdc_runtime.site.graphs import (
|
|
|
44
45
|
ingest_neon_surface_water_metadata,
|
|
45
46
|
ensure_alldocs,
|
|
46
47
|
nmdc_study_to_ncbi_submission_export,
|
|
48
|
+
generate_data_generation_set_for_biosamples_in_nmdc_study,
|
|
47
49
|
)
|
|
48
50
|
from nmdc_runtime.site.resources import (
|
|
49
51
|
get_mongo,
|
|
@@ -113,6 +115,13 @@ housekeeping_weekly = ScheduleDefinition(
|
|
|
113
115
|
job=housekeeping.to_job(**preset_normal),
|
|
114
116
|
)
|
|
115
117
|
|
|
118
|
+
ensure_alldocs_daily = ScheduleDefinition(
|
|
119
|
+
name="daily_ensure_alldocs",
|
|
120
|
+
cron_schedule="0 3 * * *",
|
|
121
|
+
execution_timezone="America/New_York",
|
|
122
|
+
job=ensure_alldocs.to_job(**preset_normal),
|
|
123
|
+
)
|
|
124
|
+
|
|
116
125
|
|
|
117
126
|
def asset_materialization_metadata(asset_event, key):
|
|
118
127
|
"""Get metadata from an asset materialization event.
|
|
@@ -453,7 +462,7 @@ def repo():
|
|
|
453
462
|
export_study_biosamples_metadata.to_job(**preset_normal),
|
|
454
463
|
ensure_alldocs.to_job(**preset_normal),
|
|
455
464
|
]
|
|
456
|
-
schedules = [housekeeping_weekly]
|
|
465
|
+
schedules = [housekeeping_weekly, ensure_alldocs_daily]
|
|
457
466
|
sensors = [
|
|
458
467
|
done_object_put_ops,
|
|
459
468
|
ensure_gold_translation_job,
|
|
@@ -915,6 +924,97 @@ def biosample_export():
|
|
|
915
924
|
]
|
|
916
925
|
|
|
917
926
|
|
|
927
|
+
@repository
|
|
928
|
+
def database_records_stitching():
|
|
929
|
+
normal_resources = run_config_frozen__normal_env["resources"]
|
|
930
|
+
return [
|
|
931
|
+
generate_data_generation_set_for_biosamples_in_nmdc_study.to_job(
|
|
932
|
+
description="This job can be used to create a data_generation_set JSON for biosamples that are already present in the NMDC database.",
|
|
933
|
+
resource_defs=resource_defs,
|
|
934
|
+
config={
|
|
935
|
+
"resources": merge(
|
|
936
|
+
unfreeze(normal_resources),
|
|
937
|
+
{
|
|
938
|
+
"runtime_api_user_client": {
|
|
939
|
+
"config": {
|
|
940
|
+
"base_url": {"env": "API_HOST"},
|
|
941
|
+
"username": {"env": "API_ADMIN_USER"},
|
|
942
|
+
"password": {"env": "API_ADMIN_PASS"},
|
|
943
|
+
},
|
|
944
|
+
},
|
|
945
|
+
"runtime_api_site_client": {
|
|
946
|
+
"config": {
|
|
947
|
+
"base_url": {"env": "API_HOST"},
|
|
948
|
+
"client_id": {"env": "API_SITE_CLIENT_ID"},
|
|
949
|
+
"client_secret": {"env": "API_SITE_CLIENT_SECRET"},
|
|
950
|
+
"site_id": {"env": "API_SITE_ID"},
|
|
951
|
+
},
|
|
952
|
+
},
|
|
953
|
+
"gold_api_client": {
|
|
954
|
+
"config": {
|
|
955
|
+
"base_url": {"env": "GOLD_API_BASE_URL"},
|
|
956
|
+
"username": {"env": "GOLD_API_USERNAME"},
|
|
957
|
+
"password": {"env": "GOLD_API_PASSWORD"},
|
|
958
|
+
},
|
|
959
|
+
},
|
|
960
|
+
},
|
|
961
|
+
),
|
|
962
|
+
"ops": {
|
|
963
|
+
"get_database_updater_inputs": {
|
|
964
|
+
"config": {
|
|
965
|
+
"nmdc_study_id": "",
|
|
966
|
+
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
967
|
+
}
|
|
968
|
+
},
|
|
969
|
+
"export_json_to_drs": {"config": {"username": ""}},
|
|
970
|
+
},
|
|
971
|
+
},
|
|
972
|
+
),
|
|
973
|
+
generate_biosample_set_from_samples_in_gold.to_job(
|
|
974
|
+
description="This job can be used to create a biosample_set JSON from samples in GOLD for a given study in NMDC.",
|
|
975
|
+
resource_defs=resource_defs,
|
|
976
|
+
config={
|
|
977
|
+
"resources": merge(
|
|
978
|
+
unfreeze(normal_resources),
|
|
979
|
+
{
|
|
980
|
+
"runtime_api_user_client": {
|
|
981
|
+
"config": {
|
|
982
|
+
"base_url": {"env": "API_HOST"},
|
|
983
|
+
"username": {"env": "API_ADMIN_USER"},
|
|
984
|
+
"password": {"env": "API_ADMIN_PASS"},
|
|
985
|
+
},
|
|
986
|
+
},
|
|
987
|
+
"runtime_api_site_client": {
|
|
988
|
+
"config": {
|
|
989
|
+
"base_url": {"env": "API_HOST"},
|
|
990
|
+
"client_id": {"env": "API_SITE_CLIENT_ID"},
|
|
991
|
+
"client_secret": {"env": "API_SITE_CLIENT_SECRET"},
|
|
992
|
+
"site_id": {"env": "API_SITE_ID"},
|
|
993
|
+
},
|
|
994
|
+
},
|
|
995
|
+
"gold_api_client": {
|
|
996
|
+
"config": {
|
|
997
|
+
"base_url": {"env": "GOLD_API_BASE_URL"},
|
|
998
|
+
"username": {"env": "GOLD_API_USERNAME"},
|
|
999
|
+
"password": {"env": "GOLD_API_PASSWORD"},
|
|
1000
|
+
},
|
|
1001
|
+
},
|
|
1002
|
+
},
|
|
1003
|
+
),
|
|
1004
|
+
"ops": {
|
|
1005
|
+
"get_database_updater_inputs": {
|
|
1006
|
+
"config": {
|
|
1007
|
+
"nmdc_study_id": "",
|
|
1008
|
+
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
1009
|
+
}
|
|
1010
|
+
},
|
|
1011
|
+
"export_json_to_drs": {"config": {"username": ""}},
|
|
1012
|
+
},
|
|
1013
|
+
},
|
|
1014
|
+
),
|
|
1015
|
+
]
|
|
1016
|
+
|
|
1017
|
+
|
|
918
1018
|
# @repository
|
|
919
1019
|
# def validation():
|
|
920
1020
|
# graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
|
nmdc_runtime/site/resources.py
CHANGED
|
@@ -129,16 +129,23 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
129
129
|
return response.json()["cursor"]["firstBatch"]
|
|
130
130
|
|
|
131
131
|
def get_biosamples_for_study(self, study_id: str):
|
|
132
|
+
# TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
|
|
133
|
+
# The /nmdcschema/{collection-name} endpoint implements pagination via the page_token mechanism,
|
|
134
|
+
# but the tradeoff there is that we would need to make multiple requests to step through the
|
|
135
|
+
# each of the pages. By picking a large number for max_page_size, we can get all the results
|
|
136
|
+
# in a single request.
|
|
137
|
+
# This method previously used the /queries:run endpoint but the problem with that was that
|
|
138
|
+
# it used to truncate the number of results returned to 100.
|
|
132
139
|
response = self.request(
|
|
133
|
-
"
|
|
134
|
-
f"/
|
|
140
|
+
"GET",
|
|
141
|
+
f"/nmdcschema/biosample_set",
|
|
135
142
|
{
|
|
136
|
-
"
|
|
137
|
-
"
|
|
143
|
+
"filter": json.dumps({"associated_studies": study_id}),
|
|
144
|
+
"max_page_size": 10000,
|
|
138
145
|
},
|
|
139
146
|
)
|
|
140
147
|
response.raise_for_status()
|
|
141
|
-
return response.json()["
|
|
148
|
+
return response.json()["resources"]
|
|
142
149
|
|
|
143
150
|
def get_omics_processing_by_name(self, name: str):
|
|
144
151
|
response = self.request(
|
|
@@ -152,6 +159,18 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
152
159
|
response.raise_for_status()
|
|
153
160
|
return response.json()["cursor"]["firstBatch"]
|
|
154
161
|
|
|
162
|
+
def get_study(self, study_id: str):
|
|
163
|
+
response = self.request(
|
|
164
|
+
"POST",
|
|
165
|
+
f"/queries:run",
|
|
166
|
+
{
|
|
167
|
+
"find": "study_set",
|
|
168
|
+
"filter": {"id": study_id},
|
|
169
|
+
},
|
|
170
|
+
)
|
|
171
|
+
response.raise_for_status()
|
|
172
|
+
return response.json()["cursor"]["firstBatch"]
|
|
173
|
+
|
|
155
174
|
|
|
156
175
|
class RuntimeApiSiteClient(RuntimeApiClient):
|
|
157
176
|
def __init__(
|
|
@@ -370,6 +389,18 @@ class GoldApiClient(BasicAuthClient):
|
|
|
370
389
|
return None
|
|
371
390
|
return results[0]
|
|
372
391
|
|
|
392
|
+
def fetch_projects_by_biosample(self, biosample_id: str) -> List[Dict[str, Any]]:
|
|
393
|
+
id = self._normalize_id(biosample_id)
|
|
394
|
+
results = self.request("/projects", params={"biosampleGoldId": id})
|
|
395
|
+
return results
|
|
396
|
+
|
|
397
|
+
def fetch_biosample_by_biosample_id(
|
|
398
|
+
self, biosample_id: str
|
|
399
|
+
) -> List[Dict[str, Any]]:
|
|
400
|
+
id = self._normalize_id(biosample_id)
|
|
401
|
+
results = self.request("/biosamples", params={"biosampleGoldId": id})
|
|
402
|
+
return results
|
|
403
|
+
|
|
373
404
|
|
|
374
405
|
@resource(
|
|
375
406
|
config_schema={
|
|
@@ -12,6 +12,29 @@ from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
|
|
|
12
12
|
SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def _is_valid_project(project: dict) -> bool:
|
|
16
|
+
"""A project is considered valid if:
|
|
17
|
+
1. `sequencingStrategy` is in {"Metagenome", "Metatranscriptome"}
|
|
18
|
+
2. if `sequencingCenters` == 'DOE Joint Genome Institute (JGI)' then
|
|
19
|
+
`projectStatus` must be in ("Permanent Draft", "Complete and Published")
|
|
20
|
+
3. otherwise, no `projectStatus` filter is applied
|
|
21
|
+
|
|
22
|
+
:param project: GOLD project object (structurally similar to response
|
|
23
|
+
from `/projects` endpoint)
|
|
24
|
+
:return: True if the project is valid, False otherwise
|
|
25
|
+
"""
|
|
26
|
+
if project.get("sequencingStrategy") not in SEQUENCING_STRATEGIES:
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
if project.get("sequencingCenters") == "DOE Joint Genome Institute (JGI)":
|
|
30
|
+
return project.get("projectStatus") in (
|
|
31
|
+
"Permanent Draft",
|
|
32
|
+
"Complete and Published",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
return True
|
|
36
|
+
|
|
37
|
+
|
|
15
38
|
class GoldStudyTranslator(Translator):
|
|
16
39
|
def __init__(
|
|
17
40
|
self,
|
|
@@ -36,16 +59,15 @@ class GoldStudyTranslator(Translator):
|
|
|
36
59
|
biosample
|
|
37
60
|
for biosample in biosamples
|
|
38
61
|
if any(
|
|
39
|
-
project.get("
|
|
40
|
-
for project in biosample.get("projects", [])
|
|
62
|
+
_is_valid_project(project) for project in biosample.get("projects", [])
|
|
41
63
|
)
|
|
42
64
|
]
|
|
43
65
|
# Fetch the valid projectGoldIds that are associated with filtered
|
|
44
66
|
# biosamples on their `projects` field
|
|
45
67
|
valid_project_ids = {
|
|
46
68
|
project.get("projectGoldId")
|
|
47
|
-
for
|
|
48
|
-
|
|
69
|
+
for project in projects
|
|
70
|
+
if _is_valid_project(project)
|
|
49
71
|
}
|
|
50
72
|
# Filter projects to only those with `projectGoldId` in valid_project_ids
|
|
51
73
|
self.projects = [
|
nmdc_runtime/site/util.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from functools import lru_cache
|
|
3
|
-
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
4
2
|
|
|
3
|
+
from dagster import op
|
|
4
|
+
from functools import lru_cache
|
|
5
5
|
from pymongo.database import Database as MongoDatabase
|
|
6
|
+
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
6
7
|
|
|
7
8
|
from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
|
|
8
9
|
from nmdc_runtime.site.resources import mongo_resource
|
|
@@ -47,3 +48,7 @@ def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
|
|
|
47
48
|
|
|
48
49
|
def get_basename(filename: str) -> str:
|
|
49
50
|
return os.path.basename(filename)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
|
|
54
|
+
return nmdc_study_id.replace(":", "_").replace("-", "_")
|
nmdc_runtime/util.py
CHANGED
|
@@ -24,6 +24,10 @@ from nmdc_schema.get_nmdc_view import ViewGetter
|
|
|
24
24
|
from pydantic import Field, BaseModel
|
|
25
25
|
from pymongo.database import Database as MongoDatabase
|
|
26
26
|
from pymongo.errors import OperationFailure
|
|
27
|
+
from refscan.lib.helpers import identify_references
|
|
28
|
+
from refscan.lib.Finder import Finder
|
|
29
|
+
from refscan.lib.ReferenceList import ReferenceList
|
|
30
|
+
from refscan.scanner import scan_outgoing_references
|
|
27
31
|
from toolz import merge, unique
|
|
28
32
|
|
|
29
33
|
from nmdc_runtime.api.core.util import sha256hash_from_file
|
|
@@ -120,6 +124,23 @@ def get_class_names_from_collection_spec(
|
|
|
120
124
|
return class_names
|
|
121
125
|
|
|
122
126
|
|
|
127
|
+
@lru_cache
|
|
128
|
+
def get_allowed_references() -> ReferenceList:
|
|
129
|
+
r"""
|
|
130
|
+
Returns a `ReferenceList` of all the inter-document references that
|
|
131
|
+
the NMDC Schema allows a schema-compliant MongoDB database to contain.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
# Identify the inter-document references that the schema allows a database to contain.
|
|
135
|
+
print("Identifying schema-allowed references.")
|
|
136
|
+
references = identify_references(
|
|
137
|
+
schema_view=nmdc_schema_view(),
|
|
138
|
+
collection_name_to_class_names=collection_name_to_class_names,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
return references
|
|
142
|
+
|
|
143
|
+
|
|
123
144
|
@lru_cache
|
|
124
145
|
def get_type_collections() -> dict:
|
|
125
146
|
"""Returns a dictionary mapping class names to Mongo collection names."""
|
|
@@ -353,6 +374,14 @@ def nmdc_database_collection_instance_class_names():
|
|
|
353
374
|
|
|
354
375
|
@lru_cache
|
|
355
376
|
def nmdc_database_collection_names():
|
|
377
|
+
r"""
|
|
378
|
+
TODO: Document this function.
|
|
379
|
+
|
|
380
|
+
TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
|
|
381
|
+
collections, use the function named `get_collection_names_from_schema` in `nmdc_runtime/api/db/mongo.py`
|
|
382
|
+
instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
|
|
383
|
+
maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
|
|
384
|
+
"""
|
|
356
385
|
names = []
|
|
357
386
|
view = nmdc_schema_view()
|
|
358
387
|
all_classes = set(view.all_classes())
|
|
@@ -513,6 +542,13 @@ class OverlayDB(AbstractContextManager):
|
|
|
513
542
|
overlay collection, that id is marked as "seen" and will not also be returned when
|
|
514
543
|
subsequently scanning the (unmodified) base-database collection.
|
|
515
544
|
|
|
545
|
+
Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
|
|
546
|
+
database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
|
|
547
|
+
`overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
|
|
548
|
+
the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
|
|
549
|
+
"merging" just-in-time to process the method invocation. You can see an example of this in the implementation
|
|
550
|
+
of the `merge_find` method, which internally accesses both the real database and the overlaying database.
|
|
551
|
+
|
|
516
552
|
Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
|
|
517
553
|
documents from a base collection to the overlay, and then applying the updates to the overlay,
|
|
518
554
|
so that again, base collections are unmodified, and a "merge_find" call will produce a result
|
|
@@ -591,7 +627,33 @@ class OverlayDB(AbstractContextManager):
|
|
|
591
627
|
yield doc
|
|
592
628
|
|
|
593
629
|
|
|
594
|
-
def validate_json(
|
|
630
|
+
def validate_json(
|
|
631
|
+
in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
|
|
632
|
+
):
|
|
633
|
+
r"""
|
|
634
|
+
Checks whether the specified dictionary represents a valid instance of the `Database` class
|
|
635
|
+
defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
|
|
636
|
+
|
|
637
|
+
Example dictionary:
|
|
638
|
+
{
|
|
639
|
+
"biosample_set": [
|
|
640
|
+
{"id": "nmdc:bsm-00-000001", ...},
|
|
641
|
+
{"id": "nmdc:bsm-00-000002", ...}
|
|
642
|
+
],
|
|
643
|
+
"study_set": [
|
|
644
|
+
{"id": "nmdc:sty-00-000001", ...},
|
|
645
|
+
{"id": "nmdc:sty-00-000002", ...}
|
|
646
|
+
]
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
:param in_docs: The dictionary you want to validate
|
|
650
|
+
:param mdb: A reference to a MongoDB database
|
|
651
|
+
:param check_inter_document_references: Whether you want this function to check whether every document that
|
|
652
|
+
is referenced by any of the documents passed in would, indeed, exist
|
|
653
|
+
in the database, if the documents passed in were to be inserted into
|
|
654
|
+
the database. In other words, set this to `True` if you want this
|
|
655
|
+
function to perform referential integrity checks.
|
|
656
|
+
"""
|
|
595
657
|
validator = Draft7Validator(get_nmdc_jsonschema_dict())
|
|
596
658
|
docs = deepcopy(in_docs)
|
|
597
659
|
validation_errors = {}
|
|
@@ -599,6 +661,8 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
|
|
|
599
661
|
known_coll_names = set(nmdc_database_collection_names())
|
|
600
662
|
for coll_name, coll_docs in docs.items():
|
|
601
663
|
if coll_name not in known_coll_names:
|
|
664
|
+
# FIXME: Document what `@type` is (conceptually; e.g., why this function accepts it as a collection name).
|
|
665
|
+
# See: https://github.com/microbiomedata/nmdc-runtime/discussions/858
|
|
602
666
|
if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
|
|
603
667
|
continue
|
|
604
668
|
else:
|
|
@@ -631,6 +695,84 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
|
|
|
631
695
|
except Exception as e:
|
|
632
696
|
return {"result": "errors", "detail": str(e)}
|
|
633
697
|
|
|
698
|
+
# Third pass (if enabled): Check inter-document references.
|
|
699
|
+
if check_inter_document_references is True:
|
|
700
|
+
# Prepare to use `refscan`.
|
|
701
|
+
#
|
|
702
|
+
# Note: We check the inter-document references in two stages, which are:
|
|
703
|
+
# 1. For each document in the JSON payload, check whether each document it references already exists
|
|
704
|
+
# (in the collections the schema says it can exist in) in the database. We use the
|
|
705
|
+
# `refscan` package to do this, which returns violation details we'll use in the second stage.
|
|
706
|
+
# 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
|
|
707
|
+
# check whether that document exists (in the collections the schema says it can exist in) in the
|
|
708
|
+
# JSON payload. If it does, then we "waive" (i.e. discard) that violation.
|
|
709
|
+
# The violations that remain after those two stages are the ones we return to the caller.
|
|
710
|
+
#
|
|
711
|
+
# Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
|
|
712
|
+
# does not provide a means to perform arbitrary queries against its virtual "merged" database. It
|
|
713
|
+
# is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
|
|
714
|
+
# `refscan`'s `Finder` class accepts.
|
|
715
|
+
#
|
|
716
|
+
finder = Finder(database=mdb)
|
|
717
|
+
references = get_allowed_references()
|
|
718
|
+
reference_field_names_by_source_class_name = (
|
|
719
|
+
references.get_reference_field_names_by_source_class_name()
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
# Iterate over the collections in the JSON payload.
|
|
723
|
+
for source_collection_name, documents in in_docs.items():
|
|
724
|
+
for document in documents:
|
|
725
|
+
# Add an `_id` field to the document, since `refscan` requires the document to have one.
|
|
726
|
+
source_document = dict(document, _id=None)
|
|
727
|
+
violations = scan_outgoing_references(
|
|
728
|
+
document=source_document,
|
|
729
|
+
schema_view=nmdc_schema_view(),
|
|
730
|
+
reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
|
|
731
|
+
references=references,
|
|
732
|
+
finder=finder,
|
|
733
|
+
collection_names=nmdc_database_collection_names(),
|
|
734
|
+
source_collection_name=source_collection_name,
|
|
735
|
+
user_wants_to_locate_misplaced_documents=False,
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# For each violation, check whether the misplaced document is in the JSON payload, itself.
|
|
739
|
+
for violation in violations:
|
|
740
|
+
can_waive_violation = False
|
|
741
|
+
# Determine which collections can contain the referenced document, based upon
|
|
742
|
+
# the schema class of which this source document is an instance.
|
|
743
|
+
target_collection_names = (
|
|
744
|
+
references.get_target_collection_names(
|
|
745
|
+
source_class_name=violation.source_class_name,
|
|
746
|
+
source_field_name=violation.source_field_name,
|
|
747
|
+
)
|
|
748
|
+
)
|
|
749
|
+
# Check whether the referenced document exists in any of those collections in the JSON payload.
|
|
750
|
+
for json_coll_name, json_coll_docs in in_docs.items():
|
|
751
|
+
if json_coll_name in target_collection_names:
|
|
752
|
+
for json_coll_doc in json_coll_docs:
|
|
753
|
+
if json_coll_doc["id"] == violation.target_id:
|
|
754
|
+
can_waive_violation = True
|
|
755
|
+
break # stop checking
|
|
756
|
+
if can_waive_violation:
|
|
757
|
+
break # stop checking
|
|
758
|
+
if not can_waive_violation:
|
|
759
|
+
violation_as_str = (
|
|
760
|
+
f"Document '{violation.source_document_id}' "
|
|
761
|
+
f"in collection '{violation.source_collection_name}' "
|
|
762
|
+
f"has a field '{violation.source_field_name}' that "
|
|
763
|
+
f"references a document having id "
|
|
764
|
+
f"'{violation.target_id}', but the latter document "
|
|
765
|
+
f"does not exist in any of the collections the "
|
|
766
|
+
f"NMDC Schema says it can exist in."
|
|
767
|
+
)
|
|
768
|
+
validation_errors[source_collection_name].append(
|
|
769
|
+
violation_as_str
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
# If any collection's error list is not empty, return an error response.
|
|
773
|
+
if any(len(v) > 0 for v in validation_errors.values()):
|
|
774
|
+
return {"result": "errors", "detail": validation_errors}
|
|
775
|
+
|
|
634
776
|
return {"result": "All Okay!"}
|
|
635
777
|
else:
|
|
636
778
|
return {"result": "errors", "detail": validation_errors}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: nmdc_runtime
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.0
|
|
4
4
|
Summary: A runtime system for NMDC data management and orchestration
|
|
5
5
|
Home-page: https://github.com/microbiomedata/nmdc-runtime
|
|
6
6
|
Author: Donny Winston
|
|
@@ -11,6 +11,14 @@ Classifier: License :: OSI Approved :: Apache Software License
|
|
|
11
11
|
Requires-Python: >=3.10
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: home-page
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
14
22
|
|
|
15
23
|
A runtime system for NMDC data management and orchestration.
|
|
16
24
|
|
|
@@ -2,7 +2,7 @@ nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
2
2
|
nmdc_runtime/config.py,sha256=qyV_To6t--DQUpYJ3SrE6sZlxuVXLPmx2dVtZV-3l-c,33
|
|
3
3
|
nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
|
|
4
4
|
nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
nmdc_runtime/util.py,sha256=
|
|
5
|
+
nmdc_runtime/util.py,sha256=HzQsNMYG6Pb-IuBEE9HBzX_lNkII7jiNe65UFk34ZYA,31414
|
|
6
6
|
nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
|
|
@@ -28,7 +28,7 @@ nmdc_runtime/lib/nmdc_etl_class.py,sha256=tVh3rKVMkBHQE65_LhKeIjCsaCZQk_HJzbc9K4
|
|
|
28
28
|
nmdc_runtime/lib/transform_nmdc_data.py,sha256=hij4lR3IMQRJQdL-rsP_I-m_WyFPsBMchV2MNFUkh0M,39906
|
|
29
29
|
nmdc_runtime/minter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
30
|
nmdc_runtime/minter/bootstrap.py,sha256=5Ej6pJVBRryRIi0ZwEloY78Zky7iE2okF6tPwRI2axM,822
|
|
31
|
-
nmdc_runtime/minter/config.py,sha256=
|
|
31
|
+
nmdc_runtime/minter/config.py,sha256=gsXZropDeeTO5tmLAtRuoocwqL3HgfgqVAENyCbX-Gc,2739
|
|
32
32
|
nmdc_runtime/minter/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
33
|
nmdc_runtime/minter/adapters/repository.py,sha256=I-jmGP38-9kPhkogrwUht_Ir0CfHA9_5ZImw5I_wbcw,8323
|
|
34
34
|
nmdc_runtime/minter/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -36,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
|
|
|
36
36
|
nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
37
|
nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
|
|
38
38
|
nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
-
nmdc_runtime/site/graphs.py,sha256=
|
|
40
|
-
nmdc_runtime/site/ops.py,sha256=
|
|
41
|
-
nmdc_runtime/site/repository.py,sha256=
|
|
42
|
-
nmdc_runtime/site/resources.py,sha256=
|
|
43
|
-
nmdc_runtime/site/util.py,sha256
|
|
39
|
+
nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
|
|
40
|
+
nmdc_runtime/site/ops.py,sha256=OrTQLSBToih-kI754RtVqjUMRojgYJQmb7B_VRjZWtg,46223
|
|
41
|
+
nmdc_runtime/site/repository.py,sha256=b3UVQznelU8wDOfuc9_vE_eqFGOoFRiHtQJJH7or73E,43875
|
|
42
|
+
nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
|
|
43
|
+
nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
|
|
44
44
|
nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
|
|
46
46
|
nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
|
|
@@ -56,10 +56,12 @@ nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZ
|
|
|
56
56
|
nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
|
|
57
57
|
nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
|
|
59
|
+
nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
|
+
nmdc_runtime/site/repair/database_updater.py,sha256=EMuY8MfwQEfdejJHp0Y-Gb1eb1zOgKgfJxbtm6wM3YU,10943
|
|
59
61
|
nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
62
|
nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
|
|
61
63
|
nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
|
|
62
|
-
nmdc_runtime/site/translation/gold_translator.py,sha256=
|
|
64
|
+
nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
|
|
63
65
|
nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
|
|
64
66
|
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
|
|
65
67
|
nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
|
|
@@ -73,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
|
|
|
73
75
|
nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
|
|
74
76
|
nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
|
|
75
77
|
nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
|
|
76
|
-
nmdc_runtime-2.
|
|
77
|
-
nmdc_runtime-2.
|
|
78
|
-
nmdc_runtime-2.
|
|
79
|
-
nmdc_runtime-2.
|
|
80
|
-
nmdc_runtime-2.
|
|
81
|
-
nmdc_runtime-2.
|
|
78
|
+
nmdc_runtime-2.3.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
|
|
79
|
+
nmdc_runtime-2.3.0.dist-info/METADATA,sha256=BEManThNKOEkfS9woYYiwm1ya6BENBC6vXE6b7L_z2E,7430
|
|
80
|
+
nmdc_runtime-2.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
81
|
+
nmdc_runtime-2.3.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
|
|
82
|
+
nmdc_runtime-2.3.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
|
|
83
|
+
nmdc_runtime-2.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|