nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/api/__init__.py +0 -0
- nmdc_runtime/api/analytics.py +70 -0
- nmdc_runtime/api/boot/__init__.py +0 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/__init__.py +0 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +170 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/__init__.py +0 -0
- nmdc_runtime/api/db/mongo.py +447 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/__init__.py +0 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +794 -0
- nmdc_runtime/api/endpoints/ids.py +192 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +105 -0
- nmdc_runtime/api/endpoints/queries.py +679 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +229 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +774 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/main.py +401 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/__init__.py +0 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/__init__.py +0 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/minter.py +0 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +253 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +242 -0
- nmdc_runtime/config.py +55 -4
- nmdc_runtime/core/db/Database.py +1 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -9
- nmdc_runtime/lib/extract_nmdc_data.py +0 -8
- nmdc_runtime/lib/nmdc_dataframes.py +3 -7
- nmdc_runtime/lib/nmdc_etl_class.py +1 -7
- nmdc_runtime/minter/adapters/repository.py +1 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +35 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/export/ncbi_xml.py +1 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
- nmdc_runtime/site/graphs.py +33 -28
- nmdc_runtime/site/ops.py +97 -237
- nmdc_runtime/site/repair/database_updater.py +8 -0
- nmdc_runtime/site/repository.py +7 -117
- nmdc_runtime/site/resources.py +4 -4
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/util.py +9 -321
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
- nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/site/repository.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from dagster import (
|
|
6
5
|
repository,
|
|
@@ -14,7 +13,6 @@ from dagster import (
|
|
|
14
13
|
DagsterRunStatus,
|
|
15
14
|
RunStatusSensorContext,
|
|
16
15
|
DefaultSensorStatus,
|
|
17
|
-
in_process_executor,
|
|
18
16
|
)
|
|
19
17
|
from starlette import status
|
|
20
18
|
from toolz import merge, get_in
|
|
@@ -30,8 +28,6 @@ from nmdc_runtime.site.graphs import (
|
|
|
30
28
|
translate_metadata_submission_to_nmdc_schema_database,
|
|
31
29
|
ingest_metadata_submission,
|
|
32
30
|
gold_study_to_database,
|
|
33
|
-
gold_translation,
|
|
34
|
-
gold_translation_curation,
|
|
35
31
|
create_objects_from_site_object_puts,
|
|
36
32
|
housekeeping,
|
|
37
33
|
ensure_jobs,
|
|
@@ -62,9 +58,6 @@ from nmdc_runtime.site.resources import (
|
|
|
62
58
|
from nmdc_runtime.site.resources import (
|
|
63
59
|
get_runtime_api_site_client,
|
|
64
60
|
)
|
|
65
|
-
from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
|
|
66
|
-
from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
|
|
67
|
-
from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
|
|
68
61
|
from nmdc_runtime.util import freeze
|
|
69
62
|
from nmdc_runtime.util import unfreeze
|
|
70
63
|
|
|
@@ -249,82 +242,6 @@ def process_workflow_job_triggers(_context):
|
|
|
249
242
|
yield SkipReason("No new jobs required")
|
|
250
243
|
|
|
251
244
|
|
|
252
|
-
@asset_sensor(
|
|
253
|
-
asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
|
|
254
|
-
job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
|
|
255
|
-
)
|
|
256
|
-
def ensure_gold_translation_job(_context, asset_event):
|
|
257
|
-
mdb = get_mongo(run_config_frozen__normal_env).db
|
|
258
|
-
gold_etl_latest = mdb.objects.find_one(
|
|
259
|
-
{"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
|
|
260
|
-
)
|
|
261
|
-
sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
|
|
262
|
-
if gold_etl_latest is None:
|
|
263
|
-
yield SkipReason("can't find sensed asset object_id in database")
|
|
264
|
-
return
|
|
265
|
-
elif gold_etl_latest["id"] != sensed_object_id:
|
|
266
|
-
yield SkipReason("later object than sensed materialization")
|
|
267
|
-
return
|
|
268
|
-
|
|
269
|
-
run_config = merge(
|
|
270
|
-
run_config_frozen__normal_env,
|
|
271
|
-
{
|
|
272
|
-
"solids": {
|
|
273
|
-
"construct_jobs": {
|
|
274
|
-
"config": {
|
|
275
|
-
"base_jobs": [
|
|
276
|
-
{
|
|
277
|
-
"workflow": {"id": "gold-translation-1.0.0"},
|
|
278
|
-
"config": {"object_id": gold_etl_latest["id"]},
|
|
279
|
-
}
|
|
280
|
-
]
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
},
|
|
285
|
-
)
|
|
286
|
-
yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
@asset_sensor(
|
|
290
|
-
asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
|
|
291
|
-
job=gold_translation_curation.to_job(**preset_normal),
|
|
292
|
-
)
|
|
293
|
-
def claim_and_run_gold_translation_curation(_context, asset_event):
|
|
294
|
-
client = get_runtime_api_site_client(run_config_frozen__normal_env)
|
|
295
|
-
mdb = get_mongo(run_config_frozen__normal_env).db
|
|
296
|
-
object_id_latest = asset_materialization_metadata(
|
|
297
|
-
asset_event, "object_id_latest"
|
|
298
|
-
).text
|
|
299
|
-
job = mdb.jobs.find_one(
|
|
300
|
-
{
|
|
301
|
-
"workflow.id": "gold-translation-1.0.0",
|
|
302
|
-
"config.object_id_latest": object_id_latest,
|
|
303
|
-
}
|
|
304
|
-
)
|
|
305
|
-
if job is not None:
|
|
306
|
-
rv = client.claim_job(job["id"])
|
|
307
|
-
if rv.status_code == status.HTTP_200_OK:
|
|
308
|
-
operation = rv.json()
|
|
309
|
-
run_config = merge(
|
|
310
|
-
run_config_frozen__normal_env,
|
|
311
|
-
{
|
|
312
|
-
"ops": {
|
|
313
|
-
"get_operation": {
|
|
314
|
-
"config": {
|
|
315
|
-
"operation_id": operation["id"],
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
},
|
|
320
|
-
)
|
|
321
|
-
yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
|
|
322
|
-
else:
|
|
323
|
-
yield SkipReason("Job found, but already claimed by this site")
|
|
324
|
-
else:
|
|
325
|
-
yield SkipReason("No job found")
|
|
326
|
-
|
|
327
|
-
|
|
328
245
|
@sensor(
|
|
329
246
|
job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
|
|
330
247
|
default_status=DefaultSensorStatus.RUNNING,
|
|
@@ -463,11 +380,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
|
|
|
463
380
|
yield SkipReason("; ".join(skip_notes))
|
|
464
381
|
|
|
465
382
|
|
|
466
|
-
# TODO ensure data_object_type values from file_type_enum
|
|
467
|
-
# see /metadata-translation/notebooks/202106_curation_updates.ipynb
|
|
468
|
-
# for details ("Create file_type_enum collection" section).
|
|
469
|
-
|
|
470
|
-
|
|
471
383
|
@sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
|
|
472
384
|
def done_object_put_ops(_context):
|
|
473
385
|
client = get_runtime_api_site_client(run_config_frozen__normal_env)
|
|
@@ -507,7 +419,6 @@ def on_run_fail(context: RunStatusSensorContext):
|
|
|
507
419
|
@repository
|
|
508
420
|
def repo():
|
|
509
421
|
graph_jobs = [
|
|
510
|
-
gold_translation.to_job(**preset_normal),
|
|
511
422
|
hello_graph.to_job(name="hello_job"),
|
|
512
423
|
ensure_jobs.to_job(**preset_normal),
|
|
513
424
|
apply_metadata_in.to_job(**preset_normal),
|
|
@@ -523,8 +434,6 @@ def repo():
|
|
|
523
434
|
]
|
|
524
435
|
sensors = [
|
|
525
436
|
done_object_put_ops,
|
|
526
|
-
ensure_gold_translation_job,
|
|
527
|
-
claim_and_run_gold_translation_curation,
|
|
528
437
|
process_workflow_job_triggers,
|
|
529
438
|
claim_and_run_apply_changesheet_jobs,
|
|
530
439
|
claim_and_run_metadata_in_jobs,
|
|
@@ -534,20 +443,6 @@ def repo():
|
|
|
534
443
|
return graph_jobs + schedules + sensors
|
|
535
444
|
|
|
536
445
|
|
|
537
|
-
@repository
|
|
538
|
-
def translation():
|
|
539
|
-
graph_jobs = [jgi_job, gold_job, emsl_job]
|
|
540
|
-
|
|
541
|
-
return graph_jobs
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
@repository
|
|
545
|
-
def test_translation():
|
|
546
|
-
graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
|
|
547
|
-
|
|
548
|
-
return graph_jobs
|
|
549
|
-
|
|
550
|
-
|
|
551
446
|
@repository
|
|
552
447
|
def biosample_submission_ingest():
|
|
553
448
|
normal_resources = run_config_frozen__normal_env["resources"]
|
|
@@ -574,6 +469,7 @@ def biosample_submission_ingest():
|
|
|
574
469
|
"study_type": "research_study",
|
|
575
470
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
576
471
|
"include_field_site_info": False,
|
|
472
|
+
"enable_biosample_filtering": True,
|
|
577
473
|
},
|
|
578
474
|
},
|
|
579
475
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -1018,6 +914,8 @@ def database_records_stitching():
|
|
|
1018
914
|
"config": {
|
|
1019
915
|
"nmdc_study_id": "",
|
|
1020
916
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
917
|
+
"include_field_site_info": False,
|
|
918
|
+
"enable_biosample_filtering": True,
|
|
1021
919
|
}
|
|
1022
920
|
},
|
|
1023
921
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -1060,6 +958,8 @@ def database_records_stitching():
|
|
|
1060
958
|
"config": {
|
|
1061
959
|
"nmdc_study_id": "",
|
|
1062
960
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
961
|
+
"include_field_site_info": False,
|
|
962
|
+
"enable_biosample_filtering": True,
|
|
1063
963
|
}
|
|
1064
964
|
},
|
|
1065
965
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -1102,21 +1002,11 @@ def database_records_stitching():
|
|
|
1102
1002
|
"config": {
|
|
1103
1003
|
"nmdc_study_id": "",
|
|
1104
1004
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
1005
|
+
"include_field_site_info": False,
|
|
1006
|
+
"enable_biosample_filtering": True,
|
|
1105
1007
|
}
|
|
1106
1008
|
},
|
|
1107
1009
|
},
|
|
1108
1010
|
},
|
|
1109
1011
|
),
|
|
1110
1012
|
]
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
# @repository
|
|
1114
|
-
# def validation():
|
|
1115
|
-
# graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
|
|
1116
|
-
# return graph_jobs
|
|
1117
|
-
#
|
|
1118
|
-
#
|
|
1119
|
-
# @repository
|
|
1120
|
-
# def test_validation():
|
|
1121
|
-
# graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
|
|
1122
|
-
# return graph_jobs
|
nmdc_runtime/site/resources.py
CHANGED
|
@@ -109,7 +109,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
109
109
|
},
|
|
110
110
|
)
|
|
111
111
|
response.raise_for_status()
|
|
112
|
-
return response.json()["cursor"]["
|
|
112
|
+
return response.json()["cursor"]["batch"]
|
|
113
113
|
|
|
114
114
|
def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
|
|
115
115
|
gold_project_id = normalize_gold_id(gold_project_id)
|
|
@@ -126,7 +126,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
126
126
|
},
|
|
127
127
|
)
|
|
128
128
|
response.raise_for_status()
|
|
129
|
-
return response.json()["cursor"]["
|
|
129
|
+
return response.json()["cursor"]["batch"]
|
|
130
130
|
|
|
131
131
|
def get_biosamples_for_study(self, study_id: str):
|
|
132
132
|
# TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
|
|
@@ -170,7 +170,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
170
170
|
},
|
|
171
171
|
)
|
|
172
172
|
response.raise_for_status()
|
|
173
|
-
return response.json()["cursor"]["
|
|
173
|
+
return response.json()["cursor"]["batch"]
|
|
174
174
|
|
|
175
175
|
def get_study(self, study_id: str):
|
|
176
176
|
response = self.request(
|
|
@@ -182,7 +182,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
182
182
|
},
|
|
183
183
|
)
|
|
184
184
|
response.raise_for_status()
|
|
185
|
-
return response.json()["cursor"]["
|
|
185
|
+
return response.json()["cursor"]["batch"]
|
|
186
186
|
|
|
187
187
|
|
|
188
188
|
class RuntimeApiSiteClient(RuntimeApiClient):
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import collections
|
|
2
|
-
import csv
|
|
3
2
|
import re
|
|
4
3
|
from typing import List, Tuple, Union
|
|
5
4
|
from nmdc_schema import nmdc
|
|
@@ -45,6 +44,7 @@ class GoldStudyTranslator(Translator):
|
|
|
45
44
|
analysis_projects: List[JSON_OBJECT] = [],
|
|
46
45
|
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
47
46
|
include_field_site_info: bool = False,
|
|
47
|
+
enable_biosample_filtering: bool = True,
|
|
48
48
|
*args,
|
|
49
49
|
**kwargs,
|
|
50
50
|
) -> None:
|
|
@@ -53,15 +53,20 @@ class GoldStudyTranslator(Translator):
|
|
|
53
53
|
self.study = study
|
|
54
54
|
self.study_type = nmdc.StudyCategoryEnum(study_type)
|
|
55
55
|
self.include_field_site_info = include_field_site_info
|
|
56
|
+
self.enable_biosample_filtering = enable_biosample_filtering
|
|
56
57
|
# Filter biosamples to only those with `sequencingStrategy` of
|
|
57
|
-
# "Metagenome" or "Metatranscriptome"
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
58
|
+
# "Metagenome" or "Metatranscriptome" if filtering is enabled
|
|
59
|
+
if enable_biosample_filtering:
|
|
60
|
+
self.biosamples = [
|
|
61
|
+
biosample
|
|
62
|
+
for biosample in biosamples
|
|
63
|
+
if any(
|
|
64
|
+
_is_valid_project(project)
|
|
65
|
+
for project in biosample.get("projects", [])
|
|
66
|
+
)
|
|
67
|
+
]
|
|
68
|
+
else:
|
|
69
|
+
self.biosamples = biosamples
|
|
65
70
|
# Fetch the valid projectGoldIds that are associated with filtered
|
|
66
71
|
# biosamples on their `projects` field
|
|
67
72
|
valid_project_ids = {
|
|
@@ -116,6 +121,9 @@ class GoldStudyTranslator(Translator):
|
|
|
116
121
|
:param gold_entity: GOLD entity object
|
|
117
122
|
:return: PersonValue corresponding to the first PI in the `contacts` field
|
|
118
123
|
"""
|
|
124
|
+
if "contacts" not in gold_entity:
|
|
125
|
+
return None
|
|
126
|
+
|
|
119
127
|
pi_dict = next(
|
|
120
128
|
(
|
|
121
129
|
contact
|
|
@@ -169,7 +177,7 @@ class GoldStudyTranslator(Translator):
|
|
|
169
177
|
project["ncbiBioSampleAccession"], default_prefix="biosample"
|
|
170
178
|
)
|
|
171
179
|
for project in biosample_projects
|
|
172
|
-
if project
|
|
180
|
+
if project.get("ncbiBioSampleAccession")
|
|
173
181
|
]
|
|
174
182
|
|
|
175
183
|
def _get_samp_taxon_id(
|
|
@@ -333,12 +341,7 @@ class GoldStudyTranslator(Translator):
|
|
|
333
341
|
if field_value is None:
|
|
334
342
|
return None
|
|
335
343
|
|
|
336
|
-
return
|
|
337
|
-
has_raw_value=field_value,
|
|
338
|
-
has_numeric_value=nmdc.Double(field_value),
|
|
339
|
-
has_unit=unit,
|
|
340
|
-
type="nmdc:QuantityValue",
|
|
341
|
-
)
|
|
344
|
+
return self._parse_quantity_value(str(field_value), unit)
|
|
342
345
|
|
|
343
346
|
def _get_text_value(
|
|
344
347
|
self, gold_entity: JSON_OBJECT, gold_field: str
|
|
@@ -564,13 +567,11 @@ class GoldStudyTranslator(Translator):
|
|
|
564
567
|
gold_biosample_id = gold_biosample["biosampleGoldId"]
|
|
565
568
|
return nmdc.Biosample(
|
|
566
569
|
add_date=gold_biosample.get("addDate"),
|
|
567
|
-
alt=self._get_quantity_value(
|
|
568
|
-
gold_biosample, "altitudeInMeters", unit="meters"
|
|
569
|
-
),
|
|
570
|
+
alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
|
|
570
571
|
collected_from=nmdc_field_site_id,
|
|
571
572
|
collection_date=self._get_collection_date(gold_biosample),
|
|
572
573
|
depth=self._get_quantity_value(
|
|
573
|
-
gold_biosample, ("depthInMeters", "depthInMeters2"), unit="
|
|
574
|
+
gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
|
|
574
575
|
),
|
|
575
576
|
description=gold_biosample.get("description"),
|
|
576
577
|
diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
|
|
@@ -609,7 +610,7 @@ class GoldStudyTranslator(Translator):
|
|
|
609
610
|
),
|
|
610
611
|
specific_ecosystem=gold_biosample.get("specificEcosystem"),
|
|
611
612
|
subsurface_depth=self._get_quantity_value(
|
|
612
|
-
gold_biosample, "subsurfaceDepthInMeters", unit="
|
|
613
|
+
gold_biosample, "subsurfaceDepthInMeters", unit="m"
|
|
613
614
|
),
|
|
614
615
|
temp=self._get_quantity_value(
|
|
615
616
|
gold_biosample, "sampleCollectionTemperature"
|
|
@@ -11,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
11
11
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
12
12
|
_get_value_or_none,
|
|
13
13
|
_create_controlled_identified_term_value,
|
|
14
|
-
_create_controlled_term_value,
|
|
15
14
|
_create_geolocation_value,
|
|
16
15
|
_create_quantity_value,
|
|
17
16
|
_create_timestamp_value,
|
|
@@ -10,7 +10,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
10
10
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
11
11
|
_get_value_or_none,
|
|
12
12
|
_create_controlled_identified_term_value,
|
|
13
|
-
_create_controlled_term_value,
|
|
14
13
|
_create_geolocation_value,
|
|
15
14
|
_create_quantity_value,
|
|
16
15
|
_create_timestamp_value,
|
|
@@ -153,7 +152,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
153
152
|
collection_date=_create_timestamp_value(
|
|
154
153
|
biosample_row["collectDate"].values[0]
|
|
155
154
|
),
|
|
156
|
-
temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "
|
|
155
|
+
temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Cel"),
|
|
157
156
|
depth=nmdc.QuantityValue(
|
|
158
157
|
has_minimum_numeric_value=_get_value_or_none(
|
|
159
158
|
biosample_row, "sampleTopDepth"
|
|
@@ -169,13 +168,13 @@ class NeonSoilDataTranslator(Translator):
|
|
|
169
168
|
analysis_type=_get_value_or_none(biosample_row, "sequenceAnalysisType"),
|
|
170
169
|
env_package=_create_text_value(biosample_row["sampleType"].values[0]),
|
|
171
170
|
nitro=_create_quantity_value(
|
|
172
|
-
biosample_row["nitrogenPercent"].values[0], "
|
|
171
|
+
biosample_row["nitrogenPercent"].values[0], "%"
|
|
173
172
|
),
|
|
174
173
|
org_carb=_create_quantity_value(
|
|
175
|
-
biosample_row["organicCPercent"].values[0], "
|
|
174
|
+
biosample_row["organicCPercent"].values[0], "%"
|
|
176
175
|
),
|
|
177
176
|
carb_nitro_ratio=_create_quantity_value(
|
|
178
|
-
biosample_row["CNratio"].values[0],
|
|
177
|
+
biosample_row["CNratio"].values[0], "ratio"
|
|
179
178
|
),
|
|
180
179
|
ph=_create_double_value(biosample_row["soilInWaterpH"].values[0]),
|
|
181
180
|
water_content=(
|
|
@@ -3,7 +3,6 @@ import sqlite3
|
|
|
3
3
|
from typing import Dict, Optional, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
|
-
import requests
|
|
7
6
|
import requests_cache
|
|
8
7
|
|
|
9
8
|
from nmdc_schema import nmdc
|
|
@@ -12,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
12
11
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
13
12
|
_get_value_or_none,
|
|
14
13
|
_create_controlled_identified_term_value,
|
|
15
|
-
_create_controlled_term_value,
|
|
16
14
|
_create_geolocation_value,
|
|
17
15
|
_create_quantity_value,
|
|
18
16
|
_create_timestamp_value,
|
|
@@ -47,6 +47,12 @@ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str]
|
|
|
47
47
|
(INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
UNIT_OVERRIDES: dict[str, dict[str, str]] = {
|
|
51
|
+
"Biosample": {
|
|
52
|
+
"depth": "m",
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
50
56
|
|
|
51
57
|
class EnvironmentPackage(Enum):
|
|
52
58
|
r"""
|
|
@@ -272,61 +278,9 @@ class SubmissionPortalTranslator(Translator):
|
|
|
272
278
|
def _get_quantity_value(
|
|
273
279
|
self, raw_value: Optional[str], unit: Optional[str] = None
|
|
274
280
|
) -> Union[nmdc.QuantityValue, None]:
|
|
275
|
-
"""Construct a nmdc:QuantityValue from a raw value string
|
|
276
|
-
|
|
277
|
-
The regex pattern minimally matches on a single numeric value (possibly
|
|
278
|
-
floating point). The pattern can also identify a range represented by
|
|
279
|
-
two numeric values separated by a hyphen. It can also identify non-numeric
|
|
280
|
-
characters at the end of the string which are interpreted as a unit. A unit
|
|
281
|
-
may also be explicitly provided as an argument to this function. If parsing
|
|
282
|
-
identifies a unit and a unit argument is provided, the unit argument is used.
|
|
283
|
-
If the pattern is not matched at all None is returned.
|
|
284
|
-
|
|
285
|
-
TODO: currently the parsed unit string is used as-is. In the future we may want
|
|
286
|
-
to be stricter about what we accept or coerce into a controlled value set
|
|
281
|
+
"""Construct a nmdc:QuantityValue from a raw value string"""
|
|
287
282
|
|
|
288
|
-
|
|
289
|
-
:param unit: optional unit, defaults to None
|
|
290
|
-
:return: nmdc:QuantityValue
|
|
291
|
-
"""
|
|
292
|
-
if raw_value is None:
|
|
293
|
-
return None
|
|
294
|
-
|
|
295
|
-
match = re.fullmatch(
|
|
296
|
-
"([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
|
|
297
|
-
raw_value,
|
|
298
|
-
)
|
|
299
|
-
if not match:
|
|
300
|
-
return None
|
|
301
|
-
|
|
302
|
-
qv = nmdc.QuantityValue(
|
|
303
|
-
has_raw_value=raw_value,
|
|
304
|
-
type="nmdc:QuantityValue",
|
|
305
|
-
)
|
|
306
|
-
if match.group(2):
|
|
307
|
-
# having group 2 means the value is a range like "0 - 1". Either
|
|
308
|
-
# group 1 or group 2 might be the minimum especially when handling
|
|
309
|
-
# negative ranges like "0 - -1"
|
|
310
|
-
num_1 = float(match.group(1))
|
|
311
|
-
num_2 = float(match.group(2))
|
|
312
|
-
qv.has_minimum_numeric_value = min(num_1, num_2)
|
|
313
|
-
qv.has_maximum_numeric_value = max(num_1, num_2)
|
|
314
|
-
else:
|
|
315
|
-
# otherwise we just have a single numeric value
|
|
316
|
-
qv.has_numeric_value = float(match.group(1))
|
|
317
|
-
|
|
318
|
-
if unit:
|
|
319
|
-
# a unit was manually specified
|
|
320
|
-
if match.group(3) and unit != match.group(3):
|
|
321
|
-
# a unit was also found in the raw string; issue a warning
|
|
322
|
-
# if they don't agree, but keep the manually specified one
|
|
323
|
-
logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
|
|
324
|
-
qv.has_unit = unit
|
|
325
|
-
elif match.group(3):
|
|
326
|
-
# a unit was found in the raw string
|
|
327
|
-
qv.has_unit = match.group(3)
|
|
328
|
-
|
|
329
|
-
return qv
|
|
283
|
+
return self._parse_quantity_value(raw_value, unit)
|
|
330
284
|
|
|
331
285
|
def _get_ontology_class(
|
|
332
286
|
self, raw_value: Optional[str]
|
|
@@ -475,6 +429,50 @@ class SubmissionPortalTranslator(Translator):
|
|
|
475
429
|
|
|
476
430
|
return value
|
|
477
431
|
|
|
432
|
+
def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
|
|
433
|
+
"""Collect and format DOIs from submission portal schema in nmdc format DOIs
|
|
434
|
+
|
|
435
|
+
If there were no DOIs, None is returned.
|
|
436
|
+
|
|
437
|
+
:param metadata_submission: submission portal entry
|
|
438
|
+
:return: list of nmdc.DOI objects
|
|
439
|
+
"""
|
|
440
|
+
data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
|
|
441
|
+
award_dois = self._get_from(
|
|
442
|
+
metadata_submission, ["multiOmicsForm", "awardDois"]
|
|
443
|
+
)
|
|
444
|
+
if data_dois and len(data_dois) > 0:
|
|
445
|
+
updated_data_dois = [
|
|
446
|
+
nmdc.Doi(
|
|
447
|
+
doi_category="dataset_doi",
|
|
448
|
+
doi_provider=doi["provider"],
|
|
449
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
450
|
+
type="nmdc:Doi",
|
|
451
|
+
)
|
|
452
|
+
for doi in data_dois
|
|
453
|
+
]
|
|
454
|
+
else:
|
|
455
|
+
updated_data_dois = []
|
|
456
|
+
|
|
457
|
+
if award_dois and len(award_dois) > 0:
|
|
458
|
+
updated_award_dois = [
|
|
459
|
+
nmdc.Doi(
|
|
460
|
+
doi_category="award_doi",
|
|
461
|
+
doi_provider=doi["provider"],
|
|
462
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
463
|
+
type="nmdc:Doi",
|
|
464
|
+
)
|
|
465
|
+
for doi in award_dois
|
|
466
|
+
]
|
|
467
|
+
else:
|
|
468
|
+
updated_award_dois = []
|
|
469
|
+
|
|
470
|
+
return_val = updated_data_dois + updated_award_dois
|
|
471
|
+
if len(return_val) == 0:
|
|
472
|
+
return_val = None
|
|
473
|
+
|
|
474
|
+
return return_val
|
|
475
|
+
|
|
478
476
|
def _get_data_objects_from_fields(
|
|
479
477
|
self,
|
|
480
478
|
sample_data: JSON_OBJECT,
|
|
@@ -591,6 +589,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
591
589
|
websites=self._get_from(
|
|
592
590
|
metadata_submission, ["studyForm", "linkOutWebpage"]
|
|
593
591
|
),
|
|
592
|
+
associated_dois=self._get_study_dois(metadata_submission),
|
|
594
593
|
)
|
|
595
594
|
|
|
596
595
|
def _transform_value_for_slot(
|
|
@@ -660,6 +659,17 @@ class SubmissionPortalTranslator(Translator):
|
|
|
660
659
|
logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
|
|
661
660
|
continue
|
|
662
661
|
|
|
662
|
+
# This step handles cases where the submission portal/schema instructs a user to
|
|
663
|
+
# provide a value in a specific unit. The unit cannot be parsed out of the raw value
|
|
664
|
+
# in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
|
|
665
|
+
# go away once units are encoded in the schema itself.
|
|
666
|
+
# See: https://github.com/microbiomedata/nmdc-schema/issues/2517
|
|
667
|
+
if class_name in UNIT_OVERRIDES:
|
|
668
|
+
# If the class has unit overrides, check if the slot is in the overrides
|
|
669
|
+
unit_overrides = UNIT_OVERRIDES[class_name]
|
|
670
|
+
if slot_name in unit_overrides:
|
|
671
|
+
unit = unit_overrides[slot_name]
|
|
672
|
+
|
|
663
673
|
slot_definition = self.schema_view.induced_slot(slot_name, class_name)
|
|
664
674
|
if slot_definition.multivalued:
|
|
665
675
|
value_list = value
|
|
@@ -1,9 +1,13 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
1
3
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Any, Callable, Dict, List, Optional
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
3
5
|
from nmdc_schema import nmdc
|
|
4
6
|
|
|
5
7
|
JSON_OBJECT = Dict[str, Any]
|
|
6
8
|
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
7
11
|
|
|
8
12
|
class Translator(ABC):
|
|
9
13
|
def __init__(
|
|
@@ -27,3 +31,61 @@ class Translator(ABC):
|
|
|
27
31
|
@abstractmethod
|
|
28
32
|
def get_database(self) -> nmdc.Database:
|
|
29
33
|
pass
|
|
34
|
+
|
|
35
|
+
def _parse_quantity_value(
|
|
36
|
+
self, raw_value: Optional[str], unit: Optional[str] = None
|
|
37
|
+
) -> Union[nmdc.QuantityValue, None]:
|
|
38
|
+
"""Construct a nmdc:QuantityValue from a raw value string
|
|
39
|
+
|
|
40
|
+
The regex pattern minimally matches on a single numeric value (possibly
|
|
41
|
+
floating point). The pattern can also identify a range represented by
|
|
42
|
+
two numeric values separated by a hyphen. It can also identify non-numeric
|
|
43
|
+
characters at the end of the string which are interpreted as a unit. A unit
|
|
44
|
+
may also be explicitly provided as an argument to this function. If parsing
|
|
45
|
+
identifies a unit and a unit argument is provided, the unit argument is used.
|
|
46
|
+
If the pattern is not matched at all None is returned.
|
|
47
|
+
|
|
48
|
+
:param raw_value: string to parse
|
|
49
|
+
:param unit: optional unit, defaults to None. If None, the unit is extracted from the
|
|
50
|
+
raw_value. If a unit is provided, it will override the unit extracted from the
|
|
51
|
+
raw_value.
|
|
52
|
+
:return: nmdc:QuantityValue
|
|
53
|
+
"""
|
|
54
|
+
if raw_value is None:
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
match = re.fullmatch(
|
|
58
|
+
"([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
|
|
59
|
+
raw_value,
|
|
60
|
+
)
|
|
61
|
+
if not match:
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
quantity_value_kwargs = {
|
|
65
|
+
"has_raw_value": raw_value,
|
|
66
|
+
"type": "nmdc:QuantityValue",
|
|
67
|
+
}
|
|
68
|
+
if match.group(2):
|
|
69
|
+
# having group 2 means the value is a range like "0 - 1". Either
|
|
70
|
+
# group 1 or group 2 might be the minimum especially when handling
|
|
71
|
+
# negative ranges like "0 - -1"
|
|
72
|
+
num_1 = float(match.group(1))
|
|
73
|
+
num_2 = float(match.group(2))
|
|
74
|
+
quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
|
|
75
|
+
quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
|
|
76
|
+
else:
|
|
77
|
+
# otherwise we just have a single numeric value
|
|
78
|
+
quantity_value_kwargs["has_numeric_value"] = float(match.group(1))
|
|
79
|
+
|
|
80
|
+
if unit:
|
|
81
|
+
# a unit was manually specified
|
|
82
|
+
if match.group(3) and unit != match.group(3):
|
|
83
|
+
# a unit was also found in the raw string; issue a warning
|
|
84
|
+
# if they don't agree, but keep the manually specified one
|
|
85
|
+
logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
|
|
86
|
+
quantity_value_kwargs["has_unit"] = unit
|
|
87
|
+
elif match.group(3):
|
|
88
|
+
# a unit was found in the raw string
|
|
89
|
+
quantity_value_kwargs["has_unit"] = match.group(3)
|
|
90
|
+
|
|
91
|
+
return nmdc.QuantityValue(**quantity_value_kwargs)
|
nmdc_runtime/site/util.py
CHANGED
|
@@ -3,10 +3,11 @@ import os
|
|
|
3
3
|
from functools import lru_cache
|
|
4
4
|
from pymongo.database import Database as MongoDatabase
|
|
5
5
|
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
6
|
-
from
|
|
6
|
+
from refscan.lib.helpers import get_collection_names_from_schema
|
|
7
7
|
|
|
8
|
-
from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
|
|
9
8
|
from nmdc_runtime.site.resources import mongo_resource
|
|
9
|
+
from nmdc_runtime.util import nmdc_schema_view
|
|
10
|
+
|
|
10
11
|
|
|
11
12
|
mode_test = {
|
|
12
13
|
"resource_defs": {"mongo": mongo_resource}
|
|
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
|
|
|
37
38
|
|
|
38
39
|
@lru_cache
|
|
39
40
|
def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
|
|
41
|
+
"""
|
|
42
|
+
TODO: Document this function.
|
|
43
|
+
"""
|
|
44
|
+
schema_view = nmdc_schema_view()
|
|
40
45
|
present_collection_names = set(mdb.list_collection_names())
|
|
41
46
|
return {
|
|
42
47
|
name: (
|
|
43
48
|
name in present_collection_names and "id_1" in mdb[name].index_information()
|
|
44
49
|
)
|
|
45
|
-
for name in get_collection_names_from_schema()
|
|
50
|
+
for name in get_collection_names_from_schema(schema_view)
|
|
46
51
|
}
|
|
47
52
|
|
|
48
53
|
|