nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +167 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/mongo.py +435 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +270 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +796 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +425 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +7 -8
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +633 -13
- nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
- nmdc_runtime/site/graphs.py +8 -22
- nmdc_runtime/site/ops.py +147 -181
- nmdc_runtime/site/repository.py +2 -112
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/gold_translator.py +4 -12
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +90 -48
- nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
- nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
- nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
- nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
nmdc_runtime/site/repository.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from dagster import (
|
|
6
5
|
repository,
|
|
@@ -14,7 +13,6 @@ from dagster import (
|
|
|
14
13
|
DagsterRunStatus,
|
|
15
14
|
RunStatusSensorContext,
|
|
16
15
|
DefaultSensorStatus,
|
|
17
|
-
in_process_executor,
|
|
18
16
|
)
|
|
19
17
|
from starlette import status
|
|
20
18
|
from toolz import merge, get_in
|
|
@@ -30,8 +28,6 @@ from nmdc_runtime.site.graphs import (
|
|
|
30
28
|
translate_metadata_submission_to_nmdc_schema_database,
|
|
31
29
|
ingest_metadata_submission,
|
|
32
30
|
gold_study_to_database,
|
|
33
|
-
gold_translation,
|
|
34
|
-
gold_translation_curation,
|
|
35
31
|
create_objects_from_site_object_puts,
|
|
36
32
|
housekeeping,
|
|
37
33
|
ensure_jobs,
|
|
@@ -62,9 +58,6 @@ from nmdc_runtime.site.resources import (
|
|
|
62
58
|
from nmdc_runtime.site.resources import (
|
|
63
59
|
get_runtime_api_site_client,
|
|
64
60
|
)
|
|
65
|
-
from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
|
|
66
|
-
from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
|
|
67
|
-
from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
|
|
68
61
|
from nmdc_runtime.util import freeze
|
|
69
62
|
from nmdc_runtime.util import unfreeze
|
|
70
63
|
|
|
@@ -249,82 +242,6 @@ def process_workflow_job_triggers(_context):
|
|
|
249
242
|
yield SkipReason("No new jobs required")
|
|
250
243
|
|
|
251
244
|
|
|
252
|
-
@asset_sensor(
|
|
253
|
-
asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
|
|
254
|
-
job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
|
|
255
|
-
)
|
|
256
|
-
def ensure_gold_translation_job(_context, asset_event):
|
|
257
|
-
mdb = get_mongo(run_config_frozen__normal_env).db
|
|
258
|
-
gold_etl_latest = mdb.objects.find_one(
|
|
259
|
-
{"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
|
|
260
|
-
)
|
|
261
|
-
sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
|
|
262
|
-
if gold_etl_latest is None:
|
|
263
|
-
yield SkipReason("can't find sensed asset object_id in database")
|
|
264
|
-
return
|
|
265
|
-
elif gold_etl_latest["id"] != sensed_object_id:
|
|
266
|
-
yield SkipReason("later object than sensed materialization")
|
|
267
|
-
return
|
|
268
|
-
|
|
269
|
-
run_config = merge(
|
|
270
|
-
run_config_frozen__normal_env,
|
|
271
|
-
{
|
|
272
|
-
"solids": {
|
|
273
|
-
"construct_jobs": {
|
|
274
|
-
"config": {
|
|
275
|
-
"base_jobs": [
|
|
276
|
-
{
|
|
277
|
-
"workflow": {"id": "gold-translation-1.0.0"},
|
|
278
|
-
"config": {"object_id": gold_etl_latest["id"]},
|
|
279
|
-
}
|
|
280
|
-
]
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
},
|
|
285
|
-
)
|
|
286
|
-
yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
@asset_sensor(
|
|
290
|
-
asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
|
|
291
|
-
job=gold_translation_curation.to_job(**preset_normal),
|
|
292
|
-
)
|
|
293
|
-
def claim_and_run_gold_translation_curation(_context, asset_event):
|
|
294
|
-
client = get_runtime_api_site_client(run_config_frozen__normal_env)
|
|
295
|
-
mdb = get_mongo(run_config_frozen__normal_env).db
|
|
296
|
-
object_id_latest = asset_materialization_metadata(
|
|
297
|
-
asset_event, "object_id_latest"
|
|
298
|
-
).text
|
|
299
|
-
job = mdb.jobs.find_one(
|
|
300
|
-
{
|
|
301
|
-
"workflow.id": "gold-translation-1.0.0",
|
|
302
|
-
"config.object_id_latest": object_id_latest,
|
|
303
|
-
}
|
|
304
|
-
)
|
|
305
|
-
if job is not None:
|
|
306
|
-
rv = client.claim_job(job["id"])
|
|
307
|
-
if rv.status_code == status.HTTP_200_OK:
|
|
308
|
-
operation = rv.json()
|
|
309
|
-
run_config = merge(
|
|
310
|
-
run_config_frozen__normal_env,
|
|
311
|
-
{
|
|
312
|
-
"ops": {
|
|
313
|
-
"get_operation": {
|
|
314
|
-
"config": {
|
|
315
|
-
"operation_id": operation["id"],
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
},
|
|
320
|
-
)
|
|
321
|
-
yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
|
|
322
|
-
else:
|
|
323
|
-
yield SkipReason("Job found, but already claimed by this site")
|
|
324
|
-
else:
|
|
325
|
-
yield SkipReason("No job found")
|
|
326
|
-
|
|
327
|
-
|
|
328
245
|
@sensor(
|
|
329
246
|
job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
|
|
330
247
|
default_status=DefaultSensorStatus.RUNNING,
|
|
@@ -502,7 +419,6 @@ def on_run_fail(context: RunStatusSensorContext):
|
|
|
502
419
|
@repository
|
|
503
420
|
def repo():
|
|
504
421
|
graph_jobs = [
|
|
505
|
-
gold_translation.to_job(**preset_normal),
|
|
506
422
|
hello_graph.to_job(name="hello_job"),
|
|
507
423
|
ensure_jobs.to_job(**preset_normal),
|
|
508
424
|
apply_metadata_in.to_job(**preset_normal),
|
|
@@ -518,8 +434,6 @@ def repo():
|
|
|
518
434
|
]
|
|
519
435
|
sensors = [
|
|
520
436
|
done_object_put_ops,
|
|
521
|
-
ensure_gold_translation_job,
|
|
522
|
-
claim_and_run_gold_translation_curation,
|
|
523
437
|
process_workflow_job_triggers,
|
|
524
438
|
claim_and_run_apply_changesheet_jobs,
|
|
525
439
|
claim_and_run_metadata_in_jobs,
|
|
@@ -529,20 +443,6 @@ def repo():
|
|
|
529
443
|
return graph_jobs + schedules + sensors
|
|
530
444
|
|
|
531
445
|
|
|
532
|
-
@repository
|
|
533
|
-
def translation():
|
|
534
|
-
graph_jobs = [jgi_job, gold_job, emsl_job]
|
|
535
|
-
|
|
536
|
-
return graph_jobs
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
@repository
|
|
540
|
-
def test_translation():
|
|
541
|
-
graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
|
|
542
|
-
|
|
543
|
-
return graph_jobs
|
|
544
|
-
|
|
545
|
-
|
|
546
446
|
@repository
|
|
547
447
|
def biosample_submission_ingest():
|
|
548
448
|
normal_resources = run_config_frozen__normal_env["resources"]
|
|
@@ -602,6 +502,7 @@ def biosample_submission_ingest():
|
|
|
602
502
|
"data_object_mapping_file_url": None,
|
|
603
503
|
"biosample_extras_file_url": None,
|
|
604
504
|
"biosample_extras_slot_mapping_file_url": None,
|
|
505
|
+
"study_id": None,
|
|
605
506
|
}
|
|
606
507
|
},
|
|
607
508
|
"translate_portal_submission_to_nmdc_schema_database": {
|
|
@@ -638,6 +539,7 @@ def biosample_submission_ingest():
|
|
|
638
539
|
"data_object_mapping_file_url": None,
|
|
639
540
|
"biosample_extras_file_url": None,
|
|
640
541
|
"biosample_extras_slot_mapping_file_url": None,
|
|
542
|
+
"study_id": None,
|
|
641
543
|
}
|
|
642
544
|
},
|
|
643
545
|
"translate_portal_submission_to_nmdc_schema_database": {
|
|
@@ -1110,15 +1012,3 @@ def database_records_stitching():
|
|
|
1110
1012
|
},
|
|
1111
1013
|
),
|
|
1112
1014
|
]
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
# @repository
|
|
1116
|
-
# def validation():
|
|
1117
|
-
# graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
|
|
1118
|
-
# return graph_jobs
|
|
1119
|
-
#
|
|
1120
|
-
#
|
|
1121
|
-
# @repository
|
|
1122
|
-
# def test_validation():
|
|
1123
|
-
# graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
|
|
1124
|
-
# return graph_jobs
|
nmdc_runtime/site/resources.py
CHANGED
|
@@ -520,11 +520,24 @@ class MongoDB:
|
|
|
520
520
|
self.db = self.client[dbname]
|
|
521
521
|
|
|
522
522
|
def add_docs(self, docs, validate=True, replace=True):
|
|
523
|
+
"""
|
|
524
|
+
TODO: Document this function.
|
|
525
|
+
"""
|
|
523
526
|
try:
|
|
524
527
|
if validate:
|
|
525
528
|
nmdc_jsonschema_validator_noidpatterns(docs)
|
|
526
529
|
rv = {}
|
|
527
|
-
for collection_name,
|
|
530
|
+
for collection_name, collection_docs in docs.items():
|
|
531
|
+
# If `collection_docs` is empty, abort this iteration.
|
|
532
|
+
#
|
|
533
|
+
# Note: We do this because the `bulk_write` method called below will raise
|
|
534
|
+
# an `InvalidOperation` exception if it is passed 0 operations.
|
|
535
|
+
#
|
|
536
|
+
# Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write
|
|
537
|
+
#
|
|
538
|
+
if len(collection_docs) == 0:
|
|
539
|
+
continue
|
|
540
|
+
|
|
528
541
|
rv[collection_name] = self.db[collection_name].bulk_write(
|
|
529
542
|
[
|
|
530
543
|
(
|
|
@@ -532,7 +545,7 @@ class MongoDB:
|
|
|
532
545
|
if replace
|
|
533
546
|
else InsertOne(d)
|
|
534
547
|
)
|
|
535
|
-
for d in
|
|
548
|
+
for d in collection_docs
|
|
536
549
|
]
|
|
537
550
|
)
|
|
538
551
|
now = datetime.now(timezone.utc)
|
|
@@ -544,7 +557,7 @@ class MongoDB:
|
|
|
544
557
|
"ts": now,
|
|
545
558
|
# "dtl": {},
|
|
546
559
|
}
|
|
547
|
-
for d in
|
|
560
|
+
for d in collection_docs
|
|
548
561
|
]
|
|
549
562
|
)
|
|
550
563
|
return rv
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import collections
|
|
2
|
-
import csv
|
|
3
2
|
import re
|
|
4
3
|
from typing import List, Tuple, Union
|
|
5
4
|
from nmdc_schema import nmdc
|
|
@@ -342,12 +341,7 @@ class GoldStudyTranslator(Translator):
|
|
|
342
341
|
if field_value is None:
|
|
343
342
|
return None
|
|
344
343
|
|
|
345
|
-
return
|
|
346
|
-
has_raw_value=field_value,
|
|
347
|
-
has_numeric_value=nmdc.Double(field_value),
|
|
348
|
-
has_unit=unit,
|
|
349
|
-
type="nmdc:QuantityValue",
|
|
350
|
-
)
|
|
344
|
+
return self._parse_quantity_value(str(field_value), unit)
|
|
351
345
|
|
|
352
346
|
def _get_text_value(
|
|
353
347
|
self, gold_entity: JSON_OBJECT, gold_field: str
|
|
@@ -573,13 +567,11 @@ class GoldStudyTranslator(Translator):
|
|
|
573
567
|
gold_biosample_id = gold_biosample["biosampleGoldId"]
|
|
574
568
|
return nmdc.Biosample(
|
|
575
569
|
add_date=gold_biosample.get("addDate"),
|
|
576
|
-
alt=self._get_quantity_value(
|
|
577
|
-
gold_biosample, "altitudeInMeters", unit="meters"
|
|
578
|
-
),
|
|
570
|
+
alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
|
|
579
571
|
collected_from=nmdc_field_site_id,
|
|
580
572
|
collection_date=self._get_collection_date(gold_biosample),
|
|
581
573
|
depth=self._get_quantity_value(
|
|
582
|
-
gold_biosample, ("depthInMeters", "depthInMeters2"), unit="
|
|
574
|
+
gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
|
|
583
575
|
),
|
|
584
576
|
description=gold_biosample.get("description"),
|
|
585
577
|
diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
|
|
@@ -618,7 +610,7 @@ class GoldStudyTranslator(Translator):
|
|
|
618
610
|
),
|
|
619
611
|
specific_ecosystem=gold_biosample.get("specificEcosystem"),
|
|
620
612
|
subsurface_depth=self._get_quantity_value(
|
|
621
|
-
gold_biosample, "subsurfaceDepthInMeters", unit="
|
|
613
|
+
gold_biosample, "subsurfaceDepthInMeters", unit="m"
|
|
622
614
|
),
|
|
623
615
|
temp=self._get_quantity_value(
|
|
624
616
|
gold_biosample, "sampleCollectionTemperature"
|
|
@@ -11,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
11
11
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
12
12
|
_get_value_or_none,
|
|
13
13
|
_create_controlled_identified_term_value,
|
|
14
|
-
_create_controlled_term_value,
|
|
15
14
|
_create_geolocation_value,
|
|
16
15
|
_create_quantity_value,
|
|
17
16
|
_create_timestamp_value,
|
|
@@ -10,7 +10,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
10
10
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
11
11
|
_get_value_or_none,
|
|
12
12
|
_create_controlled_identified_term_value,
|
|
13
|
-
_create_controlled_term_value,
|
|
14
13
|
_create_geolocation_value,
|
|
15
14
|
_create_quantity_value,
|
|
16
15
|
_create_timestamp_value,
|
|
@@ -153,7 +152,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
153
152
|
collection_date=_create_timestamp_value(
|
|
154
153
|
biosample_row["collectDate"].values[0]
|
|
155
154
|
),
|
|
156
|
-
temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "
|
|
155
|
+
temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Cel"),
|
|
157
156
|
depth=nmdc.QuantityValue(
|
|
158
157
|
has_minimum_numeric_value=_get_value_or_none(
|
|
159
158
|
biosample_row, "sampleTopDepth"
|
|
@@ -169,13 +168,13 @@ class NeonSoilDataTranslator(Translator):
|
|
|
169
168
|
analysis_type=_get_value_or_none(biosample_row, "sequenceAnalysisType"),
|
|
170
169
|
env_package=_create_text_value(biosample_row["sampleType"].values[0]),
|
|
171
170
|
nitro=_create_quantity_value(
|
|
172
|
-
biosample_row["nitrogenPercent"].values[0], "
|
|
171
|
+
biosample_row["nitrogenPercent"].values[0], "%"
|
|
173
172
|
),
|
|
174
173
|
org_carb=_create_quantity_value(
|
|
175
|
-
biosample_row["organicCPercent"].values[0], "
|
|
174
|
+
biosample_row["organicCPercent"].values[0], "%"
|
|
176
175
|
),
|
|
177
176
|
carb_nitro_ratio=_create_quantity_value(
|
|
178
|
-
biosample_row["CNratio"].values[0],
|
|
177
|
+
biosample_row["CNratio"].values[0], "ratio"
|
|
179
178
|
),
|
|
180
179
|
ph=_create_double_value(biosample_row["soilInWaterpH"].values[0]),
|
|
181
180
|
water_content=(
|
|
@@ -3,7 +3,6 @@ import sqlite3
|
|
|
3
3
|
from typing import Dict, Optional, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
|
-
import requests
|
|
7
6
|
import requests_cache
|
|
8
7
|
|
|
9
8
|
from nmdc_schema import nmdc
|
|
@@ -12,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
12
11
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
13
12
|
_get_value_or_none,
|
|
14
13
|
_create_controlled_identified_term_value,
|
|
15
|
-
_create_controlled_term_value,
|
|
16
14
|
_create_geolocation_value,
|
|
17
15
|
_create_quantity_value,
|
|
18
16
|
_create_timestamp_value,
|
|
@@ -145,6 +145,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
145
145
|
# See: https://github.com/microbiomedata/submission-schema/issues/162
|
|
146
146
|
study_category: Optional[str] = None,
|
|
147
147
|
study_pi_image_url: Optional[str] = None,
|
|
148
|
+
study_id: Optional[str] = None,
|
|
148
149
|
# Additional biosample-level metadata with optional column mapping information not captured
|
|
149
150
|
# by the submission portal currently.
|
|
150
151
|
# See: https://github.com/microbiomedata/submission-schema/issues/162
|
|
@@ -165,6 +166,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
165
166
|
nmdc.StudyCategoryEnum(study_category) if study_category else None
|
|
166
167
|
)
|
|
167
168
|
self.study_pi_image_url = study_pi_image_url
|
|
169
|
+
self.study_id = study_id
|
|
168
170
|
|
|
169
171
|
self.biosample_extras = group_dicts_by_key(
|
|
170
172
|
BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
|
|
@@ -174,6 +176,13 @@ class SubmissionPortalTranslator(Translator):
|
|
|
174
176
|
)
|
|
175
177
|
|
|
176
178
|
self.schema_view: SchemaView = _get_schema_view()
|
|
179
|
+
self._material_processing_subclass_names = []
|
|
180
|
+
for class_name in self.schema_view.class_descendants(
|
|
181
|
+
"MaterialProcessing", reflexive=False
|
|
182
|
+
):
|
|
183
|
+
class_def = self.schema_view.get_class(class_name)
|
|
184
|
+
if not class_def.abstract:
|
|
185
|
+
self._material_processing_subclass_names.append(class_name)
|
|
177
186
|
|
|
178
187
|
def _get_pi(
|
|
179
188
|
self, metadata_submission: JSON_OBJECT
|
|
@@ -278,61 +287,9 @@ class SubmissionPortalTranslator(Translator):
|
|
|
278
287
|
def _get_quantity_value(
|
|
279
288
|
self, raw_value: Optional[str], unit: Optional[str] = None
|
|
280
289
|
) -> Union[nmdc.QuantityValue, None]:
|
|
281
|
-
"""Construct a nmdc:QuantityValue from a raw value string
|
|
290
|
+
"""Construct a nmdc:QuantityValue from a raw value string"""
|
|
282
291
|
|
|
283
|
-
|
|
284
|
-
floating point). The pattern can also identify a range represented by
|
|
285
|
-
two numeric values separated by a hyphen. It can also identify non-numeric
|
|
286
|
-
characters at the end of the string which are interpreted as a unit. A unit
|
|
287
|
-
may also be explicitly provided as an argument to this function. If parsing
|
|
288
|
-
identifies a unit and a unit argument is provided, the unit argument is used.
|
|
289
|
-
If the pattern is not matched at all None is returned.
|
|
290
|
-
|
|
291
|
-
TODO: currently the parsed unit string is used as-is. In the future we may want
|
|
292
|
-
to be stricter about what we accept or coerce into a controlled value set
|
|
293
|
-
|
|
294
|
-
:param raw_value: string to parse
|
|
295
|
-
:param unit: optional unit, defaults to None
|
|
296
|
-
:return: nmdc:QuantityValue
|
|
297
|
-
"""
|
|
298
|
-
if raw_value is None:
|
|
299
|
-
return None
|
|
300
|
-
|
|
301
|
-
match = re.fullmatch(
|
|
302
|
-
"([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
|
|
303
|
-
raw_value,
|
|
304
|
-
)
|
|
305
|
-
if not match:
|
|
306
|
-
return None
|
|
307
|
-
|
|
308
|
-
qv = nmdc.QuantityValue(
|
|
309
|
-
has_raw_value=raw_value,
|
|
310
|
-
type="nmdc:QuantityValue",
|
|
311
|
-
)
|
|
312
|
-
if match.group(2):
|
|
313
|
-
# having group 2 means the value is a range like "0 - 1". Either
|
|
314
|
-
# group 1 or group 2 might be the minimum especially when handling
|
|
315
|
-
# negative ranges like "0 - -1"
|
|
316
|
-
num_1 = float(match.group(1))
|
|
317
|
-
num_2 = float(match.group(2))
|
|
318
|
-
qv.has_minimum_numeric_value = min(num_1, num_2)
|
|
319
|
-
qv.has_maximum_numeric_value = max(num_1, num_2)
|
|
320
|
-
else:
|
|
321
|
-
# otherwise we just have a single numeric value
|
|
322
|
-
qv.has_numeric_value = float(match.group(1))
|
|
323
|
-
|
|
324
|
-
if unit:
|
|
325
|
-
# a unit was manually specified
|
|
326
|
-
if match.group(3) and unit != match.group(3):
|
|
327
|
-
# a unit was also found in the raw string; issue a warning
|
|
328
|
-
# if they don't agree, but keep the manually specified one
|
|
329
|
-
logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
|
|
330
|
-
qv.has_unit = unit
|
|
331
|
-
elif match.group(3):
|
|
332
|
-
# a unit was found in the raw string
|
|
333
|
-
qv.has_unit = match.group(3)
|
|
334
|
-
|
|
335
|
-
return qv
|
|
292
|
+
return self._parse_quantity_value(raw_value, unit)
|
|
336
293
|
|
|
337
294
|
def _get_ontology_class(
|
|
338
295
|
self, raw_value: Optional[str]
|
|
@@ -594,6 +551,14 @@ class SubmissionPortalTranslator(Translator):
|
|
|
594
551
|
|
|
595
552
|
return data_objects, manifest
|
|
596
553
|
|
|
554
|
+
def _parse_sample_link(self, sample_link: str) -> tuple[str, list[str]] | None:
|
|
555
|
+
"""Parse a sample link in the form of `ProcessingName:SampleName,..."""
|
|
556
|
+
pattern = r"(" + "|".join(self._material_processing_subclass_names) + r"):(.+)"
|
|
557
|
+
match = re.match(pattern, sample_link)
|
|
558
|
+
if not match:
|
|
559
|
+
return None
|
|
560
|
+
return match.group(1), split_strip(match.group(2), ",")
|
|
561
|
+
|
|
597
562
|
def _translate_study(
|
|
598
563
|
self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
|
|
599
564
|
) -> nmdc.Study:
|
|
@@ -804,11 +769,14 @@ class SubmissionPortalTranslator(Translator):
|
|
|
804
769
|
"metadata_submission", {}
|
|
805
770
|
)
|
|
806
771
|
|
|
807
|
-
# Generate one Study instance based on the metadata submission
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
772
|
+
# Generate one Study instance based on the metadata submission, if a study_id wasn't provided
|
|
773
|
+
if self.study_id:
|
|
774
|
+
nmdc_study_id = self.study_id
|
|
775
|
+
else:
|
|
776
|
+
nmdc_study_id = self._id_minter("nmdc:Study")[0]
|
|
777
|
+
database.study_set = [
|
|
778
|
+
self._translate_study(metadata_submission_data, nmdc_study_id)
|
|
779
|
+
]
|
|
812
780
|
|
|
813
781
|
# Automatically populate the `env_package` field in the sample data based on which
|
|
814
782
|
# environmental data tab the sample data came from.
|
|
@@ -840,15 +808,63 @@ class SubmissionPortalTranslator(Translator):
|
|
|
840
808
|
)
|
|
841
809
|
|
|
842
810
|
# Translate the sample data into nmdc:Biosample objects
|
|
843
|
-
database.biosample_set = [
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
811
|
+
database.biosample_set = []
|
|
812
|
+
for sample_data_id, sample_data in sample_data_by_id.items():
|
|
813
|
+
# This shouldn't happen, but just in case skip empty sample data
|
|
814
|
+
if not sample_data:
|
|
815
|
+
continue
|
|
816
|
+
|
|
817
|
+
# Find the first tab that has a sample_link value and attempt to parse it
|
|
818
|
+
sample_link = ""
|
|
819
|
+
for tab in sample_data:
|
|
820
|
+
if tab.get("sample_link"):
|
|
821
|
+
sample_link = tab.get("sample_link")
|
|
822
|
+
break
|
|
823
|
+
parsed_sample_link = self._parse_sample_link(sample_link)
|
|
824
|
+
|
|
825
|
+
# If the sample_link could be parsed according to the [ProcessName]:[InputSample,...]
|
|
826
|
+
# format, then create a ProcessedSample and MaterialProcessing instance instead of a
|
|
827
|
+
# Biosample instance. The input samples must be present in the submission for this to
|
|
828
|
+
# work. An exception is raised if any of the referenced input samples are missing.
|
|
829
|
+
if parsed_sample_link is not None:
|
|
830
|
+
processing_type, processing_inputs = parsed_sample_link
|
|
831
|
+
if not all(
|
|
832
|
+
input_id in sample_data_to_nmdc_biosample_ids
|
|
833
|
+
for input_id in processing_inputs
|
|
834
|
+
):
|
|
835
|
+
raise ValueError(
|
|
836
|
+
f"Could not find all input samples in sample_link '{sample_link}'"
|
|
837
|
+
)
|
|
838
|
+
processed_sample_id = self._id_minter("nmdc:ProcessedSample")[0]
|
|
839
|
+
database.processed_sample_set.append(
|
|
840
|
+
nmdc.ProcessedSample(
|
|
841
|
+
id=processed_sample_id,
|
|
842
|
+
type="nmdc:ProcessedSample",
|
|
843
|
+
name=sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip(),
|
|
844
|
+
)
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
processing_class = getattr(nmdc, processing_type)
|
|
848
|
+
material_processing = processing_class(
|
|
849
|
+
id=self._id_minter(f"nmdc:{processing_type}")[0],
|
|
850
|
+
type=f"nmdc:{processing_type}",
|
|
851
|
+
has_input=[
|
|
852
|
+
sample_data_to_nmdc_biosample_ids[input_id]
|
|
853
|
+
for input_id in processing_inputs
|
|
854
|
+
],
|
|
855
|
+
has_output=[processed_sample_id],
|
|
856
|
+
)
|
|
857
|
+
database.material_processing_set.append(material_processing)
|
|
858
|
+
|
|
859
|
+
# If there was no sample_link or it doesn't follow the expected format, create a
|
|
860
|
+
# Biosample instance as normal.
|
|
861
|
+
else:
|
|
862
|
+
biosample = self._translate_biosample(
|
|
863
|
+
sample_data,
|
|
864
|
+
nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
|
|
865
|
+
nmdc_study_id=nmdc_study_id,
|
|
866
|
+
)
|
|
867
|
+
database.biosample_set.append(biosample)
|
|
852
868
|
|
|
853
869
|
# This section handles the translation of information in the external sequencing tabs into
|
|
854
870
|
# various NMDC objects.
|
|
@@ -1,9 +1,13 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
1
3
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Any, Callable, Dict, List, Optional
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
3
5
|
from nmdc_schema import nmdc
|
|
4
6
|
|
|
5
7
|
JSON_OBJECT = Dict[str, Any]
|
|
6
8
|
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
7
11
|
|
|
8
12
|
class Translator(ABC):
|
|
9
13
|
def __init__(
|
|
@@ -27,3 +31,61 @@ class Translator(ABC):
|
|
|
27
31
|
@abstractmethod
|
|
28
32
|
def get_database(self) -> nmdc.Database:
|
|
29
33
|
pass
|
|
34
|
+
|
|
35
|
+
def _parse_quantity_value(
|
|
36
|
+
self, raw_value: Optional[str], unit: Optional[str] = None
|
|
37
|
+
) -> Union[nmdc.QuantityValue, None]:
|
|
38
|
+
"""Construct a nmdc:QuantityValue from a raw value string
|
|
39
|
+
|
|
40
|
+
The regex pattern minimally matches on a single numeric value (possibly
|
|
41
|
+
floating point). The pattern can also identify a range represented by
|
|
42
|
+
two numeric values separated by a hyphen. It can also identify non-numeric
|
|
43
|
+
characters at the end of the string which are interpreted as a unit. A unit
|
|
44
|
+
may also be explicitly provided as an argument to this function. If parsing
|
|
45
|
+
identifies a unit and a unit argument is provided, the unit argument is used.
|
|
46
|
+
If the pattern is not matched at all None is returned.
|
|
47
|
+
|
|
48
|
+
:param raw_value: string to parse
|
|
49
|
+
:param unit: optional unit, defaults to None. If None, the unit is extracted from the
|
|
50
|
+
raw_value. If a unit is provided, it will override the unit extracted from the
|
|
51
|
+
raw_value.
|
|
52
|
+
:return: nmdc:QuantityValue
|
|
53
|
+
"""
|
|
54
|
+
if raw_value is None:
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
match = re.fullmatch(
|
|
58
|
+
"([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
|
|
59
|
+
raw_value,
|
|
60
|
+
)
|
|
61
|
+
if not match:
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
quantity_value_kwargs = {
|
|
65
|
+
"has_raw_value": raw_value,
|
|
66
|
+
"type": "nmdc:QuantityValue",
|
|
67
|
+
}
|
|
68
|
+
if match.group(2):
|
|
69
|
+
# having group 2 means the value is a range like "0 - 1". Either
|
|
70
|
+
# group 1 or group 2 might be the minimum especially when handling
|
|
71
|
+
# negative ranges like "0 - -1"
|
|
72
|
+
num_1 = float(match.group(1))
|
|
73
|
+
num_2 = float(match.group(2))
|
|
74
|
+
quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
|
|
75
|
+
quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
|
|
76
|
+
else:
|
|
77
|
+
# otherwise we just have a single numeric value
|
|
78
|
+
quantity_value_kwargs["has_numeric_value"] = float(match.group(1))
|
|
79
|
+
|
|
80
|
+
if unit:
|
|
81
|
+
# a unit was manually specified
|
|
82
|
+
if match.group(3) and unit != match.group(3):
|
|
83
|
+
# a unit was also found in the raw string; issue a warning
|
|
84
|
+
# if they don't agree, but keep the manually specified one
|
|
85
|
+
logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
|
|
86
|
+
quantity_value_kwargs["has_unit"] = unit
|
|
87
|
+
elif match.group(3):
|
|
88
|
+
# a unit was found in the raw string
|
|
89
|
+
quantity_value_kwargs["has_unit"] = match.group(3)
|
|
90
|
+
|
|
91
|
+
return nmdc.QuantityValue(**quantity_value_kwargs)
|
nmdc_runtime/site/util.py
CHANGED
|
@@ -3,10 +3,11 @@ import os
|
|
|
3
3
|
from functools import lru_cache
|
|
4
4
|
from pymongo.database import Database as MongoDatabase
|
|
5
5
|
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
6
|
-
from
|
|
6
|
+
from refscan.lib.helpers import get_collection_names_from_schema
|
|
7
7
|
|
|
8
|
-
from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
|
|
9
8
|
from nmdc_runtime.site.resources import mongo_resource
|
|
9
|
+
from nmdc_runtime.util import nmdc_schema_view
|
|
10
|
+
|
|
10
11
|
|
|
11
12
|
mode_test = {
|
|
12
13
|
"resource_defs": {"mongo": mongo_resource}
|
|
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
|
|
|
37
38
|
|
|
38
39
|
@lru_cache
|
|
39
40
|
def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
|
|
41
|
+
"""
|
|
42
|
+
TODO: Document this function.
|
|
43
|
+
"""
|
|
44
|
+
schema_view = nmdc_schema_view()
|
|
40
45
|
present_collection_names = set(mdb.list_collection_names())
|
|
41
46
|
return {
|
|
42
47
|
name: (
|
|
43
48
|
name in present_collection_names and "id_1" in mdb[name].index_information()
|
|
44
49
|
)
|
|
45
|
-
for name in get_collection_names_from_schema()
|
|
50
|
+
for name in get_collection_names_from_schema(schema_view)
|
|
46
51
|
}
|
|
47
52
|
|
|
48
53
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from dagster import op, AssetMaterialization, AssetKey,
|
|
1
|
+
from dagster import op, AssetMaterialization, AssetKey, MetadataValue
|
|
2
2
|
from jsonschema import Draft7Validator
|
|
3
3
|
from nmdc_runtime.util import get_nmdc_jsonschema_dict
|
|
4
4
|
from toolz import dissoc
|
|
@@ -92,10 +92,15 @@ def announce_validation_report(context, report, api_object):
|
|
|
92
92
|
asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
|
|
93
93
|
description=f"{collection_name} translation validation",
|
|
94
94
|
metadata={
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
95
|
+
# Note: When this code was originally written, it used Dagster's `EventMetadata` class,
|
|
96
|
+
# which has since been replaced by Dagster's `MetadataValue` class.
|
|
97
|
+
#
|
|
98
|
+
# Reference:
|
|
99
|
+
# - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
|
|
100
|
+
# - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
|
|
101
|
+
#
|
|
102
|
+
"n_errors": MetadataValue.int(len(report["errors"])),
|
|
103
|
+
"object_id": MetadataValue.text(api_object["id"]),
|
|
99
104
|
},
|
|
100
105
|
)
|
|
101
106
|
|