nmdc-runtime 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/api/__init__.py +0 -0
- nmdc_runtime/api/analytics.py +70 -0
- nmdc_runtime/api/boot/__init__.py +0 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/__init__.py +0 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +170 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/__init__.py +0 -0
- nmdc_runtime/api/db/mongo.py +447 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/__init__.py +0 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +794 -0
- nmdc_runtime/api/endpoints/ids.py +192 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +105 -0
- nmdc_runtime/api/endpoints/queries.py +679 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +229 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +774 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/main.py +401 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/__init__.py +0 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/__init__.py +0 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/minter.py +0 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +253 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +242 -0
- nmdc_runtime/config.py +7 -8
- nmdc_runtime/core/db/Database.py +1 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -9
- nmdc_runtime/lib/extract_nmdc_data.py +0 -8
- nmdc_runtime/lib/nmdc_dataframes.py +3 -7
- nmdc_runtime/lib/nmdc_etl_class.py +1 -7
- nmdc_runtime/minter/adapters/repository.py +1 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +35 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/export/ncbi_xml.py +1 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
- nmdc_runtime/site/graphs.py +1 -22
- nmdc_runtime/site/ops.py +60 -152
- nmdc_runtime/site/repository.py +0 -112
- nmdc_runtime/site/translation/gold_translator.py +4 -12
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +2 -54
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/util.py +3 -47
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
- nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/site/repository.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from dagster import (
|
|
6
5
|
repository,
|
|
@@ -14,7 +13,6 @@ from dagster import (
|
|
|
14
13
|
DagsterRunStatus,
|
|
15
14
|
RunStatusSensorContext,
|
|
16
15
|
DefaultSensorStatus,
|
|
17
|
-
in_process_executor,
|
|
18
16
|
)
|
|
19
17
|
from starlette import status
|
|
20
18
|
from toolz import merge, get_in
|
|
@@ -30,8 +28,6 @@ from nmdc_runtime.site.graphs import (
|
|
|
30
28
|
translate_metadata_submission_to_nmdc_schema_database,
|
|
31
29
|
ingest_metadata_submission,
|
|
32
30
|
gold_study_to_database,
|
|
33
|
-
gold_translation,
|
|
34
|
-
gold_translation_curation,
|
|
35
31
|
create_objects_from_site_object_puts,
|
|
36
32
|
housekeeping,
|
|
37
33
|
ensure_jobs,
|
|
@@ -62,9 +58,6 @@ from nmdc_runtime.site.resources import (
|
|
|
62
58
|
from nmdc_runtime.site.resources import (
|
|
63
59
|
get_runtime_api_site_client,
|
|
64
60
|
)
|
|
65
|
-
from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
|
|
66
|
-
from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
|
|
67
|
-
from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
|
|
68
61
|
from nmdc_runtime.util import freeze
|
|
69
62
|
from nmdc_runtime.util import unfreeze
|
|
70
63
|
|
|
@@ -249,82 +242,6 @@ def process_workflow_job_triggers(_context):
|
|
|
249
242
|
yield SkipReason("No new jobs required")
|
|
250
243
|
|
|
251
244
|
|
|
252
|
-
@asset_sensor(
|
|
253
|
-
asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
|
|
254
|
-
job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
|
|
255
|
-
)
|
|
256
|
-
def ensure_gold_translation_job(_context, asset_event):
|
|
257
|
-
mdb = get_mongo(run_config_frozen__normal_env).db
|
|
258
|
-
gold_etl_latest = mdb.objects.find_one(
|
|
259
|
-
{"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
|
|
260
|
-
)
|
|
261
|
-
sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
|
|
262
|
-
if gold_etl_latest is None:
|
|
263
|
-
yield SkipReason("can't find sensed asset object_id in database")
|
|
264
|
-
return
|
|
265
|
-
elif gold_etl_latest["id"] != sensed_object_id:
|
|
266
|
-
yield SkipReason("later object than sensed materialization")
|
|
267
|
-
return
|
|
268
|
-
|
|
269
|
-
run_config = merge(
|
|
270
|
-
run_config_frozen__normal_env,
|
|
271
|
-
{
|
|
272
|
-
"solids": {
|
|
273
|
-
"construct_jobs": {
|
|
274
|
-
"config": {
|
|
275
|
-
"base_jobs": [
|
|
276
|
-
{
|
|
277
|
-
"workflow": {"id": "gold-translation-1.0.0"},
|
|
278
|
-
"config": {"object_id": gold_etl_latest["id"]},
|
|
279
|
-
}
|
|
280
|
-
]
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
},
|
|
285
|
-
)
|
|
286
|
-
yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
@asset_sensor(
|
|
290
|
-
asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
|
|
291
|
-
job=gold_translation_curation.to_job(**preset_normal),
|
|
292
|
-
)
|
|
293
|
-
def claim_and_run_gold_translation_curation(_context, asset_event):
|
|
294
|
-
client = get_runtime_api_site_client(run_config_frozen__normal_env)
|
|
295
|
-
mdb = get_mongo(run_config_frozen__normal_env).db
|
|
296
|
-
object_id_latest = asset_materialization_metadata(
|
|
297
|
-
asset_event, "object_id_latest"
|
|
298
|
-
).text
|
|
299
|
-
job = mdb.jobs.find_one(
|
|
300
|
-
{
|
|
301
|
-
"workflow.id": "gold-translation-1.0.0",
|
|
302
|
-
"config.object_id_latest": object_id_latest,
|
|
303
|
-
}
|
|
304
|
-
)
|
|
305
|
-
if job is not None:
|
|
306
|
-
rv = client.claim_job(job["id"])
|
|
307
|
-
if rv.status_code == status.HTTP_200_OK:
|
|
308
|
-
operation = rv.json()
|
|
309
|
-
run_config = merge(
|
|
310
|
-
run_config_frozen__normal_env,
|
|
311
|
-
{
|
|
312
|
-
"ops": {
|
|
313
|
-
"get_operation": {
|
|
314
|
-
"config": {
|
|
315
|
-
"operation_id": operation["id"],
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
},
|
|
320
|
-
)
|
|
321
|
-
yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
|
|
322
|
-
else:
|
|
323
|
-
yield SkipReason("Job found, but already claimed by this site")
|
|
324
|
-
else:
|
|
325
|
-
yield SkipReason("No job found")
|
|
326
|
-
|
|
327
|
-
|
|
328
245
|
@sensor(
|
|
329
246
|
job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
|
|
330
247
|
default_status=DefaultSensorStatus.RUNNING,
|
|
@@ -502,7 +419,6 @@ def on_run_fail(context: RunStatusSensorContext):
|
|
|
502
419
|
@repository
|
|
503
420
|
def repo():
|
|
504
421
|
graph_jobs = [
|
|
505
|
-
gold_translation.to_job(**preset_normal),
|
|
506
422
|
hello_graph.to_job(name="hello_job"),
|
|
507
423
|
ensure_jobs.to_job(**preset_normal),
|
|
508
424
|
apply_metadata_in.to_job(**preset_normal),
|
|
@@ -518,8 +434,6 @@ def repo():
|
|
|
518
434
|
]
|
|
519
435
|
sensors = [
|
|
520
436
|
done_object_put_ops,
|
|
521
|
-
ensure_gold_translation_job,
|
|
522
|
-
claim_and_run_gold_translation_curation,
|
|
523
437
|
process_workflow_job_triggers,
|
|
524
438
|
claim_and_run_apply_changesheet_jobs,
|
|
525
439
|
claim_and_run_metadata_in_jobs,
|
|
@@ -529,20 +443,6 @@ def repo():
|
|
|
529
443
|
return graph_jobs + schedules + sensors
|
|
530
444
|
|
|
531
445
|
|
|
532
|
-
@repository
|
|
533
|
-
def translation():
|
|
534
|
-
graph_jobs = [jgi_job, gold_job, emsl_job]
|
|
535
|
-
|
|
536
|
-
return graph_jobs
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
@repository
|
|
540
|
-
def test_translation():
|
|
541
|
-
graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
|
|
542
|
-
|
|
543
|
-
return graph_jobs
|
|
544
|
-
|
|
545
|
-
|
|
546
446
|
@repository
|
|
547
447
|
def biosample_submission_ingest():
|
|
548
448
|
normal_resources = run_config_frozen__normal_env["resources"]
|
|
@@ -1110,15 +1010,3 @@ def database_records_stitching():
|
|
|
1110
1010
|
},
|
|
1111
1011
|
),
|
|
1112
1012
|
]
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
# @repository
|
|
1116
|
-
# def validation():
|
|
1117
|
-
# graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
|
|
1118
|
-
# return graph_jobs
|
|
1119
|
-
#
|
|
1120
|
-
#
|
|
1121
|
-
# @repository
|
|
1122
|
-
# def test_validation():
|
|
1123
|
-
# graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
|
|
1124
|
-
# return graph_jobs
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import collections
|
|
2
|
-
import csv
|
|
3
2
|
import re
|
|
4
3
|
from typing import List, Tuple, Union
|
|
5
4
|
from nmdc_schema import nmdc
|
|
@@ -342,12 +341,7 @@ class GoldStudyTranslator(Translator):
|
|
|
342
341
|
if field_value is None:
|
|
343
342
|
return None
|
|
344
343
|
|
|
345
|
-
return
|
|
346
|
-
has_raw_value=field_value,
|
|
347
|
-
has_numeric_value=nmdc.Double(field_value),
|
|
348
|
-
has_unit=unit,
|
|
349
|
-
type="nmdc:QuantityValue",
|
|
350
|
-
)
|
|
344
|
+
return self._parse_quantity_value(str(field_value), unit)
|
|
351
345
|
|
|
352
346
|
def _get_text_value(
|
|
353
347
|
self, gold_entity: JSON_OBJECT, gold_field: str
|
|
@@ -573,13 +567,11 @@ class GoldStudyTranslator(Translator):
|
|
|
573
567
|
gold_biosample_id = gold_biosample["biosampleGoldId"]
|
|
574
568
|
return nmdc.Biosample(
|
|
575
569
|
add_date=gold_biosample.get("addDate"),
|
|
576
|
-
alt=self._get_quantity_value(
|
|
577
|
-
gold_biosample, "altitudeInMeters", unit="meters"
|
|
578
|
-
),
|
|
570
|
+
alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
|
|
579
571
|
collected_from=nmdc_field_site_id,
|
|
580
572
|
collection_date=self._get_collection_date(gold_biosample),
|
|
581
573
|
depth=self._get_quantity_value(
|
|
582
|
-
gold_biosample, ("depthInMeters", "depthInMeters2"), unit="
|
|
574
|
+
gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
|
|
583
575
|
),
|
|
584
576
|
description=gold_biosample.get("description"),
|
|
585
577
|
diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
|
|
@@ -618,7 +610,7 @@ class GoldStudyTranslator(Translator):
|
|
|
618
610
|
),
|
|
619
611
|
specific_ecosystem=gold_biosample.get("specificEcosystem"),
|
|
620
612
|
subsurface_depth=self._get_quantity_value(
|
|
621
|
-
gold_biosample, "subsurfaceDepthInMeters", unit="
|
|
613
|
+
gold_biosample, "subsurfaceDepthInMeters", unit="m"
|
|
622
614
|
),
|
|
623
615
|
temp=self._get_quantity_value(
|
|
624
616
|
gold_biosample, "sampleCollectionTemperature"
|
|
@@ -11,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
11
11
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
12
12
|
_get_value_or_none,
|
|
13
13
|
_create_controlled_identified_term_value,
|
|
14
|
-
_create_controlled_term_value,
|
|
15
14
|
_create_geolocation_value,
|
|
16
15
|
_create_quantity_value,
|
|
17
16
|
_create_timestamp_value,
|
|
@@ -10,7 +10,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
10
10
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
11
11
|
_get_value_or_none,
|
|
12
12
|
_create_controlled_identified_term_value,
|
|
13
|
-
_create_controlled_term_value,
|
|
14
13
|
_create_geolocation_value,
|
|
15
14
|
_create_quantity_value,
|
|
16
15
|
_create_timestamp_value,
|
|
@@ -153,7 +152,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
153
152
|
collection_date=_create_timestamp_value(
|
|
154
153
|
biosample_row["collectDate"].values[0]
|
|
155
154
|
),
|
|
156
|
-
temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "
|
|
155
|
+
temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Cel"),
|
|
157
156
|
depth=nmdc.QuantityValue(
|
|
158
157
|
has_minimum_numeric_value=_get_value_or_none(
|
|
159
158
|
biosample_row, "sampleTopDepth"
|
|
@@ -169,13 +168,13 @@ class NeonSoilDataTranslator(Translator):
|
|
|
169
168
|
analysis_type=_get_value_or_none(biosample_row, "sequenceAnalysisType"),
|
|
170
169
|
env_package=_create_text_value(biosample_row["sampleType"].values[0]),
|
|
171
170
|
nitro=_create_quantity_value(
|
|
172
|
-
biosample_row["nitrogenPercent"].values[0], "
|
|
171
|
+
biosample_row["nitrogenPercent"].values[0], "%"
|
|
173
172
|
),
|
|
174
173
|
org_carb=_create_quantity_value(
|
|
175
|
-
biosample_row["organicCPercent"].values[0], "
|
|
174
|
+
biosample_row["organicCPercent"].values[0], "%"
|
|
176
175
|
),
|
|
177
176
|
carb_nitro_ratio=_create_quantity_value(
|
|
178
|
-
biosample_row["CNratio"].values[0],
|
|
177
|
+
biosample_row["CNratio"].values[0], "ratio"
|
|
179
178
|
),
|
|
180
179
|
ph=_create_double_value(biosample_row["soilInWaterpH"].values[0]),
|
|
181
180
|
water_content=(
|
|
@@ -3,7 +3,6 @@ import sqlite3
|
|
|
3
3
|
from typing import Dict, Optional, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
|
-
import requests
|
|
7
6
|
import requests_cache
|
|
8
7
|
|
|
9
8
|
from nmdc_schema import nmdc
|
|
@@ -12,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
|
|
|
12
11
|
from nmdc_runtime.site.translation.neon_utils import (
|
|
13
12
|
_get_value_or_none,
|
|
14
13
|
_create_controlled_identified_term_value,
|
|
15
|
-
_create_controlled_term_value,
|
|
16
14
|
_create_geolocation_value,
|
|
17
15
|
_create_quantity_value,
|
|
18
16
|
_create_timestamp_value,
|
|
@@ -278,61 +278,9 @@ class SubmissionPortalTranslator(Translator):
|
|
|
278
278
|
def _get_quantity_value(
|
|
279
279
|
self, raw_value: Optional[str], unit: Optional[str] = None
|
|
280
280
|
) -> Union[nmdc.QuantityValue, None]:
|
|
281
|
-
"""Construct a nmdc:QuantityValue from a raw value string
|
|
281
|
+
"""Construct a nmdc:QuantityValue from a raw value string"""
|
|
282
282
|
|
|
283
|
-
|
|
284
|
-
floating point). The pattern can also identify a range represented by
|
|
285
|
-
two numeric values separated by a hyphen. It can also identify non-numeric
|
|
286
|
-
characters at the end of the string which are interpreted as a unit. A unit
|
|
287
|
-
may also be explicitly provided as an argument to this function. If parsing
|
|
288
|
-
identifies a unit and a unit argument is provided, the unit argument is used.
|
|
289
|
-
If the pattern is not matched at all None is returned.
|
|
290
|
-
|
|
291
|
-
TODO: currently the parsed unit string is used as-is. In the future we may want
|
|
292
|
-
to be stricter about what we accept or coerce into a controlled value set
|
|
293
|
-
|
|
294
|
-
:param raw_value: string to parse
|
|
295
|
-
:param unit: optional unit, defaults to None
|
|
296
|
-
:return: nmdc:QuantityValue
|
|
297
|
-
"""
|
|
298
|
-
if raw_value is None:
|
|
299
|
-
return None
|
|
300
|
-
|
|
301
|
-
match = re.fullmatch(
|
|
302
|
-
"([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
|
|
303
|
-
raw_value,
|
|
304
|
-
)
|
|
305
|
-
if not match:
|
|
306
|
-
return None
|
|
307
|
-
|
|
308
|
-
qv = nmdc.QuantityValue(
|
|
309
|
-
has_raw_value=raw_value,
|
|
310
|
-
type="nmdc:QuantityValue",
|
|
311
|
-
)
|
|
312
|
-
if match.group(2):
|
|
313
|
-
# having group 2 means the value is a range like "0 - 1". Either
|
|
314
|
-
# group 1 or group 2 might be the minimum especially when handling
|
|
315
|
-
# negative ranges like "0 - -1"
|
|
316
|
-
num_1 = float(match.group(1))
|
|
317
|
-
num_2 = float(match.group(2))
|
|
318
|
-
qv.has_minimum_numeric_value = min(num_1, num_2)
|
|
319
|
-
qv.has_maximum_numeric_value = max(num_1, num_2)
|
|
320
|
-
else:
|
|
321
|
-
# otherwise we just have a single numeric value
|
|
322
|
-
qv.has_numeric_value = float(match.group(1))
|
|
323
|
-
|
|
324
|
-
if unit:
|
|
325
|
-
# a unit was manually specified
|
|
326
|
-
if match.group(3) and unit != match.group(3):
|
|
327
|
-
# a unit was also found in the raw string; issue a warning
|
|
328
|
-
# if they don't agree, but keep the manually specified one
|
|
329
|
-
logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
|
|
330
|
-
qv.has_unit = unit
|
|
331
|
-
elif match.group(3):
|
|
332
|
-
# a unit was found in the raw string
|
|
333
|
-
qv.has_unit = match.group(3)
|
|
334
|
-
|
|
335
|
-
return qv
|
|
283
|
+
return self._parse_quantity_value(raw_value, unit)
|
|
336
284
|
|
|
337
285
|
def _get_ontology_class(
|
|
338
286
|
self, raw_value: Optional[str]
|
|
@@ -1,9 +1,13 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
1
3
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Any, Callable, Dict, List, Optional
|
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
3
5
|
from nmdc_schema import nmdc
|
|
4
6
|
|
|
5
7
|
JSON_OBJECT = Dict[str, Any]
|
|
6
8
|
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
7
11
|
|
|
8
12
|
class Translator(ABC):
|
|
9
13
|
def __init__(
|
|
@@ -27,3 +31,61 @@ class Translator(ABC):
|
|
|
27
31
|
@abstractmethod
|
|
28
32
|
def get_database(self) -> nmdc.Database:
|
|
29
33
|
pass
|
|
34
|
+
|
|
35
|
+
def _parse_quantity_value(
|
|
36
|
+
self, raw_value: Optional[str], unit: Optional[str] = None
|
|
37
|
+
) -> Union[nmdc.QuantityValue, None]:
|
|
38
|
+
"""Construct a nmdc:QuantityValue from a raw value string
|
|
39
|
+
|
|
40
|
+
The regex pattern minimally matches on a single numeric value (possibly
|
|
41
|
+
floating point). The pattern can also identify a range represented by
|
|
42
|
+
two numeric values separated by a hyphen. It can also identify non-numeric
|
|
43
|
+
characters at the end of the string which are interpreted as a unit. A unit
|
|
44
|
+
may also be explicitly provided as an argument to this function. If parsing
|
|
45
|
+
identifies a unit and a unit argument is provided, the unit argument is used.
|
|
46
|
+
If the pattern is not matched at all None is returned.
|
|
47
|
+
|
|
48
|
+
:param raw_value: string to parse
|
|
49
|
+
:param unit: optional unit, defaults to None. If None, the unit is extracted from the
|
|
50
|
+
raw_value. If a unit is provided, it will override the unit extracted from the
|
|
51
|
+
raw_value.
|
|
52
|
+
:return: nmdc:QuantityValue
|
|
53
|
+
"""
|
|
54
|
+
if raw_value is None:
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
match = re.fullmatch(
|
|
58
|
+
"([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
|
|
59
|
+
raw_value,
|
|
60
|
+
)
|
|
61
|
+
if not match:
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
quantity_value_kwargs = {
|
|
65
|
+
"has_raw_value": raw_value,
|
|
66
|
+
"type": "nmdc:QuantityValue",
|
|
67
|
+
}
|
|
68
|
+
if match.group(2):
|
|
69
|
+
# having group 2 means the value is a range like "0 - 1". Either
|
|
70
|
+
# group 1 or group 2 might be the minimum especially when handling
|
|
71
|
+
# negative ranges like "0 - -1"
|
|
72
|
+
num_1 = float(match.group(1))
|
|
73
|
+
num_2 = float(match.group(2))
|
|
74
|
+
quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
|
|
75
|
+
quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
|
|
76
|
+
else:
|
|
77
|
+
# otherwise we just have a single numeric value
|
|
78
|
+
quantity_value_kwargs["has_numeric_value"] = float(match.group(1))
|
|
79
|
+
|
|
80
|
+
if unit:
|
|
81
|
+
# a unit was manually specified
|
|
82
|
+
if match.group(3) and unit != match.group(3):
|
|
83
|
+
# a unit was also found in the raw string; issue a warning
|
|
84
|
+
# if they don't agree, but keep the manually specified one
|
|
85
|
+
logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
|
|
86
|
+
quantity_value_kwargs["has_unit"] = unit
|
|
87
|
+
elif match.group(3):
|
|
88
|
+
# a unit was found in the raw string
|
|
89
|
+
quantity_value_kwargs["has_unit"] = match.group(3)
|
|
90
|
+
|
|
91
|
+
return nmdc.QuantityValue(**quantity_value_kwargs)
|
nmdc_runtime/site/util.py
CHANGED
|
@@ -3,10 +3,11 @@ import os
|
|
|
3
3
|
from functools import lru_cache
|
|
4
4
|
from pymongo.database import Database as MongoDatabase
|
|
5
5
|
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
6
|
-
from
|
|
6
|
+
from refscan.lib.helpers import get_collection_names_from_schema
|
|
7
7
|
|
|
8
|
-
from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
|
|
9
8
|
from nmdc_runtime.site.resources import mongo_resource
|
|
9
|
+
from nmdc_runtime.util import nmdc_schema_view
|
|
10
|
+
|
|
10
11
|
|
|
11
12
|
mode_test = {
|
|
12
13
|
"resource_defs": {"mongo": mongo_resource}
|
|
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
|
|
|
37
38
|
|
|
38
39
|
@lru_cache
|
|
39
40
|
def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
|
|
41
|
+
"""
|
|
42
|
+
TODO: Document this function.
|
|
43
|
+
"""
|
|
44
|
+
schema_view = nmdc_schema_view()
|
|
40
45
|
present_collection_names = set(mdb.list_collection_names())
|
|
41
46
|
return {
|
|
42
47
|
name: (
|
|
43
48
|
name in present_collection_names and "id_1" in mdb[name].index_information()
|
|
44
49
|
)
|
|
45
|
-
for name in get_collection_names_from_schema()
|
|
50
|
+
for name in get_collection_names_from_schema(schema_view)
|
|
46
51
|
}
|
|
47
52
|
|
|
48
53
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from dagster import op, AssetMaterialization, AssetKey,
|
|
1
|
+
from dagster import op, AssetMaterialization, AssetKey, MetadataValue
|
|
2
2
|
from jsonschema import Draft7Validator
|
|
3
3
|
from nmdc_runtime.util import get_nmdc_jsonschema_dict
|
|
4
4
|
from toolz import dissoc
|
|
@@ -92,10 +92,15 @@ def announce_validation_report(context, report, api_object):
|
|
|
92
92
|
asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
|
|
93
93
|
description=f"{collection_name} translation validation",
|
|
94
94
|
metadata={
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
95
|
+
# Note: When this code was originally written, it used Dagster's `EventMetadata` class,
|
|
96
|
+
# which has since been replaced by Dagster's `MetadataValue` class.
|
|
97
|
+
#
|
|
98
|
+
# Reference:
|
|
99
|
+
# - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
|
|
100
|
+
# - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
|
|
101
|
+
#
|
|
102
|
+
"n_errors": MetadataValue.int(len(report["errors"])),
|
|
103
|
+
"object_id": MetadataValue.text(api_object["id"]),
|
|
99
104
|
},
|
|
100
105
|
)
|
|
101
106
|
|
nmdc_runtime/util.py
CHANGED
|
@@ -14,8 +14,6 @@ from typing import Callable, List, Optional, Set, Dict
|
|
|
14
14
|
import fastjsonschema
|
|
15
15
|
import requests
|
|
16
16
|
from frozendict import frozendict
|
|
17
|
-
from linkml_runtime import linkml_model
|
|
18
|
-
from linkml_runtime.utils.schemaview import SchemaView
|
|
19
17
|
from nmdc_schema.get_nmdc_view import ViewGetter
|
|
20
18
|
from pymongo.database import Database as MongoDatabase
|
|
21
19
|
from pymongo.errors import OperationFailure
|
|
@@ -27,48 +25,6 @@ from nmdc_runtime.api.core.util import sha256hash_from_file
|
|
|
27
25
|
from nmdc_runtime.api.models.object import DrsObjectIn
|
|
28
26
|
|
|
29
27
|
|
|
30
|
-
def get_names_of_classes_in_effective_range_of_slot(
|
|
31
|
-
schema_view: SchemaView, slot_definition: linkml_model.SlotDefinition
|
|
32
|
-
) -> List[str]:
|
|
33
|
-
r"""
|
|
34
|
-
Determine the slot's "effective" range, by taking into account its `any_of` constraints (if defined).
|
|
35
|
-
|
|
36
|
-
Note: The `any_of` constraints constrain the slot's "effective" range beyond that described by the
|
|
37
|
-
induced slot definition's `range` attribute. `SchemaView` does not seem to provide the result
|
|
38
|
-
of applying those additional constraints, so we do it manually here (if any are defined).
|
|
39
|
-
Reference: https://github.com/orgs/linkml/discussions/2101#discussion-6625646
|
|
40
|
-
|
|
41
|
-
Reference: https://linkml.io/linkml-model/latest/docs/any_of/
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
# Initialize the list to be empty.
|
|
45
|
-
names_of_eligible_target_classes = []
|
|
46
|
-
|
|
47
|
-
# If the `any_of` constraint is defined on this slot, use that instead of the `range`.
|
|
48
|
-
if "any_of" in slot_definition and len(slot_definition.any_of) > 0:
|
|
49
|
-
for slot_expression in slot_definition.any_of:
|
|
50
|
-
# Use the slot expression's `range` to get the specified eligible class name
|
|
51
|
-
# and the names of all classes that inherit from that eligible class.
|
|
52
|
-
if slot_expression.range in schema_view.all_classes():
|
|
53
|
-
own_and_descendant_class_names = schema_view.class_descendants(
|
|
54
|
-
slot_expression.range
|
|
55
|
-
)
|
|
56
|
-
names_of_eligible_target_classes.extend(own_and_descendant_class_names)
|
|
57
|
-
else:
|
|
58
|
-
# Use the slot's `range` to get the specified eligible class name
|
|
59
|
-
# and the names of all classes that inherit from that eligible class.
|
|
60
|
-
if slot_definition.range in schema_view.all_classes():
|
|
61
|
-
own_and_descendant_class_names = schema_view.class_descendants(
|
|
62
|
-
slot_definition.range
|
|
63
|
-
)
|
|
64
|
-
names_of_eligible_target_classes.extend(own_and_descendant_class_names)
|
|
65
|
-
|
|
66
|
-
# Remove duplicate class names.
|
|
67
|
-
names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))
|
|
68
|
-
|
|
69
|
-
return names_of_eligible_target_classes
|
|
70
|
-
|
|
71
|
-
|
|
72
28
|
def get_class_names_from_collection_spec(
|
|
73
29
|
spec: dict, prefix: Optional[str] = None
|
|
74
30
|
) -> List[str]:
|
|
@@ -324,9 +280,9 @@ def find_one(k_v: dict, entities: Iterable[dict]):
|
|
|
324
280
|
"""Find the first entity with key-value pair k_v, if any?
|
|
325
281
|
|
|
326
282
|
>>> find_one({"id": "foo"}, [{"id": "foo"}])
|
|
283
|
+
{'id': 'foo'}
|
|
284
|
+
>>> find_one({"id": "foo"}, [{"id": "bar"}]) is None
|
|
327
285
|
True
|
|
328
|
-
>>> find_one({"id": "foo"}, [{"id": "bar"}])
|
|
329
|
-
False
|
|
330
286
|
"""
|
|
331
287
|
if len(k_v) > 1:
|
|
332
288
|
raise Exception("Supports only one key-value pair")
|
|
@@ -370,7 +326,7 @@ def nmdc_database_collection_names():
|
|
|
370
326
|
TODO: Document this function.
|
|
371
327
|
|
|
372
328
|
TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
|
|
373
|
-
collections, use the function named `get_collection_names_from_schema`
|
|
329
|
+
collections, import/use the function named `get_collection_names_from_schema` from `refscan.lib.helpers`
|
|
374
330
|
instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
|
|
375
331
|
maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
|
|
376
332
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nmdc_runtime
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.10.0
|
|
4
4
|
Summary: A runtime system for NMDC data management and orchestration
|
|
5
5
|
Home-page: https://github.com/microbiomedata/nmdc-runtime
|
|
6
6
|
Author: Donny Winston
|
|
@@ -152,7 +152,7 @@ http://127.0.0.1:8000/redoc/.
|
|
|
152
152
|
|
|
153
153
|
|
|
154
154
|
* NOTE: Any time you add or change requirements in requirements/main.in or requirements/dev.in, you must run:
|
|
155
|
-
```
|
|
155
|
+
```bash
|
|
156
156
|
pip-compile --build-isolation --allow-unsafe --resolver=backtracking --strip-extras --output-file requirements/[main|dev].txt requirements/[main|dev].in
|
|
157
157
|
```
|
|
158
158
|
to generate main.txt and dev.txt files respectively. main.in is kind of like a poetry dependency stanza, dev.in is kind
|
|
@@ -160,9 +160,6 @@ of like poetry dev.dependencies stanza. main.txt and dev.txt are kind of like po
|
|
|
160
160
|
versions of dependencies to use. main.txt and dev.txt are combined in the docker compose build process to create the
|
|
161
161
|
final requirements.txt file and import the dependencies into the Docker image.
|
|
162
162
|
|
|
163
|
-
|
|
164
|
-
```bash
|
|
165
|
-
|
|
166
163
|
## Local Testing
|
|
167
164
|
|
|
168
165
|
Tests can be found in `tests` and are run with the following commands:
|
|
@@ -173,8 +170,9 @@ make test
|
|
|
173
170
|
|
|
174
171
|
# Run a Specific test file eg. tests/test_api/test_endpoints.py
|
|
175
172
|
make test ARGS="tests/test_api/test_endpoints.py"
|
|
176
|
-
|
|
173
|
+
|
|
177
174
|
docker compose --file docker-compose.test.yml run test
|
|
175
|
+
```
|
|
178
176
|
|
|
179
177
|
As you create Dagster solids and pipelines, add tests in `tests/` to check that your code behaves as
|
|
180
178
|
desired and does not break over time.
|
|
@@ -182,6 +180,59 @@ desired and does not break over time.
|
|
|
182
180
|
[For hints on how to write tests for solids and pipelines in Dagster, see their documentation
|
|
183
181
|
tutorial on Testing](https://docs.dagster.io/guides/test/unit-testing-assets-and-ops).
|
|
184
182
|
|
|
183
|
+
### Performance profiling
|
|
184
|
+
|
|
185
|
+
We use a tool called [Pyinstrument](https://pyinstrument.readthedocs.io) to profile the performance of the Runtime API while processing an individual HTTP request.
|
|
186
|
+
|
|
187
|
+
Here's how you can do that:
|
|
188
|
+
|
|
189
|
+
1. In your `.env` file, set `IS_PROFILING_ENABLED` to `true`
|
|
190
|
+
2. Start/restart your development stack: `$ make up-dev`
|
|
191
|
+
3. Ensure the endpoint function whose performance you want to profile is defined using `async def` (as opposed to just `def`) ([reference](https://github.com/joerick/pyinstrument/issues/257))
|
|
192
|
+
|
|
193
|
+
Then—with all of that done—submit an HTTP request that includes the URL query parameter: `profile=true`. Instructions for doing that are in the sections below.
|
|
194
|
+
|
|
195
|
+
<details>
|
|
196
|
+
<summary>Show/hide instructions for <code>GET</code> requests only (involves web browser)</summary>
|
|
197
|
+
|
|
198
|
+
1. In your web browser, visit the endpoint's URL, but add the `profile=true` query parameter to the URL. Examples:
|
|
199
|
+
```diff
|
|
200
|
+
A. If the URL doesn't already have query parameters, append `?profile=true`.
|
|
201
|
+
- http://127.0.0.1:8000/nmdcschema/biosample_set
|
|
202
|
+
+ http://127.0.0.1:8000/nmdcschema/biosample_set?profile=true
|
|
203
|
+
|
|
204
|
+
B. If the URL already has query parameters, append `&profile=true`.
|
|
205
|
+
- http://127.0.0.1:8000/nmdcschema/biosample_set?filter={}
|
|
206
|
+
+ http://127.0.0.1:8000/nmdcschema/biosample_set?filter={}&profile=true
|
|
207
|
+
```
|
|
208
|
+
2. Your web browser will display a performance profiling report.
|
|
209
|
+
> Note: The Runtime API will have responded with a performance profiling report web page, instead of its normal response (which the Runtime discards).
|
|
210
|
+
|
|
211
|
+
That'll only work for `GET` requests, though, since you're limited to specifying the request via the address bar.
|
|
212
|
+
|
|
213
|
+
</details>
|
|
214
|
+
|
|
215
|
+
<details>
|
|
216
|
+
<summary>Show/hide instructions for <strong>all</strong> kinds of requests (involves <code>curl</code> + web browser)</summary>
|
|
217
|
+
|
|
218
|
+
1. At your terminal, type or paste the `curl` command you want to run (you can copy/paste one from Swagger UI).
|
|
219
|
+
2. Append the `profile=true` query parameter to the URL in the command, and use the `-o` option to save the response to a file whose name ends with `.html`. For example:
|
|
220
|
+
```diff
|
|
221
|
+
curl -X 'POST' \
|
|
222
|
+
- 'http://127.0.0.1:8000/metadata/json:validate' \
|
|
223
|
+
+ 'http://127.0.0.1:8000/metadata/json:validate?profile=true' \
|
|
224
|
+
+ -o /tmp/profile.html
|
|
225
|
+
-H 'accept: application/json' \
|
|
226
|
+
-H 'Content-Type: application/json' \
|
|
227
|
+
-d '{"biosample_set": []}'
|
|
228
|
+
```
|
|
229
|
+
3. Run the command.
|
|
230
|
+
> Note: The Runtime API will respond with a performance profiling report web page, instead of its normal response (which the Runtime discards). The performance profiling report web page will be saved to the `.html` file to which you redirected the command output.
|
|
231
|
+
4. Double-click on the `.html` file to view it in your web browser.
|
|
232
|
+
1. Alternatively, open your web browser and navigate to the `.html` file; e.g., enter `file:///tmp/profile.html` into the address bar.
|
|
233
|
+
|
|
234
|
+
</details>
|
|
235
|
+
|
|
185
236
|
### RAM usage
|
|
186
237
|
|
|
187
238
|
The `dagster-daemon` and `dagster-dagit` containers can consume a lot of RAM. If tests are failing and the console of
|