nmdc-runtime 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (98) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +7 -8
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +1 -22
  76. nmdc_runtime/site/ops.py +60 -152
  77. nmdc_runtime/site/repository.py +0 -112
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +2 -54
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/util.py +3 -47
  87. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  88. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  89. nmdc_runtime/site/translation/emsl.py +0 -43
  90. nmdc_runtime/site/translation/gold.py +0 -53
  91. nmdc_runtime/site/translation/jgi.py +0 -32
  92. nmdc_runtime/site/translation/util.py +0 -132
  93. nmdc_runtime/site/validation/jgi.py +0 -43
  94. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  95. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  96. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  97. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  98. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
1
1
  import json
2
2
 
3
- from typing import Any
4
3
 
5
4
  from dagster import (
6
5
  repository,
@@ -14,7 +13,6 @@ from dagster import (
14
13
  DagsterRunStatus,
15
14
  RunStatusSensorContext,
16
15
  DefaultSensorStatus,
17
- in_process_executor,
18
16
  )
19
17
  from starlette import status
20
18
  from toolz import merge, get_in
@@ -30,8 +28,6 @@ from nmdc_runtime.site.graphs import (
30
28
  translate_metadata_submission_to_nmdc_schema_database,
31
29
  ingest_metadata_submission,
32
30
  gold_study_to_database,
33
- gold_translation,
34
- gold_translation_curation,
35
31
  create_objects_from_site_object_puts,
36
32
  housekeeping,
37
33
  ensure_jobs,
@@ -62,9 +58,6 @@ from nmdc_runtime.site.resources import (
62
58
  from nmdc_runtime.site.resources import (
63
59
  get_runtime_api_site_client,
64
60
  )
65
- from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
66
- from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
67
- from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
68
61
  from nmdc_runtime.util import freeze
69
62
  from nmdc_runtime.util import unfreeze
70
63
 
@@ -249,82 +242,6 @@ def process_workflow_job_triggers(_context):
249
242
  yield SkipReason("No new jobs required")
250
243
 
251
244
 
252
- @asset_sensor(
253
- asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
254
- job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
255
- )
256
- def ensure_gold_translation_job(_context, asset_event):
257
- mdb = get_mongo(run_config_frozen__normal_env).db
258
- gold_etl_latest = mdb.objects.find_one(
259
- {"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
260
- )
261
- sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
262
- if gold_etl_latest is None:
263
- yield SkipReason("can't find sensed asset object_id in database")
264
- return
265
- elif gold_etl_latest["id"] != sensed_object_id:
266
- yield SkipReason("later object than sensed materialization")
267
- return
268
-
269
- run_config = merge(
270
- run_config_frozen__normal_env,
271
- {
272
- "solids": {
273
- "construct_jobs": {
274
- "config": {
275
- "base_jobs": [
276
- {
277
- "workflow": {"id": "gold-translation-1.0.0"},
278
- "config": {"object_id": gold_etl_latest["id"]},
279
- }
280
- ]
281
- }
282
- }
283
- }
284
- },
285
- )
286
- yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
287
-
288
-
289
- @asset_sensor(
290
- asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
291
- job=gold_translation_curation.to_job(**preset_normal),
292
- )
293
- def claim_and_run_gold_translation_curation(_context, asset_event):
294
- client = get_runtime_api_site_client(run_config_frozen__normal_env)
295
- mdb = get_mongo(run_config_frozen__normal_env).db
296
- object_id_latest = asset_materialization_metadata(
297
- asset_event, "object_id_latest"
298
- ).text
299
- job = mdb.jobs.find_one(
300
- {
301
- "workflow.id": "gold-translation-1.0.0",
302
- "config.object_id_latest": object_id_latest,
303
- }
304
- )
305
- if job is not None:
306
- rv = client.claim_job(job["id"])
307
- if rv.status_code == status.HTTP_200_OK:
308
- operation = rv.json()
309
- run_config = merge(
310
- run_config_frozen__normal_env,
311
- {
312
- "ops": {
313
- "get_operation": {
314
- "config": {
315
- "operation_id": operation["id"],
316
- }
317
- }
318
- }
319
- },
320
- )
321
- yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
322
- else:
323
- yield SkipReason("Job found, but already claimed by this site")
324
- else:
325
- yield SkipReason("No job found")
326
-
327
-
328
245
  @sensor(
329
246
  job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
330
247
  default_status=DefaultSensorStatus.RUNNING,
@@ -502,7 +419,6 @@ def on_run_fail(context: RunStatusSensorContext):
502
419
  @repository
503
420
  def repo():
504
421
  graph_jobs = [
505
- gold_translation.to_job(**preset_normal),
506
422
  hello_graph.to_job(name="hello_job"),
507
423
  ensure_jobs.to_job(**preset_normal),
508
424
  apply_metadata_in.to_job(**preset_normal),
@@ -518,8 +434,6 @@ def repo():
518
434
  ]
519
435
  sensors = [
520
436
  done_object_put_ops,
521
- ensure_gold_translation_job,
522
- claim_and_run_gold_translation_curation,
523
437
  process_workflow_job_triggers,
524
438
  claim_and_run_apply_changesheet_jobs,
525
439
  claim_and_run_metadata_in_jobs,
@@ -529,20 +443,6 @@ def repo():
529
443
  return graph_jobs + schedules + sensors
530
444
 
531
445
 
532
- @repository
533
- def translation():
534
- graph_jobs = [jgi_job, gold_job, emsl_job]
535
-
536
- return graph_jobs
537
-
538
-
539
- @repository
540
- def test_translation():
541
- graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
542
-
543
- return graph_jobs
544
-
545
-
546
446
  @repository
547
447
  def biosample_submission_ingest():
548
448
  normal_resources = run_config_frozen__normal_env["resources"]
@@ -1110,15 +1010,3 @@ def database_records_stitching():
1110
1010
  },
1111
1011
  ),
1112
1012
  ]
1113
-
1114
-
1115
- # @repository
1116
- # def validation():
1117
- # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
1118
- # return graph_jobs
1119
- #
1120
- #
1121
- # @repository
1122
- # def test_validation():
1123
- # graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
1124
- # return graph_jobs
@@ -1,5 +1,4 @@
1
1
  import collections
2
- import csv
3
2
  import re
4
3
  from typing import List, Tuple, Union
5
4
  from nmdc_schema import nmdc
@@ -342,12 +341,7 @@ class GoldStudyTranslator(Translator):
342
341
  if field_value is None:
343
342
  return None
344
343
 
345
- return nmdc.QuantityValue(
346
- has_raw_value=field_value,
347
- has_numeric_value=nmdc.Double(field_value),
348
- has_unit=unit,
349
- type="nmdc:QuantityValue",
350
- )
344
+ return self._parse_quantity_value(str(field_value), unit)
351
345
 
352
346
  def _get_text_value(
353
347
  self, gold_entity: JSON_OBJECT, gold_field: str
@@ -573,13 +567,11 @@ class GoldStudyTranslator(Translator):
573
567
  gold_biosample_id = gold_biosample["biosampleGoldId"]
574
568
  return nmdc.Biosample(
575
569
  add_date=gold_biosample.get("addDate"),
576
- alt=self._get_quantity_value(
577
- gold_biosample, "altitudeInMeters", unit="meters"
578
- ),
570
+ alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
579
571
  collected_from=nmdc_field_site_id,
580
572
  collection_date=self._get_collection_date(gold_biosample),
581
573
  depth=self._get_quantity_value(
582
- gold_biosample, ("depthInMeters", "depthInMeters2"), unit="meters"
574
+ gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
583
575
  ),
584
576
  description=gold_biosample.get("description"),
585
577
  diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
@@ -618,7 +610,7 @@ class GoldStudyTranslator(Translator):
618
610
  ),
619
611
  specific_ecosystem=gold_biosample.get("specificEcosystem"),
620
612
  subsurface_depth=self._get_quantity_value(
621
- gold_biosample, "subsurfaceDepthInMeters", unit="meters"
613
+ gold_biosample, "subsurfaceDepthInMeters", unit="m"
622
614
  ),
623
615
  temp=self._get_quantity_value(
624
616
  gold_biosample, "sampleCollectionTemperature"
@@ -11,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
11
11
  from nmdc_runtime.site.translation.neon_utils import (
12
12
  _get_value_or_none,
13
13
  _create_controlled_identified_term_value,
14
- _create_controlled_term_value,
15
14
  _create_geolocation_value,
16
15
  _create_quantity_value,
17
16
  _create_timestamp_value,
@@ -10,7 +10,6 @@ from nmdc_runtime.site.util import get_basename
10
10
  from nmdc_runtime.site.translation.neon_utils import (
11
11
  _get_value_or_none,
12
12
  _create_controlled_identified_term_value,
13
- _create_controlled_term_value,
14
13
  _create_geolocation_value,
15
14
  _create_quantity_value,
16
15
  _create_timestamp_value,
@@ -153,7 +152,7 @@ class NeonSoilDataTranslator(Translator):
153
152
  collection_date=_create_timestamp_value(
154
153
  biosample_row["collectDate"].values[0]
155
154
  ),
156
- temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Celsius"),
155
+ temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Cel"),
157
156
  depth=nmdc.QuantityValue(
158
157
  has_minimum_numeric_value=_get_value_or_none(
159
158
  biosample_row, "sampleTopDepth"
@@ -169,13 +168,13 @@ class NeonSoilDataTranslator(Translator):
169
168
  analysis_type=_get_value_or_none(biosample_row, "sequenceAnalysisType"),
170
169
  env_package=_create_text_value(biosample_row["sampleType"].values[0]),
171
170
  nitro=_create_quantity_value(
172
- biosample_row["nitrogenPercent"].values[0], "percent"
171
+ biosample_row["nitrogenPercent"].values[0], "%"
173
172
  ),
174
173
  org_carb=_create_quantity_value(
175
- biosample_row["organicCPercent"].values[0], "percent"
174
+ biosample_row["organicCPercent"].values[0], "%"
176
175
  ),
177
176
  carb_nitro_ratio=_create_quantity_value(
178
- biosample_row["CNratio"].values[0], None
177
+ biosample_row["CNratio"].values[0], "ratio"
179
178
  ),
180
179
  ph=_create_double_value(biosample_row["soilInWaterpH"].values[0]),
181
180
  water_content=(
@@ -3,7 +3,6 @@ import sqlite3
3
3
  from typing import Dict, Optional, Union
4
4
 
5
5
  import pandas as pd
6
- import requests
7
6
  import requests_cache
8
7
 
9
8
  from nmdc_schema import nmdc
@@ -12,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
12
11
  from nmdc_runtime.site.translation.neon_utils import (
13
12
  _get_value_or_none,
14
13
  _create_controlled_identified_term_value,
15
- _create_controlled_term_value,
16
14
  _create_geolocation_value,
17
15
  _create_quantity_value,
18
16
  _create_timestamp_value,
@@ -278,61 +278,9 @@ class SubmissionPortalTranslator(Translator):
278
278
  def _get_quantity_value(
279
279
  self, raw_value: Optional[str], unit: Optional[str] = None
280
280
  ) -> Union[nmdc.QuantityValue, None]:
281
- """Construct a nmdc:QuantityValue from a raw value string
281
+ """Construct a nmdc:QuantityValue from a raw value string"""
282
282
 
283
- The regex pattern minimally matches on a single numeric value (possibly
284
- floating point). The pattern can also identify a range represented by
285
- two numeric values separated by a hyphen. It can also identify non-numeric
286
- characters at the end of the string which are interpreted as a unit. A unit
287
- may also be explicitly provided as an argument to this function. If parsing
288
- identifies a unit and a unit argument is provided, the unit argument is used.
289
- If the pattern is not matched at all None is returned.
290
-
291
- TODO: currently the parsed unit string is used as-is. In the future we may want
292
- to be stricter about what we accept or coerce into a controlled value set
293
-
294
- :param raw_value: string to parse
295
- :param unit: optional unit, defaults to None
296
- :return: nmdc:QuantityValue
297
- """
298
- if raw_value is None:
299
- return None
300
-
301
- match = re.fullmatch(
302
- "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
303
- raw_value,
304
- )
305
- if not match:
306
- return None
307
-
308
- qv = nmdc.QuantityValue(
309
- has_raw_value=raw_value,
310
- type="nmdc:QuantityValue",
311
- )
312
- if match.group(2):
313
- # having group 2 means the value is a range like "0 - 1". Either
314
- # group 1 or group 2 might be the minimum especially when handling
315
- # negative ranges like "0 - -1"
316
- num_1 = float(match.group(1))
317
- num_2 = float(match.group(2))
318
- qv.has_minimum_numeric_value = min(num_1, num_2)
319
- qv.has_maximum_numeric_value = max(num_1, num_2)
320
- else:
321
- # otherwise we just have a single numeric value
322
- qv.has_numeric_value = float(match.group(1))
323
-
324
- if unit:
325
- # a unit was manually specified
326
- if match.group(3) and unit != match.group(3):
327
- # a unit was also found in the raw string; issue a warning
328
- # if they don't agree, but keep the manually specified one
329
- logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
330
- qv.has_unit = unit
331
- elif match.group(3):
332
- # a unit was found in the raw string
333
- qv.has_unit = match.group(3)
334
-
335
- return qv
283
+ return self._parse_quantity_value(raw_value, unit)
336
284
 
337
285
  def _get_ontology_class(
338
286
  self, raw_value: Optional[str]
@@ -1,9 +1,13 @@
1
+ import logging
2
+ import re
1
3
  from abc import ABC, abstractmethod
2
- from typing import Any, Callable, Dict, List, Optional
4
+ from typing import Any, Callable, Dict, List, Optional, Union
3
5
  from nmdc_schema import nmdc
4
6
 
5
7
  JSON_OBJECT = Dict[str, Any]
6
8
 
9
+ logger = logging.getLogger(__name__)
10
+
7
11
 
8
12
  class Translator(ABC):
9
13
  def __init__(
@@ -27,3 +31,61 @@ class Translator(ABC):
27
31
  @abstractmethod
28
32
  def get_database(self) -> nmdc.Database:
29
33
  pass
34
+
35
+ def _parse_quantity_value(
36
+ self, raw_value: Optional[str], unit: Optional[str] = None
37
+ ) -> Union[nmdc.QuantityValue, None]:
38
+ """Construct a nmdc:QuantityValue from a raw value string
39
+
40
+ The regex pattern minimally matches on a single numeric value (possibly
41
+ floating point). The pattern can also identify a range represented by
42
+ two numeric values separated by a hyphen. It can also identify non-numeric
43
+ characters at the end of the string which are interpreted as a unit. A unit
44
+ may also be explicitly provided as an argument to this function. If parsing
45
+ identifies a unit and a unit argument is provided, the unit argument is used.
46
+ If the pattern is not matched at all None is returned.
47
+
48
+ :param raw_value: string to parse
49
+ :param unit: optional unit, defaults to None. If None, the unit is extracted from the
50
+ raw_value. If a unit is provided, it will override the unit extracted from the
51
+ raw_value.
52
+ :return: nmdc:QuantityValue
53
+ """
54
+ if raw_value is None:
55
+ return None
56
+
57
+ match = re.fullmatch(
58
+ "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
59
+ raw_value,
60
+ )
61
+ if not match:
62
+ return None
63
+
64
+ quantity_value_kwargs = {
65
+ "has_raw_value": raw_value,
66
+ "type": "nmdc:QuantityValue",
67
+ }
68
+ if match.group(2):
69
+ # having group 2 means the value is a range like "0 - 1". Either
70
+ # group 1 or group 2 might be the minimum especially when handling
71
+ # negative ranges like "0 - -1"
72
+ num_1 = float(match.group(1))
73
+ num_2 = float(match.group(2))
74
+ quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
75
+ quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
76
+ else:
77
+ # otherwise we just have a single numeric value
78
+ quantity_value_kwargs["has_numeric_value"] = float(match.group(1))
79
+
80
+ if unit:
81
+ # a unit was manually specified
82
+ if match.group(3) and unit != match.group(3):
83
+ # a unit was also found in the raw string; issue a warning
84
+ # if they don't agree, but keep the manually specified one
85
+ logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
86
+ quantity_value_kwargs["has_unit"] = unit
87
+ elif match.group(3):
88
+ # a unit was found in the raw string
89
+ quantity_value_kwargs["has_unit"] = match.group(3)
90
+
91
+ return nmdc.QuantityValue(**quantity_value_kwargs)
nmdc_runtime/site/util.py CHANGED
@@ -3,10 +3,11 @@ import os
3
3
  from functools import lru_cache
4
4
  from pymongo.database import Database as MongoDatabase
5
5
  from subprocess import Popen, PIPE, STDOUT, CalledProcessError
6
- from toolz import groupby
6
+ from refscan.lib.helpers import get_collection_names_from_schema
7
7
 
8
- from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
9
8
  from nmdc_runtime.site.resources import mongo_resource
9
+ from nmdc_runtime.util import nmdc_schema_view
10
+
10
11
 
11
12
  mode_test = {
12
13
  "resource_defs": {"mongo": mongo_resource}
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
37
38
 
38
39
  @lru_cache
39
40
  def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
41
+ """
42
+ TODO: Document this function.
43
+ """
44
+ schema_view = nmdc_schema_view()
40
45
  present_collection_names = set(mdb.list_collection_names())
41
46
  return {
42
47
  name: (
43
48
  name in present_collection_names and "id_1" in mdb[name].index_information()
44
49
  )
45
- for name in get_collection_names_from_schema()
50
+ for name in get_collection_names_from_schema(schema_view)
46
51
  }
47
52
 
48
53
 
@@ -1,4 +1,4 @@
1
- from dagster import op, AssetMaterialization, AssetKey, EventMetadata
1
+ from dagster import op, AssetMaterialization, AssetKey, MetadataValue
2
2
  from jsonschema import Draft7Validator
3
3
  from nmdc_runtime.util import get_nmdc_jsonschema_dict
4
4
  from toolz import dissoc
@@ -92,10 +92,15 @@ def announce_validation_report(context, report, api_object):
92
92
  asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
93
93
  description=f"{collection_name} translation validation",
94
94
  metadata={
95
- # https://docs.dagster.io/_apidocs/solids#event-metadata
96
- # also .json, .md, .path, .url, .python_artifact, ...
97
- "n_errors": EventMetadata.int(len(report["errors"])),
98
- "object_id": EventMetadata.text(api_object["id"]),
95
+ # Note: When this code was originally written, it used Dagster's `EventMetadata` class,
96
+ # which has since been replaced by Dagster's `MetadataValue` class.
97
+ #
98
+ # Reference:
99
+ # - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
100
+ # - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
101
+ #
102
+ "n_errors": MetadataValue.int(len(report["errors"])),
103
+ "object_id": MetadataValue.text(api_object["id"]),
99
104
  },
100
105
  )
101
106
 
nmdc_runtime/util.py CHANGED
@@ -14,8 +14,6 @@ from typing import Callable, List, Optional, Set, Dict
14
14
  import fastjsonschema
15
15
  import requests
16
16
  from frozendict import frozendict
17
- from linkml_runtime import linkml_model
18
- from linkml_runtime.utils.schemaview import SchemaView
19
17
  from nmdc_schema.get_nmdc_view import ViewGetter
20
18
  from pymongo.database import Database as MongoDatabase
21
19
  from pymongo.errors import OperationFailure
@@ -27,48 +25,6 @@ from nmdc_runtime.api.core.util import sha256hash_from_file
27
25
  from nmdc_runtime.api.models.object import DrsObjectIn
28
26
 
29
27
 
30
- def get_names_of_classes_in_effective_range_of_slot(
31
- schema_view: SchemaView, slot_definition: linkml_model.SlotDefinition
32
- ) -> List[str]:
33
- r"""
34
- Determine the slot's "effective" range, by taking into account its `any_of` constraints (if defined).
35
-
36
- Note: The `any_of` constraints constrain the slot's "effective" range beyond that described by the
37
- induced slot definition's `range` attribute. `SchemaView` does not seem to provide the result
38
- of applying those additional constraints, so we do it manually here (if any are defined).
39
- Reference: https://github.com/orgs/linkml/discussions/2101#discussion-6625646
40
-
41
- Reference: https://linkml.io/linkml-model/latest/docs/any_of/
42
- """
43
-
44
- # Initialize the list to be empty.
45
- names_of_eligible_target_classes = []
46
-
47
- # If the `any_of` constraint is defined on this slot, use that instead of the `range`.
48
- if "any_of" in slot_definition and len(slot_definition.any_of) > 0:
49
- for slot_expression in slot_definition.any_of:
50
- # Use the slot expression's `range` to get the specified eligible class name
51
- # and the names of all classes that inherit from that eligible class.
52
- if slot_expression.range in schema_view.all_classes():
53
- own_and_descendant_class_names = schema_view.class_descendants(
54
- slot_expression.range
55
- )
56
- names_of_eligible_target_classes.extend(own_and_descendant_class_names)
57
- else:
58
- # Use the slot's `range` to get the specified eligible class name
59
- # and the names of all classes that inherit from that eligible class.
60
- if slot_definition.range in schema_view.all_classes():
61
- own_and_descendant_class_names = schema_view.class_descendants(
62
- slot_definition.range
63
- )
64
- names_of_eligible_target_classes.extend(own_and_descendant_class_names)
65
-
66
- # Remove duplicate class names.
67
- names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))
68
-
69
- return names_of_eligible_target_classes
70
-
71
-
72
28
  def get_class_names_from_collection_spec(
73
29
  spec: dict, prefix: Optional[str] = None
74
30
  ) -> List[str]:
@@ -324,9 +280,9 @@ def find_one(k_v: dict, entities: Iterable[dict]):
324
280
  """Find the first entity with key-value pair k_v, if any?
325
281
 
326
282
  >>> find_one({"id": "foo"}, [{"id": "foo"}])
283
+ {'id': 'foo'}
284
+ >>> find_one({"id": "foo"}, [{"id": "bar"}]) is None
327
285
  True
328
- >>> find_one({"id": "foo"}, [{"id": "bar"}])
329
- False
330
286
  """
331
287
  if len(k_v) > 1:
332
288
  raise Exception("Supports only one key-value pair")
@@ -370,7 +326,7 @@ def nmdc_database_collection_names():
370
326
  TODO: Document this function.
371
327
 
372
328
  TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
373
- collections, use the function named `get_collection_names_from_schema` in `nmdc_runtime/api/db/mongo.py`
329
+ collections, import/use the function named `get_collection_names_from_schema` from `refscan.lib.helpers`
374
330
  instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
375
331
  maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
376
332
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nmdc_runtime
3
- Version: 2.9.0
3
+ Version: 2.10.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -152,7 +152,7 @@ http://127.0.0.1:8000/redoc/.
152
152
 
153
153
 
154
154
  * NOTE: Any time you add or change requirements in requirements/main.in or requirements/dev.in, you must run:
155
- ```
155
+ ```bash
156
156
  pip-compile --build-isolation --allow-unsafe --resolver=backtracking --strip-extras --output-file requirements/[main|dev].txt requirements/[main|dev].in
157
157
  ```
158
158
  to generate main.txt and dev.txt files respectively. main.in is kind of like a poetry dependency stanza, dev.in is kind
@@ -160,9 +160,6 @@ of like poetry dev.dependencies stanza. main.txt and dev.txt are kind of like po
160
160
  versions of dependencies to use. main.txt and dev.txt are combined in the docker compose build process to create the
161
161
  final requirements.txt file and import the dependencies into the Docker image.
162
162
 
163
-
164
- ```bash
165
-
166
163
  ## Local Testing
167
164
 
168
165
  Tests can be found in `tests` and are run with the following commands:
@@ -173,8 +170,9 @@ make test
173
170
 
174
171
  # Run a Specific test file eg. tests/test_api/test_endpoints.py
175
172
  make test ARGS="tests/test_api/test_endpoints.py"
176
- ```
173
+
177
174
  docker compose --file docker-compose.test.yml run test
175
+ ```
178
176
 
179
177
  As you create Dagster solids and pipelines, add tests in `tests/` to check that your code behaves as
180
178
  desired and does not break over time.
@@ -182,6 +180,59 @@ desired and does not break over time.
182
180
  [For hints on how to write tests for solids and pipelines in Dagster, see their documentation
183
181
  tutorial on Testing](https://docs.dagster.io/guides/test/unit-testing-assets-and-ops).
184
182
 
183
+ ### Performance profiling
184
+
185
+ We use a tool called [Pyinstrument](https://pyinstrument.readthedocs.io) to profile the performance of the Runtime API while processing an individual HTTP request.
186
+
187
+ Here's how you can do that:
188
+
189
+ 1. In your `.env` file, set `IS_PROFILING_ENABLED` to `true`
190
+ 2. Start/restart your development stack: `$ make up-dev`
191
+ 3. Ensure the endpoint function whose performance you want to profile is defined using `async def` (as opposed to just `def`) ([reference](https://github.com/joerick/pyinstrument/issues/257))
192
+
193
+ Then—with all of that done—submit an HTTP request that includes the URL query parameter: `profile=true`. Instructions for doing that are in the sections below.
194
+
195
+ <details>
196
+ <summary>Show/hide instructions for <code>GET</code> requests only (involves web browser)</summary>
197
+
198
+ 1. In your web browser, visit the endpoint's URL, but add the `profile=true` query parameter to the URL. Examples:
199
+ ```diff
200
+ A. If the URL doesn't already have query parameters, append `?profile=true`.
201
+ - http://127.0.0.1:8000/nmdcschema/biosample_set
202
+ + http://127.0.0.1:8000/nmdcschema/biosample_set?profile=true
203
+
204
+ B. If the URL already has query parameters, append `&profile=true`.
205
+ - http://127.0.0.1:8000/nmdcschema/biosample_set?filter={}
206
+ + http://127.0.0.1:8000/nmdcschema/biosample_set?filter={}&profile=true
207
+ ```
208
+ 2. Your web browser will display a performance profiling report.
209
+ > Note: The Runtime API will have responded with a performance profiling report web page, instead of its normal response (which the Runtime discards).
210
+
211
+ That'll only work for `GET` requests, though, since you're limited to specifying the request via the address bar.
212
+
213
+ </details>
214
+
215
+ <details>
216
+ <summary>Show/hide instructions for <strong>all</strong> kinds of requests (involves <code>curl</code> + web browser)</summary>
217
+
218
+ 1. At your terminal, type or paste the `curl` command you want to run (you can copy/paste one from Swagger UI).
219
+ 2. Append the `profile=true` query parameter to the URL in the command, and use the `-o` option to save the response to a file whose name ends with `.html`. For example:
220
+ ```diff
221
+ curl -X 'POST' \
222
+ - 'http://127.0.0.1:8000/metadata/json:validate' \
223
+ + 'http://127.0.0.1:8000/metadata/json:validate?profile=true' \
224
+ + -o /tmp/profile.html
225
+ -H 'accept: application/json' \
226
+ -H 'Content-Type: application/json' \
227
+ -d '{"biosample_set": []}'
228
+ ```
229
+ 3. Run the command.
230
+ > Note: The Runtime API will respond with a performance profiling report web page, instead of its normal response (which the Runtime discards). The performance profiling report web page will be saved to the `.html` file to which you redirected the command output.
231
+ 4. Double-click on the `.html` file to view it in your web browser.
232
+ 1. Alternatively, open your web browser and navigate to the `.html` file; e.g., enter `file:///tmp/profile.html` into the address bar.
233
+
234
+ </details>
235
+
185
236
  ### RAM usage
186
237
 
187
238
  The `dagster-daemon` and `dagster-dagit` containers can consume a lot of RAM. If tests are failing and the console of