nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (100) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +55 -4
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +33 -28
  76. nmdc_runtime/site/ops.py +97 -237
  77. nmdc_runtime/site/repair/database_updater.py +8 -0
  78. nmdc_runtime/site/repository.py +7 -117
  79. nmdc_runtime/site/resources.py +4 -4
  80. nmdc_runtime/site/translation/gold_translator.py +22 -21
  81. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  82. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  83. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  84. nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
  85. nmdc_runtime/site/translation/translator.py +63 -1
  86. nmdc_runtime/site/util.py +8 -3
  87. nmdc_runtime/site/validation/util.py +10 -5
  88. nmdc_runtime/util.py +9 -321
  89. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  90. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  91. nmdc_runtime/site/translation/emsl.py +0 -43
  92. nmdc_runtime/site/translation/gold.py +0 -53
  93. nmdc_runtime/site/translation/jgi.py +0 -32
  94. nmdc_runtime/site/translation/util.py +0 -132
  95. nmdc_runtime/site/validation/jgi.py +0 -43
  96. nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
  97. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  98. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  99. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  100. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
1
1
  import json
2
2
 
3
- from typing import Any
4
3
 
5
4
  from dagster import (
6
5
  repository,
@@ -14,7 +13,6 @@ from dagster import (
14
13
  DagsterRunStatus,
15
14
  RunStatusSensorContext,
16
15
  DefaultSensorStatus,
17
- in_process_executor,
18
16
  )
19
17
  from starlette import status
20
18
  from toolz import merge, get_in
@@ -30,8 +28,6 @@ from nmdc_runtime.site.graphs import (
30
28
  translate_metadata_submission_to_nmdc_schema_database,
31
29
  ingest_metadata_submission,
32
30
  gold_study_to_database,
33
- gold_translation,
34
- gold_translation_curation,
35
31
  create_objects_from_site_object_puts,
36
32
  housekeeping,
37
33
  ensure_jobs,
@@ -62,9 +58,6 @@ from nmdc_runtime.site.resources import (
62
58
  from nmdc_runtime.site.resources import (
63
59
  get_runtime_api_site_client,
64
60
  )
65
- from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
66
- from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
67
- from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
68
61
  from nmdc_runtime.util import freeze
69
62
  from nmdc_runtime.util import unfreeze
70
63
 
@@ -249,82 +242,6 @@ def process_workflow_job_triggers(_context):
249
242
  yield SkipReason("No new jobs required")
250
243
 
251
244
 
252
- @asset_sensor(
253
- asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
254
- job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
255
- )
256
- def ensure_gold_translation_job(_context, asset_event):
257
- mdb = get_mongo(run_config_frozen__normal_env).db
258
- gold_etl_latest = mdb.objects.find_one(
259
- {"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
260
- )
261
- sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
262
- if gold_etl_latest is None:
263
- yield SkipReason("can't find sensed asset object_id in database")
264
- return
265
- elif gold_etl_latest["id"] != sensed_object_id:
266
- yield SkipReason("later object than sensed materialization")
267
- return
268
-
269
- run_config = merge(
270
- run_config_frozen__normal_env,
271
- {
272
- "solids": {
273
- "construct_jobs": {
274
- "config": {
275
- "base_jobs": [
276
- {
277
- "workflow": {"id": "gold-translation-1.0.0"},
278
- "config": {"object_id": gold_etl_latest["id"]},
279
- }
280
- ]
281
- }
282
- }
283
- }
284
- },
285
- )
286
- yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
287
-
288
-
289
- @asset_sensor(
290
- asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
291
- job=gold_translation_curation.to_job(**preset_normal),
292
- )
293
- def claim_and_run_gold_translation_curation(_context, asset_event):
294
- client = get_runtime_api_site_client(run_config_frozen__normal_env)
295
- mdb = get_mongo(run_config_frozen__normal_env).db
296
- object_id_latest = asset_materialization_metadata(
297
- asset_event, "object_id_latest"
298
- ).text
299
- job = mdb.jobs.find_one(
300
- {
301
- "workflow.id": "gold-translation-1.0.0",
302
- "config.object_id_latest": object_id_latest,
303
- }
304
- )
305
- if job is not None:
306
- rv = client.claim_job(job["id"])
307
- if rv.status_code == status.HTTP_200_OK:
308
- operation = rv.json()
309
- run_config = merge(
310
- run_config_frozen__normal_env,
311
- {
312
- "ops": {
313
- "get_operation": {
314
- "config": {
315
- "operation_id": operation["id"],
316
- }
317
- }
318
- }
319
- },
320
- )
321
- yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
322
- else:
323
- yield SkipReason("Job found, but already claimed by this site")
324
- else:
325
- yield SkipReason("No job found")
326
-
327
-
328
245
  @sensor(
329
246
  job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
330
247
  default_status=DefaultSensorStatus.RUNNING,
@@ -463,11 +380,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
463
380
  yield SkipReason("; ".join(skip_notes))
464
381
 
465
382
 
466
- # TODO ensure data_object_type values from file_type_enum
467
- # see /metadata-translation/notebooks/202106_curation_updates.ipynb
468
- # for details ("Create file_type_enum collection" section).
469
-
470
-
471
383
  @sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
472
384
  def done_object_put_ops(_context):
473
385
  client = get_runtime_api_site_client(run_config_frozen__normal_env)
@@ -507,7 +419,6 @@ def on_run_fail(context: RunStatusSensorContext):
507
419
  @repository
508
420
  def repo():
509
421
  graph_jobs = [
510
- gold_translation.to_job(**preset_normal),
511
422
  hello_graph.to_job(name="hello_job"),
512
423
  ensure_jobs.to_job(**preset_normal),
513
424
  apply_metadata_in.to_job(**preset_normal),
@@ -523,8 +434,6 @@ def repo():
523
434
  ]
524
435
  sensors = [
525
436
  done_object_put_ops,
526
- ensure_gold_translation_job,
527
- claim_and_run_gold_translation_curation,
528
437
  process_workflow_job_triggers,
529
438
  claim_and_run_apply_changesheet_jobs,
530
439
  claim_and_run_metadata_in_jobs,
@@ -534,20 +443,6 @@ def repo():
534
443
  return graph_jobs + schedules + sensors
535
444
 
536
445
 
537
- @repository
538
- def translation():
539
- graph_jobs = [jgi_job, gold_job, emsl_job]
540
-
541
- return graph_jobs
542
-
543
-
544
- @repository
545
- def test_translation():
546
- graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
547
-
548
- return graph_jobs
549
-
550
-
551
446
  @repository
552
447
  def biosample_submission_ingest():
553
448
  normal_resources = run_config_frozen__normal_env["resources"]
@@ -574,6 +469,7 @@ def biosample_submission_ingest():
574
469
  "study_type": "research_study",
575
470
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
576
471
  "include_field_site_info": False,
472
+ "enable_biosample_filtering": True,
577
473
  },
578
474
  },
579
475
  "export_json_to_drs": {"config": {"username": ""}},
@@ -1018,6 +914,8 @@ def database_records_stitching():
1018
914
  "config": {
1019
915
  "nmdc_study_id": "",
1020
916
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
917
+ "include_field_site_info": False,
918
+ "enable_biosample_filtering": True,
1021
919
  }
1022
920
  },
1023
921
  "export_json_to_drs": {"config": {"username": ""}},
@@ -1060,6 +958,8 @@ def database_records_stitching():
1060
958
  "config": {
1061
959
  "nmdc_study_id": "",
1062
960
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
961
+ "include_field_site_info": False,
962
+ "enable_biosample_filtering": True,
1063
963
  }
1064
964
  },
1065
965
  "export_json_to_drs": {"config": {"username": ""}},
@@ -1102,21 +1002,11 @@ def database_records_stitching():
1102
1002
  "config": {
1103
1003
  "nmdc_study_id": "",
1104
1004
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
1005
+ "include_field_site_info": False,
1006
+ "enable_biosample_filtering": True,
1105
1007
  }
1106
1008
  },
1107
1009
  },
1108
1010
  },
1109
1011
  ),
1110
1012
  ]
1111
-
1112
-
1113
- # @repository
1114
- # def validation():
1115
- # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
1116
- # return graph_jobs
1117
- #
1118
- #
1119
- # @repository
1120
- # def test_validation():
1121
- # graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
1122
- # return graph_jobs
@@ -109,7 +109,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
109
109
  },
110
110
  )
111
111
  response.raise_for_status()
112
- return response.json()["cursor"]["firstBatch"]
112
+ return response.json()["cursor"]["batch"]
113
113
 
114
114
  def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
115
115
  gold_project_id = normalize_gold_id(gold_project_id)
@@ -126,7 +126,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
126
126
  },
127
127
  )
128
128
  response.raise_for_status()
129
- return response.json()["cursor"]["firstBatch"]
129
+ return response.json()["cursor"]["batch"]
130
130
 
131
131
  def get_biosamples_for_study(self, study_id: str):
132
132
  # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
@@ -170,7 +170,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
170
170
  },
171
171
  )
172
172
  response.raise_for_status()
173
- return response.json()["cursor"]["firstBatch"]
173
+ return response.json()["cursor"]["batch"]
174
174
 
175
175
  def get_study(self, study_id: str):
176
176
  response = self.request(
@@ -182,7 +182,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
182
182
  },
183
183
  )
184
184
  response.raise_for_status()
185
- return response.json()["cursor"]["firstBatch"]
185
+ return response.json()["cursor"]["batch"]
186
186
 
187
187
 
188
188
  class RuntimeApiSiteClient(RuntimeApiClient):
@@ -1,5 +1,4 @@
1
1
  import collections
2
- import csv
3
2
  import re
4
3
  from typing import List, Tuple, Union
5
4
  from nmdc_schema import nmdc
@@ -45,6 +44,7 @@ class GoldStudyTranslator(Translator):
45
44
  analysis_projects: List[JSON_OBJECT] = [],
46
45
  gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
47
46
  include_field_site_info: bool = False,
47
+ enable_biosample_filtering: bool = True,
48
48
  *args,
49
49
  **kwargs,
50
50
  ) -> None:
@@ -53,15 +53,20 @@ class GoldStudyTranslator(Translator):
53
53
  self.study = study
54
54
  self.study_type = nmdc.StudyCategoryEnum(study_type)
55
55
  self.include_field_site_info = include_field_site_info
56
+ self.enable_biosample_filtering = enable_biosample_filtering
56
57
  # Filter biosamples to only those with `sequencingStrategy` of
57
- # "Metagenome" or "Metatranscriptome"
58
- self.biosamples = [
59
- biosample
60
- for biosample in biosamples
61
- if any(
62
- _is_valid_project(project) for project in biosample.get("projects", [])
63
- )
64
- ]
58
+ # "Metagenome" or "Metatranscriptome" if filtering is enabled
59
+ if enable_biosample_filtering:
60
+ self.biosamples = [
61
+ biosample
62
+ for biosample in biosamples
63
+ if any(
64
+ _is_valid_project(project)
65
+ for project in biosample.get("projects", [])
66
+ )
67
+ ]
68
+ else:
69
+ self.biosamples = biosamples
65
70
  # Fetch the valid projectGoldIds that are associated with filtered
66
71
  # biosamples on their `projects` field
67
72
  valid_project_ids = {
@@ -116,6 +121,9 @@ class GoldStudyTranslator(Translator):
116
121
  :param gold_entity: GOLD entity object
117
122
  :return: PersonValue corresponding to the first PI in the `contacts` field
118
123
  """
124
+ if "contacts" not in gold_entity:
125
+ return None
126
+
119
127
  pi_dict = next(
120
128
  (
121
129
  contact
@@ -169,7 +177,7 @@ class GoldStudyTranslator(Translator):
169
177
  project["ncbiBioSampleAccession"], default_prefix="biosample"
170
178
  )
171
179
  for project in biosample_projects
172
- if project["ncbiBioSampleAccession"]
180
+ if project.get("ncbiBioSampleAccession")
173
181
  ]
174
182
 
175
183
  def _get_samp_taxon_id(
@@ -333,12 +341,7 @@ class GoldStudyTranslator(Translator):
333
341
  if field_value is None:
334
342
  return None
335
343
 
336
- return nmdc.QuantityValue(
337
- has_raw_value=field_value,
338
- has_numeric_value=nmdc.Double(field_value),
339
- has_unit=unit,
340
- type="nmdc:QuantityValue",
341
- )
344
+ return self._parse_quantity_value(str(field_value), unit)
342
345
 
343
346
  def _get_text_value(
344
347
  self, gold_entity: JSON_OBJECT, gold_field: str
@@ -564,13 +567,11 @@ class GoldStudyTranslator(Translator):
564
567
  gold_biosample_id = gold_biosample["biosampleGoldId"]
565
568
  return nmdc.Biosample(
566
569
  add_date=gold_biosample.get("addDate"),
567
- alt=self._get_quantity_value(
568
- gold_biosample, "altitudeInMeters", unit="meters"
569
- ),
570
+ alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
570
571
  collected_from=nmdc_field_site_id,
571
572
  collection_date=self._get_collection_date(gold_biosample),
572
573
  depth=self._get_quantity_value(
573
- gold_biosample, ("depthInMeters", "depthInMeters2"), unit="meters"
574
+ gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
574
575
  ),
575
576
  description=gold_biosample.get("description"),
576
577
  diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
@@ -609,7 +610,7 @@ class GoldStudyTranslator(Translator):
609
610
  ),
610
611
  specific_ecosystem=gold_biosample.get("specificEcosystem"),
611
612
  subsurface_depth=self._get_quantity_value(
612
- gold_biosample, "subsurfaceDepthInMeters", unit="meters"
613
+ gold_biosample, "subsurfaceDepthInMeters", unit="m"
613
614
  ),
614
615
  temp=self._get_quantity_value(
615
616
  gold_biosample, "sampleCollectionTemperature"
@@ -11,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
11
11
  from nmdc_runtime.site.translation.neon_utils import (
12
12
  _get_value_or_none,
13
13
  _create_controlled_identified_term_value,
14
- _create_controlled_term_value,
15
14
  _create_geolocation_value,
16
15
  _create_quantity_value,
17
16
  _create_timestamp_value,
@@ -10,7 +10,6 @@ from nmdc_runtime.site.util import get_basename
10
10
  from nmdc_runtime.site.translation.neon_utils import (
11
11
  _get_value_or_none,
12
12
  _create_controlled_identified_term_value,
13
- _create_controlled_term_value,
14
13
  _create_geolocation_value,
15
14
  _create_quantity_value,
16
15
  _create_timestamp_value,
@@ -153,7 +152,7 @@ class NeonSoilDataTranslator(Translator):
153
152
  collection_date=_create_timestamp_value(
154
153
  biosample_row["collectDate"].values[0]
155
154
  ),
156
- temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Celsius"),
155
+ temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Cel"),
157
156
  depth=nmdc.QuantityValue(
158
157
  has_minimum_numeric_value=_get_value_or_none(
159
158
  biosample_row, "sampleTopDepth"
@@ -169,13 +168,13 @@ class NeonSoilDataTranslator(Translator):
169
168
  analysis_type=_get_value_or_none(biosample_row, "sequenceAnalysisType"),
170
169
  env_package=_create_text_value(biosample_row["sampleType"].values[0]),
171
170
  nitro=_create_quantity_value(
172
- biosample_row["nitrogenPercent"].values[0], "percent"
171
+ biosample_row["nitrogenPercent"].values[0], "%"
173
172
  ),
174
173
  org_carb=_create_quantity_value(
175
- biosample_row["organicCPercent"].values[0], "percent"
174
+ biosample_row["organicCPercent"].values[0], "%"
176
175
  ),
177
176
  carb_nitro_ratio=_create_quantity_value(
178
- biosample_row["CNratio"].values[0], None
177
+ biosample_row["CNratio"].values[0], "ratio"
179
178
  ),
180
179
  ph=_create_double_value(biosample_row["soilInWaterpH"].values[0]),
181
180
  water_content=(
@@ -3,7 +3,6 @@ import sqlite3
3
3
  from typing import Dict, Optional, Union
4
4
 
5
5
  import pandas as pd
6
- import requests
7
6
  import requests_cache
8
7
 
9
8
  from nmdc_schema import nmdc
@@ -12,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
12
11
  from nmdc_runtime.site.translation.neon_utils import (
13
12
  _get_value_or_none,
14
13
  _create_controlled_identified_term_value,
15
- _create_controlled_term_value,
16
14
  _create_geolocation_value,
17
15
  _create_quantity_value,
18
16
  _create_timestamp_value,
@@ -47,6 +47,12 @@ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str]
47
47
  (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
48
48
  }
49
49
 
50
+ UNIT_OVERRIDES: dict[str, dict[str, str]] = {
51
+ "Biosample": {
52
+ "depth": "m",
53
+ }
54
+ }
55
+
50
56
 
51
57
  class EnvironmentPackage(Enum):
52
58
  r"""
@@ -272,61 +278,9 @@ class SubmissionPortalTranslator(Translator):
272
278
  def _get_quantity_value(
273
279
  self, raw_value: Optional[str], unit: Optional[str] = None
274
280
  ) -> Union[nmdc.QuantityValue, None]:
275
- """Construct a nmdc:QuantityValue from a raw value string
276
-
277
- The regex pattern minimally matches on a single numeric value (possibly
278
- floating point). The pattern can also identify a range represented by
279
- two numeric values separated by a hyphen. It can also identify non-numeric
280
- characters at the end of the string which are interpreted as a unit. A unit
281
- may also be explicitly provided as an argument to this function. If parsing
282
- identifies a unit and a unit argument is provided, the unit argument is used.
283
- If the pattern is not matched at all None is returned.
284
-
285
- TODO: currently the parsed unit string is used as-is. In the future we may want
286
- to be stricter about what we accept or coerce into a controlled value set
281
+ """Construct a nmdc:QuantityValue from a raw value string"""
287
282
 
288
- :param raw_value: string to parse
289
- :param unit: optional unit, defaults to None
290
- :return: nmdc:QuantityValue
291
- """
292
- if raw_value is None:
293
- return None
294
-
295
- match = re.fullmatch(
296
- "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
297
- raw_value,
298
- )
299
- if not match:
300
- return None
301
-
302
- qv = nmdc.QuantityValue(
303
- has_raw_value=raw_value,
304
- type="nmdc:QuantityValue",
305
- )
306
- if match.group(2):
307
- # having group 2 means the value is a range like "0 - 1". Either
308
- # group 1 or group 2 might be the minimum especially when handling
309
- # negative ranges like "0 - -1"
310
- num_1 = float(match.group(1))
311
- num_2 = float(match.group(2))
312
- qv.has_minimum_numeric_value = min(num_1, num_2)
313
- qv.has_maximum_numeric_value = max(num_1, num_2)
314
- else:
315
- # otherwise we just have a single numeric value
316
- qv.has_numeric_value = float(match.group(1))
317
-
318
- if unit:
319
- # a unit was manually specified
320
- if match.group(3) and unit != match.group(3):
321
- # a unit was also found in the raw string; issue a warning
322
- # if they don't agree, but keep the manually specified one
323
- logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
324
- qv.has_unit = unit
325
- elif match.group(3):
326
- # a unit was found in the raw string
327
- qv.has_unit = match.group(3)
328
-
329
- return qv
283
+ return self._parse_quantity_value(raw_value, unit)
330
284
 
331
285
  def _get_ontology_class(
332
286
  self, raw_value: Optional[str]
@@ -475,6 +429,50 @@ class SubmissionPortalTranslator(Translator):
475
429
 
476
430
  return value
477
431
 
432
+ def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
433
+ """Collect and format DOIs from submission portal schema in nmdc format DOIs
434
+
435
+ If there were no DOIs, None is returned.
436
+
437
+ :param metadata_submission: submission portal entry
438
+ :return: list of nmdc.DOI objects
439
+ """
440
+ data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
441
+ award_dois = self._get_from(
442
+ metadata_submission, ["multiOmicsForm", "awardDois"]
443
+ )
444
+ if data_dois and len(data_dois) > 0:
445
+ updated_data_dois = [
446
+ nmdc.Doi(
447
+ doi_category="dataset_doi",
448
+ doi_provider=doi["provider"],
449
+ doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
450
+ type="nmdc:Doi",
451
+ )
452
+ for doi in data_dois
453
+ ]
454
+ else:
455
+ updated_data_dois = []
456
+
457
+ if award_dois and len(award_dois) > 0:
458
+ updated_award_dois = [
459
+ nmdc.Doi(
460
+ doi_category="award_doi",
461
+ doi_provider=doi["provider"],
462
+ doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
463
+ type="nmdc:Doi",
464
+ )
465
+ for doi in award_dois
466
+ ]
467
+ else:
468
+ updated_award_dois = []
469
+
470
+ return_val = updated_data_dois + updated_award_dois
471
+ if len(return_val) == 0:
472
+ return_val = None
473
+
474
+ return return_val
475
+
478
476
  def _get_data_objects_from_fields(
479
477
  self,
480
478
  sample_data: JSON_OBJECT,
@@ -591,6 +589,7 @@ class SubmissionPortalTranslator(Translator):
591
589
  websites=self._get_from(
592
590
  metadata_submission, ["studyForm", "linkOutWebpage"]
593
591
  ),
592
+ associated_dois=self._get_study_dois(metadata_submission),
594
593
  )
595
594
 
596
595
  def _transform_value_for_slot(
@@ -660,6 +659,17 @@ class SubmissionPortalTranslator(Translator):
660
659
  logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
661
660
  continue
662
661
 
662
+ # This step handles cases where the submission portal/schema instructs a user to
663
+ # provide a value in a specific unit. The unit cannot be parsed out of the raw value
664
+ # in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
665
+ # go away once units are encoded in the schema itself.
666
+ # See: https://github.com/microbiomedata/nmdc-schema/issues/2517
667
+ if class_name in UNIT_OVERRIDES:
668
+ # If the class has unit overrides, check if the slot is in the overrides
669
+ unit_overrides = UNIT_OVERRIDES[class_name]
670
+ if slot_name in unit_overrides:
671
+ unit = unit_overrides[slot_name]
672
+
663
673
  slot_definition = self.schema_view.induced_slot(slot_name, class_name)
664
674
  if slot_definition.multivalued:
665
675
  value_list = value
@@ -1,9 +1,13 @@
1
+ import logging
2
+ import re
1
3
  from abc import ABC, abstractmethod
2
- from typing import Any, Callable, Dict, List, Optional
4
+ from typing import Any, Callable, Dict, List, Optional, Union
3
5
  from nmdc_schema import nmdc
4
6
 
5
7
  JSON_OBJECT = Dict[str, Any]
6
8
 
9
+ logger = logging.getLogger(__name__)
10
+
7
11
 
8
12
  class Translator(ABC):
9
13
  def __init__(
@@ -27,3 +31,61 @@ class Translator(ABC):
27
31
  @abstractmethod
28
32
  def get_database(self) -> nmdc.Database:
29
33
  pass
34
+
35
+ def _parse_quantity_value(
36
+ self, raw_value: Optional[str], unit: Optional[str] = None
37
+ ) -> Union[nmdc.QuantityValue, None]:
38
+ """Construct a nmdc:QuantityValue from a raw value string
39
+
40
+ The regex pattern minimally matches on a single numeric value (possibly
41
+ floating point). The pattern can also identify a range represented by
42
+ two numeric values separated by a hyphen. It can also identify non-numeric
43
+ characters at the end of the string which are interpreted as a unit. A unit
44
+ may also be explicitly provided as an argument to this function. If parsing
45
+ identifies a unit and a unit argument is provided, the unit argument is used.
46
+ If the pattern is not matched at all None is returned.
47
+
48
+ :param raw_value: string to parse
49
+ :param unit: optional unit, defaults to None. If None, the unit is extracted from the
50
+ raw_value. If a unit is provided, it will override the unit extracted from the
51
+ raw_value.
52
+ :return: nmdc:QuantityValue
53
+ """
54
+ if raw_value is None:
55
+ return None
56
+
57
+ match = re.fullmatch(
58
+ "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
59
+ raw_value,
60
+ )
61
+ if not match:
62
+ return None
63
+
64
+ quantity_value_kwargs = {
65
+ "has_raw_value": raw_value,
66
+ "type": "nmdc:QuantityValue",
67
+ }
68
+ if match.group(2):
69
+ # having group 2 means the value is a range like "0 - 1". Either
70
+ # group 1 or group 2 might be the minimum especially when handling
71
+ # negative ranges like "0 - -1"
72
+ num_1 = float(match.group(1))
73
+ num_2 = float(match.group(2))
74
+ quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
75
+ quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
76
+ else:
77
+ # otherwise we just have a single numeric value
78
+ quantity_value_kwargs["has_numeric_value"] = float(match.group(1))
79
+
80
+ if unit:
81
+ # a unit was manually specified
82
+ if match.group(3) and unit != match.group(3):
83
+ # a unit was also found in the raw string; issue a warning
84
+ # if they don't agree, but keep the manually specified one
85
+ logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
86
+ quantity_value_kwargs["has_unit"] = unit
87
+ elif match.group(3):
88
+ # a unit was found in the raw string
89
+ quantity_value_kwargs["has_unit"] = match.group(3)
90
+
91
+ return nmdc.QuantityValue(**quantity_value_kwargs)
nmdc_runtime/site/util.py CHANGED
@@ -3,10 +3,11 @@ import os
3
3
  from functools import lru_cache
4
4
  from pymongo.database import Database as MongoDatabase
5
5
  from subprocess import Popen, PIPE, STDOUT, CalledProcessError
6
- from toolz import groupby
6
+ from refscan.lib.helpers import get_collection_names_from_schema
7
7
 
8
- from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
9
8
  from nmdc_runtime.site.resources import mongo_resource
9
+ from nmdc_runtime.util import nmdc_schema_view
10
+
10
11
 
11
12
  mode_test = {
12
13
  "resource_defs": {"mongo": mongo_resource}
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
37
38
 
38
39
  @lru_cache
39
40
  def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
41
+ """
42
+ TODO: Document this function.
43
+ """
44
+ schema_view = nmdc_schema_view()
40
45
  present_collection_names = set(mdb.list_collection_names())
41
46
  return {
42
47
  name: (
43
48
  name in present_collection_names and "id_1" in mdb[name].index_information()
44
49
  )
45
- for name in get_collection_names_from_schema()
50
+ for name in get_collection_names_from_schema(schema_view)
46
51
  }
47
52
 
48
53