nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +208 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +788 -0
  10. nmdc_runtime/api/core/util.py +109 -0
  11. nmdc_runtime/api/db/mongo.py +435 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +143 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +270 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +796 -0
  31. nmdc_runtime/api/endpoints/workflows.py +353 -0
  32. nmdc_runtime/api/entrypoint.sh +7 -0
  33. nmdc_runtime/api/main.py +425 -0
  34. nmdc_runtime/api/middleware.py +43 -0
  35. nmdc_runtime/api/models/capability.py +14 -0
  36. nmdc_runtime/api/models/id.py +92 -0
  37. nmdc_runtime/api/models/job.py +37 -0
  38. nmdc_runtime/api/models/lib/helpers.py +78 -0
  39. nmdc_runtime/api/models/metadata.py +11 -0
  40. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  41. nmdc_runtime/api/models/object.py +180 -0
  42. nmdc_runtime/api/models/object_type.py +20 -0
  43. nmdc_runtime/api/models/operation.py +66 -0
  44. nmdc_runtime/api/models/query.py +246 -0
  45. nmdc_runtime/api/models/query_continuation.py +111 -0
  46. nmdc_runtime/api/models/run.py +161 -0
  47. nmdc_runtime/api/models/site.py +87 -0
  48. nmdc_runtime/api/models/trigger.py +13 -0
  49. nmdc_runtime/api/models/user.py +140 -0
  50. nmdc_runtime/api/models/util.py +260 -0
  51. nmdc_runtime/api/models/workflow.py +15 -0
  52. nmdc_runtime/api/openapi.py +178 -0
  53. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  54. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  55. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  56. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  57. nmdc_runtime/config.py +7 -8
  58. nmdc_runtime/minter/adapters/repository.py +22 -2
  59. nmdc_runtime/minter/config.py +2 -0
  60. nmdc_runtime/minter/domain/model.py +55 -1
  61. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  62. nmdc_runtime/mongo_util.py +1 -2
  63. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  64. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  65. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  66. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  67. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  68. nmdc_runtime/site/dagster.yaml +53 -0
  69. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  70. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  71. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  72. nmdc_runtime/site/export/ncbi_xml.py +633 -13
  73. nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
  74. nmdc_runtime/site/graphs.py +8 -22
  75. nmdc_runtime/site/ops.py +147 -181
  76. nmdc_runtime/site/repository.py +2 -112
  77. nmdc_runtime/site/resources.py +16 -3
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/site/workspace.yaml +13 -0
  87. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  88. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  89. nmdc_runtime/static/README.md +5 -0
  90. nmdc_runtime/static/favicon.ico +0 -0
  91. nmdc_runtime/util.py +90 -48
  92. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  93. nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
  94. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  95. nmdc_runtime/containers.py +0 -14
  96. nmdc_runtime/core/db/Database.py +0 -15
  97. nmdc_runtime/core/exceptions/__init__.py +0 -23
  98. nmdc_runtime/core/exceptions/base.py +0 -47
  99. nmdc_runtime/core/exceptions/token.py +0 -13
  100. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  101. nmdc_runtime/domain/users/userSchema.py +0 -37
  102. nmdc_runtime/domain/users/userService.py +0 -14
  103. nmdc_runtime/infrastructure/database/db.py +0 -3
  104. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  105. nmdc_runtime/lib/__init__.py +0 -1
  106. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  107. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  108. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  109. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  110. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  111. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  112. nmdc_runtime/site/drsobjects/registration.py +0 -131
  113. nmdc_runtime/site/translation/emsl.py +0 -43
  114. nmdc_runtime/site/translation/gold.py +0 -53
  115. nmdc_runtime/site/translation/jgi.py +0 -32
  116. nmdc_runtime/site/translation/util.py +0 -132
  117. nmdc_runtime/site/validation/jgi.py +0 -43
  118. nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
  119. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  120. nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
  121. /nmdc_runtime/{client → api}/__init__.py +0 -0
  122. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  123. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  124. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  125. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  126. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  127. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  128. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  129. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  130. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  131. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,5 @@
1
1
  import json
2
2
 
3
- from typing import Any
4
3
 
5
4
  from dagster import (
6
5
  repository,
@@ -14,7 +13,6 @@ from dagster import (
14
13
  DagsterRunStatus,
15
14
  RunStatusSensorContext,
16
15
  DefaultSensorStatus,
17
- in_process_executor,
18
16
  )
19
17
  from starlette import status
20
18
  from toolz import merge, get_in
@@ -30,8 +28,6 @@ from nmdc_runtime.site.graphs import (
30
28
  translate_metadata_submission_to_nmdc_schema_database,
31
29
  ingest_metadata_submission,
32
30
  gold_study_to_database,
33
- gold_translation,
34
- gold_translation_curation,
35
31
  create_objects_from_site_object_puts,
36
32
  housekeeping,
37
33
  ensure_jobs,
@@ -62,9 +58,6 @@ from nmdc_runtime.site.resources import (
62
58
  from nmdc_runtime.site.resources import (
63
59
  get_runtime_api_site_client,
64
60
  )
65
- from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
66
- from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
67
- from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
68
61
  from nmdc_runtime.util import freeze
69
62
  from nmdc_runtime.util import unfreeze
70
63
 
@@ -249,82 +242,6 @@ def process_workflow_job_triggers(_context):
249
242
  yield SkipReason("No new jobs required")
250
243
 
251
244
 
252
- @asset_sensor(
253
- asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
254
- job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
255
- )
256
- def ensure_gold_translation_job(_context, asset_event):
257
- mdb = get_mongo(run_config_frozen__normal_env).db
258
- gold_etl_latest = mdb.objects.find_one(
259
- {"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
260
- )
261
- sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
262
- if gold_etl_latest is None:
263
- yield SkipReason("can't find sensed asset object_id in database")
264
- return
265
- elif gold_etl_latest["id"] != sensed_object_id:
266
- yield SkipReason("later object than sensed materialization")
267
- return
268
-
269
- run_config = merge(
270
- run_config_frozen__normal_env,
271
- {
272
- "solids": {
273
- "construct_jobs": {
274
- "config": {
275
- "base_jobs": [
276
- {
277
- "workflow": {"id": "gold-translation-1.0.0"},
278
- "config": {"object_id": gold_etl_latest["id"]},
279
- }
280
- ]
281
- }
282
- }
283
- }
284
- },
285
- )
286
- yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
287
-
288
-
289
- @asset_sensor(
290
- asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
291
- job=gold_translation_curation.to_job(**preset_normal),
292
- )
293
- def claim_and_run_gold_translation_curation(_context, asset_event):
294
- client = get_runtime_api_site_client(run_config_frozen__normal_env)
295
- mdb = get_mongo(run_config_frozen__normal_env).db
296
- object_id_latest = asset_materialization_metadata(
297
- asset_event, "object_id_latest"
298
- ).text
299
- job = mdb.jobs.find_one(
300
- {
301
- "workflow.id": "gold-translation-1.0.0",
302
- "config.object_id_latest": object_id_latest,
303
- }
304
- )
305
- if job is not None:
306
- rv = client.claim_job(job["id"])
307
- if rv.status_code == status.HTTP_200_OK:
308
- operation = rv.json()
309
- run_config = merge(
310
- run_config_frozen__normal_env,
311
- {
312
- "ops": {
313
- "get_operation": {
314
- "config": {
315
- "operation_id": operation["id"],
316
- }
317
- }
318
- }
319
- },
320
- )
321
- yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
322
- else:
323
- yield SkipReason("Job found, but already claimed by this site")
324
- else:
325
- yield SkipReason("No job found")
326
-
327
-
328
245
  @sensor(
329
246
  job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
330
247
  default_status=DefaultSensorStatus.RUNNING,
@@ -502,7 +419,6 @@ def on_run_fail(context: RunStatusSensorContext):
502
419
  @repository
503
420
  def repo():
504
421
  graph_jobs = [
505
- gold_translation.to_job(**preset_normal),
506
422
  hello_graph.to_job(name="hello_job"),
507
423
  ensure_jobs.to_job(**preset_normal),
508
424
  apply_metadata_in.to_job(**preset_normal),
@@ -518,8 +434,6 @@ def repo():
518
434
  ]
519
435
  sensors = [
520
436
  done_object_put_ops,
521
- ensure_gold_translation_job,
522
- claim_and_run_gold_translation_curation,
523
437
  process_workflow_job_triggers,
524
438
  claim_and_run_apply_changesheet_jobs,
525
439
  claim_and_run_metadata_in_jobs,
@@ -529,20 +443,6 @@ def repo():
529
443
  return graph_jobs + schedules + sensors
530
444
 
531
445
 
532
- @repository
533
- def translation():
534
- graph_jobs = [jgi_job, gold_job, emsl_job]
535
-
536
- return graph_jobs
537
-
538
-
539
- @repository
540
- def test_translation():
541
- graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
542
-
543
- return graph_jobs
544
-
545
-
546
446
  @repository
547
447
  def biosample_submission_ingest():
548
448
  normal_resources = run_config_frozen__normal_env["resources"]
@@ -602,6 +502,7 @@ def biosample_submission_ingest():
602
502
  "data_object_mapping_file_url": None,
603
503
  "biosample_extras_file_url": None,
604
504
  "biosample_extras_slot_mapping_file_url": None,
505
+ "study_id": None,
605
506
  }
606
507
  },
607
508
  "translate_portal_submission_to_nmdc_schema_database": {
@@ -638,6 +539,7 @@ def biosample_submission_ingest():
638
539
  "data_object_mapping_file_url": None,
639
540
  "biosample_extras_file_url": None,
640
541
  "biosample_extras_slot_mapping_file_url": None,
542
+ "study_id": None,
641
543
  }
642
544
  },
643
545
  "translate_portal_submission_to_nmdc_schema_database": {
@@ -1110,15 +1012,3 @@ def database_records_stitching():
1110
1012
  },
1111
1013
  ),
1112
1014
  ]
1113
-
1114
-
1115
- # @repository
1116
- # def validation():
1117
- # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
1118
- # return graph_jobs
1119
- #
1120
- #
1121
- # @repository
1122
- # def test_validation():
1123
- # graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
1124
- # return graph_jobs
@@ -520,11 +520,24 @@ class MongoDB:
520
520
  self.db = self.client[dbname]
521
521
 
522
522
  def add_docs(self, docs, validate=True, replace=True):
523
+ """
524
+ TODO: Document this function.
525
+ """
523
526
  try:
524
527
  if validate:
525
528
  nmdc_jsonschema_validator_noidpatterns(docs)
526
529
  rv = {}
527
- for collection_name, docs in docs.items():
530
+ for collection_name, collection_docs in docs.items():
531
+ # If `collection_docs` is empty, abort this iteration.
532
+ #
533
+ # Note: We do this because the `bulk_write` method called below will raise
534
+ # an `InvalidOperation` exception if it is passed 0 operations.
535
+ #
536
+ # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write
537
+ #
538
+ if len(collection_docs) == 0:
539
+ continue
540
+
528
541
  rv[collection_name] = self.db[collection_name].bulk_write(
529
542
  [
530
543
  (
@@ -532,7 +545,7 @@ class MongoDB:
532
545
  if replace
533
546
  else InsertOne(d)
534
547
  )
535
- for d in docs
548
+ for d in collection_docs
536
549
  ]
537
550
  )
538
551
  now = datetime.now(timezone.utc)
@@ -544,7 +557,7 @@ class MongoDB:
544
557
  "ts": now,
545
558
  # "dtl": {},
546
559
  }
547
- for d in docs
560
+ for d in collection_docs
548
561
  ]
549
562
  )
550
563
  return rv
@@ -1,5 +1,4 @@
1
1
  import collections
2
- import csv
3
2
  import re
4
3
  from typing import List, Tuple, Union
5
4
  from nmdc_schema import nmdc
@@ -342,12 +341,7 @@ class GoldStudyTranslator(Translator):
342
341
  if field_value is None:
343
342
  return None
344
343
 
345
- return nmdc.QuantityValue(
346
- has_raw_value=field_value,
347
- has_numeric_value=nmdc.Double(field_value),
348
- has_unit=unit,
349
- type="nmdc:QuantityValue",
350
- )
344
+ return self._parse_quantity_value(str(field_value), unit)
351
345
 
352
346
  def _get_text_value(
353
347
  self, gold_entity: JSON_OBJECT, gold_field: str
@@ -573,13 +567,11 @@ class GoldStudyTranslator(Translator):
573
567
  gold_biosample_id = gold_biosample["biosampleGoldId"]
574
568
  return nmdc.Biosample(
575
569
  add_date=gold_biosample.get("addDate"),
576
- alt=self._get_quantity_value(
577
- gold_biosample, "altitudeInMeters", unit="meters"
578
- ),
570
+ alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
579
571
  collected_from=nmdc_field_site_id,
580
572
  collection_date=self._get_collection_date(gold_biosample),
581
573
  depth=self._get_quantity_value(
582
- gold_biosample, ("depthInMeters", "depthInMeters2"), unit="meters"
574
+ gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
583
575
  ),
584
576
  description=gold_biosample.get("description"),
585
577
  diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
@@ -618,7 +610,7 @@ class GoldStudyTranslator(Translator):
618
610
  ),
619
611
  specific_ecosystem=gold_biosample.get("specificEcosystem"),
620
612
  subsurface_depth=self._get_quantity_value(
621
- gold_biosample, "subsurfaceDepthInMeters", unit="meters"
613
+ gold_biosample, "subsurfaceDepthInMeters", unit="m"
622
614
  ),
623
615
  temp=self._get_quantity_value(
624
616
  gold_biosample, "sampleCollectionTemperature"
@@ -11,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
11
11
  from nmdc_runtime.site.translation.neon_utils import (
12
12
  _get_value_or_none,
13
13
  _create_controlled_identified_term_value,
14
- _create_controlled_term_value,
15
14
  _create_geolocation_value,
16
15
  _create_quantity_value,
17
16
  _create_timestamp_value,
@@ -10,7 +10,6 @@ from nmdc_runtime.site.util import get_basename
10
10
  from nmdc_runtime.site.translation.neon_utils import (
11
11
  _get_value_or_none,
12
12
  _create_controlled_identified_term_value,
13
- _create_controlled_term_value,
14
13
  _create_geolocation_value,
15
14
  _create_quantity_value,
16
15
  _create_timestamp_value,
@@ -153,7 +152,7 @@ class NeonSoilDataTranslator(Translator):
153
152
  collection_date=_create_timestamp_value(
154
153
  biosample_row["collectDate"].values[0]
155
154
  ),
156
- temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Celsius"),
155
+ temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Cel"),
157
156
  depth=nmdc.QuantityValue(
158
157
  has_minimum_numeric_value=_get_value_or_none(
159
158
  biosample_row, "sampleTopDepth"
@@ -169,13 +168,13 @@ class NeonSoilDataTranslator(Translator):
169
168
  analysis_type=_get_value_or_none(biosample_row, "sequenceAnalysisType"),
170
169
  env_package=_create_text_value(biosample_row["sampleType"].values[0]),
171
170
  nitro=_create_quantity_value(
172
- biosample_row["nitrogenPercent"].values[0], "percent"
171
+ biosample_row["nitrogenPercent"].values[0], "%"
173
172
  ),
174
173
  org_carb=_create_quantity_value(
175
- biosample_row["organicCPercent"].values[0], "percent"
174
+ biosample_row["organicCPercent"].values[0], "%"
176
175
  ),
177
176
  carb_nitro_ratio=_create_quantity_value(
178
- biosample_row["CNratio"].values[0], None
177
+ biosample_row["CNratio"].values[0], "ratio"
179
178
  ),
180
179
  ph=_create_double_value(biosample_row["soilInWaterpH"].values[0]),
181
180
  water_content=(
@@ -3,7 +3,6 @@ import sqlite3
3
3
  from typing import Dict, Optional, Union
4
4
 
5
5
  import pandas as pd
6
- import requests
7
6
  import requests_cache
8
7
 
9
8
  from nmdc_schema import nmdc
@@ -12,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
12
11
  from nmdc_runtime.site.translation.neon_utils import (
13
12
  _get_value_or_none,
14
13
  _create_controlled_identified_term_value,
15
- _create_controlled_term_value,
16
14
  _create_geolocation_value,
17
15
  _create_quantity_value,
18
16
  _create_timestamp_value,
@@ -145,6 +145,7 @@ class SubmissionPortalTranslator(Translator):
145
145
  # See: https://github.com/microbiomedata/submission-schema/issues/162
146
146
  study_category: Optional[str] = None,
147
147
  study_pi_image_url: Optional[str] = None,
148
+ study_id: Optional[str] = None,
148
149
  # Additional biosample-level metadata with optional column mapping information not captured
149
150
  # by the submission portal currently.
150
151
  # See: https://github.com/microbiomedata/submission-schema/issues/162
@@ -165,6 +166,7 @@ class SubmissionPortalTranslator(Translator):
165
166
  nmdc.StudyCategoryEnum(study_category) if study_category else None
166
167
  )
167
168
  self.study_pi_image_url = study_pi_image_url
169
+ self.study_id = study_id
168
170
 
169
171
  self.biosample_extras = group_dicts_by_key(
170
172
  BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
@@ -174,6 +176,13 @@ class SubmissionPortalTranslator(Translator):
174
176
  )
175
177
 
176
178
  self.schema_view: SchemaView = _get_schema_view()
179
+ self._material_processing_subclass_names = []
180
+ for class_name in self.schema_view.class_descendants(
181
+ "MaterialProcessing", reflexive=False
182
+ ):
183
+ class_def = self.schema_view.get_class(class_name)
184
+ if not class_def.abstract:
185
+ self._material_processing_subclass_names.append(class_name)
177
186
 
178
187
  def _get_pi(
179
188
  self, metadata_submission: JSON_OBJECT
@@ -278,61 +287,9 @@ class SubmissionPortalTranslator(Translator):
278
287
  def _get_quantity_value(
279
288
  self, raw_value: Optional[str], unit: Optional[str] = None
280
289
  ) -> Union[nmdc.QuantityValue, None]:
281
- """Construct a nmdc:QuantityValue from a raw value string
290
+ """Construct a nmdc:QuantityValue from a raw value string"""
282
291
 
283
- The regex pattern minimally matches on a single numeric value (possibly
284
- floating point). The pattern can also identify a range represented by
285
- two numeric values separated by a hyphen. It can also identify non-numeric
286
- characters at the end of the string which are interpreted as a unit. A unit
287
- may also be explicitly provided as an argument to this function. If parsing
288
- identifies a unit and a unit argument is provided, the unit argument is used.
289
- If the pattern is not matched at all None is returned.
290
-
291
- TODO: currently the parsed unit string is used as-is. In the future we may want
292
- to be stricter about what we accept or coerce into a controlled value set
293
-
294
- :param raw_value: string to parse
295
- :param unit: optional unit, defaults to None
296
- :return: nmdc:QuantityValue
297
- """
298
- if raw_value is None:
299
- return None
300
-
301
- match = re.fullmatch(
302
- "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
303
- raw_value,
304
- )
305
- if not match:
306
- return None
307
-
308
- qv = nmdc.QuantityValue(
309
- has_raw_value=raw_value,
310
- type="nmdc:QuantityValue",
311
- )
312
- if match.group(2):
313
- # having group 2 means the value is a range like "0 - 1". Either
314
- # group 1 or group 2 might be the minimum especially when handling
315
- # negative ranges like "0 - -1"
316
- num_1 = float(match.group(1))
317
- num_2 = float(match.group(2))
318
- qv.has_minimum_numeric_value = min(num_1, num_2)
319
- qv.has_maximum_numeric_value = max(num_1, num_2)
320
- else:
321
- # otherwise we just have a single numeric value
322
- qv.has_numeric_value = float(match.group(1))
323
-
324
- if unit:
325
- # a unit was manually specified
326
- if match.group(3) and unit != match.group(3):
327
- # a unit was also found in the raw string; issue a warning
328
- # if they don't agree, but keep the manually specified one
329
- logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
330
- qv.has_unit = unit
331
- elif match.group(3):
332
- # a unit was found in the raw string
333
- qv.has_unit = match.group(3)
334
-
335
- return qv
292
+ return self._parse_quantity_value(raw_value, unit)
336
293
 
337
294
  def _get_ontology_class(
338
295
  self, raw_value: Optional[str]
@@ -594,6 +551,14 @@ class SubmissionPortalTranslator(Translator):
594
551
 
595
552
  return data_objects, manifest
596
553
 
554
+ def _parse_sample_link(self, sample_link: str) -> tuple[str, list[str]] | None:
555
+ """Parse a sample link in the form of `ProcessingName:SampleName,..."""
556
+ pattern = r"(" + "|".join(self._material_processing_subclass_names) + r"):(.+)"
557
+ match = re.match(pattern, sample_link)
558
+ if not match:
559
+ return None
560
+ return match.group(1), split_strip(match.group(2), ",")
561
+
597
562
  def _translate_study(
598
563
  self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
599
564
  ) -> nmdc.Study:
@@ -804,11 +769,14 @@ class SubmissionPortalTranslator(Translator):
804
769
  "metadata_submission", {}
805
770
  )
806
771
 
807
- # Generate one Study instance based on the metadata submission
808
- nmdc_study_id = self._id_minter("nmdc:Study")[0]
809
- database.study_set = [
810
- self._translate_study(metadata_submission_data, nmdc_study_id)
811
- ]
772
+ # Generate one Study instance based on the metadata submission, if a study_id wasn't provided
773
+ if self.study_id:
774
+ nmdc_study_id = self.study_id
775
+ else:
776
+ nmdc_study_id = self._id_minter("nmdc:Study")[0]
777
+ database.study_set = [
778
+ self._translate_study(metadata_submission_data, nmdc_study_id)
779
+ ]
812
780
 
813
781
  # Automatically populate the `env_package` field in the sample data based on which
814
782
  # environmental data tab the sample data came from.
@@ -840,15 +808,63 @@ class SubmissionPortalTranslator(Translator):
840
808
  )
841
809
 
842
810
  # Translate the sample data into nmdc:Biosample objects
843
- database.biosample_set = [
844
- self._translate_biosample(
845
- sample_data,
846
- nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
847
- nmdc_study_id=nmdc_study_id,
848
- )
849
- for sample_data_id, sample_data in sample_data_by_id.items()
850
- if sample_data
851
- ]
811
+ database.biosample_set = []
812
+ for sample_data_id, sample_data in sample_data_by_id.items():
813
+ # This shouldn't happen, but just in case skip empty sample data
814
+ if not sample_data:
815
+ continue
816
+
817
+ # Find the first tab that has a sample_link value and attempt to parse it
818
+ sample_link = ""
819
+ for tab in sample_data:
820
+ if tab.get("sample_link"):
821
+ sample_link = tab.get("sample_link")
822
+ break
823
+ parsed_sample_link = self._parse_sample_link(sample_link)
824
+
825
+ # If the sample_link could be parsed according to the [ProcessName]:[InputSample,...]
826
+ # format, then create a ProcessedSample and MaterialProcessing instance instead of a
827
+ # Biosample instance. The input samples must be present in the submission for this to
828
+ # work. An exception is raised if any of the referenced input samples are missing.
829
+ if parsed_sample_link is not None:
830
+ processing_type, processing_inputs = parsed_sample_link
831
+ if not all(
832
+ input_id in sample_data_to_nmdc_biosample_ids
833
+ for input_id in processing_inputs
834
+ ):
835
+ raise ValueError(
836
+ f"Could not find all input samples in sample_link '{sample_link}'"
837
+ )
838
+ processed_sample_id = self._id_minter("nmdc:ProcessedSample")[0]
839
+ database.processed_sample_set.append(
840
+ nmdc.ProcessedSample(
841
+ id=processed_sample_id,
842
+ type="nmdc:ProcessedSample",
843
+ name=sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip(),
844
+ )
845
+ )
846
+
847
+ processing_class = getattr(nmdc, processing_type)
848
+ material_processing = processing_class(
849
+ id=self._id_minter(f"nmdc:{processing_type}")[0],
850
+ type=f"nmdc:{processing_type}",
851
+ has_input=[
852
+ sample_data_to_nmdc_biosample_ids[input_id]
853
+ for input_id in processing_inputs
854
+ ],
855
+ has_output=[processed_sample_id],
856
+ )
857
+ database.material_processing_set.append(material_processing)
858
+
859
+ # If there was no sample_link or it doesn't follow the expected format, create a
860
+ # Biosample instance as normal.
861
+ else:
862
+ biosample = self._translate_biosample(
863
+ sample_data,
864
+ nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
865
+ nmdc_study_id=nmdc_study_id,
866
+ )
867
+ database.biosample_set.append(biosample)
852
868
 
853
869
  # This section handles the translation of information in the external sequencing tabs into
854
870
  # various NMDC objects.
@@ -1,9 +1,13 @@
1
+ import logging
2
+ import re
1
3
  from abc import ABC, abstractmethod
2
- from typing import Any, Callable, Dict, List, Optional
4
+ from typing import Any, Callable, Dict, List, Optional, Union
3
5
  from nmdc_schema import nmdc
4
6
 
5
7
  JSON_OBJECT = Dict[str, Any]
6
8
 
9
+ logger = logging.getLogger(__name__)
10
+
7
11
 
8
12
  class Translator(ABC):
9
13
  def __init__(
@@ -27,3 +31,61 @@ class Translator(ABC):
27
31
  @abstractmethod
28
32
  def get_database(self) -> nmdc.Database:
29
33
  pass
34
+
35
+ def _parse_quantity_value(
36
+ self, raw_value: Optional[str], unit: Optional[str] = None
37
+ ) -> Union[nmdc.QuantityValue, None]:
38
+ """Construct a nmdc:QuantityValue from a raw value string
39
+
40
+ The regex pattern minimally matches on a single numeric value (possibly
41
+ floating point). The pattern can also identify a range represented by
42
+ two numeric values separated by a hyphen. It can also identify non-numeric
43
+ characters at the end of the string which are interpreted as a unit. A unit
44
+ may also be explicitly provided as an argument to this function. If parsing
45
+ identifies a unit and a unit argument is provided, the unit argument is used.
46
+ If the pattern is not matched at all None is returned.
47
+
48
+ :param raw_value: string to parse
49
+ :param unit: optional unit, defaults to None. If None, the unit is extracted from the
50
+ raw_value. If a unit is provided, it will override the unit extracted from the
51
+ raw_value.
52
+ :return: nmdc:QuantityValue
53
+ """
54
+ if raw_value is None:
55
+ return None
56
+
57
+ match = re.fullmatch(
58
+ "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
59
+ raw_value,
60
+ )
61
+ if not match:
62
+ return None
63
+
64
+ quantity_value_kwargs = {
65
+ "has_raw_value": raw_value,
66
+ "type": "nmdc:QuantityValue",
67
+ }
68
+ if match.group(2):
69
+ # having group 2 means the value is a range like "0 - 1". Either
70
+ # group 1 or group 2 might be the minimum especially when handling
71
+ # negative ranges like "0 - -1"
72
+ num_1 = float(match.group(1))
73
+ num_2 = float(match.group(2))
74
+ quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
75
+ quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
76
+ else:
77
+ # otherwise we just have a single numeric value
78
+ quantity_value_kwargs["has_numeric_value"] = float(match.group(1))
79
+
80
+ if unit:
81
+ # a unit was manually specified
82
+ if match.group(3) and unit != match.group(3):
83
+ # a unit was also found in the raw string; issue a warning
84
+ # if they don't agree, but keep the manually specified one
85
+ logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
86
+ quantity_value_kwargs["has_unit"] = unit
87
+ elif match.group(3):
88
+ # a unit was found in the raw string
89
+ quantity_value_kwargs["has_unit"] = match.group(3)
90
+
91
+ return nmdc.QuantityValue(**quantity_value_kwargs)
nmdc_runtime/site/util.py CHANGED
@@ -3,10 +3,11 @@ import os
3
3
  from functools import lru_cache
4
4
  from pymongo.database import Database as MongoDatabase
5
5
  from subprocess import Popen, PIPE, STDOUT, CalledProcessError
6
- from toolz import groupby
6
+ from refscan.lib.helpers import get_collection_names_from_schema
7
7
 
8
- from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
9
8
  from nmdc_runtime.site.resources import mongo_resource
9
+ from nmdc_runtime.util import nmdc_schema_view
10
+
10
11
 
11
12
  mode_test = {
12
13
  "resource_defs": {"mongo": mongo_resource}
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
37
38
 
38
39
  @lru_cache
39
40
  def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
41
+ """
42
+ TODO: Document this function.
43
+ """
44
+ schema_view = nmdc_schema_view()
40
45
  present_collection_names = set(mdb.list_collection_names())
41
46
  return {
42
47
  name: (
43
48
  name in present_collection_names and "id_1" in mdb[name].index_information()
44
49
  )
45
- for name in get_collection_names_from_schema()
50
+ for name in get_collection_names_from_schema(schema_view)
46
51
  }
47
52
 
48
53
 
@@ -1,4 +1,4 @@
1
- from dagster import op, AssetMaterialization, AssetKey, EventMetadata
1
+ from dagster import op, AssetMaterialization, AssetKey, MetadataValue
2
2
  from jsonschema import Draft7Validator
3
3
  from nmdc_runtime.util import get_nmdc_jsonschema_dict
4
4
  from toolz import dissoc
@@ -92,10 +92,15 @@ def announce_validation_report(context, report, api_object):
92
92
  asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
93
93
  description=f"{collection_name} translation validation",
94
94
  metadata={
95
- # https://docs.dagster.io/_apidocs/solids#event-metadata
96
- # also .json, .md, .path, .url, .python_artifact, ...
97
- "n_errors": EventMetadata.int(len(report["errors"])),
98
- "object_id": EventMetadata.text(api_object["id"]),
95
+ # Note: When this code was originally written, it used Dagster's `EventMetadata` class,
96
+ # which has since been replaced by Dagster's `MetadataValue` class.
97
+ #
98
+ # Reference:
99
+ # - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
100
+ # - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
101
+ #
102
+ "n_errors": MetadataValue.int(len(report["errors"])),
103
+ "object_id": MetadataValue.text(api_object["id"]),
99
104
  },
100
105
  )
101
106