nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@ import logging
2
2
  import re
3
3
  from collections import namedtuple
4
4
  from datetime import datetime
5
+ from decimal import Decimal
5
6
  from enum import Enum
6
7
  from functools import lru_cache
7
8
  from importlib import resources
@@ -47,6 +48,12 @@ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str]
47
48
  (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
48
49
  }
49
50
 
51
+ UNIT_OVERRIDES: dict[str, dict[str, str]] = {
52
+ "Biosample": {
53
+ "depth": "m",
54
+ }
55
+ }
56
+
50
57
 
51
58
  class EnvironmentPackage(Enum):
52
59
  r"""
@@ -139,6 +146,7 @@ class SubmissionPortalTranslator(Translator):
139
146
  # See: https://github.com/microbiomedata/submission-schema/issues/162
140
147
  study_category: Optional[str] = None,
141
148
  study_pi_image_url: Optional[str] = None,
149
+ study_id: Optional[str] = None,
142
150
  # Additional biosample-level metadata with optional column mapping information not captured
143
151
  # by the submission portal currently.
144
152
  # See: https://github.com/microbiomedata/submission-schema/issues/162
@@ -159,6 +167,7 @@ class SubmissionPortalTranslator(Translator):
159
167
  nmdc.StudyCategoryEnum(study_category) if study_category else None
160
168
  )
161
169
  self.study_pi_image_url = study_pi_image_url
170
+ self.study_id = study_id
162
171
 
163
172
  self.biosample_extras = group_dicts_by_key(
164
173
  BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
@@ -168,6 +177,13 @@ class SubmissionPortalTranslator(Translator):
168
177
  )
169
178
 
170
179
  self.schema_view: SchemaView = _get_schema_view()
180
+ self._material_processing_subclass_names = []
181
+ for class_name in self.schema_view.class_descendants(
182
+ "MaterialProcessing", reflexive=False
183
+ ):
184
+ class_def = self.schema_view.get_class(class_name)
185
+ if not class_def.abstract:
186
+ self._material_processing_subclass_names.append(class_name)
171
187
 
172
188
  def _get_pi(
173
189
  self, metadata_submission: JSON_OBJECT
@@ -270,63 +286,39 @@ class SubmissionPortalTranslator(Translator):
270
286
  return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
271
287
 
272
288
  def _get_quantity_value(
273
- self, raw_value: Optional[str], unit: Optional[str] = None
289
+ self,
290
+ raw_value: Optional[str | int | float],
291
+ slot_definition: SlotDefinition,
292
+ unit: Optional[str] = None,
274
293
  ) -> Union[nmdc.QuantityValue, None]:
275
- """Construct a nmdc:QuantityValue from a raw value string
276
-
277
- The regex pattern minimally matches on a single numeric value (possibly
278
- floating point). The pattern can also identify a range represented by
279
- two numeric values separated by a hyphen. It can also identify non-numeric
280
- characters at the end of the string which are interpreted as a unit. A unit
281
- may also be explicitly provided as an argument to this function. If parsing
282
- identifies a unit and a unit argument is provided, the unit argument is used.
283
- If the pattern is not matched at all None is returned.
284
-
285
- TODO: currently the parsed unit string is used as-is. In the future we may want
286
- to be stricter about what we accept or coerce into a controlled value set
287
-
288
- :param raw_value: string to parse
289
- :param unit: optional unit, defaults to None
290
- :return: nmdc:QuantityValue
291
- """
292
- if raw_value is None:
293
- return None
294
-
295
- match = re.fullmatch(
296
- "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
297
- raw_value,
298
- )
299
- if not match:
300
- return None
294
+ """Construct a nmdc:QuantityValue from a raw value string"""
295
+
296
+ # If the storage_units annotation is present on the slot and it only contains one unit (i.e.
297
+ # not a pipe-separated list of units) then use that unit.
298
+ if "storage_units" in slot_definition.annotations:
299
+ storage_units = slot_definition.annotations["storage_units"].value
300
+ if storage_units and "|" not in storage_units:
301
+ unit = storage_units
302
+
303
+ # If the raw_value is numeric, directly construct a QuantityValue with the inferred unit.
304
+ if isinstance(raw_value, (int, float)):
305
+ if unit is None:
306
+ raise ValueError(
307
+ f"While processing value for slot {slot_definition.name}, a numeric value was provided but no unit could be inferred."
308
+ )
309
+ # Constructing a Decimal directly from a float will maintain the full precision of the
310
+ # float (i.e. numbers like 0.5 cannot be represented exactly). Converting the float to
311
+ # a string first and then constructing the Decimal from that string will give a more
312
+ # expected value.
313
+ value_as_str = str(raw_value)
314
+ return nmdc.QuantityValue(
315
+ has_raw_value=value_as_str,
316
+ has_numeric_value=Decimal(value_as_str),
317
+ has_unit=unit,
318
+ type="nmdc:QuantityValue",
319
+ )
301
320
 
302
- qv = nmdc.QuantityValue(
303
- has_raw_value=raw_value,
304
- type="nmdc:QuantityValue",
305
- )
306
- if match.group(2):
307
- # having group 2 means the value is a range like "0 - 1". Either
308
- # group 1 or group 2 might be the minimum especially when handling
309
- # negative ranges like "0 - -1"
310
- num_1 = float(match.group(1))
311
- num_2 = float(match.group(2))
312
- qv.has_minimum_numeric_value = min(num_1, num_2)
313
- qv.has_maximum_numeric_value = max(num_1, num_2)
314
- else:
315
- # otherwise we just have a single numeric value
316
- qv.has_numeric_value = float(match.group(1))
317
-
318
- if unit:
319
- # a unit was manually specified
320
- if match.group(3) and unit != match.group(3):
321
- # a unit was also found in the raw string; issue a warning
322
- # if they don't agree, but keep the manually specified one
323
- logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
324
- qv.has_unit = unit
325
- elif match.group(3):
326
- # a unit was found in the raw string
327
- qv.has_unit = match.group(3)
328
-
329
- return qv
321
+ return self._parse_quantity_value(raw_value, unit)
330
322
 
331
323
  def _get_ontology_class(
332
324
  self, raw_value: Optional[str]
@@ -475,6 +467,50 @@ class SubmissionPortalTranslator(Translator):
475
467
 
476
468
  return value
477
469
 
470
+ def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
471
+ """Collect and format DOIs from submission portal schema in nmdc format DOIs
472
+
473
+ If there were no DOIs, None is returned.
474
+
475
+ :param metadata_submission: submission portal entry
476
+ :return: list of nmdc.DOI objects
477
+ """
478
+ data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
479
+ award_dois = self._get_from(
480
+ metadata_submission, ["multiOmicsForm", "awardDois"]
481
+ )
482
+ if data_dois and len(data_dois) > 0:
483
+ updated_data_dois = [
484
+ nmdc.Doi(
485
+ doi_category="dataset_doi",
486
+ doi_provider=doi["provider"],
487
+ doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
488
+ type="nmdc:Doi",
489
+ )
490
+ for doi in data_dois
491
+ ]
492
+ else:
493
+ updated_data_dois = []
494
+
495
+ if award_dois and len(award_dois) > 0:
496
+ updated_award_dois = [
497
+ nmdc.Doi(
498
+ doi_category="award_doi",
499
+ doi_provider=doi["provider"],
500
+ doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
501
+ type="nmdc:Doi",
502
+ )
503
+ for doi in award_dois
504
+ ]
505
+ else:
506
+ updated_award_dois = []
507
+
508
+ return_val = updated_data_dois + updated_award_dois
509
+ if len(return_val) == 0:
510
+ return_val = None
511
+
512
+ return return_val
513
+
478
514
  def _get_data_objects_from_fields(
479
515
  self,
480
516
  sample_data: JSON_OBJECT,
@@ -544,6 +580,14 @@ class SubmissionPortalTranslator(Translator):
544
580
 
545
581
  return data_objects, manifest
546
582
 
583
+ def _parse_sample_link(self, sample_link: str) -> tuple[str, list[str]] | None:
584
+ """Parse a sample link in the form of `ProcessingName:SampleName,..."""
585
+ pattern = r"(" + "|".join(self._material_processing_subclass_names) + r"):(.+)"
586
+ match = re.match(pattern, sample_link)
587
+ if not match:
588
+ return None
589
+ return match.group(1), split_strip(match.group(2), ",")
590
+
547
591
  def _translate_study(
548
592
  self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
549
593
  ) -> nmdc.Study:
@@ -591,6 +635,7 @@ class SubmissionPortalTranslator(Translator):
591
635
  websites=self._get_from(
592
636
  metadata_submission, ["studyForm", "linkOutWebpage"]
593
637
  ),
638
+ associated_dois=self._get_study_dois(metadata_submission),
594
639
  )
595
640
 
596
641
  def _transform_value_for_slot(
@@ -605,6 +650,7 @@ class SubmissionPortalTranslator(Translator):
605
650
  elif slot.range == "QuantityValue":
606
651
  transformed_value = self._get_quantity_value(
607
652
  value,
653
+ slot,
608
654
  unit=unit,
609
655
  )
610
656
  elif slot.range == "ControlledIdentifiedTermValue":
@@ -660,6 +706,17 @@ class SubmissionPortalTranslator(Translator):
660
706
  logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
661
707
  continue
662
708
 
709
+ # This step handles cases where the submission portal/schema instructs a user to
710
+ # provide a value in a specific unit. The unit cannot be parsed out of the raw value
711
+ # in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
712
+ # go away once units are encoded in the schema itself.
713
+ # See: https://github.com/microbiomedata/nmdc-schema/issues/2517
714
+ if class_name in UNIT_OVERRIDES:
715
+ # If the class has unit overrides, check if the slot is in the overrides
716
+ unit_overrides = UNIT_OVERRIDES[class_name]
717
+ if slot_name in unit_overrides:
718
+ unit = unit_overrides[slot_name]
719
+
663
720
  slot_definition = self.schema_view.induced_slot(slot_name, class_name)
664
721
  if slot_definition.multivalued:
665
722
  value_list = value
@@ -742,11 +799,14 @@ class SubmissionPortalTranslator(Translator):
742
799
  "metadata_submission", {}
743
800
  )
744
801
 
745
- # Generate one Study instance based on the metadata submission
746
- nmdc_study_id = self._id_minter("nmdc:Study")[0]
747
- database.study_set = [
748
- self._translate_study(metadata_submission_data, nmdc_study_id)
749
- ]
802
+ # Generate one Study instance based on the metadata submission, if a study_id wasn't provided
803
+ if self.study_id:
804
+ nmdc_study_id = self.study_id
805
+ else:
806
+ nmdc_study_id = self._id_minter("nmdc:Study")[0]
807
+ database.study_set = [
808
+ self._translate_study(metadata_submission_data, nmdc_study_id)
809
+ ]
750
810
 
751
811
  # Automatically populate the `env_package` field in the sample data based on which
752
812
  # environmental data tab the sample data came from.
@@ -778,15 +838,63 @@ class SubmissionPortalTranslator(Translator):
778
838
  )
779
839
 
780
840
  # Translate the sample data into nmdc:Biosample objects
781
- database.biosample_set = [
782
- self._translate_biosample(
783
- sample_data,
784
- nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
785
- nmdc_study_id=nmdc_study_id,
786
- )
787
- for sample_data_id, sample_data in sample_data_by_id.items()
788
- if sample_data
789
- ]
841
+ database.biosample_set = []
842
+ for sample_data_id, sample_data in sample_data_by_id.items():
843
+ # This shouldn't happen, but just in case skip empty sample data
844
+ if not sample_data:
845
+ continue
846
+
847
+ # Find the first tab that has a sample_link value and attempt to parse it
848
+ sample_link = ""
849
+ for tab in sample_data:
850
+ if tab.get("sample_link"):
851
+ sample_link = tab.get("sample_link")
852
+ break
853
+ parsed_sample_link = self._parse_sample_link(sample_link)
854
+
855
+ # If the sample_link could be parsed according to the [ProcessName]:[InputSample,...]
856
+ # format, then create a ProcessedSample and MaterialProcessing instance instead of a
857
+ # Biosample instance. The input samples must be present in the submission for this to
858
+ # work. An exception is raised if any of the referenced input samples are missing.
859
+ if parsed_sample_link is not None:
860
+ processing_type, processing_inputs = parsed_sample_link
861
+ if not all(
862
+ input_id in sample_data_to_nmdc_biosample_ids
863
+ for input_id in processing_inputs
864
+ ):
865
+ raise ValueError(
866
+ f"Could not find all input samples in sample_link '{sample_link}'"
867
+ )
868
+ processed_sample_id = self._id_minter("nmdc:ProcessedSample")[0]
869
+ database.processed_sample_set.append(
870
+ nmdc.ProcessedSample(
871
+ id=processed_sample_id,
872
+ type="nmdc:ProcessedSample",
873
+ name=sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip(),
874
+ )
875
+ )
876
+
877
+ processing_class = getattr(nmdc, processing_type)
878
+ material_processing = processing_class(
879
+ id=self._id_minter(f"nmdc:{processing_type}")[0],
880
+ type=f"nmdc:{processing_type}",
881
+ has_input=[
882
+ sample_data_to_nmdc_biosample_ids[input_id]
883
+ for input_id in processing_inputs
884
+ ],
885
+ has_output=[processed_sample_id],
886
+ )
887
+ database.material_processing_set.append(material_processing)
888
+
889
+ # If there was no sample_link or it doesn't follow the expected format, create a
890
+ # Biosample instance as normal.
891
+ else:
892
+ biosample = self._translate_biosample(
893
+ sample_data,
894
+ nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
895
+ nmdc_study_id=nmdc_study_id,
896
+ )
897
+ database.biosample_set.append(biosample)
790
898
 
791
899
  # This section handles the translation of information in the external sequencing tabs into
792
900
  # various NMDC objects.
@@ -972,3 +1080,42 @@ class SubmissionPortalTranslator(Translator):
972
1080
  database.data_generation_set.append(nucleotide_sequencing)
973
1081
 
974
1082
  return database
1083
+
1084
+ @staticmethod
1085
+ def set_study_images(
1086
+ nmdc_study: nmdc.Study,
1087
+ pi_image_url: Optional[str],
1088
+ primary_study_image_url: Optional[str],
1089
+ study_images_url: Optional[list[str]],
1090
+ ) -> None:
1091
+ """Set images for a study based on provided URLs."""
1092
+
1093
+ if pi_image_url:
1094
+ if not nmdc_study.principal_investigator:
1095
+ nmdc_study.principal_investigator = nmdc.PersonValue(
1096
+ type="nmdc:PersonValue"
1097
+ )
1098
+ nmdc_study.principal_investigator.profile_image_url = pi_image_url
1099
+
1100
+ if primary_study_image_url:
1101
+ if not nmdc_study.study_image:
1102
+ nmdc_study.study_image = []
1103
+ nmdc_study.study_image.append(
1104
+ nmdc.ImageValue(
1105
+ type="nmdc:ImageValue",
1106
+ url=primary_study_image_url,
1107
+ display_order=0,
1108
+ )
1109
+ )
1110
+
1111
+ if study_images_url:
1112
+ if not nmdc_study.study_image:
1113
+ nmdc_study.study_image = []
1114
+ for idx, image_url in enumerate(study_images_url, start=1):
1115
+ nmdc_study.study_image.append(
1116
+ nmdc.ImageValue(
1117
+ type="nmdc:ImageValue",
1118
+ url=image_url,
1119
+ display_order=idx,
1120
+ )
1121
+ )
@@ -1,9 +1,14 @@
1
+ import logging
2
+ import re
1
3
  from abc import ABC, abstractmethod
2
- from typing import Any, Callable, Dict, List, Optional
4
+ from decimal import Decimal
5
+ from typing import Any, Callable, Dict, List, Optional, Union
3
6
  from nmdc_schema import nmdc
4
7
 
5
8
  JSON_OBJECT = Dict[str, Any]
6
9
 
10
+ logger = logging.getLogger(__name__)
11
+
7
12
 
8
13
  class Translator(ABC):
9
14
  def __init__(
@@ -27,3 +32,61 @@ class Translator(ABC):
27
32
  @abstractmethod
28
33
  def get_database(self) -> nmdc.Database:
29
34
  pass
35
+
36
+ def _parse_quantity_value(
37
+ self, raw_value: Optional[str], unit: Optional[str] = None
38
+ ) -> Union[nmdc.QuantityValue, None]:
39
+ """Construct a nmdc:QuantityValue from a raw value string
40
+
41
+ The regex pattern minimally matches on a single numeric value (possibly
42
+ floating point). The pattern can also identify a range represented by
43
+ two numeric values separated by a hyphen. It can also identify non-numeric
44
+ characters at the end of the string which are interpreted as a unit. A unit
45
+ may also be explicitly provided as an argument to this function. If parsing
46
+ identifies a unit and a unit argument is provided, the unit argument is used.
47
+ If the pattern is not matched at all None is returned.
48
+
49
+ :param raw_value: string to parse
50
+ :param unit: optional unit, defaults to None. If None, the unit is extracted from the
51
+ raw_value. If a unit is provided, it will override the unit extracted from the
52
+ raw_value.
53
+ :return: nmdc:QuantityValue
54
+ """
55
+ if raw_value is None:
56
+ return None
57
+
58
+ match = re.fullmatch(
59
+ "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
60
+ raw_value,
61
+ )
62
+ if not match:
63
+ return None
64
+
65
+ quantity_value_kwargs = {
66
+ "has_raw_value": raw_value,
67
+ "type": "nmdc:QuantityValue",
68
+ }
69
+ if match.group(2):
70
+ # having group 2 means the value is a range like "0 - 1". Either
71
+ # group 1 or group 2 might be the minimum especially when handling
72
+ # negative ranges like "0 - -1"
73
+ num_1 = Decimal(match.group(1))
74
+ num_2 = Decimal(match.group(2))
75
+ quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
76
+ quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
77
+ else:
78
+ # otherwise we just have a single numeric value
79
+ quantity_value_kwargs["has_numeric_value"] = Decimal(match.group(1))
80
+
81
+ if unit:
82
+ # a unit was manually specified
83
+ if match.group(3) and unit != match.group(3):
84
+ # a unit was also found in the raw string; issue a warning
85
+ # if they don't agree, but keep the manually specified one
86
+ logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
87
+ quantity_value_kwargs["has_unit"] = unit
88
+ elif match.group(3):
89
+ # a unit was found in the raw string
90
+ quantity_value_kwargs["has_unit"] = match.group(3)
91
+
92
+ return nmdc.QuantityValue(**quantity_value_kwargs)
nmdc_runtime/site/util.py CHANGED
@@ -3,10 +3,11 @@ import os
3
3
  from functools import lru_cache
4
4
  from pymongo.database import Database as MongoDatabase
5
5
  from subprocess import Popen, PIPE, STDOUT, CalledProcessError
6
- from toolz import groupby
6
+ from refscan.lib.helpers import get_collection_names_from_schema
7
7
 
8
- from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
9
8
  from nmdc_runtime.site.resources import mongo_resource
9
+ from nmdc_runtime.util import nmdc_schema_view
10
+
10
11
 
11
12
  mode_test = {
12
13
  "resource_defs": {"mongo": mongo_resource}
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
37
38
 
38
39
  @lru_cache
39
40
  def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
41
+ """
42
+ TODO: Document this function.
43
+ """
44
+ schema_view = nmdc_schema_view()
40
45
  present_collection_names = set(mdb.list_collection_names())
41
46
  return {
42
47
  name: (
43
48
  name in present_collection_names and "id_1" in mdb[name].index_information()
44
49
  )
45
- for name in get_collection_names_from_schema()
50
+ for name in get_collection_names_from_schema(schema_view)
46
51
  }
47
52
 
48
53
 
@@ -1,6 +1,5 @@
1
- from dagster import op, AssetMaterialization, AssetKey, EventMetadata
2
- from jsonschema import Draft7Validator
3
- from nmdc_runtime.util import get_nmdc_jsonschema_dict
1
+ from dagster import op, AssetMaterialization, AssetKey, MetadataValue
2
+ from nmdc_runtime.util import get_nmdc_schema_validator
4
3
  from toolz import dissoc
5
4
 
6
5
  from nmdc_runtime.site.resources import mongo_resource
@@ -61,19 +60,19 @@ def validate_mongo_collection(context, collection_name: str):
61
60
  collection = mongo_db[collection_name] # get mongo collection
62
61
  db_set = collection_name.split(".")[0]
63
62
 
64
- validator = Draft7Validator(get_nmdc_jsonschema_dict())
63
+ validator = get_nmdc_schema_validator()
65
64
  validation_errors = []
66
65
 
67
66
  for count, doc in enumerate(collection.find()):
68
67
  # add logging for progress?
69
68
  # e.g.: if count % 1000 == 0: context.log.info(“done X of Y")
70
69
  doc = dissoc(doc, "_id") # dissoc _id
71
- errors = list(validator.iter_errors({f"{db_set}": [doc]}))
72
- if len(errors) > 0:
70
+ report = validator.validate({f"{db_set}": [doc]}, target_class="Database")
71
+ if len(report.results) > 0:
73
72
  if "id" in doc.keys():
74
- errors = {doc["id"]: [e.message for e in errors]}
73
+ errors = {doc["id"]: [r.message for r in report.results]}
75
74
  else:
76
- errors = {f"missing id ({count})": [e.message for e in errors]}
75
+ errors = {f"missing id ({count})": [r.message for r in report.results]}
77
76
  validation_errors.append(errors)
78
77
 
79
78
  return {"collection_name": collection_name, "errors": validation_errors}
@@ -92,10 +91,15 @@ def announce_validation_report(context, report, api_object):
92
91
  asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
93
92
  description=f"{collection_name} translation validation",
94
93
  metadata={
95
- # https://docs.dagster.io/_apidocs/solids#event-metadata
96
- # also .json, .md, .path, .url, .python_artifact, ...
97
- "n_errors": EventMetadata.int(len(report["errors"])),
98
- "object_id": EventMetadata.text(api_object["id"]),
94
+ # Note: When this code was originally written, it used Dagster's `EventMetadata` class,
95
+ # which has since been replaced by Dagster's `MetadataValue` class.
96
+ #
97
+ # Reference:
98
+ # - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
99
+ # - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
100
+ #
101
+ "n_errors": MetadataValue.int(len(report["errors"])),
102
+ "object_id": MetadataValue.text(api_object["id"]),
99
103
  },
100
104
  )
101
105
 
@@ -0,0 +1,13 @@
1
+ load_from:
2
+ - python_package:
3
+ package_name: nmdc_runtime.site.repository
4
+ attribute: repo
5
+ - python_package:
6
+ package_name: nmdc_runtime.site.repository
7
+ attribute: biosample_submission_ingest
8
+ - python_package:
9
+ package_name: nmdc_runtime.site.repository
10
+ attribute: biosample_export
11
+ - python_package:
12
+ package_name: nmdc_runtime.site.repository
13
+ attribute: database_records_stitching