nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -1
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +731 -40
- nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
- nmdc_runtime/site/graphs.py +80 -29
- nmdc_runtime/site/ops.py +522 -183
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +108 -117
- nmdc_runtime/site/resources.py +72 -36
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
- nmdc_runtime/site/translation/translator.py +64 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +175 -348
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
- nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
- nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
import re
|
|
3
3
|
from collections import namedtuple
|
|
4
4
|
from datetime import datetime
|
|
5
|
+
from decimal import Decimal
|
|
5
6
|
from enum import Enum
|
|
6
7
|
from functools import lru_cache
|
|
7
8
|
from importlib import resources
|
|
@@ -47,6 +48,12 @@ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str]
|
|
|
47
48
|
(INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
|
|
48
49
|
}
|
|
49
50
|
|
|
51
|
+
UNIT_OVERRIDES: dict[str, dict[str, str]] = {
|
|
52
|
+
"Biosample": {
|
|
53
|
+
"depth": "m",
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
50
57
|
|
|
51
58
|
class EnvironmentPackage(Enum):
|
|
52
59
|
r"""
|
|
@@ -139,6 +146,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
139
146
|
# See: https://github.com/microbiomedata/submission-schema/issues/162
|
|
140
147
|
study_category: Optional[str] = None,
|
|
141
148
|
study_pi_image_url: Optional[str] = None,
|
|
149
|
+
study_id: Optional[str] = None,
|
|
142
150
|
# Additional biosample-level metadata with optional column mapping information not captured
|
|
143
151
|
# by the submission portal currently.
|
|
144
152
|
# See: https://github.com/microbiomedata/submission-schema/issues/162
|
|
@@ -159,6 +167,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
159
167
|
nmdc.StudyCategoryEnum(study_category) if study_category else None
|
|
160
168
|
)
|
|
161
169
|
self.study_pi_image_url = study_pi_image_url
|
|
170
|
+
self.study_id = study_id
|
|
162
171
|
|
|
163
172
|
self.biosample_extras = group_dicts_by_key(
|
|
164
173
|
BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
|
|
@@ -168,6 +177,13 @@ class SubmissionPortalTranslator(Translator):
|
|
|
168
177
|
)
|
|
169
178
|
|
|
170
179
|
self.schema_view: SchemaView = _get_schema_view()
|
|
180
|
+
self._material_processing_subclass_names = []
|
|
181
|
+
for class_name in self.schema_view.class_descendants(
|
|
182
|
+
"MaterialProcessing", reflexive=False
|
|
183
|
+
):
|
|
184
|
+
class_def = self.schema_view.get_class(class_name)
|
|
185
|
+
if not class_def.abstract:
|
|
186
|
+
self._material_processing_subclass_names.append(class_name)
|
|
171
187
|
|
|
172
188
|
def _get_pi(
|
|
173
189
|
self, metadata_submission: JSON_OBJECT
|
|
@@ -270,63 +286,39 @@ class SubmissionPortalTranslator(Translator):
|
|
|
270
286
|
return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
|
|
271
287
|
|
|
272
288
|
def _get_quantity_value(
|
|
273
|
-
self,
|
|
289
|
+
self,
|
|
290
|
+
raw_value: Optional[str | int | float],
|
|
291
|
+
slot_definition: SlotDefinition,
|
|
292
|
+
unit: Optional[str] = None,
|
|
274
293
|
) -> Union[nmdc.QuantityValue, None]:
|
|
275
|
-
"""Construct a nmdc:QuantityValue from a raw value string
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
294
|
+
"""Construct a nmdc:QuantityValue from a raw value string"""
|
|
295
|
+
|
|
296
|
+
# If the storage_units annotation is present on the slot and it only contains one unit (i.e.
|
|
297
|
+
# not a pipe-separated list of units) then use that unit.
|
|
298
|
+
if "storage_units" in slot_definition.annotations:
|
|
299
|
+
storage_units = slot_definition.annotations["storage_units"].value
|
|
300
|
+
if storage_units and "|" not in storage_units:
|
|
301
|
+
unit = storage_units
|
|
302
|
+
|
|
303
|
+
# If the raw_value is numeric, directly construct a QuantityValue with the inferred unit.
|
|
304
|
+
if isinstance(raw_value, (int, float)):
|
|
305
|
+
if unit is None:
|
|
306
|
+
raise ValueError(
|
|
307
|
+
f"While processing value for slot {slot_definition.name}, a numeric value was provided but no unit could be inferred."
|
|
308
|
+
)
|
|
309
|
+
# Constructing a Decimal directly from a float will maintain the full precision of the
|
|
310
|
+
# float (i.e. numbers like 0.5 cannot be represented exactly). Converting the float to
|
|
311
|
+
# a string first and then constructing the Decimal from that string will give a more
|
|
312
|
+
# expected value.
|
|
313
|
+
value_as_str = str(raw_value)
|
|
314
|
+
return nmdc.QuantityValue(
|
|
315
|
+
has_raw_value=value_as_str,
|
|
316
|
+
has_numeric_value=Decimal(value_as_str),
|
|
317
|
+
has_unit=unit,
|
|
318
|
+
type="nmdc:QuantityValue",
|
|
319
|
+
)
|
|
301
320
|
|
|
302
|
-
|
|
303
|
-
has_raw_value=raw_value,
|
|
304
|
-
type="nmdc:QuantityValue",
|
|
305
|
-
)
|
|
306
|
-
if match.group(2):
|
|
307
|
-
# having group 2 means the value is a range like "0 - 1". Either
|
|
308
|
-
# group 1 or group 2 might be the minimum especially when handling
|
|
309
|
-
# negative ranges like "0 - -1"
|
|
310
|
-
num_1 = float(match.group(1))
|
|
311
|
-
num_2 = float(match.group(2))
|
|
312
|
-
qv.has_minimum_numeric_value = min(num_1, num_2)
|
|
313
|
-
qv.has_maximum_numeric_value = max(num_1, num_2)
|
|
314
|
-
else:
|
|
315
|
-
# otherwise we just have a single numeric value
|
|
316
|
-
qv.has_numeric_value = float(match.group(1))
|
|
317
|
-
|
|
318
|
-
if unit:
|
|
319
|
-
# a unit was manually specified
|
|
320
|
-
if match.group(3) and unit != match.group(3):
|
|
321
|
-
# a unit was also found in the raw string; issue a warning
|
|
322
|
-
# if they don't agree, but keep the manually specified one
|
|
323
|
-
logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
|
|
324
|
-
qv.has_unit = unit
|
|
325
|
-
elif match.group(3):
|
|
326
|
-
# a unit was found in the raw string
|
|
327
|
-
qv.has_unit = match.group(3)
|
|
328
|
-
|
|
329
|
-
return qv
|
|
321
|
+
return self._parse_quantity_value(raw_value, unit)
|
|
330
322
|
|
|
331
323
|
def _get_ontology_class(
|
|
332
324
|
self, raw_value: Optional[str]
|
|
@@ -475,6 +467,50 @@ class SubmissionPortalTranslator(Translator):
|
|
|
475
467
|
|
|
476
468
|
return value
|
|
477
469
|
|
|
470
|
+
def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
|
|
471
|
+
"""Collect and format DOIs from submission portal schema in nmdc format DOIs
|
|
472
|
+
|
|
473
|
+
If there were no DOIs, None is returned.
|
|
474
|
+
|
|
475
|
+
:param metadata_submission: submission portal entry
|
|
476
|
+
:return: list of nmdc.DOI objects
|
|
477
|
+
"""
|
|
478
|
+
data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
|
|
479
|
+
award_dois = self._get_from(
|
|
480
|
+
metadata_submission, ["multiOmicsForm", "awardDois"]
|
|
481
|
+
)
|
|
482
|
+
if data_dois and len(data_dois) > 0:
|
|
483
|
+
updated_data_dois = [
|
|
484
|
+
nmdc.Doi(
|
|
485
|
+
doi_category="dataset_doi",
|
|
486
|
+
doi_provider=doi["provider"],
|
|
487
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
488
|
+
type="nmdc:Doi",
|
|
489
|
+
)
|
|
490
|
+
for doi in data_dois
|
|
491
|
+
]
|
|
492
|
+
else:
|
|
493
|
+
updated_data_dois = []
|
|
494
|
+
|
|
495
|
+
if award_dois and len(award_dois) > 0:
|
|
496
|
+
updated_award_dois = [
|
|
497
|
+
nmdc.Doi(
|
|
498
|
+
doi_category="award_doi",
|
|
499
|
+
doi_provider=doi["provider"],
|
|
500
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
501
|
+
type="nmdc:Doi",
|
|
502
|
+
)
|
|
503
|
+
for doi in award_dois
|
|
504
|
+
]
|
|
505
|
+
else:
|
|
506
|
+
updated_award_dois = []
|
|
507
|
+
|
|
508
|
+
return_val = updated_data_dois + updated_award_dois
|
|
509
|
+
if len(return_val) == 0:
|
|
510
|
+
return_val = None
|
|
511
|
+
|
|
512
|
+
return return_val
|
|
513
|
+
|
|
478
514
|
def _get_data_objects_from_fields(
|
|
479
515
|
self,
|
|
480
516
|
sample_data: JSON_OBJECT,
|
|
@@ -544,6 +580,14 @@ class SubmissionPortalTranslator(Translator):
|
|
|
544
580
|
|
|
545
581
|
return data_objects, manifest
|
|
546
582
|
|
|
583
|
+
def _parse_sample_link(self, sample_link: str) -> tuple[str, list[str]] | None:
|
|
584
|
+
"""Parse a sample link in the form of `ProcessingName:SampleName,..."""
|
|
585
|
+
pattern = r"(" + "|".join(self._material_processing_subclass_names) + r"):(.+)"
|
|
586
|
+
match = re.match(pattern, sample_link)
|
|
587
|
+
if not match:
|
|
588
|
+
return None
|
|
589
|
+
return match.group(1), split_strip(match.group(2), ",")
|
|
590
|
+
|
|
547
591
|
def _translate_study(
|
|
548
592
|
self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
|
|
549
593
|
) -> nmdc.Study:
|
|
@@ -591,6 +635,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
591
635
|
websites=self._get_from(
|
|
592
636
|
metadata_submission, ["studyForm", "linkOutWebpage"]
|
|
593
637
|
),
|
|
638
|
+
associated_dois=self._get_study_dois(metadata_submission),
|
|
594
639
|
)
|
|
595
640
|
|
|
596
641
|
def _transform_value_for_slot(
|
|
@@ -605,6 +650,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
605
650
|
elif slot.range == "QuantityValue":
|
|
606
651
|
transformed_value = self._get_quantity_value(
|
|
607
652
|
value,
|
|
653
|
+
slot,
|
|
608
654
|
unit=unit,
|
|
609
655
|
)
|
|
610
656
|
elif slot.range == "ControlledIdentifiedTermValue":
|
|
@@ -660,6 +706,17 @@ class SubmissionPortalTranslator(Translator):
|
|
|
660
706
|
logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
|
|
661
707
|
continue
|
|
662
708
|
|
|
709
|
+
# This step handles cases where the submission portal/schema instructs a user to
|
|
710
|
+
# provide a value in a specific unit. The unit cannot be parsed out of the raw value
|
|
711
|
+
# in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
|
|
712
|
+
# go away once units are encoded in the schema itself.
|
|
713
|
+
# See: https://github.com/microbiomedata/nmdc-schema/issues/2517
|
|
714
|
+
if class_name in UNIT_OVERRIDES:
|
|
715
|
+
# If the class has unit overrides, check if the slot is in the overrides
|
|
716
|
+
unit_overrides = UNIT_OVERRIDES[class_name]
|
|
717
|
+
if slot_name in unit_overrides:
|
|
718
|
+
unit = unit_overrides[slot_name]
|
|
719
|
+
|
|
663
720
|
slot_definition = self.schema_view.induced_slot(slot_name, class_name)
|
|
664
721
|
if slot_definition.multivalued:
|
|
665
722
|
value_list = value
|
|
@@ -742,11 +799,14 @@ class SubmissionPortalTranslator(Translator):
|
|
|
742
799
|
"metadata_submission", {}
|
|
743
800
|
)
|
|
744
801
|
|
|
745
|
-
# Generate one Study instance based on the metadata submission
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
802
|
+
# Generate one Study instance based on the metadata submission, if a study_id wasn't provided
|
|
803
|
+
if self.study_id:
|
|
804
|
+
nmdc_study_id = self.study_id
|
|
805
|
+
else:
|
|
806
|
+
nmdc_study_id = self._id_minter("nmdc:Study")[0]
|
|
807
|
+
database.study_set = [
|
|
808
|
+
self._translate_study(metadata_submission_data, nmdc_study_id)
|
|
809
|
+
]
|
|
750
810
|
|
|
751
811
|
# Automatically populate the `env_package` field in the sample data based on which
|
|
752
812
|
# environmental data tab the sample data came from.
|
|
@@ -778,15 +838,63 @@ class SubmissionPortalTranslator(Translator):
|
|
|
778
838
|
)
|
|
779
839
|
|
|
780
840
|
# Translate the sample data into nmdc:Biosample objects
|
|
781
|
-
database.biosample_set = [
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
841
|
+
database.biosample_set = []
|
|
842
|
+
for sample_data_id, sample_data in sample_data_by_id.items():
|
|
843
|
+
# This shouldn't happen, but just in case skip empty sample data
|
|
844
|
+
if not sample_data:
|
|
845
|
+
continue
|
|
846
|
+
|
|
847
|
+
# Find the first tab that has a sample_link value and attempt to parse it
|
|
848
|
+
sample_link = ""
|
|
849
|
+
for tab in sample_data:
|
|
850
|
+
if tab.get("sample_link"):
|
|
851
|
+
sample_link = tab.get("sample_link")
|
|
852
|
+
break
|
|
853
|
+
parsed_sample_link = self._parse_sample_link(sample_link)
|
|
854
|
+
|
|
855
|
+
# If the sample_link could be parsed according to the [ProcessName]:[InputSample,...]
|
|
856
|
+
# format, then create a ProcessedSample and MaterialProcessing instance instead of a
|
|
857
|
+
# Biosample instance. The input samples must be present in the submission for this to
|
|
858
|
+
# work. An exception is raised if any of the referenced input samples are missing.
|
|
859
|
+
if parsed_sample_link is not None:
|
|
860
|
+
processing_type, processing_inputs = parsed_sample_link
|
|
861
|
+
if not all(
|
|
862
|
+
input_id in sample_data_to_nmdc_biosample_ids
|
|
863
|
+
for input_id in processing_inputs
|
|
864
|
+
):
|
|
865
|
+
raise ValueError(
|
|
866
|
+
f"Could not find all input samples in sample_link '{sample_link}'"
|
|
867
|
+
)
|
|
868
|
+
processed_sample_id = self._id_minter("nmdc:ProcessedSample")[0]
|
|
869
|
+
database.processed_sample_set.append(
|
|
870
|
+
nmdc.ProcessedSample(
|
|
871
|
+
id=processed_sample_id,
|
|
872
|
+
type="nmdc:ProcessedSample",
|
|
873
|
+
name=sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip(),
|
|
874
|
+
)
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
processing_class = getattr(nmdc, processing_type)
|
|
878
|
+
material_processing = processing_class(
|
|
879
|
+
id=self._id_minter(f"nmdc:{processing_type}")[0],
|
|
880
|
+
type=f"nmdc:{processing_type}",
|
|
881
|
+
has_input=[
|
|
882
|
+
sample_data_to_nmdc_biosample_ids[input_id]
|
|
883
|
+
for input_id in processing_inputs
|
|
884
|
+
],
|
|
885
|
+
has_output=[processed_sample_id],
|
|
886
|
+
)
|
|
887
|
+
database.material_processing_set.append(material_processing)
|
|
888
|
+
|
|
889
|
+
# If there was no sample_link or it doesn't follow the expected format, create a
|
|
890
|
+
# Biosample instance as normal.
|
|
891
|
+
else:
|
|
892
|
+
biosample = self._translate_biosample(
|
|
893
|
+
sample_data,
|
|
894
|
+
nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
|
|
895
|
+
nmdc_study_id=nmdc_study_id,
|
|
896
|
+
)
|
|
897
|
+
database.biosample_set.append(biosample)
|
|
790
898
|
|
|
791
899
|
# This section handles the translation of information in the external sequencing tabs into
|
|
792
900
|
# various NMDC objects.
|
|
@@ -972,3 +1080,42 @@ class SubmissionPortalTranslator(Translator):
|
|
|
972
1080
|
database.data_generation_set.append(nucleotide_sequencing)
|
|
973
1081
|
|
|
974
1082
|
return database
|
|
1083
|
+
|
|
1084
|
+
@staticmethod
|
|
1085
|
+
def set_study_images(
|
|
1086
|
+
nmdc_study: nmdc.Study,
|
|
1087
|
+
pi_image_url: Optional[str],
|
|
1088
|
+
primary_study_image_url: Optional[str],
|
|
1089
|
+
study_images_url: Optional[list[str]],
|
|
1090
|
+
) -> None:
|
|
1091
|
+
"""Set images for a study based on provided URLs."""
|
|
1092
|
+
|
|
1093
|
+
if pi_image_url:
|
|
1094
|
+
if not nmdc_study.principal_investigator:
|
|
1095
|
+
nmdc_study.principal_investigator = nmdc.PersonValue(
|
|
1096
|
+
type="nmdc:PersonValue"
|
|
1097
|
+
)
|
|
1098
|
+
nmdc_study.principal_investigator.profile_image_url = pi_image_url
|
|
1099
|
+
|
|
1100
|
+
if primary_study_image_url:
|
|
1101
|
+
if not nmdc_study.study_image:
|
|
1102
|
+
nmdc_study.study_image = []
|
|
1103
|
+
nmdc_study.study_image.append(
|
|
1104
|
+
nmdc.ImageValue(
|
|
1105
|
+
type="nmdc:ImageValue",
|
|
1106
|
+
url=primary_study_image_url,
|
|
1107
|
+
display_order=0,
|
|
1108
|
+
)
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
if study_images_url:
|
|
1112
|
+
if not nmdc_study.study_image:
|
|
1113
|
+
nmdc_study.study_image = []
|
|
1114
|
+
for idx, image_url in enumerate(study_images_url, start=1):
|
|
1115
|
+
nmdc_study.study_image.append(
|
|
1116
|
+
nmdc.ImageValue(
|
|
1117
|
+
type="nmdc:ImageValue",
|
|
1118
|
+
url=image_url,
|
|
1119
|
+
display_order=idx,
|
|
1120
|
+
)
|
|
1121
|
+
)
|
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
1
3
|
from abc import ABC, abstractmethod
|
|
2
|
-
from
|
|
4
|
+
from decimal import Decimal
|
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
3
6
|
from nmdc_schema import nmdc
|
|
4
7
|
|
|
5
8
|
JSON_OBJECT = Dict[str, Any]
|
|
6
9
|
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
7
12
|
|
|
8
13
|
class Translator(ABC):
|
|
9
14
|
def __init__(
|
|
@@ -27,3 +32,61 @@ class Translator(ABC):
|
|
|
27
32
|
@abstractmethod
|
|
28
33
|
def get_database(self) -> nmdc.Database:
|
|
29
34
|
pass
|
|
35
|
+
|
|
36
|
+
def _parse_quantity_value(
|
|
37
|
+
self, raw_value: Optional[str], unit: Optional[str] = None
|
|
38
|
+
) -> Union[nmdc.QuantityValue, None]:
|
|
39
|
+
"""Construct a nmdc:QuantityValue from a raw value string
|
|
40
|
+
|
|
41
|
+
The regex pattern minimally matches on a single numeric value (possibly
|
|
42
|
+
floating point). The pattern can also identify a range represented by
|
|
43
|
+
two numeric values separated by a hyphen. It can also identify non-numeric
|
|
44
|
+
characters at the end of the string which are interpreted as a unit. A unit
|
|
45
|
+
may also be explicitly provided as an argument to this function. If parsing
|
|
46
|
+
identifies a unit and a unit argument is provided, the unit argument is used.
|
|
47
|
+
If the pattern is not matched at all None is returned.
|
|
48
|
+
|
|
49
|
+
:param raw_value: string to parse
|
|
50
|
+
:param unit: optional unit, defaults to None. If None, the unit is extracted from the
|
|
51
|
+
raw_value. If a unit is provided, it will override the unit extracted from the
|
|
52
|
+
raw_value.
|
|
53
|
+
:return: nmdc:QuantityValue
|
|
54
|
+
"""
|
|
55
|
+
if raw_value is None:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
match = re.fullmatch(
|
|
59
|
+
"([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
|
|
60
|
+
raw_value,
|
|
61
|
+
)
|
|
62
|
+
if not match:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
quantity_value_kwargs = {
|
|
66
|
+
"has_raw_value": raw_value,
|
|
67
|
+
"type": "nmdc:QuantityValue",
|
|
68
|
+
}
|
|
69
|
+
if match.group(2):
|
|
70
|
+
# having group 2 means the value is a range like "0 - 1". Either
|
|
71
|
+
# group 1 or group 2 might be the minimum especially when handling
|
|
72
|
+
# negative ranges like "0 - -1"
|
|
73
|
+
num_1 = Decimal(match.group(1))
|
|
74
|
+
num_2 = Decimal(match.group(2))
|
|
75
|
+
quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
|
|
76
|
+
quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
|
|
77
|
+
else:
|
|
78
|
+
# otherwise we just have a single numeric value
|
|
79
|
+
quantity_value_kwargs["has_numeric_value"] = Decimal(match.group(1))
|
|
80
|
+
|
|
81
|
+
if unit:
|
|
82
|
+
# a unit was manually specified
|
|
83
|
+
if match.group(3) and unit != match.group(3):
|
|
84
|
+
# a unit was also found in the raw string; issue a warning
|
|
85
|
+
# if they don't agree, but keep the manually specified one
|
|
86
|
+
logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
|
|
87
|
+
quantity_value_kwargs["has_unit"] = unit
|
|
88
|
+
elif match.group(3):
|
|
89
|
+
# a unit was found in the raw string
|
|
90
|
+
quantity_value_kwargs["has_unit"] = match.group(3)
|
|
91
|
+
|
|
92
|
+
return nmdc.QuantityValue(**quantity_value_kwargs)
|
nmdc_runtime/site/util.py
CHANGED
|
@@ -3,10 +3,11 @@ import os
|
|
|
3
3
|
from functools import lru_cache
|
|
4
4
|
from pymongo.database import Database as MongoDatabase
|
|
5
5
|
from subprocess import Popen, PIPE, STDOUT, CalledProcessError
|
|
6
|
-
from
|
|
6
|
+
from refscan.lib.helpers import get_collection_names_from_schema
|
|
7
7
|
|
|
8
|
-
from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
|
|
9
8
|
from nmdc_runtime.site.resources import mongo_resource
|
|
9
|
+
from nmdc_runtime.util import nmdc_schema_view
|
|
10
|
+
|
|
10
11
|
|
|
11
12
|
mode_test = {
|
|
12
13
|
"resource_defs": {"mongo": mongo_resource}
|
|
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
|
|
|
37
38
|
|
|
38
39
|
@lru_cache
|
|
39
40
|
def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
|
|
41
|
+
"""
|
|
42
|
+
TODO: Document this function.
|
|
43
|
+
"""
|
|
44
|
+
schema_view = nmdc_schema_view()
|
|
40
45
|
present_collection_names = set(mdb.list_collection_names())
|
|
41
46
|
return {
|
|
42
47
|
name: (
|
|
43
48
|
name in present_collection_names and "id_1" in mdb[name].index_information()
|
|
44
49
|
)
|
|
45
|
-
for name in get_collection_names_from_schema()
|
|
50
|
+
for name in get_collection_names_from_schema(schema_view)
|
|
46
51
|
}
|
|
47
52
|
|
|
48
53
|
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
from dagster import op, AssetMaterialization, AssetKey,
|
|
2
|
-
from
|
|
3
|
-
from nmdc_runtime.util import get_nmdc_jsonschema_dict
|
|
1
|
+
from dagster import op, AssetMaterialization, AssetKey, MetadataValue
|
|
2
|
+
from nmdc_runtime.util import get_nmdc_schema_validator
|
|
4
3
|
from toolz import dissoc
|
|
5
4
|
|
|
6
5
|
from nmdc_runtime.site.resources import mongo_resource
|
|
@@ -61,19 +60,19 @@ def validate_mongo_collection(context, collection_name: str):
|
|
|
61
60
|
collection = mongo_db[collection_name] # get mongo collection
|
|
62
61
|
db_set = collection_name.split(".")[0]
|
|
63
62
|
|
|
64
|
-
validator =
|
|
63
|
+
validator = get_nmdc_schema_validator()
|
|
65
64
|
validation_errors = []
|
|
66
65
|
|
|
67
66
|
for count, doc in enumerate(collection.find()):
|
|
68
67
|
# add logging for progress?
|
|
69
68
|
# e.g.: if count % 1000 == 0: context.log.info(“done X of Y")
|
|
70
69
|
doc = dissoc(doc, "_id") # dissoc _id
|
|
71
|
-
|
|
72
|
-
if len(
|
|
70
|
+
report = validator.validate({f"{db_set}": [doc]}, target_class="Database")
|
|
71
|
+
if len(report.results) > 0:
|
|
73
72
|
if "id" in doc.keys():
|
|
74
|
-
errors = {doc["id"]: [
|
|
73
|
+
errors = {doc["id"]: [r.message for r in report.results]}
|
|
75
74
|
else:
|
|
76
|
-
errors = {f"missing id ({count})": [
|
|
75
|
+
errors = {f"missing id ({count})": [r.message for r in report.results]}
|
|
77
76
|
validation_errors.append(errors)
|
|
78
77
|
|
|
79
78
|
return {"collection_name": collection_name, "errors": validation_errors}
|
|
@@ -92,10 +91,15 @@ def announce_validation_report(context, report, api_object):
|
|
|
92
91
|
asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
|
|
93
92
|
description=f"{collection_name} translation validation",
|
|
94
93
|
metadata={
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
94
|
+
# Note: When this code was originally written, it used Dagster's `EventMetadata` class,
|
|
95
|
+
# which has since been replaced by Dagster's `MetadataValue` class.
|
|
96
|
+
#
|
|
97
|
+
# Reference:
|
|
98
|
+
# - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
|
|
99
|
+
# - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
|
|
100
|
+
#
|
|
101
|
+
"n_errors": MetadataValue.int(len(report["errors"])),
|
|
102
|
+
"object_id": MetadataValue.text(api_object["id"]),
|
|
99
103
|
},
|
|
100
104
|
)
|
|
101
105
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
load_from:
|
|
2
|
+
- python_package:
|
|
3
|
+
package_name: nmdc_runtime.site.repository
|
|
4
|
+
attribute: repo
|
|
5
|
+
- python_package:
|
|
6
|
+
package_name: nmdc_runtime.site.repository
|
|
7
|
+
attribute: biosample_submission_ingest
|
|
8
|
+
- python_package:
|
|
9
|
+
package_name: nmdc_runtime.site.repository
|
|
10
|
+
attribute: biosample_export
|
|
11
|
+
- python_package:
|
|
12
|
+
package_name: nmdc_runtime.site.repository
|
|
13
|
+
attribute: database_records_stitching
|