nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,9 +1,14 @@
1
+ import logging
2
+ import re
1
3
  from abc import ABC, abstractmethod
2
- from typing import Any, Callable, Dict, List, Optional
4
+ from decimal import Decimal
5
+ from typing import Any, Callable, Dict, List, Optional, Union
3
6
  from nmdc_schema import nmdc
4
7
 
5
8
  JSON_OBJECT = Dict[str, Any]
6
9
 
10
+ logger = logging.getLogger(__name__)
11
+
7
12
 
8
13
  class Translator(ABC):
9
14
  def __init__(
@@ -14,9 +19,74 @@ class Translator(ABC):
14
19
  def _index_by_id(self, collection, id):
15
20
  return {item[id]: item for item in collection}
16
21
 
17
- def _get_curie(self, prefix: str, local: str) -> str:
18
- return f"{prefix}:{local}"
22
+ @staticmethod
23
+ def _ensure_curie(identifier: str, *, default_prefix: str) -> str:
24
+ identifier_parts = identifier.split(":", 1)
25
+
26
+ # Don't add prefix if identifier is already a CURIE
27
+ if len(identifier_parts) == 2:
28
+ return identifier
29
+
30
+ return f"{default_prefix}:{identifier_parts[0]}"
19
31
 
20
32
  @abstractmethod
21
33
  def get_database(self) -> nmdc.Database:
22
34
  pass
35
+
36
+ def _parse_quantity_value(
37
+ self, raw_value: Optional[str], unit: Optional[str] = None
38
+ ) -> Union[nmdc.QuantityValue, None]:
39
+ """Construct a nmdc:QuantityValue from a raw value string
40
+
41
+ The regex pattern minimally matches on a single numeric value (possibly
42
+ floating point). The pattern can also identify a range represented by
43
+ two numeric values separated by a hyphen. It can also identify non-numeric
44
+ characters at the end of the string which are interpreted as a unit. A unit
45
+ may also be explicitly provided as an argument to this function. If parsing
46
+ identifies a unit and a unit argument is provided, the unit argument is used.
47
+ If the pattern is not matched at all None is returned.
48
+
49
+ :param raw_value: string to parse
50
+ :param unit: optional unit, defaults to None. If None, the unit is extracted from the
51
+ raw_value. If a unit is provided, it will override the unit extracted from the
52
+ raw_value.
53
+ :return: nmdc:QuantityValue
54
+ """
55
+ if raw_value is None:
56
+ return None
57
+
58
+ match = re.fullmatch(
59
+ "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
60
+ raw_value,
61
+ )
62
+ if not match:
63
+ return None
64
+
65
+ quantity_value_kwargs = {
66
+ "has_raw_value": raw_value,
67
+ "type": "nmdc:QuantityValue",
68
+ }
69
+ if match.group(2):
70
+ # having group 2 means the value is a range like "0 - 1". Either
71
+ # group 1 or group 2 might be the minimum especially when handling
72
+ # negative ranges like "0 - -1"
73
+ num_1 = Decimal(match.group(1))
74
+ num_2 = Decimal(match.group(2))
75
+ quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
76
+ quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
77
+ else:
78
+ # otherwise we just have a single numeric value
79
+ quantity_value_kwargs["has_numeric_value"] = Decimal(match.group(1))
80
+
81
+ if unit:
82
+ # a unit was manually specified
83
+ if match.group(3) and unit != match.group(3):
84
+ # a unit was also found in the raw string; issue a warning
85
+ # if they don't agree, but keep the manually specified one
86
+ logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
87
+ quantity_value_kwargs["has_unit"] = unit
88
+ elif match.group(3):
89
+ # a unit was found in the raw string
90
+ quantity_value_kwargs["has_unit"] = match.group(3)
91
+
92
+ return nmdc.QuantityValue(**quantity_value_kwargs)
nmdc_runtime/site/util.py CHANGED
@@ -1,10 +1,13 @@
1
1
  import os
2
- from functools import lru_cache
3
- from subprocess import Popen, PIPE, STDOUT, CalledProcessError
4
2
 
3
+ from functools import lru_cache
5
4
  from pymongo.database import Database as MongoDatabase
5
+ from subprocess import Popen, PIPE, STDOUT, CalledProcessError
6
+ from refscan.lib.helpers import get_collection_names_from_schema
6
7
 
7
8
  from nmdc_runtime.site.resources import mongo_resource
9
+ from nmdc_runtime.util import nmdc_schema_view
10
+
8
11
 
9
12
  mode_test = {
10
13
  "resource_defs": {"mongo": mongo_resource}
@@ -34,14 +37,30 @@ def run_and_log(shell_cmd, context):
34
37
 
35
38
 
36
39
  @lru_cache
37
- def collection_indexed_on_id(mdb: MongoDatabase) -> dict:
38
- set_collection_names = [
39
- name for name in mdb.list_collection_names() if name.endswith("_set")
40
- ]
40
+ def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
41
+ """
42
+ TODO: Document this function.
43
+ """
44
+ schema_view = nmdc_schema_view()
45
+ present_collection_names = set(mdb.list_collection_names())
41
46
  return {
42
- name: ("id_1" in mdb[name].index_information()) for name in set_collection_names
47
+ name: (
48
+ name in present_collection_names and "id_1" in mdb[name].index_information()
49
+ )
50
+ for name in get_collection_names_from_schema(schema_view)
43
51
  }
44
52
 
45
53
 
46
54
  def get_basename(filename: str) -> str:
47
55
  return os.path.basename(filename)
56
+
57
+
58
+ def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
59
+ return nmdc_study_id.replace(":", "_").replace("-", "_")
60
+
61
+
62
+ def get_instruments_by_id(mdb: MongoDatabase) -> dict[str, dict]:
63
+ """Get all documents from the instrument_set collection in a dict keyed by id."""
64
+ return {
65
+ instrument["id"]: instrument for instrument in mdb["instrument_set"].find({})
66
+ }
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Validates data in the EMSL collection in the nmdc_etl_staging database.
3
3
  """
4
+
4
5
  from dagster import op, graph
5
6
  from nmdc_runtime.site.validation.util import (
6
7
  preset_prod,
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Validates data in the GOLD collection in the nmdc_etl_staging database.
3
3
  """
4
+
4
5
  from dagster import op, graph
5
6
  from nmdc_runtime.site.validation.util import (
6
7
  preset_prod,
@@ -1,6 +1,5 @@
1
- from dagster import op, AssetMaterialization, AssetKey, EventMetadata
2
- from jsonschema import Draft7Validator
3
- from nmdc_runtime.util import get_nmdc_jsonschema_dict
1
+ from dagster import op, AssetMaterialization, AssetKey, MetadataValue
2
+ from nmdc_runtime.util import get_nmdc_schema_validator
4
3
  from toolz import dissoc
5
4
 
6
5
  from nmdc_runtime.site.resources import mongo_resource
@@ -61,19 +60,19 @@ def validate_mongo_collection(context, collection_name: str):
61
60
  collection = mongo_db[collection_name] # get mongo collection
62
61
  db_set = collection_name.split(".")[0]
63
62
 
64
- validator = Draft7Validator(get_nmdc_jsonschema_dict())
63
+ validator = get_nmdc_schema_validator()
65
64
  validation_errors = []
66
65
 
67
66
  for count, doc in enumerate(collection.find()):
68
67
  # add logging for progress?
69
68
  # e.g.: if count % 1000 == 0: context.log.info(“done X of Y")
70
69
  doc = dissoc(doc, "_id") # dissoc _id
71
- errors = list(validator.iter_errors({f"{db_set}": [doc]}))
72
- if len(errors) > 0:
70
+ report = validator.validate({f"{db_set}": [doc]}, target_class="Database")
71
+ if len(report.results) > 0:
73
72
  if "id" in doc.keys():
74
- errors = {doc["id"]: [e.message for e in errors]}
73
+ errors = {doc["id"]: [r.message for r in report.results]}
75
74
  else:
76
- errors = {f"missing id ({count})": [e.message for e in errors]}
75
+ errors = {f"missing id ({count})": [r.message for r in report.results]}
77
76
  validation_errors.append(errors)
78
77
 
79
78
  return {"collection_name": collection_name, "errors": validation_errors}
@@ -92,10 +91,15 @@ def announce_validation_report(context, report, api_object):
92
91
  asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
93
92
  description=f"{collection_name} translation validation",
94
93
  metadata={
95
- # https://docs.dagster.io/_apidocs/solids#event-metadata
96
- # also .json, .md, .path, .url, .python_artifact, ...
97
- "n_errors": EventMetadata.int(len(report["errors"])),
98
- "object_id": EventMetadata.text(api_object["id"]),
94
+ # Note: When this code was originally written, it used Dagster's `EventMetadata` class,
95
+ # which has since been replaced by Dagster's `MetadataValue` class.
96
+ #
97
+ # Reference:
98
+ # - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
99
+ # - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
100
+ #
101
+ "n_errors": MetadataValue.int(len(report["errors"])),
102
+ "object_id": MetadataValue.text(api_object["id"]),
99
103
  },
100
104
  )
101
105
 
@@ -0,0 +1,13 @@
1
+ load_from:
2
+ - python_package:
3
+ package_name: nmdc_runtime.site.repository
4
+ attribute: repo
5
+ - python_package:
6
+ package_name: nmdc_runtime.site.repository
7
+ attribute: biosample_submission_ingest
8
+ - python_package:
9
+ package_name: nmdc_runtime.site.repository
10
+ attribute: biosample_export
11
+ - python_package:
12
+ package_name: nmdc_runtime.site.repository
13
+ attribute: database_records_stitching