nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -14,12 +14,10 @@ from dagster import (
14
14
  StringSource,
15
15
  InitResourceContext,
16
16
  )
17
- from fastjsonschema import JsonSchemaValueException
18
17
  from frozendict import frozendict
19
18
  from linkml_runtime.dumpers import json_dumper
20
19
  from pydantic import BaseModel, AnyUrl
21
20
  from pymongo import MongoClient, ReplaceOne, InsertOne
22
- from terminusdb_client import WOQLClient
23
21
  from toolz import get_in
24
22
  from toolz import merge
25
23
 
@@ -28,7 +26,7 @@ from nmdc_runtime.api.models.object import DrsObject, AccessURL, DrsObjectIn
28
26
  from nmdc_runtime.api.models.operation import ListOperationsResponse
29
27
  from nmdc_runtime.api.models.util import ListRequest
30
28
  from nmdc_runtime.site.normalization.gold import normalize_gold_id
31
- from nmdc_runtime.util import unfreeze, nmdc_jsonschema_validator_noidpatterns
29
+ from nmdc_runtime.util import unfreeze, get_nmdc_schema_validator
32
30
  from nmdc_schema import nmdc
33
31
 
34
32
 
@@ -110,7 +108,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
110
108
  },
111
109
  )
112
110
  response.raise_for_status()
113
- return response.json()["cursor"]["firstBatch"]
111
+ return response.json()["cursor"]["batch"]
114
112
 
115
113
  def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
116
114
  gold_project_id = normalize_gold_id(gold_project_id)
@@ -127,19 +125,39 @@ class RuntimeApiUserClient(RuntimeApiClient):
127
125
  },
128
126
  )
129
127
  response.raise_for_status()
130
- return response.json()["cursor"]["firstBatch"]
128
+ return response.json()["cursor"]["batch"]
131
129
 
132
130
  def get_biosamples_for_study(self, study_id: str):
131
+ # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
132
+ # The /nmdcschema/{collection-name} endpoint implements pagination via the page_token mechanism,
133
+ # but the tradeoff there is that we would need to make multiple requests to step through the
134
+ # each of the pages. By picking a large number for max_page_size, we can get all the results
135
+ # in a single request.
136
+ # This method previously used the /queries:run endpoint but the problem with that was that
137
+ # it used to truncate the number of results returned to 100.
133
138
  response = self.request(
134
- "POST",
135
- f"/queries:run",
139
+ "GET",
140
+ f"/nmdcschema/biosample_set",
136
141
  {
137
- "find": "biosample_set",
138
- "filter": {"part_of": {"$elemMatch": {"$eq": study_id}}},
142
+ "filter": json.dumps({"associated_studies": study_id}),
143
+ "max_page_size": 10000,
139
144
  },
140
145
  )
141
146
  response.raise_for_status()
142
- return response.json()["cursor"]["firstBatch"]
147
+ return response.json()["resources"]
148
+
149
+ def get_data_generation_records_for_study(self, study_id: str):
150
+ # TODO: same as above, we are using a large max_page_size to avoid pagination.
151
+ response = self.request(
152
+ "GET",
153
+ f"/nmdcschema/data_generation_set",
154
+ {
155
+ "filter": json.dumps({"associated_studies": study_id}),
156
+ "max_page_size": 10000,
157
+ },
158
+ )
159
+ response.raise_for_status()
160
+ return response.json()["resources"]
143
161
 
144
162
  def get_omics_processing_by_name(self, name: str):
145
163
  response = self.request(
@@ -151,7 +169,19 @@ class RuntimeApiUserClient(RuntimeApiClient):
151
169
  },
152
170
  )
153
171
  response.raise_for_status()
154
- return response.json()["cursor"]["firstBatch"]
172
+ return response.json()["cursor"]["batch"]
173
+
174
+ def get_study(self, study_id: str):
175
+ response = self.request(
176
+ "POST",
177
+ f"/queries:run",
178
+ {
179
+ "find": "study_set",
180
+ "filter": {"id": study_id},
181
+ },
182
+ )
183
+ response.raise_for_status()
184
+ return response.json()["cursor"]["batch"]
155
185
 
156
186
 
157
187
  class RuntimeApiSiteClient(RuntimeApiClient):
@@ -332,9 +362,26 @@ class GoldApiClient(BasicAuthClient):
332
362
  """
333
363
  return id.replace("gold:", "")
334
364
 
335
- def fetch_biosamples_by_study(self, study_id: str) -> List[Dict[str, Any]]:
365
+ def fetch_biosamples_by_study(
366
+ self, study_id: str, include_project=True
367
+ ) -> List[Dict[str, Any]]:
336
368
  id = self._normalize_id(study_id)
337
369
  results = self.request("/biosamples", params={"studyGoldId": id})
370
+ if include_project:
371
+ projects = self.fetch_projects_by_study(id)
372
+ biosamples_by_id = {
373
+ biosample["biosampleGoldId"]: biosample for biosample in results
374
+ }
375
+ for project in projects:
376
+ sample_id = project.get("biosampleGoldId")
377
+ if not sample_id:
378
+ continue
379
+ if sample_id not in biosamples_by_id:
380
+ continue
381
+ biosample = biosamples_by_id[sample_id]
382
+ if "projects" not in biosample:
383
+ biosample["projects"] = []
384
+ biosample["projects"].append(project)
338
385
  return results
339
386
 
340
387
  def fetch_projects_by_study(self, study_id: str) -> List[Dict[str, Any]]:
@@ -354,6 +401,18 @@ class GoldApiClient(BasicAuthClient):
354
401
  return None
355
402
  return results[0]
356
403
 
404
+ def fetch_projects_by_biosample(self, biosample_id: str) -> List[Dict[str, Any]]:
405
+ id = self._normalize_id(biosample_id)
406
+ results = self.request("/projects", params={"biosampleGoldId": id})
407
+ return results
408
+
409
+ def fetch_biosample_by_biosample_id(
410
+ self, biosample_id: str
411
+ ) -> List[Dict[str, Any]]:
412
+ id = self._normalize_id(biosample_id)
413
+ results = self.request("/biosamples", params={"biosampleGoldId": id})
414
+ return results
415
+
357
416
 
358
417
  @resource(
359
418
  config_schema={
@@ -372,15 +431,47 @@ def gold_api_client_resource(context: InitResourceContext):
372
431
 
373
432
  @dataclass
374
433
  class NmdcPortalApiClient:
434
+
375
435
  base_url: str
376
- # Using a cookie for authentication is not ideal and should be replaced
377
- # when this API has an another authentication method
378
- session_cookie: str
436
+ refresh_token: str
437
+ access_token: Optional[str] = None
438
+ access_token_expires_at: Optional[datetime] = None
439
+
440
+ def _request(self, method: str, endpoint: str, **kwargs):
441
+ r"""
442
+ Submits a request to the specified API endpoint;
443
+ after refreshing the access token, if necessary.
444
+ """
445
+ if self.access_token is None or datetime.now() > self.access_token_expires_at:
446
+ refresh_response = requests.post(
447
+ f"{self.base_url}/auth/refresh",
448
+ json={"refresh_token": self.refresh_token},
449
+ )
450
+ refresh_response.raise_for_status()
451
+ refresh_body = refresh_response.json()
452
+ self.access_token_expires_at = datetime.now() + timedelta(
453
+ seconds=refresh_body["expires_in"]
454
+ )
455
+ self.access_token = refresh_body["access_token"]
456
+
457
+ headers = kwargs.get("headers", {})
458
+ headers["Authorization"] = f"Bearer {self.access_token}"
459
+ return requests.request(
460
+ method, f"{self.base_url}{endpoint}", **kwargs, headers=headers
461
+ )
379
462
 
380
463
  def fetch_metadata_submission(self, id: str) -> Dict[str, Any]:
381
- response = requests.get(
382
- f"{self.base_url}/api/metadata_submission/{id}",
383
- cookies={"session": self.session_cookie},
464
+ response = self._request("GET", f"/api/metadata_submission/{id}")
465
+ response.raise_for_status()
466
+ return response.json()
467
+
468
+ def make_submission_images_public(
469
+ self, submission_id: str, *, study_id: str
470
+ ) -> Dict[str, Any]:
471
+ response = self._request(
472
+ "POST",
473
+ f"/api/metadata_submission/{submission_id}/image/make_public",
474
+ json={"study_id": study_id},
384
475
  )
385
476
  response.raise_for_status()
386
477
  return response.json()
@@ -389,13 +480,13 @@ class NmdcPortalApiClient:
389
480
  @resource(
390
481
  config_schema={
391
482
  "base_url": StringSource,
392
- "session_cookie": StringSource,
483
+ "refresh_token": StringSource,
393
484
  }
394
485
  )
395
486
  def nmdc_portal_api_client_resource(context: InitResourceContext):
396
487
  return NmdcPortalApiClient(
397
488
  base_url=context.resource_config["base_url"],
398
- session_cookie=context.resource_config["session_cookie"],
489
+ refresh_token=context.resource_config["refresh_token"],
399
490
  )
400
491
 
401
492
 
@@ -439,36 +530,49 @@ class MongoDB:
439
530
  self.db = self.client[dbname]
440
531
 
441
532
  def add_docs(self, docs, validate=True, replace=True):
442
- try:
443
- if validate:
444
- nmdc_jsonschema_validator_noidpatterns(docs)
445
- rv = {}
446
- for collection_name, docs in docs.items():
447
- rv[collection_name] = self.db[collection_name].bulk_write(
448
- [
449
- (
450
- ReplaceOne({"id": d["id"]}, d, upsert=True)
451
- if replace
452
- else InsertOne(d)
453
- )
454
- for d in docs
455
- ]
456
- )
457
- now = datetime.now(timezone.utc)
458
- self.db.txn_log.insert_many(
459
- [
460
- {
461
- "tgt": {"id": d.get("id"), "c": collection_name},
462
- "type": "upsert",
463
- "ts": now,
464
- # "dtl": {},
465
- }
466
- for d in docs
467
- ]
468
- )
469
- return rv
470
- except JsonSchemaValueException as e:
471
- raise ValueError(e.message)
533
+ """
534
+ TODO: Document this function.
535
+ """
536
+ if validate:
537
+ validator = get_nmdc_schema_validator()
538
+ # Fail fast on first validation error.
539
+ for result in validator.iter_results(docs, target_class="Database"):
540
+ raise ValueError(result.message)
541
+ rv = {}
542
+ for collection_name, collection_docs in docs.items():
543
+ # If `collection_docs` is empty, abort this iteration.
544
+ #
545
+ # Note: We do this because the `bulk_write` method called below will raise
546
+ # an `InvalidOperation` exception if it is passed 0 operations.
547
+ #
548
+ # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write
549
+ #
550
+ if len(collection_docs) == 0:
551
+ continue
552
+
553
+ rv[collection_name] = self.db[collection_name].bulk_write(
554
+ [
555
+ (
556
+ ReplaceOne({"id": d["id"]}, d, upsert=True)
557
+ if replace
558
+ else InsertOne(d)
559
+ )
560
+ for d in collection_docs
561
+ ]
562
+ )
563
+ now = datetime.now(timezone.utc)
564
+ self.db.txn_log.insert_many(
565
+ [
566
+ {
567
+ "tgt": {"id": d.get("id"), "c": collection_name},
568
+ "type": "upsert",
569
+ "ts": now,
570
+ # "dtl": {},
571
+ }
572
+ for d in collection_docs
573
+ ]
574
+ )
575
+ return rv
472
576
 
473
577
 
474
578
  @resource(
@@ -512,33 +616,3 @@ def get_mongo(run_config: frozendict):
512
616
  )
513
617
  )
514
618
  return mongo_resource(resource_context)
515
-
516
-
517
- class TerminusDB:
518
- def __init__(self, server_url, user, key, account, dbid):
519
- self.client = WOQLClient(server_url=server_url)
520
- self.client.connect(user=user, key=key, account=account)
521
- db_info = self.client.get_database(dbid=dbid, account=account)
522
- if db_info is None:
523
- self.client.create_database(dbid=dbid, accountid=account, label=dbid)
524
- self.client.create_graph(graph_type="inference", graph_id="main")
525
- self.client.connect(user=user, key=key, account=account, db=dbid)
526
-
527
-
528
- @resource(
529
- config_schema={
530
- "server_url": StringSource,
531
- "user": StringSource,
532
- "key": StringSource,
533
- "account": StringSource,
534
- "dbid": StringSource,
535
- }
536
- )
537
- def terminus_resource(context):
538
- return TerminusDB(
539
- server_url=context.resource_config["server_url"],
540
- user=context.resource_config["user"],
541
- key=context.resource_config["key"],
542
- account=context.resource_config["account"],
543
- dbid=context.resource_config["dbid"],
544
- )