nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,5 @@
1
1
  from functools import lru_cache
2
- from typing import Any, Dict, List
2
+ from typing import Any, Dict, List, Union
3
3
  import pandas as pd
4
4
  from nmdc_runtime.site.resources import (
5
5
  RuntimeApiUserClient,
@@ -18,6 +18,8 @@ class DatabaseUpdater:
18
18
  gold_api_client: GoldApiClient,
19
19
  study_id: str,
20
20
  gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
21
+ include_field_site_info: bool = False,
22
+ enable_biosample_filtering: bool = True,
21
23
  ):
22
24
  """This class serves as an API for repairing connections in the database by
23
25
  adding records that are essentially missing "links"/"connections". As we identify
@@ -39,6 +41,8 @@ class DatabaseUpdater:
39
41
  self.gold_api_client = gold_api_client
40
42
  self.study_id = study_id
41
43
  self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
44
+ self.include_field_site_info = include_field_site_info
45
+ self.enable_biosample_filtering = enable_biosample_filtering
42
46
 
43
47
  @lru_cache
44
48
  def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
@@ -95,6 +99,8 @@ class DatabaseUpdater:
95
99
  biosamples=all_gold_biosamples,
96
100
  projects=all_gold_projects,
97
101
  gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
102
+ include_field_site_info=self.include_field_site_info,
103
+ enable_biosample_filtering=self.enable_biosample_filtering,
98
104
  )
99
105
 
100
106
  # The GoldStudyTranslator class has some pre-processing logic which filters out
@@ -214,6 +220,8 @@ class DatabaseUpdater:
214
220
  projects=gold_sequencing_projects_for_study,
215
221
  analysis_projects=gold_analysis_projects_for_study,
216
222
  gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
223
+ include_field_site_info=self.include_field_site_info,
224
+ enable_biosample_filtering=self.enable_biosample_filtering,
217
225
  )
218
226
 
219
227
  translated_biosamples = gold_study_translator.biosamples
@@ -240,3 +248,204 @@ class DatabaseUpdater:
240
248
  ]
241
249
 
242
250
  return database
251
+
252
+ def queries_run_script_to_update_insdc_identifiers(
253
+ self,
254
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
255
+ """This method creates a `/queries:run` API endpoint compatible update script that can be run
256
+ using that API endpoint to update/add information on the `insdc_biosample_identifiers` field
257
+ of biosample_set records and the `insdc_bioproject_identifiers` field on data_generation_set records.
258
+
259
+ The information to be asserted is retrieved from the `ncbiBioSampleAccession` and
260
+ `ncbiBioProjectAccession` fields on the GOLD `/projects` API endpoint.
261
+
262
+ :return: A `/queries:run` update query compatible script serialized as a dictionary/JSON.
263
+ """
264
+ # Fetch all biosamples associated with the study
265
+ biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
266
+ self.study_id
267
+ )
268
+
269
+ # Fetch all data_generation records associated with the study
270
+ data_generation_set = (
271
+ self.runtime_api_user_client.get_data_generation_records_for_study(
272
+ self.study_id
273
+ )
274
+ )
275
+
276
+ biosample_updates = []
277
+ data_generation_updates = []
278
+
279
+ # Dictionary to store gold_project_id -> ncbi_bioproject_accession mapping
280
+ gold_project_to_bioproject = {}
281
+
282
+ # Dictionary to store all project data we gather during biosample processing
283
+ all_processed_projects = {}
284
+
285
+ # Process biosamples for insdc_biosample_identifiers
286
+ for biosample in biosample_set:
287
+ # get the list (usually one) of GOLD biosample identifiers on the gold_biosample_identifiers slot
288
+ gold_biosample_identifiers = biosample.get("gold_biosample_identifiers", [])
289
+ if not gold_biosample_identifiers:
290
+ continue
291
+
292
+ biosample_id = biosample.get("id")
293
+ if not biosample_id:
294
+ continue
295
+
296
+ insdc_biosample_identifiers = []
297
+
298
+ for gold_biosample_id in gold_biosample_identifiers:
299
+ normalized_id = gold_biosample_id.replace("gold:", "")
300
+
301
+ # fetch projects associated with a GOLD biosample from the GOLD `/projects` API endpoint
302
+ gold_projects = self.gold_api_client.fetch_projects_by_biosample(
303
+ normalized_id
304
+ )
305
+
306
+ for project in gold_projects:
307
+ # Store each project for later use
308
+ project_gold_id = project.get("projectGoldId")
309
+ if project_gold_id:
310
+ all_processed_projects[project_gold_id] = project
311
+
312
+ # Collect ncbi_biosample_accession for biosample updates
313
+ ncbi_biosample_accession = project.get("ncbiBioSampleAccession")
314
+ if ncbi_biosample_accession and ncbi_biosample_accession.strip():
315
+ insdc_biosample_identifiers.append(ncbi_biosample_accession)
316
+
317
+ # Collect ncbi_bioproject_accession for data_generation records
318
+ ncbi_bioproject_accession = project.get("ncbiBioProjectAccession")
319
+ if (
320
+ project_gold_id
321
+ and ncbi_bioproject_accession
322
+ and ncbi_bioproject_accession.strip()
323
+ ):
324
+ gold_project_to_bioproject[project_gold_id] = (
325
+ ncbi_bioproject_accession
326
+ )
327
+
328
+ if insdc_biosample_identifiers:
329
+ existing_insdc_biosample_identifiers = biosample.get(
330
+ "insdc_biosample_identifiers", []
331
+ )
332
+ new_insdc_biosample_identifiers = list(
333
+ set(insdc_biosample_identifiers)
334
+ - set(existing_insdc_biosample_identifiers)
335
+ )
336
+
337
+ if new_insdc_biosample_identifiers:
338
+ prefixed_new_biosample_identifiers = [
339
+ f"biosample:{id}" for id in new_insdc_biosample_identifiers
340
+ ]
341
+
342
+ if existing_insdc_biosample_identifiers:
343
+ all_biosample_identifiers = list(
344
+ set(
345
+ existing_insdc_biosample_identifiers
346
+ + prefixed_new_biosample_identifiers
347
+ )
348
+ )
349
+ biosample_updates.append(
350
+ {
351
+ "q": {"id": biosample_id},
352
+ "u": {
353
+ "$set": {
354
+ "insdc_biosample_identifiers": all_biosample_identifiers
355
+ }
356
+ },
357
+ }
358
+ )
359
+ else:
360
+ biosample_updates.append(
361
+ {
362
+ "q": {"id": biosample_id},
363
+ "u": {
364
+ "$set": {
365
+ "insdc_biosample_identifiers": prefixed_new_biosample_identifiers
366
+ }
367
+ },
368
+ }
369
+ )
370
+
371
+ # Process data_generation records for insdc_bioproject_identifiers
372
+ for data_generation in data_generation_set:
373
+ data_generation_id = data_generation.get("id")
374
+ if not data_generation_id:
375
+ continue
376
+
377
+ # Extract existing insdc_bioproject_identifiers
378
+ existing_insdc_bioproject_identifiers = data_generation.get(
379
+ "insdc_bioproject_identifiers", []
380
+ )
381
+
382
+ collected_insdc_bioproject_identifiers = set()
383
+
384
+ # Add any project identifiers already on the record
385
+ if "insdc_bioproject_identifiers" in data_generation:
386
+ for identifier in data_generation["insdc_bioproject_identifiers"]:
387
+ collected_insdc_bioproject_identifiers.add(identifier)
388
+
389
+ # If there are gold_sequencing_project_identifiers, use our pre-collected mapping
390
+ gold_project_identifiers = data_generation.get(
391
+ "gold_sequencing_project_identifiers", []
392
+ )
393
+ for gold_project_id in gold_project_identifiers:
394
+ normalized_id = gold_project_id.replace("gold:", "")
395
+
396
+ # Check if we have a bioproject ID for this GOLD project ID
397
+ if normalized_id in gold_project_to_bioproject:
398
+ ncbi_bioproject_accession = gold_project_to_bioproject[
399
+ normalized_id
400
+ ]
401
+ collected_insdc_bioproject_identifiers.add(
402
+ f"bioproject:{ncbi_bioproject_accession}"
403
+ )
404
+ else:
405
+ # Only if we don't have it in our mapping, try to fetch it
406
+ # Instead of making a direct API request, check if we've already seen this project
407
+ if normalized_id in all_processed_projects:
408
+ project_data = all_processed_projects[normalized_id]
409
+ ncbi_bioproject_accession = project_data.get(
410
+ "ncbiBioProjectAccession"
411
+ )
412
+ if (
413
+ ncbi_bioproject_accession
414
+ and ncbi_bioproject_accession.strip()
415
+ ):
416
+ collected_insdc_bioproject_identifiers.add(
417
+ f"bioproject:{ncbi_bioproject_accession}"
418
+ )
419
+ # Add to our mapping for future reference
420
+ gold_project_to_bioproject[normalized_id] = (
421
+ ncbi_bioproject_accession
422
+ )
423
+
424
+ # Create a list from the set of collected identifiers
425
+ collected_insdc_bioproject_identifiers = list(
426
+ collected_insdc_bioproject_identifiers
427
+ )
428
+
429
+ # Only update if there are identifiers to add
430
+ if collected_insdc_bioproject_identifiers and set(
431
+ collected_insdc_bioproject_identifiers
432
+ ) != set(existing_insdc_bioproject_identifiers):
433
+ data_generation_updates.append(
434
+ {
435
+ "q": {"id": data_generation_id},
436
+ "u": {
437
+ "$set": {
438
+ "insdc_bioproject_identifiers": collected_insdc_bioproject_identifiers
439
+ }
440
+ },
441
+ }
442
+ )
443
+
444
+ # Return updates for both collections
445
+ if data_generation_updates:
446
+ return [
447
+ {"update": "biosample_set", "updates": biosample_updates},
448
+ {"update": "data_generation_set", "updates": data_generation_updates},
449
+ ]
450
+ else:
451
+ return {"update": "biosample_set", "updates": biosample_updates}
@@ -1,6 +1,5 @@
1
1
  import json
2
2
 
3
- from typing import Any
4
3
 
5
4
  from dagster import (
6
5
  repository,
@@ -29,8 +28,6 @@ from nmdc_runtime.site.graphs import (
29
28
  translate_metadata_submission_to_nmdc_schema_database,
30
29
  ingest_metadata_submission,
31
30
  gold_study_to_database,
32
- gold_translation,
33
- gold_translation_curation,
34
31
  create_objects_from_site_object_puts,
35
32
  housekeeping,
36
33
  ensure_jobs,
@@ -44,8 +41,10 @@ from nmdc_runtime.site.graphs import (
44
41
  ingest_neon_benthic_metadata,
45
42
  ingest_neon_surface_water_metadata,
46
43
  ensure_alldocs,
44
+ run_ontology_load,
47
45
  nmdc_study_to_ncbi_submission_export,
48
46
  generate_data_generation_set_for_biosamples_in_nmdc_study,
47
+ generate_update_script_for_insdc_biosample_identifiers,
49
48
  )
50
49
  from nmdc_runtime.site.resources import (
51
50
  get_mongo,
@@ -59,9 +58,6 @@ from nmdc_runtime.site.resources import (
59
58
  from nmdc_runtime.site.resources import (
60
59
  get_runtime_api_site_client,
61
60
  )
62
- from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
63
- from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
64
- from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
65
61
  from nmdc_runtime.util import freeze
66
62
  from nmdc_runtime.util import unfreeze
67
63
 
@@ -123,6 +119,55 @@ ensure_alldocs_daily = ScheduleDefinition(
123
119
  )
124
120
 
125
121
 
122
+ load_envo_ontology_weekly = ScheduleDefinition(
123
+ name="weekly_load_envo_ontology",
124
+ cron_schedule="0 7 * * 1",
125
+ execution_timezone="America/New_York",
126
+ job=run_ontology_load.to_job(
127
+ name="scheduled_envo_ontology_load",
128
+ config=unfreeze(
129
+ merge(
130
+ run_config_frozen__normal_env,
131
+ {"ops": {"load_ontology": {"config": {"source_ontology": "envo"}}}},
132
+ )
133
+ ),
134
+ resource_defs=resource_defs,
135
+ ),
136
+ )
137
+
138
+ load_uberon_ontology_weekly = ScheduleDefinition(
139
+ name="weekly_load_uberon_ontology",
140
+ cron_schedule="0 8 * * 1",
141
+ execution_timezone="America/New_York",
142
+ job=run_ontology_load.to_job(
143
+ name="scheduled_uberon_ontology_load",
144
+ config=unfreeze(
145
+ merge(
146
+ run_config_frozen__normal_env,
147
+ {"ops": {"load_ontology": {"config": {"source_ontology": "uberon"}}}},
148
+ )
149
+ ),
150
+ resource_defs=resource_defs,
151
+ ),
152
+ )
153
+
154
+ load_po_ontology_weekly = ScheduleDefinition(
155
+ name="weekly_load_po_ontology",
156
+ cron_schedule="0 9 * * 1",
157
+ execution_timezone="America/New_York",
158
+ job=run_ontology_load.to_job(
159
+ name="scheduled_po_ontology_load",
160
+ config=unfreeze(
161
+ merge(
162
+ run_config_frozen__normal_env,
163
+ {"ops": {"load_ontology": {"config": {"source_ontology": "po"}}}},
164
+ )
165
+ ),
166
+ resource_defs=resource_defs,
167
+ ),
168
+ )
169
+
170
+
126
171
  def asset_materialization_metadata(asset_event, key):
127
172
  """Get metadata from an asset materialization event.
128
173
 
@@ -197,82 +242,6 @@ def process_workflow_job_triggers(_context):
197
242
  yield SkipReason("No new jobs required")
198
243
 
199
244
 
200
- @asset_sensor(
201
- asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
202
- job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
203
- )
204
- def ensure_gold_translation_job(_context, asset_event):
205
- mdb = get_mongo(run_config_frozen__normal_env).db
206
- gold_etl_latest = mdb.objects.find_one(
207
- {"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
208
- )
209
- sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
210
- if gold_etl_latest is None:
211
- yield SkipReason("can't find sensed asset object_id in database")
212
- return
213
- elif gold_etl_latest["id"] != sensed_object_id:
214
- yield SkipReason("later object than sensed materialization")
215
- return
216
-
217
- run_config = merge(
218
- run_config_frozen__normal_env,
219
- {
220
- "solids": {
221
- "construct_jobs": {
222
- "config": {
223
- "base_jobs": [
224
- {
225
- "workflow": {"id": "gold-translation-1.0.0"},
226
- "config": {"object_id": gold_etl_latest["id"]},
227
- }
228
- ]
229
- }
230
- }
231
- }
232
- },
233
- )
234
- yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
235
-
236
-
237
- @asset_sensor(
238
- asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
239
- job=gold_translation_curation.to_job(**preset_normal),
240
- )
241
- def claim_and_run_gold_translation_curation(_context, asset_event):
242
- client = get_runtime_api_site_client(run_config_frozen__normal_env)
243
- mdb = get_mongo(run_config_frozen__normal_env).db
244
- object_id_latest = asset_materialization_metadata(
245
- asset_event, "object_id_latest"
246
- ).text
247
- job = mdb.jobs.find_one(
248
- {
249
- "workflow.id": "gold-translation-1.0.0",
250
- "config.object_id_latest": object_id_latest,
251
- }
252
- )
253
- if job is not None:
254
- rv = client.claim_job(job["id"])
255
- if rv.status_code == status.HTTP_200_OK:
256
- operation = rv.json()
257
- run_config = merge(
258
- run_config_frozen__normal_env,
259
- {
260
- "ops": {
261
- "get_operation": {
262
- "config": {
263
- "operation_id": operation["id"],
264
- }
265
- }
266
- }
267
- },
268
- )
269
- yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
270
- else:
271
- yield SkipReason("Job found, but already claimed by this site")
272
- else:
273
- yield SkipReason("No job found")
274
-
275
-
276
245
  @sensor(
277
246
  job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
278
247
  default_status=DefaultSensorStatus.RUNNING,
@@ -411,11 +380,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
411
380
  yield SkipReason("; ".join(skip_notes))
412
381
 
413
382
 
414
- # TODO ensure data_object_type values from file_type_enum
415
- # see /metadata-translation/notebooks/202106_curation_updates.ipynb
416
- # for details ("Create file_type_enum collection" section).
417
-
418
-
419
383
  @sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
420
384
  def done_object_put_ops(_context):
421
385
  client = get_runtime_api_site_client(run_config_frozen__normal_env)
@@ -455,18 +419,21 @@ def on_run_fail(context: RunStatusSensorContext):
455
419
  @repository
456
420
  def repo():
457
421
  graph_jobs = [
458
- gold_translation.to_job(**preset_normal),
459
422
  hello_graph.to_job(name="hello_job"),
460
423
  ensure_jobs.to_job(**preset_normal),
461
424
  apply_metadata_in.to_job(**preset_normal),
462
425
  export_study_biosamples_metadata.to_job(**preset_normal),
463
426
  ensure_alldocs.to_job(**preset_normal),
464
427
  ]
465
- schedules = [housekeeping_weekly, ensure_alldocs_daily]
428
+ schedules = [
429
+ housekeeping_weekly,
430
+ ensure_alldocs_daily,
431
+ load_envo_ontology_weekly,
432
+ load_uberon_ontology_weekly,
433
+ load_po_ontology_weekly,
434
+ ]
466
435
  sensors = [
467
436
  done_object_put_ops,
468
- ensure_gold_translation_job,
469
- claim_and_run_gold_translation_curation,
470
437
  process_workflow_job_triggers,
471
438
  claim_and_run_apply_changesheet_jobs,
472
439
  claim_and_run_metadata_in_jobs,
@@ -476,20 +443,6 @@ def repo():
476
443
  return graph_jobs + schedules + sensors
477
444
 
478
445
 
479
- @repository
480
- def translation():
481
- graph_jobs = [jgi_job, gold_job, emsl_job]
482
-
483
- return graph_jobs
484
-
485
-
486
- @repository
487
- def test_translation():
488
- graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
489
-
490
- return graph_jobs
491
-
492
-
493
446
  @repository
494
447
  def biosample_submission_ingest():
495
448
  normal_resources = run_config_frozen__normal_env["resources"]
@@ -516,6 +469,7 @@ def biosample_submission_ingest():
516
469
  "study_type": "research_study",
517
470
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
518
471
  "include_field_site_info": False,
472
+ "enable_biosample_filtering": True,
519
473
  },
520
474
  },
521
475
  "export_json_to_drs": {"config": {"username": ""}},
@@ -548,6 +502,7 @@ def biosample_submission_ingest():
548
502
  "data_object_mapping_file_url": None,
549
503
  "biosample_extras_file_url": None,
550
504
  "biosample_extras_slot_mapping_file_url": None,
505
+ "study_id": None,
551
506
  }
552
507
  },
553
508
  "translate_portal_submission_to_nmdc_schema_database": {
@@ -584,6 +539,7 @@ def biosample_submission_ingest():
584
539
  "data_object_mapping_file_url": None,
585
540
  "biosample_extras_file_url": None,
586
541
  "biosample_extras_slot_mapping_file_url": None,
542
+ "study_id": None,
587
543
  }
588
544
  },
589
545
  "translate_portal_submission_to_nmdc_schema_database": {
@@ -960,6 +916,8 @@ def database_records_stitching():
960
916
  "config": {
961
917
  "nmdc_study_id": "",
962
918
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
919
+ "include_field_site_info": False,
920
+ "enable_biosample_filtering": True,
963
921
  }
964
922
  },
965
923
  "export_json_to_drs": {"config": {"username": ""}},
@@ -1002,22 +960,55 @@ def database_records_stitching():
1002
960
  "config": {
1003
961
  "nmdc_study_id": "",
1004
962
  "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
963
+ "include_field_site_info": False,
964
+ "enable_biosample_filtering": True,
1005
965
  }
1006
966
  },
1007
967
  "export_json_to_drs": {"config": {"username": ""}},
1008
968
  },
1009
969
  },
1010
970
  ),
971
+ generate_update_script_for_insdc_biosample_identifiers.to_job(
972
+ description="This job generates a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.",
973
+ resource_defs=resource_defs,
974
+ config={
975
+ "resources": merge(
976
+ unfreeze(normal_resources),
977
+ {
978
+ "runtime_api_user_client": {
979
+ "config": {
980
+ "base_url": {"env": "API_HOST"},
981
+ "username": {"env": "API_ADMIN_USER"},
982
+ "password": {"env": "API_ADMIN_PASS"},
983
+ },
984
+ },
985
+ "runtime_api_site_client": {
986
+ "config": {
987
+ "base_url": {"env": "API_HOST"},
988
+ "client_id": {"env": "API_SITE_CLIENT_ID"},
989
+ "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
990
+ "site_id": {"env": "API_SITE_ID"},
991
+ },
992
+ },
993
+ "gold_api_client": {
994
+ "config": {
995
+ "base_url": {"env": "GOLD_API_BASE_URL"},
996
+ "username": {"env": "GOLD_API_USERNAME"},
997
+ "password": {"env": "GOLD_API_PASSWORD"},
998
+ },
999
+ },
1000
+ },
1001
+ ),
1002
+ "ops": {
1003
+ "get_database_updater_inputs": {
1004
+ "config": {
1005
+ "nmdc_study_id": "",
1006
+ "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
1007
+ "include_field_site_info": False,
1008
+ "enable_biosample_filtering": True,
1009
+ }
1010
+ },
1011
+ },
1012
+ },
1013
+ ),
1011
1014
  ]
1012
-
1013
-
1014
- # @repository
1015
- # def validation():
1016
- # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
1017
- # return graph_jobs
1018
- #
1019
- #
1020
- # @repository
1021
- # def test_validation():
1022
- # graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
1023
- # return graph_jobs