nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -1
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +731 -40
- nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
- nmdc_runtime/site/graphs.py +80 -29
- nmdc_runtime/site/ops.py +522 -183
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +108 -117
- nmdc_runtime/site/resources.py +72 -36
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
- nmdc_runtime/site/translation/translator.py +64 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +175 -348
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
- nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
- nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from functools import lru_cache
|
|
2
|
-
from typing import Any, Dict, List
|
|
2
|
+
from typing import Any, Dict, List, Union
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from nmdc_runtime.site.resources import (
|
|
5
5
|
RuntimeApiUserClient,
|
|
@@ -18,6 +18,8 @@ class DatabaseUpdater:
|
|
|
18
18
|
gold_api_client: GoldApiClient,
|
|
19
19
|
study_id: str,
|
|
20
20
|
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
21
|
+
include_field_site_info: bool = False,
|
|
22
|
+
enable_biosample_filtering: bool = True,
|
|
21
23
|
):
|
|
22
24
|
"""This class serves as an API for repairing connections in the database by
|
|
23
25
|
adding records that are essentially missing "links"/"connections". As we identify
|
|
@@ -39,6 +41,8 @@ class DatabaseUpdater:
|
|
|
39
41
|
self.gold_api_client = gold_api_client
|
|
40
42
|
self.study_id = study_id
|
|
41
43
|
self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
|
|
44
|
+
self.include_field_site_info = include_field_site_info
|
|
45
|
+
self.enable_biosample_filtering = enable_biosample_filtering
|
|
42
46
|
|
|
43
47
|
@lru_cache
|
|
44
48
|
def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
|
|
@@ -95,6 +99,8 @@ class DatabaseUpdater:
|
|
|
95
99
|
biosamples=all_gold_biosamples,
|
|
96
100
|
projects=all_gold_projects,
|
|
97
101
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
102
|
+
include_field_site_info=self.include_field_site_info,
|
|
103
|
+
enable_biosample_filtering=self.enable_biosample_filtering,
|
|
98
104
|
)
|
|
99
105
|
|
|
100
106
|
# The GoldStudyTranslator class has some pre-processing logic which filters out
|
|
@@ -214,6 +220,8 @@ class DatabaseUpdater:
|
|
|
214
220
|
projects=gold_sequencing_projects_for_study,
|
|
215
221
|
analysis_projects=gold_analysis_projects_for_study,
|
|
216
222
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
223
|
+
include_field_site_info=self.include_field_site_info,
|
|
224
|
+
enable_biosample_filtering=self.enable_biosample_filtering,
|
|
217
225
|
)
|
|
218
226
|
|
|
219
227
|
translated_biosamples = gold_study_translator.biosamples
|
|
@@ -240,3 +248,204 @@ class DatabaseUpdater:
|
|
|
240
248
|
]
|
|
241
249
|
|
|
242
250
|
return database
|
|
251
|
+
|
|
252
|
+
def queries_run_script_to_update_insdc_identifiers(
|
|
253
|
+
self,
|
|
254
|
+
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
|
|
255
|
+
"""This method creates a `/queries:run` API endpoint compatible update script that can be run
|
|
256
|
+
using that API endpoint to update/add information on the `insdc_biosample_identifiers` field
|
|
257
|
+
of biosample_set records and the `insdc_bioproject_identifiers` field on data_generation_set records.
|
|
258
|
+
|
|
259
|
+
The information to be asserted is retrieved from the `ncbiBioSampleAccession` and
|
|
260
|
+
`ncbiBioProjectAccession` fields on the GOLD `/projects` API endpoint.
|
|
261
|
+
|
|
262
|
+
:return: A `/queries:run` update query compatible script serialized as a dictionary/JSON.
|
|
263
|
+
"""
|
|
264
|
+
# Fetch all biosamples associated with the study
|
|
265
|
+
biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
|
|
266
|
+
self.study_id
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Fetch all data_generation records associated with the study
|
|
270
|
+
data_generation_set = (
|
|
271
|
+
self.runtime_api_user_client.get_data_generation_records_for_study(
|
|
272
|
+
self.study_id
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
biosample_updates = []
|
|
277
|
+
data_generation_updates = []
|
|
278
|
+
|
|
279
|
+
# Dictionary to store gold_project_id -> ncbi_bioproject_accession mapping
|
|
280
|
+
gold_project_to_bioproject = {}
|
|
281
|
+
|
|
282
|
+
# Dictionary to store all project data we gather during biosample processing
|
|
283
|
+
all_processed_projects = {}
|
|
284
|
+
|
|
285
|
+
# Process biosamples for insdc_biosample_identifiers
|
|
286
|
+
for biosample in biosample_set:
|
|
287
|
+
# get the list (usually one) of GOLD biosample identifiers on the gold_biosample_identifiers slot
|
|
288
|
+
gold_biosample_identifiers = biosample.get("gold_biosample_identifiers", [])
|
|
289
|
+
if not gold_biosample_identifiers:
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
biosample_id = biosample.get("id")
|
|
293
|
+
if not biosample_id:
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
insdc_biosample_identifiers = []
|
|
297
|
+
|
|
298
|
+
for gold_biosample_id in gold_biosample_identifiers:
|
|
299
|
+
normalized_id = gold_biosample_id.replace("gold:", "")
|
|
300
|
+
|
|
301
|
+
# fetch projects associated with a GOLD biosample from the GOLD `/projects` API endpoint
|
|
302
|
+
gold_projects = self.gold_api_client.fetch_projects_by_biosample(
|
|
303
|
+
normalized_id
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
for project in gold_projects:
|
|
307
|
+
# Store each project for later use
|
|
308
|
+
project_gold_id = project.get("projectGoldId")
|
|
309
|
+
if project_gold_id:
|
|
310
|
+
all_processed_projects[project_gold_id] = project
|
|
311
|
+
|
|
312
|
+
# Collect ncbi_biosample_accession for biosample updates
|
|
313
|
+
ncbi_biosample_accession = project.get("ncbiBioSampleAccession")
|
|
314
|
+
if ncbi_biosample_accession and ncbi_biosample_accession.strip():
|
|
315
|
+
insdc_biosample_identifiers.append(ncbi_biosample_accession)
|
|
316
|
+
|
|
317
|
+
# Collect ncbi_bioproject_accession for data_generation records
|
|
318
|
+
ncbi_bioproject_accession = project.get("ncbiBioProjectAccession")
|
|
319
|
+
if (
|
|
320
|
+
project_gold_id
|
|
321
|
+
and ncbi_bioproject_accession
|
|
322
|
+
and ncbi_bioproject_accession.strip()
|
|
323
|
+
):
|
|
324
|
+
gold_project_to_bioproject[project_gold_id] = (
|
|
325
|
+
ncbi_bioproject_accession
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if insdc_biosample_identifiers:
|
|
329
|
+
existing_insdc_biosample_identifiers = biosample.get(
|
|
330
|
+
"insdc_biosample_identifiers", []
|
|
331
|
+
)
|
|
332
|
+
new_insdc_biosample_identifiers = list(
|
|
333
|
+
set(insdc_biosample_identifiers)
|
|
334
|
+
- set(existing_insdc_biosample_identifiers)
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
if new_insdc_biosample_identifiers:
|
|
338
|
+
prefixed_new_biosample_identifiers = [
|
|
339
|
+
f"biosample:{id}" for id in new_insdc_biosample_identifiers
|
|
340
|
+
]
|
|
341
|
+
|
|
342
|
+
if existing_insdc_biosample_identifiers:
|
|
343
|
+
all_biosample_identifiers = list(
|
|
344
|
+
set(
|
|
345
|
+
existing_insdc_biosample_identifiers
|
|
346
|
+
+ prefixed_new_biosample_identifiers
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
biosample_updates.append(
|
|
350
|
+
{
|
|
351
|
+
"q": {"id": biosample_id},
|
|
352
|
+
"u": {
|
|
353
|
+
"$set": {
|
|
354
|
+
"insdc_biosample_identifiers": all_biosample_identifiers
|
|
355
|
+
}
|
|
356
|
+
},
|
|
357
|
+
}
|
|
358
|
+
)
|
|
359
|
+
else:
|
|
360
|
+
biosample_updates.append(
|
|
361
|
+
{
|
|
362
|
+
"q": {"id": biosample_id},
|
|
363
|
+
"u": {
|
|
364
|
+
"$set": {
|
|
365
|
+
"insdc_biosample_identifiers": prefixed_new_biosample_identifiers
|
|
366
|
+
}
|
|
367
|
+
},
|
|
368
|
+
}
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Process data_generation records for insdc_bioproject_identifiers
|
|
372
|
+
for data_generation in data_generation_set:
|
|
373
|
+
data_generation_id = data_generation.get("id")
|
|
374
|
+
if not data_generation_id:
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
# Extract existing insdc_bioproject_identifiers
|
|
378
|
+
existing_insdc_bioproject_identifiers = data_generation.get(
|
|
379
|
+
"insdc_bioproject_identifiers", []
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
collected_insdc_bioproject_identifiers = set()
|
|
383
|
+
|
|
384
|
+
# Add any project identifiers already on the record
|
|
385
|
+
if "insdc_bioproject_identifiers" in data_generation:
|
|
386
|
+
for identifier in data_generation["insdc_bioproject_identifiers"]:
|
|
387
|
+
collected_insdc_bioproject_identifiers.add(identifier)
|
|
388
|
+
|
|
389
|
+
# If there are gold_sequencing_project_identifiers, use our pre-collected mapping
|
|
390
|
+
gold_project_identifiers = data_generation.get(
|
|
391
|
+
"gold_sequencing_project_identifiers", []
|
|
392
|
+
)
|
|
393
|
+
for gold_project_id in gold_project_identifiers:
|
|
394
|
+
normalized_id = gold_project_id.replace("gold:", "")
|
|
395
|
+
|
|
396
|
+
# Check if we have a bioproject ID for this GOLD project ID
|
|
397
|
+
if normalized_id in gold_project_to_bioproject:
|
|
398
|
+
ncbi_bioproject_accession = gold_project_to_bioproject[
|
|
399
|
+
normalized_id
|
|
400
|
+
]
|
|
401
|
+
collected_insdc_bioproject_identifiers.add(
|
|
402
|
+
f"bioproject:{ncbi_bioproject_accession}"
|
|
403
|
+
)
|
|
404
|
+
else:
|
|
405
|
+
# Only if we don't have it in our mapping, try to fetch it
|
|
406
|
+
# Instead of making a direct API request, check if we've already seen this project
|
|
407
|
+
if normalized_id in all_processed_projects:
|
|
408
|
+
project_data = all_processed_projects[normalized_id]
|
|
409
|
+
ncbi_bioproject_accession = project_data.get(
|
|
410
|
+
"ncbiBioProjectAccession"
|
|
411
|
+
)
|
|
412
|
+
if (
|
|
413
|
+
ncbi_bioproject_accession
|
|
414
|
+
and ncbi_bioproject_accession.strip()
|
|
415
|
+
):
|
|
416
|
+
collected_insdc_bioproject_identifiers.add(
|
|
417
|
+
f"bioproject:{ncbi_bioproject_accession}"
|
|
418
|
+
)
|
|
419
|
+
# Add to our mapping for future reference
|
|
420
|
+
gold_project_to_bioproject[normalized_id] = (
|
|
421
|
+
ncbi_bioproject_accession
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
# Create a list from the set of collected identifiers
|
|
425
|
+
collected_insdc_bioproject_identifiers = list(
|
|
426
|
+
collected_insdc_bioproject_identifiers
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Only update if there are identifiers to add
|
|
430
|
+
if collected_insdc_bioproject_identifiers and set(
|
|
431
|
+
collected_insdc_bioproject_identifiers
|
|
432
|
+
) != set(existing_insdc_bioproject_identifiers):
|
|
433
|
+
data_generation_updates.append(
|
|
434
|
+
{
|
|
435
|
+
"q": {"id": data_generation_id},
|
|
436
|
+
"u": {
|
|
437
|
+
"$set": {
|
|
438
|
+
"insdc_bioproject_identifiers": collected_insdc_bioproject_identifiers
|
|
439
|
+
}
|
|
440
|
+
},
|
|
441
|
+
}
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Return updates for both collections
|
|
445
|
+
if data_generation_updates:
|
|
446
|
+
return [
|
|
447
|
+
{"update": "biosample_set", "updates": biosample_updates},
|
|
448
|
+
{"update": "data_generation_set", "updates": data_generation_updates},
|
|
449
|
+
]
|
|
450
|
+
else:
|
|
451
|
+
return {"update": "biosample_set", "updates": biosample_updates}
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from dagster import (
|
|
6
5
|
repository,
|
|
@@ -29,8 +28,6 @@ from nmdc_runtime.site.graphs import (
|
|
|
29
28
|
translate_metadata_submission_to_nmdc_schema_database,
|
|
30
29
|
ingest_metadata_submission,
|
|
31
30
|
gold_study_to_database,
|
|
32
|
-
gold_translation,
|
|
33
|
-
gold_translation_curation,
|
|
34
31
|
create_objects_from_site_object_puts,
|
|
35
32
|
housekeeping,
|
|
36
33
|
ensure_jobs,
|
|
@@ -44,8 +41,10 @@ from nmdc_runtime.site.graphs import (
|
|
|
44
41
|
ingest_neon_benthic_metadata,
|
|
45
42
|
ingest_neon_surface_water_metadata,
|
|
46
43
|
ensure_alldocs,
|
|
44
|
+
run_ontology_load,
|
|
47
45
|
nmdc_study_to_ncbi_submission_export,
|
|
48
46
|
generate_data_generation_set_for_biosamples_in_nmdc_study,
|
|
47
|
+
generate_update_script_for_insdc_biosample_identifiers,
|
|
49
48
|
)
|
|
50
49
|
from nmdc_runtime.site.resources import (
|
|
51
50
|
get_mongo,
|
|
@@ -59,9 +58,6 @@ from nmdc_runtime.site.resources import (
|
|
|
59
58
|
from nmdc_runtime.site.resources import (
|
|
60
59
|
get_runtime_api_site_client,
|
|
61
60
|
)
|
|
62
|
-
from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
|
|
63
|
-
from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
|
|
64
|
-
from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
|
|
65
61
|
from nmdc_runtime.util import freeze
|
|
66
62
|
from nmdc_runtime.util import unfreeze
|
|
67
63
|
|
|
@@ -123,6 +119,55 @@ ensure_alldocs_daily = ScheduleDefinition(
|
|
|
123
119
|
)
|
|
124
120
|
|
|
125
121
|
|
|
122
|
+
load_envo_ontology_weekly = ScheduleDefinition(
|
|
123
|
+
name="weekly_load_envo_ontology",
|
|
124
|
+
cron_schedule="0 7 * * 1",
|
|
125
|
+
execution_timezone="America/New_York",
|
|
126
|
+
job=run_ontology_load.to_job(
|
|
127
|
+
name="scheduled_envo_ontology_load",
|
|
128
|
+
config=unfreeze(
|
|
129
|
+
merge(
|
|
130
|
+
run_config_frozen__normal_env,
|
|
131
|
+
{"ops": {"load_ontology": {"config": {"source_ontology": "envo"}}}},
|
|
132
|
+
)
|
|
133
|
+
),
|
|
134
|
+
resource_defs=resource_defs,
|
|
135
|
+
),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
load_uberon_ontology_weekly = ScheduleDefinition(
|
|
139
|
+
name="weekly_load_uberon_ontology",
|
|
140
|
+
cron_schedule="0 8 * * 1",
|
|
141
|
+
execution_timezone="America/New_York",
|
|
142
|
+
job=run_ontology_load.to_job(
|
|
143
|
+
name="scheduled_uberon_ontology_load",
|
|
144
|
+
config=unfreeze(
|
|
145
|
+
merge(
|
|
146
|
+
run_config_frozen__normal_env,
|
|
147
|
+
{"ops": {"load_ontology": {"config": {"source_ontology": "uberon"}}}},
|
|
148
|
+
)
|
|
149
|
+
),
|
|
150
|
+
resource_defs=resource_defs,
|
|
151
|
+
),
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
load_po_ontology_weekly = ScheduleDefinition(
|
|
155
|
+
name="weekly_load_po_ontology",
|
|
156
|
+
cron_schedule="0 9 * * 1",
|
|
157
|
+
execution_timezone="America/New_York",
|
|
158
|
+
job=run_ontology_load.to_job(
|
|
159
|
+
name="scheduled_po_ontology_load",
|
|
160
|
+
config=unfreeze(
|
|
161
|
+
merge(
|
|
162
|
+
run_config_frozen__normal_env,
|
|
163
|
+
{"ops": {"load_ontology": {"config": {"source_ontology": "po"}}}},
|
|
164
|
+
)
|
|
165
|
+
),
|
|
166
|
+
resource_defs=resource_defs,
|
|
167
|
+
),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
126
171
|
def asset_materialization_metadata(asset_event, key):
|
|
127
172
|
"""Get metadata from an asset materialization event.
|
|
128
173
|
|
|
@@ -197,82 +242,6 @@ def process_workflow_job_triggers(_context):
|
|
|
197
242
|
yield SkipReason("No new jobs required")
|
|
198
243
|
|
|
199
244
|
|
|
200
|
-
@asset_sensor(
|
|
201
|
-
asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
|
|
202
|
-
job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
|
|
203
|
-
)
|
|
204
|
-
def ensure_gold_translation_job(_context, asset_event):
|
|
205
|
-
mdb = get_mongo(run_config_frozen__normal_env).db
|
|
206
|
-
gold_etl_latest = mdb.objects.find_one(
|
|
207
|
-
{"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
|
|
208
|
-
)
|
|
209
|
-
sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
|
|
210
|
-
if gold_etl_latest is None:
|
|
211
|
-
yield SkipReason("can't find sensed asset object_id in database")
|
|
212
|
-
return
|
|
213
|
-
elif gold_etl_latest["id"] != sensed_object_id:
|
|
214
|
-
yield SkipReason("later object than sensed materialization")
|
|
215
|
-
return
|
|
216
|
-
|
|
217
|
-
run_config = merge(
|
|
218
|
-
run_config_frozen__normal_env,
|
|
219
|
-
{
|
|
220
|
-
"solids": {
|
|
221
|
-
"construct_jobs": {
|
|
222
|
-
"config": {
|
|
223
|
-
"base_jobs": [
|
|
224
|
-
{
|
|
225
|
-
"workflow": {"id": "gold-translation-1.0.0"},
|
|
226
|
-
"config": {"object_id": gold_etl_latest["id"]},
|
|
227
|
-
}
|
|
228
|
-
]
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
}
|
|
232
|
-
},
|
|
233
|
-
)
|
|
234
|
-
yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
@asset_sensor(
|
|
238
|
-
asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
|
|
239
|
-
job=gold_translation_curation.to_job(**preset_normal),
|
|
240
|
-
)
|
|
241
|
-
def claim_and_run_gold_translation_curation(_context, asset_event):
|
|
242
|
-
client = get_runtime_api_site_client(run_config_frozen__normal_env)
|
|
243
|
-
mdb = get_mongo(run_config_frozen__normal_env).db
|
|
244
|
-
object_id_latest = asset_materialization_metadata(
|
|
245
|
-
asset_event, "object_id_latest"
|
|
246
|
-
).text
|
|
247
|
-
job = mdb.jobs.find_one(
|
|
248
|
-
{
|
|
249
|
-
"workflow.id": "gold-translation-1.0.0",
|
|
250
|
-
"config.object_id_latest": object_id_latest,
|
|
251
|
-
}
|
|
252
|
-
)
|
|
253
|
-
if job is not None:
|
|
254
|
-
rv = client.claim_job(job["id"])
|
|
255
|
-
if rv.status_code == status.HTTP_200_OK:
|
|
256
|
-
operation = rv.json()
|
|
257
|
-
run_config = merge(
|
|
258
|
-
run_config_frozen__normal_env,
|
|
259
|
-
{
|
|
260
|
-
"ops": {
|
|
261
|
-
"get_operation": {
|
|
262
|
-
"config": {
|
|
263
|
-
"operation_id": operation["id"],
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
},
|
|
268
|
-
)
|
|
269
|
-
yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
|
|
270
|
-
else:
|
|
271
|
-
yield SkipReason("Job found, but already claimed by this site")
|
|
272
|
-
else:
|
|
273
|
-
yield SkipReason("No job found")
|
|
274
|
-
|
|
275
|
-
|
|
276
245
|
@sensor(
|
|
277
246
|
job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
|
|
278
247
|
default_status=DefaultSensorStatus.RUNNING,
|
|
@@ -411,11 +380,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
|
|
|
411
380
|
yield SkipReason("; ".join(skip_notes))
|
|
412
381
|
|
|
413
382
|
|
|
414
|
-
# TODO ensure data_object_type values from file_type_enum
|
|
415
|
-
# see /metadata-translation/notebooks/202106_curation_updates.ipynb
|
|
416
|
-
# for details ("Create file_type_enum collection" section).
|
|
417
|
-
|
|
418
|
-
|
|
419
383
|
@sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
|
|
420
384
|
def done_object_put_ops(_context):
|
|
421
385
|
client = get_runtime_api_site_client(run_config_frozen__normal_env)
|
|
@@ -455,18 +419,21 @@ def on_run_fail(context: RunStatusSensorContext):
|
|
|
455
419
|
@repository
|
|
456
420
|
def repo():
|
|
457
421
|
graph_jobs = [
|
|
458
|
-
gold_translation.to_job(**preset_normal),
|
|
459
422
|
hello_graph.to_job(name="hello_job"),
|
|
460
423
|
ensure_jobs.to_job(**preset_normal),
|
|
461
424
|
apply_metadata_in.to_job(**preset_normal),
|
|
462
425
|
export_study_biosamples_metadata.to_job(**preset_normal),
|
|
463
426
|
ensure_alldocs.to_job(**preset_normal),
|
|
464
427
|
]
|
|
465
|
-
schedules = [
|
|
428
|
+
schedules = [
|
|
429
|
+
housekeeping_weekly,
|
|
430
|
+
ensure_alldocs_daily,
|
|
431
|
+
load_envo_ontology_weekly,
|
|
432
|
+
load_uberon_ontology_weekly,
|
|
433
|
+
load_po_ontology_weekly,
|
|
434
|
+
]
|
|
466
435
|
sensors = [
|
|
467
436
|
done_object_put_ops,
|
|
468
|
-
ensure_gold_translation_job,
|
|
469
|
-
claim_and_run_gold_translation_curation,
|
|
470
437
|
process_workflow_job_triggers,
|
|
471
438
|
claim_and_run_apply_changesheet_jobs,
|
|
472
439
|
claim_and_run_metadata_in_jobs,
|
|
@@ -476,20 +443,6 @@ def repo():
|
|
|
476
443
|
return graph_jobs + schedules + sensors
|
|
477
444
|
|
|
478
445
|
|
|
479
|
-
@repository
|
|
480
|
-
def translation():
|
|
481
|
-
graph_jobs = [jgi_job, gold_job, emsl_job]
|
|
482
|
-
|
|
483
|
-
return graph_jobs
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
@repository
|
|
487
|
-
def test_translation():
|
|
488
|
-
graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
|
|
489
|
-
|
|
490
|
-
return graph_jobs
|
|
491
|
-
|
|
492
|
-
|
|
493
446
|
@repository
|
|
494
447
|
def biosample_submission_ingest():
|
|
495
448
|
normal_resources = run_config_frozen__normal_env["resources"]
|
|
@@ -516,6 +469,7 @@ def biosample_submission_ingest():
|
|
|
516
469
|
"study_type": "research_study",
|
|
517
470
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
518
471
|
"include_field_site_info": False,
|
|
472
|
+
"enable_biosample_filtering": True,
|
|
519
473
|
},
|
|
520
474
|
},
|
|
521
475
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -548,6 +502,7 @@ def biosample_submission_ingest():
|
|
|
548
502
|
"data_object_mapping_file_url": None,
|
|
549
503
|
"biosample_extras_file_url": None,
|
|
550
504
|
"biosample_extras_slot_mapping_file_url": None,
|
|
505
|
+
"study_id": None,
|
|
551
506
|
}
|
|
552
507
|
},
|
|
553
508
|
"translate_portal_submission_to_nmdc_schema_database": {
|
|
@@ -584,6 +539,7 @@ def biosample_submission_ingest():
|
|
|
584
539
|
"data_object_mapping_file_url": None,
|
|
585
540
|
"biosample_extras_file_url": None,
|
|
586
541
|
"biosample_extras_slot_mapping_file_url": None,
|
|
542
|
+
"study_id": None,
|
|
587
543
|
}
|
|
588
544
|
},
|
|
589
545
|
"translate_portal_submission_to_nmdc_schema_database": {
|
|
@@ -960,6 +916,8 @@ def database_records_stitching():
|
|
|
960
916
|
"config": {
|
|
961
917
|
"nmdc_study_id": "",
|
|
962
918
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
919
|
+
"include_field_site_info": False,
|
|
920
|
+
"enable_biosample_filtering": True,
|
|
963
921
|
}
|
|
964
922
|
},
|
|
965
923
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -1002,22 +960,55 @@ def database_records_stitching():
|
|
|
1002
960
|
"config": {
|
|
1003
961
|
"nmdc_study_id": "",
|
|
1004
962
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
963
|
+
"include_field_site_info": False,
|
|
964
|
+
"enable_biosample_filtering": True,
|
|
1005
965
|
}
|
|
1006
966
|
},
|
|
1007
967
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
1008
968
|
},
|
|
1009
969
|
},
|
|
1010
970
|
),
|
|
971
|
+
generate_update_script_for_insdc_biosample_identifiers.to_job(
|
|
972
|
+
description="This job generates a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.",
|
|
973
|
+
resource_defs=resource_defs,
|
|
974
|
+
config={
|
|
975
|
+
"resources": merge(
|
|
976
|
+
unfreeze(normal_resources),
|
|
977
|
+
{
|
|
978
|
+
"runtime_api_user_client": {
|
|
979
|
+
"config": {
|
|
980
|
+
"base_url": {"env": "API_HOST"},
|
|
981
|
+
"username": {"env": "API_ADMIN_USER"},
|
|
982
|
+
"password": {"env": "API_ADMIN_PASS"},
|
|
983
|
+
},
|
|
984
|
+
},
|
|
985
|
+
"runtime_api_site_client": {
|
|
986
|
+
"config": {
|
|
987
|
+
"base_url": {"env": "API_HOST"},
|
|
988
|
+
"client_id": {"env": "API_SITE_CLIENT_ID"},
|
|
989
|
+
"client_secret": {"env": "API_SITE_CLIENT_SECRET"},
|
|
990
|
+
"site_id": {"env": "API_SITE_ID"},
|
|
991
|
+
},
|
|
992
|
+
},
|
|
993
|
+
"gold_api_client": {
|
|
994
|
+
"config": {
|
|
995
|
+
"base_url": {"env": "GOLD_API_BASE_URL"},
|
|
996
|
+
"username": {"env": "GOLD_API_USERNAME"},
|
|
997
|
+
"password": {"env": "GOLD_API_PASSWORD"},
|
|
998
|
+
},
|
|
999
|
+
},
|
|
1000
|
+
},
|
|
1001
|
+
),
|
|
1002
|
+
"ops": {
|
|
1003
|
+
"get_database_updater_inputs": {
|
|
1004
|
+
"config": {
|
|
1005
|
+
"nmdc_study_id": "",
|
|
1006
|
+
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
1007
|
+
"include_field_site_info": False,
|
|
1008
|
+
"enable_biosample_filtering": True,
|
|
1009
|
+
}
|
|
1010
|
+
},
|
|
1011
|
+
},
|
|
1012
|
+
},
|
|
1013
|
+
),
|
|
1011
1014
|
]
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
# @repository
|
|
1015
|
-
# def validation():
|
|
1016
|
-
# graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
|
|
1017
|
-
# return graph_jobs
|
|
1018
|
-
#
|
|
1019
|
-
#
|
|
1020
|
-
# @repository
|
|
1021
|
-
# def test_validation():
|
|
1022
|
-
# graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
|
|
1023
|
-
# return graph_jobs
|