nmdc-runtime 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

nmdc_runtime/config.py ADDED
@@ -0,0 +1 @@
1
+ DATABASE_CLASS_NAME = "Database"
@@ -66,7 +66,7 @@ class NCBISubmissionXML:
66
66
  element.append(child)
67
67
  return element
68
68
 
69
- def set_description(self, email, user, first, last, org, date=None):
69
+ def set_description(self, email, first, last, org, date=None):
70
70
  date = date or datetime.datetime.now().strftime("%Y-%m-%d")
71
71
  description = self.set_element(
72
72
  "Description",
@@ -74,7 +74,6 @@ class NCBISubmissionXML:
74
74
  self.set_element(
75
75
  "Comment", f"NMDC Submission for {self.nmdc_study_id}"
76
76
  ),
77
- self.set_element("Submitter", attrib={"user_name": user}),
78
77
  self.set_element(
79
78
  "Organization",
80
79
  attrib={"role": "owner", "type": "center"},
@@ -159,7 +158,6 @@ class NCBISubmissionXML:
159
158
  org,
160
159
  bioproject_id,
161
160
  nmdc_biosamples,
162
- nmdc_omics_processing,
163
161
  ):
164
162
  attribute_mappings, slot_range_mappings = load_mappings(
165
163
  self.nmdc_ncbi_attribute_mapping_file_url
@@ -206,7 +204,7 @@ class NCBISubmissionXML:
206
204
  children=[
207
205
  self.set_element(
208
206
  "Title",
209
- f"NMDC Biosample {sample_id_value} from {organism_name} part of {self.nmdc_study_id} study",
207
+ f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
210
208
  ),
211
209
  ],
212
210
  ),
@@ -230,6 +228,13 @@ class NCBISubmissionXML:
230
228
  "Attribute", attributes[key], {"attribute_name": key}
231
229
  )
232
230
  for key in sorted(attributes)
231
+ ]
232
+ + [
233
+ self.set_element(
234
+ "Attribute",
235
+ "National Microbiome Data Collaborative",
236
+ {"attribute_name": "broker name"},
237
+ )
233
238
  ],
234
239
  ),
235
240
  ]
@@ -278,29 +283,63 @@ class NCBISubmissionXML:
278
283
  biosample_data_objects: list,
279
284
  bioproject_id: str,
280
285
  org: str,
286
+ nmdc_omics_processing: list,
287
+ nmdc_biosamples: list,
288
+ nmdc_library_preparation: list,
281
289
  ):
290
+ bsm_id_name_dict = {
291
+ biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
292
+ }
293
+
282
294
  for entry in biosample_data_objects:
283
295
  fastq_files = []
284
296
  biosample_ids = []
297
+ omics_processing_ids = {}
298
+ lib_prep_protocol_names = {}
299
+ instrument_name = ""
300
+ omics_type = ""
301
+ library_name = ""
285
302
 
286
303
  for biosample_id, data_objects in entry.items():
287
304
  biosample_ids.append(biosample_id)
288
305
  for data_object in data_objects:
289
306
  if "url" in data_object:
290
307
  url = urlparse(data_object["url"])
291
- file_path = os.path.join(
292
- os.path.basename(os.path.dirname(url.path)),
293
- os.path.basename(url.path),
294
- )
308
+ file_path = os.path.basename(url.path)
295
309
  fastq_files.append(file_path)
296
310
 
311
+ for omprc_dict in nmdc_omics_processing:
312
+ if biosample_id in omprc_dict:
313
+ for omprc in omprc_dict[biosample_id]:
314
+ omics_processing_ids[biosample_id] = omprc.get("id", "")
315
+ instrument_name = omprc.get("instrument_name", "")
316
+ omics_type = (
317
+ omprc.get("omics_type", {})
318
+ .get("has_raw_value", "")
319
+ .lower()
320
+ )
321
+ library_name = bsm_id_name_dict.get(biosample_id, "")
322
+
323
+ for lib_prep_dict in nmdc_library_preparation:
324
+ if biosample_id in lib_prep_dict:
325
+ lib_prep_protocol_names[biosample_id] = (
326
+ lib_prep_dict[biosample_id]
327
+ .get("protocol_link", {})
328
+ .get("name", "")
329
+ )
330
+
297
331
  if fastq_files:
298
332
  files_elements = [
299
333
  self.set_element(
300
334
  "File",
301
335
  "",
302
336
  {"file_path": f},
303
- [self.set_element("DataType", "generic-data")],
337
+ [
338
+ self.set_element(
339
+ "DataType",
340
+ "sra-run-fastq" if ".fastq" in f else "generic-data",
341
+ )
342
+ ],
304
343
  )
305
344
  for f in fastq_files
306
345
  ]
@@ -344,35 +383,122 @@ class NCBISubmissionXML:
344
383
  )
345
384
  )
346
385
 
347
- identifier_element = self.set_element(
348
- "Identifier",
349
- children=[
386
+ sra_attributes = []
387
+ if instrument_name.lower().startswith("illumina"):
388
+ sra_attributes.append(
389
+ self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
390
+ )
391
+ if "nextseq550" in instrument_name.lower():
392
+ sra_attributes.append(
393
+ self.set_element(
394
+ "Attribute", "NextSeq 550", {"name": "instrument_model"}
395
+ )
396
+ )
397
+
398
+ if omics_type == "metagenome":
399
+ sra_attributes.append(
350
400
  self.set_element(
351
- "SPUID", bioproject_id, {"spuid_namespace": org}
401
+ "Attribute", "WGS", {"name": "library_strategy"}
352
402
  )
353
- ],
354
- )
403
+ )
404
+ sra_attributes.append(
405
+ self.set_element(
406
+ "Attribute", "METAGENOMIC", {"name": "library_source"}
407
+ )
408
+ )
409
+ sra_attributes.append(
410
+ self.set_element(
411
+ "Attribute", "RANDOM", {"name": "library_selection"}
412
+ )
413
+ )
355
414
 
356
- action = self.set_element(
357
- "Action",
358
- children=[
415
+ if omics_type == "metatranscriptome":
416
+ sra_attributes.append(
359
417
  self.set_element(
360
- "AddFiles",
361
- attrib={"target_db": "SRA"},
362
- children=files_elements
363
- + attribute_elements
364
- + [identifier_element],
365
- ),
366
- ],
418
+ "Attribute",
419
+ "METATRANSCRIPTOMIC",
420
+ {"name": "library_source"},
421
+ )
422
+ )
423
+
424
+ has_paired_reads = any(
425
+ data_object.get("data_object_type", "").lower()
426
+ == "metagenome raw reads"
427
+ for data_object in data_objects
428
+ ) or (
429
+ any(
430
+ data_object.get("data_object_type", "").lower()
431
+ == "metagenome raw read 1"
432
+ for data_object in data_objects
433
+ )
434
+ and any(
435
+ data_object.get("data_object_type", "").lower()
436
+ == "metagenome raw read 2"
437
+ for data_object in data_objects
438
+ )
367
439
  )
368
440
 
369
- self.root.append(action)
441
+ if has_paired_reads:
442
+ sra_attributes.append(
443
+ self.set_element(
444
+ "Attribute", "paired", {"name": "library_layout"}
445
+ )
446
+ )
447
+ else:
448
+ sra_attributes.append(
449
+ self.set_element(
450
+ "Attribute", "single", {"name": "library_layout"}
451
+ )
452
+ )
453
+
454
+ if library_name:
455
+ sra_attributes.append(
456
+ self.set_element(
457
+ "Attribute", library_name, {"name": "library_name"}
458
+ )
459
+ )
460
+
461
+ for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
462
+ sra_attributes.append(
463
+ self.set_element(
464
+ "Attribute",
465
+ lib_prep_name,
466
+ {"name": "library_construction_protocol"},
467
+ )
468
+ )
469
+
470
+ for biosample_id, omics_processing_id in omics_processing_ids.items():
471
+ identifier_element = self.set_element(
472
+ "Identifier",
473
+ children=[
474
+ self.set_element(
475
+ "SPUID", omics_processing_id, {"spuid_namespace": org}
476
+ )
477
+ ],
478
+ )
479
+
480
+ action = self.set_element(
481
+ "Action",
482
+ children=[
483
+ self.set_element(
484
+ "AddFiles",
485
+ attrib={"target_db": "SRA"},
486
+ children=files_elements
487
+ + attribute_elements
488
+ + sra_attributes
489
+ + [identifier_element],
490
+ ),
491
+ ],
492
+ )
493
+
494
+ self.root.append(action)
370
495
 
371
496
  def get_submission_xml(
372
497
  self,
373
498
  biosamples_list: list,
374
499
  biosample_omics_processing_list: list,
375
500
  biosample_data_objects_list: list,
501
+ biosample_library_preparation_list: list,
376
502
  ):
377
503
  data_type = None
378
504
  ncbi_project_id = None
@@ -387,7 +513,6 @@ class NCBISubmissionXML:
387
513
 
388
514
  self.set_description(
389
515
  email=self.nmdc_pi_email,
390
- user="National Microbiome Data Collaborative (NMDC)",
391
516
  first=self.first_name,
392
517
  last=self.last_name,
393
518
  org=self.ncbi_submission_metadata.get("organization", ""),
@@ -407,13 +532,15 @@ class NCBISubmissionXML:
407
532
  org=self.ncbi_submission_metadata.get("organization", ""),
408
533
  bioproject_id=ncbi_project_id,
409
534
  nmdc_biosamples=biosamples_list,
410
- nmdc_omics_processing=biosample_omics_processing_list,
411
535
  )
412
536
 
413
537
  self.set_fastq(
414
538
  biosample_data_objects=biosample_data_objects_list,
415
539
  bioproject_id=ncbi_project_id,
416
540
  org=self.ncbi_submission_metadata.get("organization", ""),
541
+ nmdc_omics_processing=biosample_omics_processing_list,
542
+ nmdc_biosamples=biosamples_list,
543
+ nmdc_library_preparation=biosample_library_preparation_list,
417
544
  )
418
545
 
419
546
  rough_string = ET.tostring(self.root, "unicode")
@@ -96,6 +96,38 @@ def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list)
96
96
  return biosample_data_objects
97
97
 
98
98
 
99
+ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_list):
100
+ biosample_lib_prep = []
101
+
102
+ for biosample in biosamples_list:
103
+ biosample_id = biosample["id"]
104
+
105
+ # Step 1: Find any document with biosample id as has_input
106
+ initial_query = {"has_input": biosample_id}
107
+ initial_document = all_docs_collection.find_one(initial_query)
108
+
109
+ if not initial_document:
110
+ continue
111
+
112
+ initial_output = initial_document.get("has_output")
113
+ if not initial_output:
114
+ continue
115
+
116
+ # Step 2: Use has_output to find the library preparation document
117
+ for output_id in initial_output:
118
+ lib_prep_query = {
119
+ "has_input": output_id,
120
+ "designated_class": "nmdc:LibraryPreparation",
121
+ }
122
+ lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
123
+
124
+ if lib_prep_doc:
125
+ biosample_lib_prep.append({biosample_id: lib_prep_doc})
126
+ break # Stop at the first document that meets the criteria
127
+
128
+ return biosample_lib_prep
129
+
130
+
99
131
  def handle_quantity_value(slot_value):
100
132
  if "has_numeric_value" in slot_value and "has_unit" in slot_value:
101
133
  return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}"
@@ -48,9 +48,11 @@ from nmdc_runtime.site.ops import (
48
48
  get_neon_pipeline_inputs,
49
49
  get_df_from_url,
50
50
  site_code_mapping,
51
+ materialize_alldocs,
51
52
  get_ncbi_export_pipeline_study,
52
53
  get_data_objects_from_biosamples,
53
54
  get_omics_processing_from_biosamples,
55
+ get_library_preparation_from_biosamples,
54
56
  get_ncbi_export_pipeline_inputs,
55
57
  ncbi_submission_xml_from_nmdc_study,
56
58
  ncbi_submission_xml_asset,
@@ -98,6 +100,11 @@ def housekeeping():
98
100
  delete_operations(list_operations(filter_ops_undone_expired()))
99
101
 
100
102
 
103
+ @graph
104
+ def ensure_alldocs():
105
+ materialize_alldocs()
106
+
107
+
101
108
  @graph
102
109
  def ensure_jobs():
103
110
  jobs = construct_jobs()
@@ -384,12 +391,14 @@ def nmdc_study_to_ncbi_submission_export():
384
391
  ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
385
392
  biosamples = get_biosamples_by_study_id(nmdc_study)
386
393
  omics_processing_records = get_omics_processing_from_biosamples(biosamples)
387
- data_objects = get_data_objects_from_biosamples(biosamples)
394
+ data_object_records = get_data_objects_from_biosamples(biosamples)
395
+ library_preparation_records = get_library_preparation_from_biosamples(biosamples)
388
396
  xml_data = ncbi_submission_xml_from_nmdc_study(
389
397
  nmdc_study,
390
398
  ncbi_submission_metadata,
391
399
  biosamples,
392
400
  omics_processing_records,
393
- data_objects,
401
+ data_object_records,
402
+ library_preparation_records,
394
403
  )
395
404
  ncbi_submission_xml_asset(xml_data)
nmdc_runtime/site/ops.py CHANGED
@@ -13,6 +13,7 @@ from zipfile import ZipFile
13
13
  import pandas as pd
14
14
  import requests
15
15
 
16
+
16
17
  from bson import ObjectId, json_util
17
18
  from dagster import (
18
19
  Any,
@@ -65,6 +66,7 @@ from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
65
66
  from nmdc_runtime.site.export.ncbi_xml_utils import (
66
67
  fetch_data_objects_from_biosamples,
67
68
  fetch_omics_processing_from_biosamples,
69
+ fetch_library_preparation_from_biosamples,
68
70
  )
69
71
  from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
70
72
  from nmdc_runtime.site.resources import (
@@ -73,6 +75,7 @@ from nmdc_runtime.site.resources import (
73
75
  RuntimeApiSiteClient,
74
76
  RuntimeApiUserClient,
75
77
  NeonApiClient,
78
+ MongoDB as MongoDBResource,
76
79
  )
77
80
  from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
78
81
  from nmdc_runtime.site.translation.neon_soil_translator import NeonSoilDataTranslator
@@ -85,15 +88,19 @@ from nmdc_runtime.site.translation.neon_surface_water_translator import (
85
88
  from nmdc_runtime.site.translation.submission_portal_translator import (
86
89
  SubmissionPortalTranslator,
87
90
  )
88
- from nmdc_runtime.site.util import collection_indexed_on_id, run_and_log
91
+ from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id
89
92
  from nmdc_runtime.util import (
90
93
  drs_object_in_for,
91
94
  pluralize,
92
95
  put_object,
93
96
  validate_json,
94
97
  specialize_activity_set_docs,
98
+ collection_name_to_class_names,
99
+ class_hierarchy_as_list,
100
+ populated_schema_collection_names_with_id_field,
95
101
  )
96
102
  from nmdc_schema import nmdc
103
+ from nmdc_schema.nmdc import Database as NMDCDatabase
97
104
  from pydantic import BaseModel
98
105
  from pymongo.database import Database as MongoDatabase
99
106
  from starlette import status
@@ -521,29 +528,45 @@ def perform_mongo_updates(context, json_in):
521
528
  if rv["result"] == "errors":
522
529
  raise Failure(str(rv["detail"]))
523
530
 
524
- coll_has_id_index = collection_indexed_on_id(mongo.db)
525
- if all(coll_has_id_index[coll] for coll in docs.keys()):
531
+ # TODO containing op `perform_mongo_updates` needs test coverage, as below line had trivial bug.
532
+ # ref: https://github.com/microbiomedata/nmdc-runtime/issues/631
533
+ add_docs_result = _add_schema_docs_with_or_without_replacement(mongo, docs)
534
+ op_patch = UpdateOperationRequest(
535
+ done=True,
536
+ result=add_docs_result,
537
+ metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")},
538
+ )
539
+ op_doc = client.update_operation(op_id, op_patch).json()
540
+ return ["/operations/" + op_doc["id"]]
541
+
542
+
543
+ def _add_schema_docs_with_or_without_replacement(
544
+ mongo: MongoDBResource, docs: Dict[str, list]
545
+ ):
546
+ coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
547
+ if all(coll_index_on_id_map[coll] for coll in docs.keys()):
526
548
  replace = True
527
- elif all(not coll_has_id_index[coll] for coll in docs.keys()):
549
+ elif all(not coll_index_on_id_map[coll] for coll in docs.keys()):
550
+ # FIXME: XXX: This is a hack because e.g. <https://w3id.org/nmdc/FunctionalAnnotationAggMember>
551
+ # documents should be unique with compound key (metagenome_annotation_id, gene_function_id)
552
+ # and yet this is not explicit in the schema. One potential solution is to auto-generate an `id`
553
+ # as a deterministic hash of the compound key.
554
+ #
555
+ # For now, decision is to potentially re-insert "duplicate" documents, i.e. to interpret
556
+ # lack of `id` as lack of unique document identity for de-duplication.
528
557
  replace = False # wasting time trying to upsert by `id`.
529
558
  else:
530
559
  colls_not_id_indexed = [
531
- coll for coll in docs.keys() if not coll_has_id_index[coll]
560
+ coll for coll in docs.keys() if not coll_index_on_id_map[coll]
532
561
  ]
533
- colls_id_indexed = [coll for coll in docs.keys() if coll_has_id_index[coll]]
562
+ colls_id_indexed = [coll for coll in docs.keys() if coll_index_on_id_map[coll]]
534
563
  raise Failure(
535
564
  "Simultaneous addition of non-`id`ed collections and `id`-ed collections"
536
565
  " is not supported at this time."
537
566
  f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
538
567
  )
539
568
  op_result = mongo.add_docs(docs, validate=False, replace=replace)
540
- op_patch = UpdateOperationRequest(
541
- done=True,
542
- result=mongo_add_docs_result_as_dict(op_result),
543
- metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")},
544
- )
545
- op_doc = client.update_operation(op_id, op_patch).json()
546
- return ["/operations/" + op_doc["id"]]
569
+ return mongo_add_docs_result_as_dict(op_result)
547
570
 
548
571
 
549
572
  @op(required_resource_keys={"mongo"})
@@ -659,7 +682,6 @@ def translate_portal_submission_to_nmdc_schema_database(
659
682
  study_category: Optional[str],
660
683
  study_doi_category: Optional[str],
661
684
  study_doi_provider: Optional[str],
662
- study_funding_sources: Optional[List[str]],
663
685
  study_pi_image_url: Optional[str],
664
686
  biosample_extras: Optional[list[dict]],
665
687
  biosample_extras_slot_mapping: Optional[list[dict]],
@@ -678,7 +700,6 @@ def translate_portal_submission_to_nmdc_schema_database(
678
700
  study_category=study_category,
679
701
  study_doi_category=study_doi_category,
680
702
  study_doi_provider=study_doi_provider,
681
- study_funding_sources=study_funding_sources,
682
703
  study_pi_image_url=study_pi_image_url,
683
704
  biosample_extras=biosample_extras,
684
705
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
@@ -973,6 +994,61 @@ def site_code_mapping() -> dict:
973
994
  )
974
995
 
975
996
 
997
+ @op(required_resource_keys={"mongo"})
998
+ def materialize_alldocs(context) -> int:
999
+ mdb = context.resources.mongo.db
1000
+ collection_names = populated_schema_collection_names_with_id_field(mdb)
1001
+
1002
+ for name in collection_names:
1003
+ assert (
1004
+ len(collection_name_to_class_names[name]) == 1
1005
+ ), f"{name} collection has class name of {collection_name_to_class_names[name]} and len {len(collection_name_to_class_names[name])}"
1006
+
1007
+ context.log.info(f"{collection_names=}")
1008
+
1009
+ # Drop any existing `alldocs` collection (e.g. from previous use of this op).
1010
+ mdb.alldocs.drop()
1011
+
1012
+ # Build alldocs
1013
+ context.log.info("constructing `alldocs` collection")
1014
+
1015
+ for collection in collection_names:
1016
+ # Calculate class_hierarchy_as_list once per collection, using the first document in list
1017
+ try:
1018
+ nmdcdb = NMDCDatabase(
1019
+ **{collection: [dissoc(mdb[collection].find_one(), "_id")]}
1020
+ )
1021
+ exemplar = getattr(nmdcdb, collection)[0]
1022
+ newdoc_type: list[str] = class_hierarchy_as_list(exemplar)
1023
+ except ValueError as e:
1024
+ context.log.info(f"Collection {collection} does not exist.")
1025
+ raise e
1026
+
1027
+ context.log.info(
1028
+ f"Found {mdb[collection].estimated_document_count()} estimated documents for {collection=}."
1029
+ )
1030
+ # For each document in this collection, replace the value of the `type` field with
1031
+ # a _list_ of the document's own class and ancestor classes, remove the `_id` field,
1032
+ # and insert the resulting document into the `alldocs` collection.
1033
+
1034
+ inserted_many_result = mdb.alldocs.insert_many(
1035
+ [
1036
+ assoc(dissoc(doc, "type", "_id"), "type", newdoc_type)
1037
+ for doc in mdb[collection].find()
1038
+ ]
1039
+ )
1040
+ context.log.info(
1041
+ f"Inserted {len(inserted_many_result.inserted_ids)} documents for {collection=}."
1042
+ )
1043
+
1044
+ # Re-idx for `alldocs` collection
1045
+ mdb.alldocs.create_index("id", unique=True)
1046
+ context.log.info(
1047
+ f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
1048
+ )
1049
+ return mdb.alldocs.estimated_document_count()
1050
+
1051
+
976
1052
  @op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
977
1053
  def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any:
978
1054
  nmdc_study = find_study_by_id(
@@ -1039,6 +1115,18 @@ def get_omics_processing_from_biosamples(context: OpExecutionContext, biosamples
1039
1115
  return biosample_omics_processing
1040
1116
 
1041
1117
 
1118
+ @op(required_resource_keys={"mongo"})
1119
+ def get_library_preparation_from_biosamples(
1120
+ context: OpExecutionContext, biosamples: list
1121
+ ):
1122
+ mdb = context.resources.mongo.db
1123
+ alldocs_collection = mdb["alldocs"]
1124
+ biosample_lib_prep = fetch_library_preparation_from_biosamples(
1125
+ alldocs_collection, biosamples
1126
+ )
1127
+ return biosample_lib_prep
1128
+
1129
+
1042
1130
  @op
1043
1131
  def ncbi_submission_xml_from_nmdc_study(
1044
1132
  context: OpExecutionContext,
@@ -1046,10 +1134,14 @@ def ncbi_submission_xml_from_nmdc_study(
1046
1134
  ncbi_exporter_metadata: dict,
1047
1135
  biosamples: list,
1048
1136
  omics_processing_records: list,
1049
- data_objects: list,
1137
+ data_object_records: list,
1138
+ library_preparation_records: list,
1050
1139
  ) -> str:
1051
1140
  ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
1052
1141
  ncbi_xml = ncbi_exporter.get_submission_xml(
1053
- biosamples, omics_processing_records, data_objects
1142
+ biosamples,
1143
+ omics_processing_records,
1144
+ data_object_records,
1145
+ library_preparation_records,
1054
1146
  )
1055
1147
  return ncbi_xml
@@ -42,6 +42,7 @@ from nmdc_runtime.site.graphs import (
42
42
  ingest_neon_soil_metadata,
43
43
  ingest_neon_benthic_metadata,
44
44
  ingest_neon_surface_water_metadata,
45
+ ensure_alldocs,
45
46
  nmdc_study_to_ncbi_submission_export,
46
47
  )
47
48
  from nmdc_runtime.site.resources import (
@@ -450,6 +451,7 @@ def repo():
450
451
  ensure_jobs.to_job(**preset_normal),
451
452
  apply_metadata_in.to_job(**preset_normal),
452
453
  export_study_biosamples_metadata.to_job(**preset_normal),
454
+ ensure_alldocs.to_job(**preset_normal),
453
455
  ]
454
456
  schedules = [housekeeping_weekly]
455
457
  sensors = [
@@ -537,7 +539,6 @@ def biosample_submission_ingest():
537
539
  "study_category": None,
538
540
  "study_doi_category": None,
539
541
  "study_doi_provider": None,
540
- "study_funding_sources": None,
541
542
  "study_pi_image_url": None,
542
543
  }
543
544
  },
@@ -576,7 +577,6 @@ def biosample_submission_ingest():
576
577
  "study_category": None,
577
578
  "study_doi_category": None,
578
579
  "study_doi_provider": None,
579
- "study_funding_sources": None,
580
580
  "study_pi_image_url": None,
581
581
  }
582
582
  },
@@ -404,7 +404,9 @@ class SubmissionPortalTranslator(Translator):
404
404
  description=self._get_from(
405
405
  metadata_submission, ["studyForm", "description"]
406
406
  ),
407
- funding_sources=self.study_funding_sources,
407
+ funding_sources=self._get_from(
408
+ metadata_submission, ["studyForm", "fundingSources"]
409
+ ),
408
410
  # emsl_proposal_identifier=self._get_from(
409
411
  # metadata_submission, ["multiOmicsForm", "studyNumber"]
410
412
  # ),
nmdc_runtime/site/util.py CHANGED
@@ -4,6 +4,7 @@ from subprocess import Popen, PIPE, STDOUT, CalledProcessError
4
4
 
5
5
  from pymongo.database import Database as MongoDatabase
6
6
 
7
+ from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
7
8
  from nmdc_runtime.site.resources import mongo_resource
8
9
 
9
10
  mode_test = {
@@ -34,12 +35,13 @@ def run_and_log(shell_cmd, context):
34
35
 
35
36
 
36
37
  @lru_cache
37
- def collection_indexed_on_id(mdb: MongoDatabase) -> dict:
38
- set_collection_names = [
39
- name for name in mdb.list_collection_names() if name.endswith("_set")
40
- ]
38
+ def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
39
+ present_collection_names = set(mdb.list_collection_names())
41
40
  return {
42
- name: ("id_1" in mdb[name].index_information()) for name in set_collection_names
41
+ name: (
42
+ name in present_collection_names and "id_1" in mdb[name].index_information()
43
+ )
44
+ for name in get_collection_names_from_schema()
43
45
  }
44
46
 
45
47
 
nmdc_runtime/util.py CHANGED
@@ -8,6 +8,7 @@ from copy import deepcopy
8
8
  from datetime import datetime, timezone
9
9
  from functools import lru_cache
10
10
  from io import BytesIO
11
+ from itertools import chain
11
12
  from pathlib import Path
12
13
  from uuid import uuid4
13
14
  from typing import List, Optional, Set, Dict
@@ -369,13 +370,38 @@ def specialize_activity_set_docs(docs):
369
370
 
370
371
  # Define a mapping from collection name to a list of class names allowable for that collection's documents.
371
372
  collection_name_to_class_names: Dict[str, List[str]] = {
372
- collection_name: get_class_names_from_collection_spec(spec)
373
+ collection_name: list(
374
+ set(
375
+ chain.from_iterable(
376
+ nmdc_schema_view().class_descendants(cls_name)
377
+ for cls_name in get_class_names_from_collection_spec(spec)
378
+ )
379
+ )
380
+ )
373
381
  for collection_name, spec in nmdc_jsonschema["$defs"]["Database"][
374
382
  "properties"
375
383
  ].items()
376
384
  }
377
385
 
378
386
 
387
+ def class_hierarchy_as_list(obj) -> list[str]:
388
+ """
389
+ get list of inherited classes for each concrete class
390
+ """
391
+ rv = []
392
+ current_class = obj.__class__
393
+
394
+ def recurse_through_bases(cls):
395
+ if cls.__name__ == "YAMLRoot":
396
+ return rv
397
+ rv.append(cls.__name__)
398
+ for base in cls.__bases__:
399
+ recurse_through_bases(base)
400
+ return rv
401
+
402
+ return recurse_through_bases(current_class)
403
+
404
+
379
405
  @lru_cache
380
406
  def schema_collection_names_with_id_field() -> Set[str]:
381
407
  """
@@ -393,6 +419,11 @@ def schema_collection_names_with_id_field() -> Set[str]:
393
419
  return target_collection_names
394
420
 
395
421
 
422
+ def populated_schema_collection_names_with_id_field(mdb: MongoDatabase) -> List[str]:
423
+ collection_names = sorted(schema_collection_names_with_id_field())
424
+ return [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]
425
+
426
+
396
427
  def ensure_unique_id_indexes(mdb: MongoDatabase):
397
428
  """Ensure that any collections with an "id" field have an index on "id"."""
398
429
  candidate_names = (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nmdc_runtime
3
- Version: 1.7.0
3
+ Version: 1.9.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -1,7 +1,8 @@
1
1
  nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ nmdc_runtime/config.py,sha256=qyV_To6t--DQUpYJ3SrE6sZlxuVXLPmx2dVtZV-3l-c,33
2
3
  nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
3
4
  nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- nmdc_runtime/util.py,sha256=3mHVEUdMOv73XgT6NTuzMuMCL5Gs6NJ4Mk0bkgQQaQU,19844
5
+ nmdc_runtime/util.py,sha256=Wd2GuuskyUqf1eV5mHLZws8BHAOsqnc0Qj7_4WhSvAM,20736
5
6
  nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
7
  nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
8
  nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
@@ -35,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
35
36
  nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
37
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
37
38
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- nmdc_runtime/site/graphs.py,sha256=_vCyQnICis4OQGH91i1ZwpvHYcXOG6Nfg04f5DVdy2M,12040
39
- nmdc_runtime/site/ops.py,sha256=G6X3YgSmDNxOnsMEByLUMfB0peY4o21o0_Ig3V7v6M4,35835
40
- nmdc_runtime/site/repository.py,sha256=-dOk9BEnLSrmAN6bZoIu_WnFSqriIpO0c5P76PuHW1M,37472
39
+ nmdc_runtime/site/graphs.py,sha256=jqfwhrCVUBszt9168au_DVvZBtgIfpUf1OXFiyPHI6U,12304
40
+ nmdc_runtime/site/ops.py,sha256=DchVsC0v7J3noZMhVXUZgSGrm_sC78Y9_z_Nfhuq21E,39632
41
+ nmdc_runtime/site/repository.py,sha256=ge3LW_5izCgL6x1Ios8z2Hrt--aY6LXqhGjnAjcIJkI,37422
41
42
  nmdc_runtime/site/resources.py,sha256=ZSH1yvA-li0R7Abc22_v0XLbjBYf5igETr2G01J3hnc,17557
42
- nmdc_runtime/site/util.py,sha256=6hyVPpb6ZkWEG8Nm7uQxnZ-QmuPOG9hgWvl0mUBr5JU,1303
43
+ nmdc_runtime/site/util.py,sha256=zAY0oIY7GRf63ecqWelmS27N7PVrAXVwEhtnpescBSw,1415
43
44
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
45
  nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
45
46
  nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
@@ -50,8 +51,8 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
50
51
  nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
51
52
  nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
52
53
  nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- nmdc_runtime/site/export/ncbi_xml.py,sha256=Z2qsaGIBvY2OdOkf8kJEZl1T_8R_YzhAlXxJ1gMQwnk,16946
54
- nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=CqrtjwzmUbZXEW8aD-KpnCV_PlXVH-Gqp309nw3vbeo,6464
54
+ nmdc_runtime/site/export/ncbi_xml.py,sha256=KMKHZJEjTGECI2N2Hp0yDSMGrkjEC7GmlOnptaZCy2E,22297
55
+ nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=jY4YJt5P7EMsy8gSOPI33K6VcEfOXaVR_zQINZOBUKU,7561
55
56
  nmdc_runtime/site/export/study_metadata.py,sha256=WRU0F1ksWfNX3k9LD91Pn2DuLA-IOpGvYPJd6DnguEs,4819
56
57
  nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
58
  nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
@@ -64,7 +65,7 @@ nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=e_7tXFrP0PpdhqUC
64
65
  nmdc_runtime/site/translation/neon_soil_translator.py,sha256=cJJ_QPva5G5SIT_7DjCSsqbDvgbiKGqUYrxK3nx7_Lw,37634
65
66
  nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=6LaFwBnVx6TN9v1D-G6LFrDxY0TK05AvMklx0E1tTeQ,26590
66
67
  nmdc_runtime/site/translation/neon_utils.py,sha256=mdxJVPb3zbD4DiKW3Fwgk22kjczKMwkcozvy7fwteTE,5203
67
- nmdc_runtime/site/translation/submission_portal_translator.py,sha256=KiVO1vohhrJGfwzLJOumRfyHjcbYfswBIBvkYIdFxv8,28097
68
+ nmdc_runtime/site/translation/submission_portal_translator.py,sha256=aNGIXTiJEXGC_29qWeol2C426bAt5VlY3In_YhplPU0,28169
68
69
  nmdc_runtime/site/translation/translator.py,sha256=xM9dM-nTgSWwu5HFoUVNHf8kqk9iiH4PgWdSx4OKxEk,601
69
70
  nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
70
71
  nmdc_runtime/site/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,9 +73,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
72
73
  nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
73
74
  nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
74
75
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
75
- nmdc_runtime-1.7.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
76
- nmdc_runtime-1.7.0.dist-info/METADATA,sha256=FnoXHNgR6o5PEe6XhqRGdqOjbIX_ry-SKY5uMtZJQXY,7302
77
- nmdc_runtime-1.7.0.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
78
- nmdc_runtime-1.7.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
79
- nmdc_runtime-1.7.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
80
- nmdc_runtime-1.7.0.dist-info/RECORD,,
76
+ nmdc_runtime-1.9.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
77
+ nmdc_runtime-1.9.0.dist-info/METADATA,sha256=6PtNVNbnAQR1l8MWaC6jtXv9YcJzoaoumZkDpvAQ7jE,7302
78
+ nmdc_runtime-1.9.0.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
79
+ nmdc_runtime-1.9.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
80
+ nmdc_runtime-1.9.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
81
+ nmdc_runtime-1.9.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.1.1)
2
+ Generator: setuptools (73.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5