nmdc-runtime 1.7.0__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -159,7 +159,6 @@ class NCBISubmissionXML:
159
159
  org,
160
160
  bioproject_id,
161
161
  nmdc_biosamples,
162
- nmdc_omics_processing,
163
162
  ):
164
163
  attribute_mappings, slot_range_mappings = load_mappings(
165
164
  self.nmdc_ncbi_attribute_mapping_file_url
@@ -278,22 +277,41 @@ class NCBISubmissionXML:
278
277
  biosample_data_objects: list,
279
278
  bioproject_id: str,
280
279
  org: str,
280
+ nmdc_omics_processing: list,
281
+ nmdc_biosamples: list,
281
282
  ):
283
+ bsm_id_name_dict = {
284
+ biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
285
+ }
286
+
282
287
  for entry in biosample_data_objects:
283
288
  fastq_files = []
284
289
  biosample_ids = []
290
+ omics_processing_ids = {}
291
+ instrument_name = ""
292
+ omics_type = ""
293
+ library_name = ""
285
294
 
286
295
  for biosample_id, data_objects in entry.items():
287
296
  biosample_ids.append(biosample_id)
288
297
  for data_object in data_objects:
289
298
  if "url" in data_object:
290
299
  url = urlparse(data_object["url"])
291
- file_path = os.path.join(
292
- os.path.basename(os.path.dirname(url.path)),
293
- os.path.basename(url.path),
294
- )
300
+ file_path = os.path.basename(url.path)
295
301
  fastq_files.append(file_path)
296
302
 
303
+ for omprc_dict in nmdc_omics_processing:
304
+ if biosample_id in omprc_dict:
305
+ for omprc in omprc_dict[biosample_id]:
306
+ omics_processing_ids[biosample_id] = omprc.get("id", "")
307
+ instrument_name = omprc.get("instrument_name", "")
308
+ omics_type = (
309
+ omprc.get("omics_type", {})
310
+ .get("has_raw_value", "")
311
+ .lower()
312
+ )
313
+ library_name = bsm_id_name_dict.get(biosample_id, "")
314
+
297
315
  if fastq_files:
298
316
  files_elements = [
299
317
  self.set_element(
@@ -344,29 +362,106 @@ class NCBISubmissionXML:
344
362
  )
345
363
  )
346
364
 
347
- identifier_element = self.set_element(
348
- "Identifier",
349
- children=[
365
+ sra_attributes = []
366
+ if instrument_name.lower().startswith("illumina"):
367
+ sra_attributes.append(
368
+ self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
369
+ )
370
+ if "nextseq550" in instrument_name.lower():
371
+ sra_attributes.append(
372
+ self.set_element(
373
+ "Attribute", "NextSeq 550", {"name": "instrument_model"}
374
+ )
375
+ )
376
+
377
+ if omics_type == "metagenome":
378
+ sra_attributes.append(
350
379
  self.set_element(
351
- "SPUID", bioproject_id, {"spuid_namespace": org}
380
+ "Attribute", "WGS", {"name": "library_strategy"}
352
381
  )
353
- ],
354
- )
382
+ )
383
+ sra_attributes.append(
384
+ self.set_element(
385
+ "Attribute", "METAGENOMIC", {"name": "library_source"}
386
+ )
387
+ )
388
+ sra_attributes.append(
389
+ self.set_element(
390
+ "Attribute", "RANDOM", {"name": "library_selection"}
391
+ )
392
+ )
355
393
 
356
- action = self.set_element(
357
- "Action",
358
- children=[
394
+ if omics_type == "metatranscriptome":
395
+ sra_attributes.append(
359
396
  self.set_element(
360
- "AddFiles",
361
- attrib={"target_db": "SRA"},
362
- children=files_elements
363
- + attribute_elements
364
- + [identifier_element],
365
- ),
366
- ],
397
+ "Attribute",
398
+ "METATRANSCRIPTOMIC",
399
+ {"name": "library_source"},
400
+ )
401
+ )
402
+
403
+ has_paired_reads = any(
404
+ data_object.get("data_object_type", "").lower()
405
+ == "metagenome raw reads"
406
+ for data_object in data_objects
407
+ ) or (
408
+ any(
409
+ data_object.get("data_object_type", "").lower()
410
+ == "metagenome raw read 1"
411
+ for data_object in data_objects
412
+ )
413
+ and any(
414
+ data_object.get("data_object_type", "").lower()
415
+ == "metagenome raw read 2"
416
+ for data_object in data_objects
417
+ )
367
418
  )
368
419
 
369
- self.root.append(action)
420
+ if has_paired_reads:
421
+ sra_attributes.append(
422
+ self.set_element(
423
+ "Attribute", "paired", {"name": "library_layout"}
424
+ )
425
+ )
426
+ else:
427
+ sra_attributes.append(
428
+ self.set_element(
429
+ "Attribute", "single", {"name": "library_layout"}
430
+ )
431
+ )
432
+
433
+ if library_name:
434
+ sra_attributes.append(
435
+ self.set_element(
436
+ "Attribute", library_name, {"name": "library_name"}
437
+ )
438
+ )
439
+
440
+ for biosample_id, omics_processing_id in omics_processing_ids.items():
441
+ identifier_element = self.set_element(
442
+ "Identifier",
443
+ children=[
444
+ self.set_element(
445
+ "SPUID", omics_processing_id, {"spuid_namespace": org}
446
+ )
447
+ ],
448
+ )
449
+
450
+ action = self.set_element(
451
+ "Action",
452
+ children=[
453
+ self.set_element(
454
+ "AddFiles",
455
+ attrib={"target_db": "SRA"},
456
+ children=files_elements
457
+ + attribute_elements
458
+ + sra_attributes
459
+ + [identifier_element],
460
+ ),
461
+ ],
462
+ )
463
+
464
+ self.root.append(action)
370
465
 
371
466
  def get_submission_xml(
372
467
  self,
@@ -407,13 +502,14 @@ class NCBISubmissionXML:
407
502
  org=self.ncbi_submission_metadata.get("organization", ""),
408
503
  bioproject_id=ncbi_project_id,
409
504
  nmdc_biosamples=biosamples_list,
410
- nmdc_omics_processing=biosample_omics_processing_list,
411
505
  )
412
506
 
413
507
  self.set_fastq(
414
508
  biosample_data_objects=biosample_data_objects_list,
415
509
  bioproject_id=ncbi_project_id,
416
510
  org=self.ncbi_submission_metadata.get("organization", ""),
511
+ nmdc_omics_processing=biosample_omics_processing_list,
512
+ nmdc_biosamples=biosamples_list,
417
513
  )
418
514
 
419
515
  rough_string = ET.tostring(self.root, "unicode")
@@ -48,6 +48,7 @@ from nmdc_runtime.site.ops import (
48
48
  get_neon_pipeline_inputs,
49
49
  get_df_from_url,
50
50
  site_code_mapping,
51
+ materialize_alldocs,
51
52
  get_ncbi_export_pipeline_study,
52
53
  get_data_objects_from_biosamples,
53
54
  get_omics_processing_from_biosamples,
@@ -98,6 +99,11 @@ def housekeeping():
98
99
  delete_operations(list_operations(filter_ops_undone_expired()))
99
100
 
100
101
 
102
+ @graph
103
+ def ensure_alldocs():
104
+ materialize_alldocs()
105
+
106
+
101
107
  @graph
102
108
  def ensure_jobs():
103
109
  jobs = construct_jobs()
nmdc_runtime/site/ops.py CHANGED
@@ -13,6 +13,7 @@ from zipfile import ZipFile
13
13
  import pandas as pd
14
14
  import requests
15
15
 
16
+
16
17
  from bson import ObjectId, json_util
17
18
  from dagster import (
18
19
  Any,
@@ -92,8 +93,12 @@ from nmdc_runtime.util import (
92
93
  put_object,
93
94
  validate_json,
94
95
  specialize_activity_set_docs,
96
+ collection_name_to_class_names,
97
+ class_hierarchy_as_list,
98
+ populated_schema_collection_names_with_id_field,
95
99
  )
96
100
  from nmdc_schema import nmdc
101
+ from nmdc_schema.nmdc import Database as NMDCDatabase
97
102
  from pydantic import BaseModel
98
103
  from pymongo.database import Database as MongoDatabase
99
104
  from starlette import status
@@ -973,6 +978,61 @@ def site_code_mapping() -> dict:
973
978
  )
974
979
 
975
980
 
981
+ @op(required_resource_keys={"mongo"})
982
+ def materialize_alldocs(context) -> int:
983
+ mdb = context.resources.mongo.db
984
+ collection_names = populated_schema_collection_names_with_id_field(mdb)
985
+
986
+ for name in collection_names:
987
+ assert (
988
+ len(collection_name_to_class_names[name]) == 1
989
+ ), f"{name} collection has class name of {collection_name_to_class_names[name]} and len {len(collection_name_to_class_names[name])}"
990
+
991
+ context.log.info(f"{collection_names=}")
992
+
993
+ # Drop any existing `alldocs` collection (e.g. from previous use of this op).
994
+ mdb.alldocs.drop()
995
+
996
+ # Build alldocs
997
+ context.log.info("constructing `alldocs` collection")
998
+
999
+ for collection in collection_names:
1000
+ # Calculate class_hierarchy_as_list once per collection, using the first document in list
1001
+ try:
1002
+ nmdcdb = NMDCDatabase(
1003
+ **{collection: [dissoc(mdb[collection].find_one(), "_id")]}
1004
+ )
1005
+ exemplar = getattr(nmdcdb, collection)[0]
1006
+ newdoc_type: list[str] = class_hierarchy_as_list(exemplar)
1007
+ except ValueError as e:
1008
+ context.log.info(f"Collection {collection} does not exist.")
1009
+ raise e
1010
+
1011
+ context.log.info(
1012
+ f"Found {mdb[collection].estimated_document_count()} estimated documents for {collection=}."
1013
+ )
1014
+ # For each document in this collection, replace the value of the `type` field with
1015
+ # a _list_ of the document's own class and ancestor classes, remove the `_id` field,
1016
+ # and insert the resulting document into the `alldocs` collection.
1017
+
1018
+ inserted_many_result = mdb.alldocs.insert_many(
1019
+ [
1020
+ assoc(dissoc(doc, "type", "_id"), "type", newdoc_type)
1021
+ for doc in mdb[collection].find()
1022
+ ]
1023
+ )
1024
+ context.log.info(
1025
+ f"Inserted {len(inserted_many_result.inserted_ids)} documents for {collection=}."
1026
+ )
1027
+
1028
+ # Re-idx for `alldocs` collection
1029
+ mdb.alldocs.create_index("id", unique=True)
1030
+ context.log.info(
1031
+ f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
1032
+ )
1033
+ return mdb.alldocs.estimated_document_count()
1034
+
1035
+
976
1036
  @op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
977
1037
  def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any:
978
1038
  nmdc_study = find_study_by_id(
@@ -42,6 +42,7 @@ from nmdc_runtime.site.graphs import (
42
42
  ingest_neon_soil_metadata,
43
43
  ingest_neon_benthic_metadata,
44
44
  ingest_neon_surface_water_metadata,
45
+ ensure_alldocs,
45
46
  nmdc_study_to_ncbi_submission_export,
46
47
  )
47
48
  from nmdc_runtime.site.resources import (
@@ -450,6 +451,7 @@ def repo():
450
451
  ensure_jobs.to_job(**preset_normal),
451
452
  apply_metadata_in.to_job(**preset_normal),
452
453
  export_study_biosamples_metadata.to_job(**preset_normal),
454
+ ensure_alldocs.to_job(**preset_normal),
453
455
  ]
454
456
  schedules = [housekeeping_weekly]
455
457
  sensors = [
nmdc_runtime/util.py CHANGED
@@ -376,6 +376,24 @@ collection_name_to_class_names: Dict[str, List[str]] = {
376
376
  }
377
377
 
378
378
 
379
+ def class_hierarchy_as_list(obj) -> list[str]:
380
+ """
381
+ get list of inherited classes for each concrete class
382
+ """
383
+ rv = []
384
+ current_class = obj.__class__
385
+
386
+ def recurse_through_bases(cls):
387
+ if cls.__name__ == "YAMLRoot":
388
+ return rv
389
+ rv.append(cls.__name__)
390
+ for base in cls.__bases__:
391
+ recurse_through_bases(base)
392
+ return rv
393
+
394
+ return recurse_through_bases(current_class)
395
+
396
+
379
397
  @lru_cache
380
398
  def schema_collection_names_with_id_field() -> Set[str]:
381
399
  """
@@ -393,6 +411,11 @@ def schema_collection_names_with_id_field() -> Set[str]:
393
411
  return target_collection_names
394
412
 
395
413
 
414
+ def populated_schema_collection_names_with_id_field(mdb: MongoDatabase) -> List[str]:
415
+ collection_names = sorted(schema_collection_names_with_id_field())
416
+ return [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]
417
+
418
+
396
419
  def ensure_unique_id_indexes(mdb: MongoDatabase):
397
420
  """Ensure that any collections with an "id" field have an index on "id"."""
398
421
  candidate_names = (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nmdc_runtime
3
- Version: 1.7.0
3
+ Version: 1.8.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -1,7 +1,7 @@
1
1
  nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
3
3
  nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- nmdc_runtime/util.py,sha256=3mHVEUdMOv73XgT6NTuzMuMCL5Gs6NJ4Mk0bkgQQaQU,19844
4
+ nmdc_runtime/util.py,sha256=nfj1MjZzVaxs9pKrHo6A98yGAzL-jHQ0zTGs_sOkBnM,20531
5
5
  nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
@@ -35,9 +35,9 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
35
35
  nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
37
37
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
- nmdc_runtime/site/graphs.py,sha256=_vCyQnICis4OQGH91i1ZwpvHYcXOG6Nfg04f5DVdy2M,12040
39
- nmdc_runtime/site/ops.py,sha256=G6X3YgSmDNxOnsMEByLUMfB0peY4o21o0_Ig3V7v6M4,35835
40
- nmdc_runtime/site/repository.py,sha256=-dOk9BEnLSrmAN6bZoIu_WnFSqriIpO0c5P76PuHW1M,37472
38
+ nmdc_runtime/site/graphs.py,sha256=QdmNvdtDLCgpJyKviLUj-IIF1gPS_vYzl1Kzv2mSF4g,12122
39
+ nmdc_runtime/site/ops.py,sha256=btdgcGBwNOFnVCzAa-vO4Gs1lMxgnjcRFd8B28X0who,38222
40
+ nmdc_runtime/site/repository.py,sha256=xTHAfokzbZVqlRFG65VuHxTfZfhyKZskOaCSGyrW_hw,37540
41
41
  nmdc_runtime/site/resources.py,sha256=ZSH1yvA-li0R7Abc22_v0XLbjBYf5igETr2G01J3hnc,17557
42
42
  nmdc_runtime/site/util.py,sha256=6hyVPpb6ZkWEG8Nm7uQxnZ-QmuPOG9hgWvl0mUBr5JU,1303
43
43
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,7 +50,7 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
50
50
  nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
51
51
  nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
52
52
  nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- nmdc_runtime/site/export/ncbi_xml.py,sha256=Z2qsaGIBvY2OdOkf8kJEZl1T_8R_YzhAlXxJ1gMQwnk,16946
53
+ nmdc_runtime/site/export/ncbi_xml.py,sha256=-GflgZO_Q4Y2rm53QIkI7vYY6pWwCf_l7tolGgTXiBg,21026
54
54
  nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=CqrtjwzmUbZXEW8aD-KpnCV_PlXVH-Gqp309nw3vbeo,6464
55
55
  nmdc_runtime/site/export/study_metadata.py,sha256=WRU0F1ksWfNX3k9LD91Pn2DuLA-IOpGvYPJd6DnguEs,4819
56
56
  nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,9 +72,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
72
72
  nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
73
73
  nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
74
74
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
75
- nmdc_runtime-1.7.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
76
- nmdc_runtime-1.7.0.dist-info/METADATA,sha256=FnoXHNgR6o5PEe6XhqRGdqOjbIX_ry-SKY5uMtZJQXY,7302
77
- nmdc_runtime-1.7.0.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
78
- nmdc_runtime-1.7.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
79
- nmdc_runtime-1.7.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
80
- nmdc_runtime-1.7.0.dist-info/RECORD,,
75
+ nmdc_runtime-1.8.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
76
+ nmdc_runtime-1.8.0.dist-info/METADATA,sha256=lBQzzEEXtwobBObmYmDogAdFKQMLvSJn3wmjG8lHQ5I,7302
77
+ nmdc_runtime-1.8.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
78
+ nmdc_runtime-1.8.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
79
+ nmdc_runtime-1.8.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
80
+ nmdc_runtime-1.8.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.1.1)
2
+ Generator: setuptools (72.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5