nmdc-runtime 2.3.0__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -4,7 +4,7 @@ import datetime
4
4
  import xml.etree.ElementTree as ET
5
5
  import xml.dom.minidom
6
6
 
7
- from typing import Any
7
+ from typing import Any, List, Union
8
8
  from urllib.parse import urlparse
9
9
  from nmdc_runtime.site.export.ncbi_xml_utils import (
10
10
  get_instruments,
@@ -366,7 +366,14 @@ class NCBISubmissionXML:
366
366
  )
367
367
  # Currently, we are making the assumption that only one instrument
368
368
  # is used to sequence a Biosample
369
- instrument_id = ntseq.get("instrument_used", "")[0]
369
+ instrument_used: List[str] = ntseq.get(
370
+ "instrument_used", []
371
+ )
372
+ if not instrument_used:
373
+ instrument_id = None
374
+ else:
375
+ instrument_id = instrument_used[0]
376
+
370
377
  instrument = all_instruments.get(instrument_id, {})
371
378
  instrument_vendor = instrument.get("vendor", "")
372
379
  instrument_model = instrument.get("model", "")
@@ -448,6 +455,20 @@ class NCBISubmissionXML:
448
455
  "Attribute", "NextSeq 550", {"name": "instrument_model"}
449
456
  )
450
457
  )
458
+ elif instrument_model == "novaseq_6000":
459
+ sra_attributes.append(
460
+ self.set_element(
461
+ "Attribute",
462
+ "NovaSeq 6000",
463
+ {"name": "instrument_model"},
464
+ )
465
+ )
466
+ elif instrument_model == "hiseq":
467
+ sra_attributes.append(
468
+ self.set_element(
469
+ "Attribute", "HiSeq", {"name": "instrument_model"}
470
+ )
471
+ )
451
472
 
452
473
  if analyte_category == "metagenome":
453
474
  sra_attributes.append(
@@ -1,6 +1,10 @@
1
1
  from io import BytesIO, StringIO
2
+ from typing import Any, Dict, List, Union
3
+
4
+ from nmdc_runtime.api.endpoints.util import strip_oid
2
5
  from nmdc_runtime.minter.config import typecodes
3
6
  from lxml import etree
7
+ from pymongo.collection import Collection
4
8
 
5
9
  import csv
6
10
  import requests
@@ -45,35 +49,53 @@ def get_instruments(instrument_set_collection):
45
49
  raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
46
50
 
47
51
 
48
- def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
52
+ def fetch_data_objects_from_biosamples(
53
+ all_docs_collection: Collection,
54
+ data_object_set: Collection,
55
+ biosamples_list: List[Dict[str, Any]],
56
+ ) -> List[Dict[str, Dict[str, Any]]]:
57
+ """This method fetches the data objects that are "associated" (derived from/products of)
58
+ with their respective biosamples by iterating over the alldocs collection recursively.
59
+ The methods returns a dictionary with biosample ids as keys and the associated list of
60
+ data objects as values.
61
+
62
+ :param all_docs_collection: reference to the alldocs collection
63
+ :param data_object_set: reference to the data_object_set collection
64
+ :param biosamples_list: list of biosamples as JSON documents
65
+ :return: list of dictionaries with biosample ids as keys and associated data objects as values
66
+ """
67
+ biosample_data_objects = []
68
+
69
+ def collect_data_objects(doc_ids, collected_objects, unique_ids):
70
+ for doc_id in doc_ids:
71
+ if (
72
+ get_classname_from_typecode(doc_id) == "DataObject"
73
+ and doc_id not in unique_ids
74
+ ):
75
+ data_obj = data_object_set.find_one({"id": doc_id})
76
+ if data_obj:
77
+ collected_objects.append(strip_oid(data_obj))
78
+ unique_ids.add(doc_id)
79
+
49
80
  biosample_data_objects = []
50
81
 
51
82
  for biosample in biosamples_list:
52
83
  current_ids = [biosample["id"]]
53
84
  collected_data_objects = []
85
+ unique_ids = set()
54
86
 
55
87
  while current_ids:
56
88
  new_current_ids = []
57
89
  for current_id in current_ids:
58
- query = {"has_input": current_id}
59
- document = all_docs_collection.find_one(query)
90
+ for doc in all_docs_collection.find({"has_input": current_id}):
91
+ has_output = doc.get("has_output", [])
60
92
 
61
- if not document:
62
- continue
63
-
64
- has_output = document.get("has_output")
65
- if not has_output:
66
- continue
67
-
68
- for output_id in has_output:
69
- if get_classname_from_typecode(output_id) == "DataObject":
70
- data_object_doc = all_docs_collection.find_one(
71
- {"id": output_id}
72
- )
73
- if data_object_doc:
74
- collected_data_objects.append(data_object_doc)
75
- else:
76
- new_current_ids.append(output_id)
93
+ collect_data_objects(has_output, collected_data_objects, unique_ids)
94
+ new_current_ids.extend(
95
+ op
96
+ for op in has_output
97
+ if get_classname_from_typecode(op) != "DataObject"
98
+ )
77
99
 
78
100
  current_ids = new_current_ids
79
101
 
@@ -83,12 +105,25 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
83
105
  return biosample_data_objects
84
106
 
85
107
 
86
- def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
87
- biosample_data_objects = []
108
+ def fetch_nucleotide_sequencing_from_biosamples(
109
+ all_docs_collection: Collection,
110
+ data_generation_set: Collection,
111
+ biosamples_list: List[Dict[str, Any]],
112
+ ) -> List[Dict[str, Dict[str, Any]]]:
113
+ """This method fetches the nucleotide sequencing process records that create data objects
114
+ for biosamples by iterating over the alldocs collection recursively.
115
+
116
+ :param all_docs_collection: reference to the alldocs collection
117
+ :param data_generation_set: reference to the data_generation_set collection
118
+ :param biosamples_list: list of biosamples as JSON documents
119
+ :return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
120
+ process objects as values
121
+ """
122
+ biosample_ntseq_objects = []
88
123
 
89
124
  for biosample in biosamples_list:
90
125
  current_ids = [biosample["id"]]
91
- collected_data_objects = []
126
+ collected_ntseq_objects = []
92
127
 
93
128
  while current_ids:
94
129
  new_current_ids = []
@@ -105,23 +140,39 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
105
140
 
106
141
  for output_id in has_output:
107
142
  if get_classname_from_typecode(output_id) == "DataObject":
108
- nucleotide_sequencing_doc = all_docs_collection.find_one(
143
+ nucleotide_sequencing_doc = data_generation_set.find_one(
109
144
  {"id": document["id"]}
110
145
  )
111
146
  if nucleotide_sequencing_doc:
112
- collected_data_objects.append(nucleotide_sequencing_doc)
147
+ collected_ntseq_objects.append(
148
+ strip_oid(nucleotide_sequencing_doc)
149
+ )
113
150
  else:
114
151
  new_current_ids.append(output_id)
115
152
 
116
153
  current_ids = new_current_ids
117
154
 
118
- if collected_data_objects:
119
- biosample_data_objects.append({biosample["id"]: collected_data_objects})
155
+ if collected_ntseq_objects:
156
+ biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
157
+
158
+ return biosample_ntseq_objects
120
159
 
121
- return biosample_data_objects
122
160
 
161
+ def fetch_library_preparation_from_biosamples(
162
+ all_docs_collection: Collection,
163
+ material_processing_set: Collection,
164
+ biosamples_list: List[Dict[str, Any]],
165
+ ) -> List[Dict[str, Dict[str, Any]]]:
166
+ """This method fetches the library preparation process records that create processed samples,
167
+ which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
168
+ for biosamples by iterating over the alldocs collection recursively.
123
169
 
124
- def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_list):
170
+ :param all_docs_collection: reference to the alldocs collection
171
+ :param material_processing_set: reference to the material_processing_set collection
172
+ :param biosamples_list: list of biosamples as JSON documents
173
+ :return: list of dictionaries with biosample ids as keys and associated library preparation process
174
+ objects as values
175
+ """
125
176
  biosample_lib_prep = []
126
177
 
127
178
  for biosample in biosamples_list:
@@ -144,10 +195,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
144
195
  "has_input": output_id,
145
196
  "type": {"$in": ["LibraryPreparation"]},
146
197
  }
147
- lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
198
+ lib_prep_doc = material_processing_set.find_one(lib_prep_query)
148
199
 
149
200
  if lib_prep_doc:
150
- biosample_lib_prep.append({biosample_id: lib_prep_doc})
201
+ biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
151
202
  break # Stop at the first document that meets the criteria
152
203
 
153
204
  return biosample_lib_prep
nmdc_runtime/site/ops.py CHANGED
@@ -1188,8 +1188,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
1188
1188
  def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
1189
1189
  mdb = context.resources.mongo.db
1190
1190
  alldocs_collection = mdb["alldocs"]
1191
+ data_object_set = mdb["data_object_set"]
1191
1192
  biosample_data_objects = fetch_data_objects_from_biosamples(
1192
- alldocs_collection, biosamples
1193
+ alldocs_collection, data_object_set, biosamples
1193
1194
  )
1194
1195
  return biosample_data_objects
1195
1196
 
@@ -1200,8 +1201,9 @@ def get_nucleotide_sequencing_from_biosamples(
1200
1201
  ):
1201
1202
  mdb = context.resources.mongo.db
1202
1203
  alldocs_collection = mdb["alldocs"]
1204
+ data_generation_set = mdb["data_generation_set"]
1203
1205
  biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
1204
- alldocs_collection, biosamples
1206
+ alldocs_collection, data_generation_set, biosamples
1205
1207
  )
1206
1208
  return biosample_omics_processing
1207
1209
 
@@ -1212,8 +1214,9 @@ def get_library_preparation_from_biosamples(
1212
1214
  ):
1213
1215
  mdb = context.resources.mongo.db
1214
1216
  alldocs_collection = mdb["alldocs"]
1217
+ material_processing_set = mdb["material_processing_set"]
1215
1218
  biosample_lib_prep = fetch_library_preparation_from_biosamples(
1216
- alldocs_collection, biosamples
1219
+ alldocs_collection, material_processing_set, biosamples
1217
1220
  )
1218
1221
  return biosample_lib_prep
1219
1222
 
@@ -652,7 +652,7 @@ def biosample_submission_ingest():
652
652
  "inputs": {
653
653
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
654
654
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
655
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
655
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
656
656
  }
657
657
  },
658
658
  },
@@ -694,7 +694,7 @@ def biosample_submission_ingest():
694
694
  "inputs": {
695
695
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
696
696
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
697
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
697
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
698
698
  }
699
699
  },
700
700
  },
@@ -737,7 +737,7 @@ def biosample_submission_ingest():
737
737
  "inputs": {
738
738
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
739
739
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
740
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
740
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
741
741
  }
742
742
  },
743
743
  "get_neon_pipeline_benthic_data_product": {
@@ -779,7 +779,7 @@ def biosample_submission_ingest():
779
779
  "inputs": {
780
780
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
781
781
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
782
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
782
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
783
783
  }
784
784
  },
785
785
  },
@@ -822,14 +822,14 @@ def biosample_submission_ingest():
822
822
  "inputs": {
823
823
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
824
824
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
825
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
825
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
826
826
  }
827
827
  },
828
828
  "get_neon_pipeline_surface_water_data_product": {
829
829
  "config": {
830
830
  "surface_water_data_product": {
831
831
  "product_id": "DP1.20281.001",
832
- "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
832
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
833
833
  }
834
834
  }
835
835
  },
@@ -856,7 +856,7 @@ def biosample_submission_ingest():
856
856
  "config": {
857
857
  "surface_water_data_product": {
858
858
  "product_id": "DP1.20281.001",
859
- "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
859
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
860
860
  }
861
861
  }
862
862
  },
@@ -864,7 +864,7 @@ def biosample_submission_ingest():
864
864
  "inputs": {
865
865
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
866
866
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
867
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
867
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
868
868
  }
869
869
  },
870
870
  },
@@ -71,6 +71,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
71
71
  neon_amb_data_tables = (
72
72
  "mms_swMetagenomeSequencing",
73
73
  "mms_swMetagenomeDnaExtraction",
74
+ "mms_swRawDataFiles",
74
75
  "amc_fieldGenetic",
75
76
  "amc_fieldSuperParent",
76
77
  )
@@ -88,6 +89,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
88
89
  if_exists="replace",
89
90
  index=False,
90
91
  )
92
+ surface_water_data["mms_swRawDataFiles"].to_sql(
93
+ "mms_swRawDataFiles", self.conn, if_exists="replace", index=False
94
+ )
91
95
  surface_water_data["amc_fieldGenetic"].to_sql(
92
96
  "amc_fieldGenetic", self.conn, if_exists="replace", index=False
93
97
  )
@@ -103,10 +107,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
103
107
  "neonEnvoTerms", self.conn, if_exists="replace", index=False
104
108
  )
105
109
 
106
- self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
107
- self.neon_raw_data_file_mappings_df.to_sql(
108
- "neonRawDataFile", self.conn, if_exists="replace", index=False
109
- )
110
+ self.neon_raw_data_file_mappings_df = surface_water_data["mms_swRawDataFiles"]
110
111
 
111
112
  self.site_code_mapping = site_code_mapping
112
113
 
@@ -371,7 +372,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
371
372
  )
372
373
 
373
374
  def _translate_data_object(
374
- self, do_id: str, url: str, do_type: str, checksum: str
375
+ self, do_id: str, url: str, do_type: str, manifest_id: str
375
376
  ) -> nmdc.DataObject:
376
377
  """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
377
378
  object mainly contains information about the sequencing file that was generated as
@@ -395,8 +396,15 @@ class NeonSurfaceWaterDataTranslator(Translator):
395
396
  url=url,
396
397
  description=f"sequencing results for {basename}",
397
398
  type="nmdc:DataObject",
398
- md5_checksum=checksum,
399
399
  data_object_type=do_type,
400
+ in_manifest=manifest_id,
401
+ )
402
+
403
+ def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
404
+ return nmdc.Manifest(
405
+ id=manifest_id,
406
+ manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
407
+ type="nmdc:Manifest",
400
408
  )
401
409
 
402
410
  def get_database(self):
@@ -477,6 +485,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
477
485
  """
478
486
  surface_water_samples = pd.read_sql_query(query, self.conn)
479
487
 
488
+ # --------------------------------------------------
489
+ # Create mappings for minted NMDC IDs
490
+ # --------------------------------------------------
480
491
  neon_biosample_ids = surface_water_samples["parentSampleID"]
481
492
  nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
482
493
  neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
@@ -511,30 +522,20 @@ class NeonSurfaceWaterDataTranslator(Translator):
511
522
  zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
512
523
  )
513
524
 
514
- neon_omprc_ids = surface_water_samples["parentSampleID"]
515
- nmdc_omprc_ids = self._id_minter(
516
- "nmdc:NucleotideSequencing", len(neon_omprc_ids)
517
- )
518
- neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
519
-
520
- neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
521
- neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
522
- nmdc_data_object_ids = self._id_minter(
523
- "nmdc:DataObject", len(neon_raw_file_paths)
524
- )
525
- neon_to_nmdc_data_object_ids = dict(
526
- zip(neon_raw_file_paths, nmdc_data_object_ids)
527
- )
528
-
525
+ # --------------------------------------------------
526
+ # STEP 1: Insert Biosamples
527
+ # --------------------------------------------------
529
528
  for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
530
529
  biosample_row = surface_water_samples[
531
530
  surface_water_samples["parentSampleID"] == neon_id
532
531
  ]
532
+ # database.biosample_set.append(
533
+ # self._translate_biosample(neon_id, nmdc_id, biosample_row)
534
+ # )
533
535
 
534
- database.biosample_set.append(
535
- self._translate_biosample(neon_id, nmdc_id, biosample_row)
536
- )
537
-
536
+ # --------------------------------------------------
537
+ # STEP 2: Insert Extraction Processes
538
+ # --------------------------------------------------
538
539
  for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
539
540
  extraction_row = surface_water_samples[
540
541
  surface_water_samples["parentSampleID"] == neon_id
@@ -557,6 +558,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
557
558
  extraction_row, "genomicsSampleID"
558
559
  )
559
560
 
561
+ # Each Extraction process output => ProcessedSample
560
562
  database.processed_sample_set.append(
561
563
  self._translate_processed_sample(
562
564
  processed_sample_id,
@@ -564,23 +566,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
564
566
  )
565
567
  )
566
568
 
567
- query = """
568
- SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
569
- FROM neonRawDataFile
570
- GROUP BY dnaSampleID
571
- """
572
- neon_raw_data_files = pd.read_sql_query(query, self.conn)
573
- neon_raw_data_files_dict = (
574
- neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
575
- .str.split("|")
576
- .to_dict()
577
- )
578
- filtered_neon_raw_data_files_dict = {
579
- key: value
580
- for key, value in neon_raw_data_files_dict.items()
581
- if len(value) <= 2
582
- }
583
-
569
+ # --------------------------------------------------
570
+ # STEP 3: Insert LibraryPreparation Processes
571
+ # --------------------------------------------------
584
572
  for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
585
573
  lib_prep_row = surface_water_samples[
586
574
  surface_water_samples["parentSampleID"] == neon_id
@@ -601,6 +589,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
601
589
 
602
590
  dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
603
591
 
592
+ # Each LibraryPreparation process output => ProcessedSample
604
593
  database.processed_sample_set.append(
605
594
  self._translate_processed_sample(
606
595
  processed_sample_id,
@@ -608,42 +597,103 @@ class NeonSurfaceWaterDataTranslator(Translator):
608
597
  )
609
598
  )
610
599
 
611
- has_output = None
612
- has_output_do_ids = []
613
-
614
- if dna_sample_id in filtered_neon_raw_data_files_dict:
615
- has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
616
- for item in has_output:
617
- if item in neon_to_nmdc_data_object_ids:
618
- has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
619
-
620
- checksum = None
621
- do_type = None
622
-
623
- checksum = neon_raw_data_file_mappings_df[
624
- neon_raw_data_file_mappings_df["rawDataFilePath"] == item
625
- ]["checkSum"].values[0]
626
- if "_R1.fastq.gz" in item:
627
- do_type = "Metagenome Raw Read 1"
628
- elif "_R2.fastq.gz" in item:
629
- do_type = "Metagenome Raw Read 2"
630
-
631
- database.data_object_set.append(
632
- self._translate_data_object(
633
- neon_to_nmdc_data_object_ids.get(item),
634
- item,
635
- do_type,
636
- checksum,
637
- )
638
- )
639
-
640
- database.data_generation_set.append(
641
- self._translate_nucleotide_sequencing(
642
- neon_to_nmdc_omprc_ids.get(neon_id),
643
- processed_sample_id,
644
- has_output_do_ids,
645
- lib_prep_row,
646
- )
600
+ # --------------------------------------------------
601
+ # STEP 4: Group raw files by (dnaSampleID, sequencerRunID)
602
+ # and insert DataObjects + DataGeneration processes
603
+ # --------------------------------------------------
604
+ raw_query = """
605
+ SELECT dnaSampleID, sequencerRunID, rawDataFilePath
606
+ FROM mms_swRawDataFiles
607
+ """
608
+ neon_raw_data_files_df = pd.read_sql_query(raw_query, self.conn)
609
+
610
+ for neon_id, nmdc_libprep_id in neon_to_nmdc_lib_prep_ids.items():
611
+ # 1) Pull out the row that corresponds to this parentSampleID
612
+ lib_prep_row = surface_water_samples[
613
+ surface_water_samples["parentSampleID"] == neon_id
614
+ ]
615
+
616
+ # 2) Grab the dnaSampleID from that row
617
+ dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
618
+ if not dna_sample_id:
619
+ # No dnaSampleID => skip
620
+ continue
621
+
622
+ # 3) Find all raw files for that dnaSampleID
623
+ dna_files = neon_raw_data_files_df[
624
+ neon_raw_data_files_df["dnaSampleID"] == dna_sample_id
625
+ ]
626
+ if dna_files.empty:
627
+ # No raw files => skip
628
+ continue
629
+
630
+ # -----------------------------------------
631
+ # LOOKUP DICT: get "has_input" for this neon_id
632
+ # -----------------------------------------
633
+ has_input_value = self.samp_procsm_dict.get(neon_id)
634
+ # If some neon_id isn't in the dictionary, handle it as needed
635
+ if not has_input_value:
636
+ # Could skip, or raise an error, or set a default
637
+ continue
638
+
639
+ # -------------------------------------------
640
+ # 4) CREATE A MANIFEST IF MULTIPLE RAW FILES
641
+ # for this row's dnaSampleID
642
+ # -------------------------------------------
643
+ manifest_id = None
644
+ if len(dna_files) > 2:
645
+ # For each row that references a dnaSampleID with multiple raw files,
646
+ # mint exactly one new manifest record
647
+ manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
648
+ new_manifest = self._translate_manifest(manifest_id)
649
+ # Add to the database
650
+ database.manifest_set.append(new_manifest)
651
+
652
+ # -------------------------------------------
653
+ # 5) NOW GROUP FILES BY sequencerRunID
654
+ # => one data_generation record per run
655
+ # -------------------------------------------
656
+ lib_prep_processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(
657
+ neon_id
658
+ )
659
+ if not lib_prep_processed_sample_id:
660
+ # If we don't have a ProcessedSample for some reason, skip
661
+ continue
662
+
663
+ for run_id, group_df in dna_files.groupby("sequencerRunID"):
664
+ # a) Mint new data_generation (NucleotideSequencing) ID for this run
665
+ data_generation_id = self._id_minter("nmdc:NucleotideSequencing", 1)[0]
666
+
667
+ # b) Create DataObjects for each raw file in this run
668
+ data_object_ids = []
669
+ for raw_fp in group_df["rawDataFilePath"]:
670
+ do_id = self._id_minter("nmdc:DataObject", 1)[0]
671
+
672
+ # Distinguish read type
673
+ do_type = None
674
+ if "_R1.fastq.gz" in raw_fp:
675
+ do_type = "Metagenome Raw Read 1"
676
+ elif "_R2.fastq.gz" in raw_fp:
677
+ do_type = "Metagenome Raw Read 2"
678
+
679
+ # Create the DataObject
680
+ data_obj = self._translate_data_object(
681
+ do_id=do_id,
682
+ url=raw_fp,
683
+ do_type=do_type,
684
+ manifest_id=manifest_id, # link to the new Manifest if it exists
685
+ )
686
+ database.data_object_set.append(data_obj)
687
+ data_object_ids.append(do_id)
688
+
689
+ # c) Finally, create the data generation record for this run
690
+ database.data_generation_set.append(
691
+ self._translate_nucleotide_sequencing(
692
+ nucleotide_sequencing_id=data_generation_id,
693
+ processed_sample_id=has_input_value,
694
+ raw_data_file_data=data_object_ids,
695
+ nucleotide_sequencing_row=lib_prep_row,
647
696
  )
697
+ )
648
698
 
649
699
  return database
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: nmdc_runtime
3
- Version: 2.3.0
3
+ Version: 2.4.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -37,7 +37,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
37
37
  * [nmdc-server](https://github.com/microbiomedata/nmdc-server)
38
38
  houses code specific to the data portal -- its database, back-end API, and front-end application.
39
39
 
40
- * [workflow_documentation](https://nmdc-workflow-documentation.readthedocs.io/en/latest/index.html)
40
+ * [workflow_documentation](https://docs.microbiomedata.org/workflows/)
41
41
  references workflow code spread across several repositories, that take source data and produce computed data.
42
42
 
43
43
  * This repo (nmdc-runtime)
@@ -37,8 +37,8 @@ nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
37
37
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
38
38
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
39
  nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
40
- nmdc_runtime/site/ops.py,sha256=OrTQLSBToih-kI754RtVqjUMRojgYJQmb7B_VRjZWtg,46223
41
- nmdc_runtime/site/repository.py,sha256=b3UVQznelU8wDOfuc9_vE_eqFGOoFRiHtQJJH7or73E,43875
40
+ nmdc_runtime/site/ops.py,sha256=p4F5SrDbFdKOrAHu1TUhWQA33QB7hdoQmCCuU-00Eqo,46445
41
+ nmdc_runtime/site/repository.py,sha256=pfx7WAVgdNaPhtfF2pak-tllqPMf4-yUeOXSpr4hu30,43861
42
42
  nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
43
43
  nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
44
44
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -51,8 +51,8 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
51
51
  nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
52
52
  nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
53
53
  nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- nmdc_runtime/site/export/ncbi_xml.py,sha256=Vb4rNP3uhnGlHqrwUGgA2DzpOotCf3S8G4sIJml7gl4,25287
55
- nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZynO1yPSLX_aHs,8390
54
+ nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
55
+ nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
56
56
  nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
57
57
  nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
@@ -65,7 +65,7 @@ nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-N
65
65
  nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
66
66
  nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
67
67
  nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
68
- nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=MQgjIfWPgoRe-bhzyfqHSe2mZwFsjcwjdT8tNqpIhlc,27729
68
+ nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
69
69
  nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
70
70
  nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
71
71
  nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
75
75
  nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
76
76
  nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
77
77
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
78
- nmdc_runtime-2.3.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
- nmdc_runtime-2.3.0.dist-info/METADATA,sha256=BEManThNKOEkfS9woYYiwm1ya6BENBC6vXE6b7L_z2E,7430
80
- nmdc_runtime-2.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
81
- nmdc_runtime-2.3.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
- nmdc_runtime-2.3.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
- nmdc_runtime-2.3.0.dist-info/RECORD,,
78
+ nmdc_runtime-2.4.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
+ nmdc_runtime-2.4.0.dist-info/METADATA,sha256=CeZZbucd3jrD0ZqGdreH2x7ALrM9pt4ksGV2olkkpPI,7401
80
+ nmdc_runtime-2.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
81
+ nmdc_runtime-2.4.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
+ nmdc_runtime-2.4.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
+ nmdc_runtime-2.4.0.dist-info/RECORD,,