nmdc-runtime 2.3.0__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -4,7 +4,7 @@ import datetime
4
4
  import xml.etree.ElementTree as ET
5
5
  import xml.dom.minidom
6
6
 
7
- from typing import Any
7
+ from typing import Any, List, Union
8
8
  from urllib.parse import urlparse
9
9
  from nmdc_runtime.site.export.ncbi_xml_utils import (
10
10
  get_instruments,
@@ -366,7 +366,14 @@ class NCBISubmissionXML:
366
366
  )
367
367
  # Currently, we are making the assumption that only one instrument
368
368
  # is used to sequence a Biosample
369
- instrument_id = ntseq.get("instrument_used", "")[0]
369
+ instrument_used: List[str] = ntseq.get(
370
+ "instrument_used", []
371
+ )
372
+ if not instrument_used:
373
+ instrument_id = None
374
+ else:
375
+ instrument_id = instrument_used[0]
376
+
370
377
  instrument = all_instruments.get(instrument_id, {})
371
378
  instrument_vendor = instrument.get("vendor", "")
372
379
  instrument_model = instrument.get("model", "")
@@ -448,6 +455,20 @@ class NCBISubmissionXML:
448
455
  "Attribute", "NextSeq 550", {"name": "instrument_model"}
449
456
  )
450
457
  )
458
+ elif instrument_model == "novaseq_6000":
459
+ sra_attributes.append(
460
+ self.set_element(
461
+ "Attribute",
462
+ "NovaSeq 6000",
463
+ {"name": "instrument_model"},
464
+ )
465
+ )
466
+ elif instrument_model == "hiseq":
467
+ sra_attributes.append(
468
+ self.set_element(
469
+ "Attribute", "HiSeq", {"name": "instrument_model"}
470
+ )
471
+ )
451
472
 
452
473
  if analyte_category == "metagenome":
453
474
  sra_attributes.append(
@@ -1,6 +1,10 @@
1
1
  from io import BytesIO, StringIO
2
+ from typing import Any, Dict, List, Union
3
+
4
+ from nmdc_runtime.api.endpoints.util import strip_oid
2
5
  from nmdc_runtime.minter.config import typecodes
3
6
  from lxml import etree
7
+ from pymongo.collection import Collection
4
8
 
5
9
  import csv
6
10
  import requests
@@ -45,35 +49,53 @@ def get_instruments(instrument_set_collection):
45
49
  raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
46
50
 
47
51
 
48
- def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
52
+ def fetch_data_objects_from_biosamples(
53
+ all_docs_collection: Collection,
54
+ data_object_set: Collection,
55
+ biosamples_list: List[Dict[str, Any]],
56
+ ) -> List[Dict[str, Dict[str, Any]]]:
57
+ """This method fetches the data objects that are "associated" (derived from/products of)
58
+ with their respective biosamples by iterating over the alldocs collection recursively.
59
+ The methods returns a dictionary with biosample ids as keys and the associated list of
60
+ data objects as values.
61
+
62
+ :param all_docs_collection: reference to the alldocs collection
63
+ :param data_object_set: reference to the data_object_set collection
64
+ :param biosamples_list: list of biosamples as JSON documents
65
+ :return: list of dictionaries with biosample ids as keys and associated data objects as values
66
+ """
67
+ biosample_data_objects = []
68
+
69
+ def collect_data_objects(doc_ids, collected_objects, unique_ids):
70
+ for doc_id in doc_ids:
71
+ if (
72
+ get_classname_from_typecode(doc_id) == "DataObject"
73
+ and doc_id not in unique_ids
74
+ ):
75
+ data_obj = data_object_set.find_one({"id": doc_id})
76
+ if data_obj:
77
+ collected_objects.append(strip_oid(data_obj))
78
+ unique_ids.add(doc_id)
79
+
49
80
  biosample_data_objects = []
50
81
 
51
82
  for biosample in biosamples_list:
52
83
  current_ids = [biosample["id"]]
53
84
  collected_data_objects = []
85
+ unique_ids = set()
54
86
 
55
87
  while current_ids:
56
88
  new_current_ids = []
57
89
  for current_id in current_ids:
58
- query = {"has_input": current_id}
59
- document = all_docs_collection.find_one(query)
90
+ for doc in all_docs_collection.find({"has_input": current_id}):
91
+ has_output = doc.get("has_output", [])
60
92
 
61
- if not document:
62
- continue
63
-
64
- has_output = document.get("has_output")
65
- if not has_output:
66
- continue
67
-
68
- for output_id in has_output:
69
- if get_classname_from_typecode(output_id) == "DataObject":
70
- data_object_doc = all_docs_collection.find_one(
71
- {"id": output_id}
72
- )
73
- if data_object_doc:
74
- collected_data_objects.append(data_object_doc)
75
- else:
76
- new_current_ids.append(output_id)
93
+ collect_data_objects(has_output, collected_data_objects, unique_ids)
94
+ new_current_ids.extend(
95
+ op
96
+ for op in has_output
97
+ if get_classname_from_typecode(op) != "DataObject"
98
+ )
77
99
 
78
100
  current_ids = new_current_ids
79
101
 
@@ -83,12 +105,25 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
83
105
  return biosample_data_objects
84
106
 
85
107
 
86
- def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
87
- biosample_data_objects = []
108
+ def fetch_nucleotide_sequencing_from_biosamples(
109
+ all_docs_collection: Collection,
110
+ data_generation_set: Collection,
111
+ biosamples_list: List[Dict[str, Any]],
112
+ ) -> List[Dict[str, Dict[str, Any]]]:
113
+ """This method fetches the nucleotide sequencing process records that create data objects
114
+ for biosamples by iterating over the alldocs collection recursively.
115
+
116
+ :param all_docs_collection: reference to the alldocs collection
117
+ :param data_generation_set: reference to the data_generation_set collection
118
+ :param biosamples_list: list of biosamples as JSON documents
119
+ :return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
120
+ process objects as values
121
+ """
122
+ biosample_ntseq_objects = []
88
123
 
89
124
  for biosample in biosamples_list:
90
125
  current_ids = [biosample["id"]]
91
- collected_data_objects = []
126
+ collected_ntseq_objects = []
92
127
 
93
128
  while current_ids:
94
129
  new_current_ids = []
@@ -105,23 +140,39 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
105
140
 
106
141
  for output_id in has_output:
107
142
  if get_classname_from_typecode(output_id) == "DataObject":
108
- nucleotide_sequencing_doc = all_docs_collection.find_one(
143
+ nucleotide_sequencing_doc = data_generation_set.find_one(
109
144
  {"id": document["id"]}
110
145
  )
111
146
  if nucleotide_sequencing_doc:
112
- collected_data_objects.append(nucleotide_sequencing_doc)
147
+ collected_ntseq_objects.append(
148
+ strip_oid(nucleotide_sequencing_doc)
149
+ )
113
150
  else:
114
151
  new_current_ids.append(output_id)
115
152
 
116
153
  current_ids = new_current_ids
117
154
 
118
- if collected_data_objects:
119
- biosample_data_objects.append({biosample["id"]: collected_data_objects})
155
+ if collected_ntseq_objects:
156
+ biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
157
+
158
+ return biosample_ntseq_objects
120
159
 
121
- return biosample_data_objects
122
160
 
161
+ def fetch_library_preparation_from_biosamples(
162
+ all_docs_collection: Collection,
163
+ material_processing_set: Collection,
164
+ biosamples_list: List[Dict[str, Any]],
165
+ ) -> List[Dict[str, Dict[str, Any]]]:
166
+ """This method fetches the library preparation process records that create processed samples,
167
+ which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
168
+ for biosamples by iterating over the alldocs collection recursively.
123
169
 
124
- def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_list):
170
+ :param all_docs_collection: reference to the alldocs collection
171
+ :param material_processing_set: reference to the material_processing_set collection
172
+ :param biosamples_list: list of biosamples as JSON documents
173
+ :return: list of dictionaries with biosample ids as keys and associated library preparation process
174
+ objects as values
175
+ """
125
176
  biosample_lib_prep = []
126
177
 
127
178
  for biosample in biosamples_list:
@@ -144,10 +195,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
144
195
  "has_input": output_id,
145
196
  "type": {"$in": ["LibraryPreparation"]},
146
197
  }
147
- lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
198
+ lib_prep_doc = material_processing_set.find_one(lib_prep_query)
148
199
 
149
200
  if lib_prep_doc:
150
- biosample_lib_prep.append({biosample_id: lib_prep_doc})
201
+ biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
151
202
  break # Stop at the first document that meets the criteria
152
203
 
153
204
  return biosample_lib_prep
nmdc_runtime/site/ops.py CHANGED
@@ -1100,7 +1100,12 @@ def materialize_alldocs(context) -> int:
1100
1100
  write_operations = []
1101
1101
  documents_processed_counter = 0
1102
1102
  for doc in mdb[coll_name].find():
1103
- doc_type = doc["type"][5:] # lop off "nmdc:" prefix
1103
+ try:
1104
+ doc_type = doc["type"][5:] # lop off "nmdc:" prefix
1105
+ except KeyError:
1106
+ raise Exception(
1107
+ f"doc {doc['id']} in collection {coll_name} has no 'type'!"
1108
+ )
1104
1109
  slots_to_include = ["id", "type"] + document_reference_ranged_slots[
1105
1110
  doc_type
1106
1111
  ]
@@ -1188,8 +1193,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
1188
1193
  def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
1189
1194
  mdb = context.resources.mongo.db
1190
1195
  alldocs_collection = mdb["alldocs"]
1196
+ data_object_set = mdb["data_object_set"]
1191
1197
  biosample_data_objects = fetch_data_objects_from_biosamples(
1192
- alldocs_collection, biosamples
1198
+ alldocs_collection, data_object_set, biosamples
1193
1199
  )
1194
1200
  return biosample_data_objects
1195
1201
 
@@ -1200,8 +1206,9 @@ def get_nucleotide_sequencing_from_biosamples(
1200
1206
  ):
1201
1207
  mdb = context.resources.mongo.db
1202
1208
  alldocs_collection = mdb["alldocs"]
1209
+ data_generation_set = mdb["data_generation_set"]
1203
1210
  biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
1204
- alldocs_collection, biosamples
1211
+ alldocs_collection, data_generation_set, biosamples
1205
1212
  )
1206
1213
  return biosample_omics_processing
1207
1214
 
@@ -1212,8 +1219,9 @@ def get_library_preparation_from_biosamples(
1212
1219
  ):
1213
1220
  mdb = context.resources.mongo.db
1214
1221
  alldocs_collection = mdb["alldocs"]
1222
+ material_processing_set = mdb["material_processing_set"]
1215
1223
  biosample_lib_prep = fetch_library_preparation_from_biosamples(
1216
- alldocs_collection, biosamples
1224
+ alldocs_collection, material_processing_set, biosamples
1217
1225
  )
1218
1226
  return biosample_lib_prep
1219
1227
 
@@ -199,8 +199,20 @@ class DatabaseUpdater:
199
199
  if gbs.get("biosampleGoldId") not in nmdc_gold_ids
200
200
  ]
201
201
 
202
+ # use the GOLD study id to fetch all sequencing project records associated with the study
203
+ gold_sequencing_projects_for_study = (
204
+ self.gold_api_client.fetch_projects_by_study(gold_study_id)
205
+ )
206
+
207
+ # use the GOLD study id to fetch all analysis project records associated with the study
208
+ gold_analysis_projects_for_study = (
209
+ self.gold_api_client.fetch_analysis_projects_by_study(gold_study_id)
210
+ )
211
+
202
212
  gold_study_translator = GoldStudyTranslator(
203
213
  biosamples=missing_gold_biosamples,
214
+ projects=gold_sequencing_projects_for_study,
215
+ analysis_projects=gold_analysis_projects_for_study,
204
216
  gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
205
217
  )
206
218
 
@@ -652,7 +652,7 @@ def biosample_submission_ingest():
652
652
  "inputs": {
653
653
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
654
654
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
655
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
655
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
656
656
  }
657
657
  },
658
658
  },
@@ -694,7 +694,7 @@ def biosample_submission_ingest():
694
694
  "inputs": {
695
695
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
696
696
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
697
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
697
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
698
698
  }
699
699
  },
700
700
  },
@@ -737,14 +737,14 @@ def biosample_submission_ingest():
737
737
  "inputs": {
738
738
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
739
739
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
740
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
740
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
741
741
  }
742
742
  },
743
743
  "get_neon_pipeline_benthic_data_product": {
744
744
  "config": {
745
745
  "benthic_data_product": {
746
746
  "product_id": "DP1.20279.001",
747
- "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
747
+ "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
748
748
  }
749
749
  }
750
750
  },
@@ -771,7 +771,7 @@ def biosample_submission_ingest():
771
771
  "config": {
772
772
  "benthic_data_product": {
773
773
  "product_id": "DP1.20279.001",
774
- "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
774
+ "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
775
775
  }
776
776
  }
777
777
  },
@@ -779,7 +779,7 @@ def biosample_submission_ingest():
779
779
  "inputs": {
780
780
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
781
781
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
782
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
782
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
783
783
  }
784
784
  },
785
785
  },
@@ -822,14 +822,14 @@ def biosample_submission_ingest():
822
822
  "inputs": {
823
823
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
824
824
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
825
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
825
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
826
826
  }
827
827
  },
828
828
  "get_neon_pipeline_surface_water_data_product": {
829
829
  "config": {
830
830
  "surface_water_data_product": {
831
831
  "product_id": "DP1.20281.001",
832
- "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
832
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
833
833
  }
834
834
  }
835
835
  },
@@ -856,7 +856,7 @@ def biosample_submission_ingest():
856
856
  "config": {
857
857
  "surface_water_data_product": {
858
858
  "product_id": "DP1.20281.001",
859
- "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
859
+ "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
860
860
  }
861
861
  }
862
862
  },
@@ -864,7 +864,7 @@ def biosample_submission_ingest():
864
864
  "inputs": {
865
865
  "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
866
866
  "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
867
- "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
867
+ "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
868
868
  }
869
869
  },
870
870
  },
@@ -1,6 +1,6 @@
1
1
  import re
2
2
  import sqlite3
3
- from typing import Union
3
+ from typing import Optional, Union
4
4
 
5
5
  import pandas as pd
6
6
  import requests_cache
@@ -61,6 +61,7 @@ class NeonBenthicDataTranslator(Translator):
61
61
  "mms_benthicMetagenomeSequencing",
62
62
  "mms_benthicMetagenomeDnaExtraction",
63
63
  "amb_fieldParent",
64
+ "mms_benthicRawDataFiles", # <--- ensure this is present
64
65
  )
65
66
 
66
67
  if all(k in benthic_data for k in neon_amb_data_tables):
@@ -79,6 +80,12 @@ class NeonBenthicDataTranslator(Translator):
79
80
  benthic_data["amb_fieldParent"].to_sql(
80
81
  "amb_fieldParent", self.conn, if_exists="replace", index=False
81
82
  )
83
+ benthic_data["mms_benthicRawDataFiles"].to_sql(
84
+ "mms_benthicRawDataFiles",
85
+ self.conn,
86
+ if_exists="replace",
87
+ index=False,
88
+ )
82
89
  else:
83
90
  raise ValueError(
84
91
  f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
@@ -88,14 +95,19 @@ class NeonBenthicDataTranslator(Translator):
88
95
  "neonEnvoTerms", self.conn, if_exists="replace", index=False
89
96
  )
90
97
 
91
- self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
92
- self.neon_raw_data_file_mappings_df.to_sql(
93
- "neonRawDataFile", self.conn, if_exists="replace", index=False
94
- )
98
+ self.neon_raw_data_file_mappings_df = benthic_data["mms_benthicRawDataFiles"]
95
99
 
96
100
  self.site_code_mapping = site_code_mapping
101
+
97
102
  self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
98
103
 
104
+ def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
105
+ return nmdc.Manifest(
106
+ id=manifest_id,
107
+ manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
108
+ type="nmdc:Manifest",
109
+ )
110
+
99
111
  def _translate_biosample(
100
112
  self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
101
113
  ) -> nmdc.Biosample:
@@ -313,7 +325,7 @@ class NeonBenthicDataTranslator(Translator):
313
325
  )
314
326
 
315
327
  def _translate_data_object(
316
- self, do_id: str, url: str, do_type: str, checksum: str
328
+ self, do_id: str, url: str, do_type: str, manifest_id: str
317
329
  ) -> nmdc.DataObject:
318
330
  """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
319
331
  object mainly contains information about the sequencing file that was generated as
@@ -324,7 +336,6 @@ class NeonBenthicDataTranslator(Translator):
324
336
  :param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
325
337
  by Hugh Cross at NEON.
326
338
  :param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
327
- :param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
328
339
  at NEON.
329
340
  :return: DataObject with all the sequencing file metadata.
330
341
  """
@@ -337,14 +348,14 @@ class NeonBenthicDataTranslator(Translator):
337
348
  url=url,
338
349
  description=f"sequencing results for {basename}",
339
350
  type="nmdc:DataObject",
340
- md5_checksum=checksum,
341
351
  data_object_type=do_type,
352
+ in_manifest=manifest_id,
342
353
  )
343
354
 
344
- def get_database(self):
355
+ def get_database(self) -> nmdc.Database:
345
356
  database = nmdc.Database()
346
357
 
347
- query = """
358
+ join_query = """
348
359
  SELECT
349
360
  merged.laboratoryName,
350
361
  merged.sequencingFacilityID,
@@ -372,202 +383,190 @@ class NeonBenthicDataTranslator(Translator):
372
383
  afp.siteID,
373
384
  afp.sampleID,
374
385
  afp.collectDate
375
- FROM
376
- (
377
- SELECT
378
- bs.collectDate,
379
- bs.laboratoryName,
380
- bs.sequencingFacilityID,
381
- bs.processedDate,
382
- bs.dnaSampleID,
383
- bs.dnaSampleCode,
384
- bs.internalLabID,
385
- bs.instrument_model,
386
- bs.sequencingMethod,
387
- bs.investigation_type,
388
- bs.qaqcStatus,
389
- bs.ncbiProjectID,
390
- bd.genomicsSampleID,
391
- bd.sequenceAnalysisType,
392
- bd.sampleMass,
393
- bd.nucleicAcidConcentration
394
- FROM
395
- mms_benthicMetagenomeSequencing AS bs
396
- JOIN
397
- mms_benthicMetagenomeDnaExtraction AS bd
398
- ON
399
- bs.dnaSampleID = bd.dnaSampleID
400
- ) AS merged
386
+ FROM (
387
+ SELECT
388
+ bs.collectDate,
389
+ bs.laboratoryName,
390
+ bs.sequencingFacilityID,
391
+ bs.processedDate,
392
+ bs.dnaSampleID,
393
+ bs.dnaSampleCode,
394
+ bs.internalLabID,
395
+ bs.instrument_model,
396
+ bs.sequencingMethod,
397
+ bs.investigation_type,
398
+ bs.qaqcStatus,
399
+ bs.ncbiProjectID,
400
+ bd.genomicsSampleID,
401
+ bd.sequenceAnalysisType,
402
+ bd.sampleMass,
403
+ bd.nucleicAcidConcentration
404
+ FROM mms_benthicMetagenomeSequencing AS bs
405
+ JOIN mms_benthicMetagenomeDnaExtraction AS bd
406
+ ON bs.dnaSampleID = bd.dnaSampleID
407
+ ) AS merged
401
408
  LEFT JOIN amb_fieldParent AS afp
402
- ON
403
- merged.genomicsSampleID = afp.geneticSampleID
409
+ ON merged.genomicsSampleID = afp.geneticSampleID
404
410
  """
405
- benthic_samples = pd.read_sql_query(query, self.conn)
411
+ benthic_samples = pd.read_sql_query(join_query, self.conn)
406
412
  benthic_samples.to_sql(
407
413
  "benthicSamples", self.conn, if_exists="replace", index=False
408
414
  )
409
415
 
410
- neon_biosample_ids = benthic_samples["sampleID"]
411
- nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
412
- neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
416
+ sample_ids = benthic_samples["sampleID"]
417
+ nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_ids))
418
+ neon_to_nmdc_biosample_ids = dict(zip(sample_ids, nmdc_biosample_ids))
413
419
 
414
- neon_extraction_ids = benthic_samples["sampleID"]
415
- nmdc_extraction_ids = self._id_minter(
416
- "nmdc:Extraction", len(neon_extraction_ids)
417
- )
418
- neon_to_nmdc_extraction_ids = dict(
419
- zip(neon_extraction_ids, nmdc_extraction_ids)
420
- )
420
+ nmdc_extraction_ids = self._id_minter("nmdc:Extraction", len(sample_ids))
421
+ neon_to_nmdc_extraction_ids = dict(zip(sample_ids, nmdc_extraction_ids))
421
422
 
422
- neon_extraction_processed_ids = benthic_samples["sampleID"]
423
423
  nmdc_extraction_processed_ids = self._id_minter(
424
- "nmdc:ProcessedSample", len(neon_extraction_processed_ids)
424
+ "nmdc:ProcessedSample", len(sample_ids)
425
425
  )
426
426
  neon_to_nmdc_extraction_processed_ids = dict(
427
- zip(neon_extraction_processed_ids, nmdc_extraction_processed_ids)
427
+ zip(sample_ids, nmdc_extraction_processed_ids)
428
428
  )
429
429
 
430
- neon_lib_prep_ids = benthic_samples["sampleID"]
431
- nmdc_lib_prep_ids = self._id_minter(
432
- "nmdc:LibraryPreparation", len(neon_lib_prep_ids)
433
- )
434
- neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
430
+ nmdc_libprep_ids = self._id_minter("nmdc:LibraryPreparation", len(sample_ids))
431
+ neon_to_nmdc_libprep_ids = dict(zip(sample_ids, nmdc_libprep_ids))
435
432
 
436
- neon_lib_prep_processed_ids = benthic_samples["sampleID"]
437
- nmdc_lib_prep_processed_ids = self._id_minter(
438
- "nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
433
+ nmdc_libprep_processed_ids = self._id_minter(
434
+ "nmdc:ProcessedSample", len(sample_ids)
439
435
  )
440
- neon_to_nmdc_lib_prep_processed_ids = dict(
441
- zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
436
+ neon_to_nmdc_libprep_processed_ids = dict(
437
+ zip(sample_ids, nmdc_libprep_processed_ids)
442
438
  )
443
439
 
444
- neon_omprc_ids = benthic_samples["sampleID"]
445
- nmdc_omprc_ids = self._id_minter(
446
- "nmdc:NucleotideSequencing", len(neon_omprc_ids)
447
- )
448
- neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
440
+ nmdc_ntseq_ids = self._id_minter("nmdc:NucleotideSequencing", len(sample_ids))
441
+ neon_to_nmdc_ntseq_ids = dict(zip(sample_ids, nmdc_ntseq_ids))
449
442
 
450
- neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
451
- neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
452
- nmdc_data_object_ids = self._id_minter(
453
- "nmdc:DataObject", len(neon_raw_file_paths)
454
- )
455
- neon_to_nmdc_data_object_ids = dict(
456
- zip(neon_raw_file_paths, nmdc_data_object_ids)
457
- )
443
+ raw_df = self.neon_raw_data_file_mappings_df
444
+ raw_file_paths = raw_df["rawDataFilePath"]
445
+ dataobject_ids = self._id_minter("nmdc:DataObject", len(raw_file_paths))
446
+ neon_to_nmdc_dataobject_ids = dict(zip(raw_file_paths, dataobject_ids))
458
447
 
459
- for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
460
- biosample_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
448
+ for neon_id, biosample_id in neon_to_nmdc_biosample_ids.items():
449
+ row = benthic_samples[benthic_samples["sampleID"] == neon_id]
450
+ if row.empty:
451
+ continue
461
452
 
453
+ # Example of how you might call _translate_biosample:
462
454
  database.biosample_set.append(
463
- self._translate_biosample(neon_id, nmdc_id, biosample_row)
455
+ self._translate_biosample(neon_id, biosample_id, row)
464
456
  )
465
457
 
466
- for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
467
- extraction_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
458
+ for neon_id, extraction_id in neon_to_nmdc_extraction_ids.items():
459
+ row = benthic_samples[benthic_samples["sampleID"] == neon_id]
460
+ if row.empty:
461
+ continue
468
462
 
469
- extraction_input = neon_to_nmdc_biosample_ids.get(neon_id)
470
- processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
463
+ biosample_id = neon_to_nmdc_biosample_ids.get(neon_id)
464
+ extraction_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
471
465
 
472
- if extraction_input is not None and processed_sample_id is not None:
466
+ if biosample_id and extraction_ps_id:
473
467
  database.material_processing_set.append(
474
468
  self._translate_extraction_process(
475
- nmdc_id,
476
- extraction_input,
477
- processed_sample_id,
478
- extraction_row,
469
+ extraction_id, biosample_id, extraction_ps_id, row
479
470
  )
480
471
  )
481
-
482
- genomics_sample_id = _get_value_or_none(
483
- extraction_row, "genomicsSampleID"
484
- )
485
-
472
+ genomics_sample_id = _get_value_or_none(row, "genomicsSampleID")
486
473
  database.processed_sample_set.append(
487
474
  self._translate_processed_sample(
488
- processed_sample_id,
475
+ extraction_ps_id,
489
476
  f"Extracted DNA from {genomics_sample_id}",
490
477
  )
491
478
  )
492
479
 
493
- query = """
480
+ query2 = """
494
481
  SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
495
- FROM neonRawDataFile
482
+ FROM mms_benthicRawDataFiles
496
483
  GROUP BY dnaSampleID
497
484
  """
498
- neon_raw_data_files = pd.read_sql_query(query, self.conn)
499
- neon_raw_data_files_dict = (
500
- neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
485
+ raw_data_files_df = pd.read_sql_query(query2, self.conn)
486
+ dna_files_dict = (
487
+ raw_data_files_df.set_index("dnaSampleID")["rawDataFilePaths"]
501
488
  .str.split("|")
502
489
  .to_dict()
503
490
  )
504
- filtered_neon_raw_data_files_dict = {
505
- key: value
506
- for key, value in neon_raw_data_files_dict.items()
507
- if len(value) <= 2
508
- }
509
491
 
510
- for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
511
- lib_prep_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
492
+ dna_sample_to_manifest_id: dict[str, str] = {}
512
493
 
513
- lib_prep_input = neon_to_nmdc_extraction_processed_ids.get(neon_id)
514
- processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
494
+ for neon_id, libprep_id in neon_to_nmdc_libprep_ids.items():
495
+ row = benthic_samples[benthic_samples["sampleID"] == neon_id]
496
+ if row.empty:
497
+ continue
515
498
 
516
- if lib_prep_input is not None and processed_sample_id is not None:
517
- database.material_processing_set.append(
518
- self._translate_library_preparation(
519
- nmdc_id,
520
- lib_prep_input,
521
- processed_sample_id,
522
- lib_prep_row,
523
- )
499
+ extr_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
500
+ libprep_ps_id = neon_to_nmdc_libprep_processed_ids.get(neon_id)
501
+ if not extr_ps_id or not libprep_ps_id:
502
+ continue
503
+
504
+ database.material_processing_set.append(
505
+ self._translate_library_preparation(
506
+ libprep_id, extr_ps_id, libprep_ps_id, row
524
507
  )
508
+ )
525
509
 
526
- dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
510
+ dna_sample_id = _get_value_or_none(row, "dnaSampleID")
511
+ database.processed_sample_set.append(
512
+ self._translate_processed_sample(
513
+ libprep_ps_id,
514
+ f"Library preparation for {dna_sample_id}",
515
+ )
516
+ )
527
517
 
528
- database.processed_sample_set.append(
529
- self._translate_processed_sample(
530
- processed_sample_id,
531
- f"Library preparation for {dna_sample_id}",
518
+ filepaths_for_dna: list[str] = dna_files_dict.get(dna_sample_id, [])
519
+ if not filepaths_for_dna:
520
+ # no raw files => skip
521
+ ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
522
+ if ntseq_id:
523
+ continue
524
+ continue
525
+
526
+ # If multiple => we create a Manifest
527
+ manifest_id: Optional[str] = None
528
+ if len(filepaths_for_dna) > 2:
529
+ if dna_sample_id not in dna_sample_to_manifest_id:
530
+ new_man_id = self._id_minter("nmdc:Manifest", 1)[0]
531
+ dna_sample_to_manifest_id[dna_sample_id] = new_man_id
532
+ database.manifest_set.append(self._translate_manifest(new_man_id))
533
+ manifest_id = dna_sample_to_manifest_id[dna_sample_id]
534
+
535
+ has_input_value = self.samp_procsm_dict.get(neon_id)
536
+ if not has_input_value:
537
+ continue
538
+
539
+ dataobject_ids_for_run: list[str] = []
540
+ for fp in filepaths_for_dna:
541
+ if fp not in neon_to_nmdc_dataobject_ids:
542
+ continue
543
+ do_id = neon_to_nmdc_dataobject_ids[fp]
544
+
545
+ do_type = None
546
+ if "_R1.fastq.gz" in fp:
547
+ do_type = "Metagenome Raw Read 1"
548
+ elif "_R2.fastq.gz" in fp:
549
+ do_type = "Metagenome Raw Read 2"
550
+
551
+ database.data_object_set.append(
552
+ self._translate_data_object(
553
+ do_id=do_id,
554
+ url=fp,
555
+ do_type=do_type,
556
+ manifest_id=manifest_id,
532
557
  )
533
558
  )
534
-
535
- has_output = None
536
- has_output_do_ids = []
537
-
538
- if dna_sample_id in filtered_neon_raw_data_files_dict:
539
- has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
540
- for item in has_output:
541
- if item in neon_to_nmdc_data_object_ids:
542
- has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
543
-
544
- checksum = None
545
- do_type = None
546
-
547
- checksum = neon_raw_data_file_mappings_df[
548
- neon_raw_data_file_mappings_df["rawDataFilePath"] == item
549
- ]["checkSum"].values[0]
550
- if "_R1.fastq.gz" in item:
551
- do_type = "Metagenome Raw Read 1"
552
- elif "_R2.fastq.gz" in item:
553
- do_type = "Metagenome Raw Read 2"
554
-
555
- database.data_object_set.append(
556
- self._translate_data_object(
557
- neon_to_nmdc_data_object_ids.get(item),
558
- item,
559
- do_type,
560
- checksum,
561
- )
562
- )
563
-
564
- database.data_generation_set.append(
565
- self._translate_nucleotide_sequencing(
566
- neon_to_nmdc_omprc_ids.get(neon_id),
567
- processed_sample_id,
568
- has_output_do_ids,
569
- lib_prep_row,
570
- )
559
+ dataobject_ids_for_run.append(do_id)
560
+
561
+ ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
562
+ if ntseq_id:
563
+ database.data_generation_set.append(
564
+ self._translate_nucleotide_sequencing(
565
+ ntseq_id,
566
+ has_input_value, # <--- from self.samp_procsm_dict
567
+ dataobject_ids_for_run,
568
+ row,
571
569
  )
570
+ )
572
571
 
573
572
  return database
@@ -71,6 +71,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
71
71
  neon_amb_data_tables = (
72
72
  "mms_swMetagenomeSequencing",
73
73
  "mms_swMetagenomeDnaExtraction",
74
+ "mms_swRawDataFiles",
74
75
  "amc_fieldGenetic",
75
76
  "amc_fieldSuperParent",
76
77
  )
@@ -88,6 +89,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
88
89
  if_exists="replace",
89
90
  index=False,
90
91
  )
92
+ surface_water_data["mms_swRawDataFiles"].to_sql(
93
+ "mms_swRawDataFiles", self.conn, if_exists="replace", index=False
94
+ )
91
95
  surface_water_data["amc_fieldGenetic"].to_sql(
92
96
  "amc_fieldGenetic", self.conn, if_exists="replace", index=False
93
97
  )
@@ -103,10 +107,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
103
107
  "neonEnvoTerms", self.conn, if_exists="replace", index=False
104
108
  )
105
109
 
106
- self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
107
- self.neon_raw_data_file_mappings_df.to_sql(
108
- "neonRawDataFile", self.conn, if_exists="replace", index=False
109
- )
110
+ self.neon_raw_data_file_mappings_df = surface_water_data["mms_swRawDataFiles"]
110
111
 
111
112
  self.site_code_mapping = site_code_mapping
112
113
 
@@ -371,7 +372,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
371
372
  )
372
373
 
373
374
  def _translate_data_object(
374
- self, do_id: str, url: str, do_type: str, checksum: str
375
+ self, do_id: str, url: str, do_type: str, manifest_id: str
375
376
  ) -> nmdc.DataObject:
376
377
  """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
377
378
  object mainly contains information about the sequencing file that was generated as
@@ -395,8 +396,15 @@ class NeonSurfaceWaterDataTranslator(Translator):
395
396
  url=url,
396
397
  description=f"sequencing results for {basename}",
397
398
  type="nmdc:DataObject",
398
- md5_checksum=checksum,
399
399
  data_object_type=do_type,
400
+ in_manifest=manifest_id,
401
+ )
402
+
403
+ def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
404
+ return nmdc.Manifest(
405
+ id=manifest_id,
406
+ manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
407
+ type="nmdc:Manifest",
400
408
  )
401
409
 
402
410
  def get_database(self):
@@ -477,6 +485,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
477
485
  """
478
486
  surface_water_samples = pd.read_sql_query(query, self.conn)
479
487
 
488
+ # --------------------------------------------------
489
+ # Create mappings for minted NMDC IDs
490
+ # --------------------------------------------------
480
491
  neon_biosample_ids = surface_water_samples["parentSampleID"]
481
492
  nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
482
493
  neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
@@ -511,30 +522,20 @@ class NeonSurfaceWaterDataTranslator(Translator):
511
522
  zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
512
523
  )
513
524
 
514
- neon_omprc_ids = surface_water_samples["parentSampleID"]
515
- nmdc_omprc_ids = self._id_minter(
516
- "nmdc:NucleotideSequencing", len(neon_omprc_ids)
517
- )
518
- neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
519
-
520
- neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
521
- neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
522
- nmdc_data_object_ids = self._id_minter(
523
- "nmdc:DataObject", len(neon_raw_file_paths)
524
- )
525
- neon_to_nmdc_data_object_ids = dict(
526
- zip(neon_raw_file_paths, nmdc_data_object_ids)
527
- )
528
-
525
+ # --------------------------------------------------
526
+ # STEP 1: Insert Biosamples
527
+ # --------------------------------------------------
529
528
  for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
530
529
  biosample_row = surface_water_samples[
531
530
  surface_water_samples["parentSampleID"] == neon_id
532
531
  ]
532
+ # database.biosample_set.append(
533
+ # self._translate_biosample(neon_id, nmdc_id, biosample_row)
534
+ # )
533
535
 
534
- database.biosample_set.append(
535
- self._translate_biosample(neon_id, nmdc_id, biosample_row)
536
- )
537
-
536
+ # --------------------------------------------------
537
+ # STEP 2: Insert Extraction Processes
538
+ # --------------------------------------------------
538
539
  for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
539
540
  extraction_row = surface_water_samples[
540
541
  surface_water_samples["parentSampleID"] == neon_id
@@ -557,6 +558,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
557
558
  extraction_row, "genomicsSampleID"
558
559
  )
559
560
 
561
+ # Each Extraction process output => ProcessedSample
560
562
  database.processed_sample_set.append(
561
563
  self._translate_processed_sample(
562
564
  processed_sample_id,
@@ -564,23 +566,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
564
566
  )
565
567
  )
566
568
 
567
- query = """
568
- SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
569
- FROM neonRawDataFile
570
- GROUP BY dnaSampleID
571
- """
572
- neon_raw_data_files = pd.read_sql_query(query, self.conn)
573
- neon_raw_data_files_dict = (
574
- neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
575
- .str.split("|")
576
- .to_dict()
577
- )
578
- filtered_neon_raw_data_files_dict = {
579
- key: value
580
- for key, value in neon_raw_data_files_dict.items()
581
- if len(value) <= 2
582
- }
583
-
569
+ # --------------------------------------------------
570
+ # STEP 3: Insert LibraryPreparation Processes
571
+ # --------------------------------------------------
584
572
  for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
585
573
  lib_prep_row = surface_water_samples[
586
574
  surface_water_samples["parentSampleID"] == neon_id
@@ -601,6 +589,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
601
589
 
602
590
  dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
603
591
 
592
+ # Each LibraryPreparation process output => ProcessedSample
604
593
  database.processed_sample_set.append(
605
594
  self._translate_processed_sample(
606
595
  processed_sample_id,
@@ -608,42 +597,103 @@ class NeonSurfaceWaterDataTranslator(Translator):
608
597
  )
609
598
  )
610
599
 
611
- has_output = None
612
- has_output_do_ids = []
613
-
614
- if dna_sample_id in filtered_neon_raw_data_files_dict:
615
- has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
616
- for item in has_output:
617
- if item in neon_to_nmdc_data_object_ids:
618
- has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
619
-
620
- checksum = None
621
- do_type = None
622
-
623
- checksum = neon_raw_data_file_mappings_df[
624
- neon_raw_data_file_mappings_df["rawDataFilePath"] == item
625
- ]["checkSum"].values[0]
626
- if "_R1.fastq.gz" in item:
627
- do_type = "Metagenome Raw Read 1"
628
- elif "_R2.fastq.gz" in item:
629
- do_type = "Metagenome Raw Read 2"
630
-
631
- database.data_object_set.append(
632
- self._translate_data_object(
633
- neon_to_nmdc_data_object_ids.get(item),
634
- item,
635
- do_type,
636
- checksum,
637
- )
638
- )
639
-
640
- database.data_generation_set.append(
641
- self._translate_nucleotide_sequencing(
642
- neon_to_nmdc_omprc_ids.get(neon_id),
643
- processed_sample_id,
644
- has_output_do_ids,
645
- lib_prep_row,
646
- )
600
+ # --------------------------------------------------
601
+ # STEP 4: Group raw files by (dnaSampleID, sequencerRunID)
602
+ # and insert DataObjects + DataGeneration processes
603
+ # --------------------------------------------------
604
+ raw_query = """
605
+ SELECT dnaSampleID, sequencerRunID, rawDataFilePath
606
+ FROM mms_swRawDataFiles
607
+ """
608
+ neon_raw_data_files_df = pd.read_sql_query(raw_query, self.conn)
609
+
610
+ for neon_id, nmdc_libprep_id in neon_to_nmdc_lib_prep_ids.items():
611
+ # 1) Pull out the row that corresponds to this parentSampleID
612
+ lib_prep_row = surface_water_samples[
613
+ surface_water_samples["parentSampleID"] == neon_id
614
+ ]
615
+
616
+ # 2) Grab the dnaSampleID from that row
617
+ dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
618
+ if not dna_sample_id:
619
+ # No dnaSampleID => skip
620
+ continue
621
+
622
+ # 3) Find all raw files for that dnaSampleID
623
+ dna_files = neon_raw_data_files_df[
624
+ neon_raw_data_files_df["dnaSampleID"] == dna_sample_id
625
+ ]
626
+ if dna_files.empty:
627
+ # No raw files => skip
628
+ continue
629
+
630
+ # -----------------------------------------
631
+ # LOOKUP DICT: get "has_input" for this neon_id
632
+ # -----------------------------------------
633
+ has_input_value = self.samp_procsm_dict.get(neon_id)
634
+ # If some neon_id isn't in the dictionary, handle it as needed
635
+ if not has_input_value:
636
+ # Could skip, or raise an error, or set a default
637
+ continue
638
+
639
+ # -------------------------------------------
640
+ # 4) CREATE A MANIFEST IF MULTIPLE RAW FILES
641
+ # for this row's dnaSampleID
642
+ # -------------------------------------------
643
+ manifest_id = None
644
+ if len(dna_files) > 2:
645
+ # For each row that references a dnaSampleID with multiple raw files,
646
+ # mint exactly one new manifest record
647
+ manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
648
+ new_manifest = self._translate_manifest(manifest_id)
649
+ # Add to the database
650
+ database.manifest_set.append(new_manifest)
651
+
652
+ # -------------------------------------------
653
+ # 5) NOW GROUP FILES BY sequencerRunID
654
+ # => one data_generation record per run
655
+ # -------------------------------------------
656
+ lib_prep_processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(
657
+ neon_id
658
+ )
659
+ if not lib_prep_processed_sample_id:
660
+ # If we don't have a ProcessedSample for some reason, skip
661
+ continue
662
+
663
+ for run_id, group_df in dna_files.groupby("sequencerRunID"):
664
+ # a) Mint new data_generation (NucleotideSequencing) ID for this run
665
+ data_generation_id = self._id_minter("nmdc:NucleotideSequencing", 1)[0]
666
+
667
+ # b) Create DataObjects for each raw file in this run
668
+ data_object_ids = []
669
+ for raw_fp in group_df["rawDataFilePath"]:
670
+ do_id = self._id_minter("nmdc:DataObject", 1)[0]
671
+
672
+ # Distinguish read type
673
+ do_type = None
674
+ if "_R1.fastq.gz" in raw_fp:
675
+ do_type = "Metagenome Raw Read 1"
676
+ elif "_R2.fastq.gz" in raw_fp:
677
+ do_type = "Metagenome Raw Read 2"
678
+
679
+ # Create the DataObject
680
+ data_obj = self._translate_data_object(
681
+ do_id=do_id,
682
+ url=raw_fp,
683
+ do_type=do_type,
684
+ manifest_id=manifest_id, # link to the new Manifest if it exists
685
+ )
686
+ database.data_object_set.append(data_obj)
687
+ data_object_ids.append(do_id)
688
+
689
+ # c) Finally, create the data generation record for this run
690
+ database.data_generation_set.append(
691
+ self._translate_nucleotide_sequencing(
692
+ nucleotide_sequencing_id=data_generation_id,
693
+ processed_sample_id=has_input_value,
694
+ raw_data_file_data=data_object_ids,
695
+ nucleotide_sequencing_row=lib_prep_row,
647
696
  )
697
+ )
648
698
 
649
699
  return database
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: nmdc_runtime
3
- Version: 2.3.0
3
+ Version: 2.5.0
4
4
  Summary: A runtime system for NMDC data management and orchestration
5
5
  Home-page: https://github.com/microbiomedata/nmdc-runtime
6
6
  Author: Donny Winston
@@ -17,6 +17,7 @@ Dynamic: classifier
17
17
  Dynamic: description
18
18
  Dynamic: description-content-type
19
19
  Dynamic: home-page
20
+ Dynamic: license-file
20
21
  Dynamic: requires-python
21
22
  Dynamic: summary
22
23
 
@@ -37,8 +38,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
37
38
  * [nmdc-server](https://github.com/microbiomedata/nmdc-server)
38
39
  houses code specific to the data portal -- its database, back-end API, and front-end application.
39
40
 
40
- * [workflow_documentation](https://nmdc-workflow-documentation.readthedocs.io/en/latest/index.html)
41
- references workflow code spread across several repositories, that take source data and produce computed data.
41
+ * Workflows — documented in the [workflows](https://docs.microbiomedata.org/workflows/) section of the NMDC documentation website — take source data and produce computed data.
42
42
 
43
43
  * This repo (nmdc-runtime)
44
44
  * houses code that takes source data and computed data, and transforms it
@@ -156,6 +156,9 @@ Tests can be found in `tests` and are run with the following commands:
156
156
  ```bash
157
157
  make up-test
158
158
  make test
159
+
160
+ # Run a Specific test file eg. tests/test_api/test_endpoints.py
161
+ make test ARGS="tests/test_api/test_endpoints.py"
159
162
  ```
160
163
 
161
164
  As you create Dagster solids and pipelines, add tests in `tests/` to check that your code behaves as
@@ -164,6 +167,16 @@ desired and does not break over time.
164
167
  [For hints on how to write tests for solids and pipelines in Dagster, see their documentation
165
168
  tutorial on Testing](https://docs.dagster.io/tutorial/testable).
166
169
 
170
+ ### RAM usage
171
+
172
+ The `dagster-daemon` and `dagster-dagit` containers can consume a lot of RAM. If tests are failing and the console of
173
+ the `test` container shows "Error 137," here is something you can try as a workaround: In Docker Desktop, go to
174
+ "Settings > Resources > Advanced," and increase the memory limit. One of our team members has
175
+ found **12 GB** to be sufficient for running the tests.
176
+
177
+ > Dedicating 12 GB of RAM to Docker may be prohibitive for some prospective developers.
178
+ > There is an open [issue](https://github.com/microbiomedata/nmdc-runtime/issues/928) about the memory requirement.
179
+
167
180
  ## Publish to PyPI
168
181
 
169
182
  This repository contains a GitHub Actions workflow that publishes a Python package to [PyPI](https://pypi.org/project/nmdc-runtime/).
@@ -37,8 +37,8 @@ nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
37
37
  nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
38
38
  nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
39
  nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
40
- nmdc_runtime/site/ops.py,sha256=OrTQLSBToih-kI754RtVqjUMRojgYJQmb7B_VRjZWtg,46223
41
- nmdc_runtime/site/repository.py,sha256=b3UVQznelU8wDOfuc9_vE_eqFGOoFRiHtQJJH7or73E,43875
40
+ nmdc_runtime/site/ops.py,sha256=tg-zRlVSUSJ7B0cJbBsUwmMRmpIUmK5tsL8ABnY0wnY,46626
41
+ nmdc_runtime/site/repository.py,sha256=kVCoIMF2rgAMUAf9a6jk0WbejFpmWgxh6nN4U37Mgc8,43919
42
42
  nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
43
43
  nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
44
44
  nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -51,21 +51,21 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
51
51
  nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
52
52
  nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
53
53
  nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- nmdc_runtime/site/export/ncbi_xml.py,sha256=Vb4rNP3uhnGlHqrwUGgA2DzpOotCf3S8G4sIJml7gl4,25287
55
- nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZynO1yPSLX_aHs,8390
54
+ nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
55
+ nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
56
56
  nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
57
57
  nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
59
59
  nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
- nmdc_runtime/site/repair/database_updater.py,sha256=EMuY8MfwQEfdejJHp0Y-Gb1eb1zOgKgfJxbtm6wM3YU,10943
60
+ nmdc_runtime/site/repair/database_updater.py,sha256=eTNAPtgAc_xQodADBfgomwow9-14j5rBqQWF8R7BheY,11525
61
61
  nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
62
  nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
63
63
  nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
64
64
  nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
65
65
  nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
66
- nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
66
+ nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=VxN7yCziQE-ZP9mtrzqI-yaS9taEgTy0EnIEattYeKo,23727
67
67
  nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
68
- nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=MQgjIfWPgoRe-bhzyfqHSe2mZwFsjcwjdT8tNqpIhlc,27729
68
+ nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
69
69
  nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
70
70
  nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
71
71
  nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
75
75
  nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
76
76
  nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
77
77
  nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
78
- nmdc_runtime-2.3.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
- nmdc_runtime-2.3.0.dist-info/METADATA,sha256=BEManThNKOEkfS9woYYiwm1ya6BENBC6vXE6b7L_z2E,7430
80
- nmdc_runtime-2.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
81
- nmdc_runtime-2.3.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
- nmdc_runtime-2.3.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
- nmdc_runtime-2.3.0.dist-info/RECORD,,
78
+ nmdc_runtime-2.5.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
79
+ nmdc_runtime-2.5.0.dist-info/METADATA,sha256=tli66QKJC-48TzLXbI9iHMzTLyugbRBKj9CJEeKHXLY,8139
80
+ nmdc_runtime-2.5.0.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
81
+ nmdc_runtime-2.5.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
82
+ nmdc_runtime-2.5.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
83
+ nmdc_runtime-2.5.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (78.0.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5