nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  import re
2
2
  import sqlite3
3
+ from typing import Optional, Union
3
4
 
4
5
  import pandas as pd
5
6
  import requests_cache
@@ -10,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
10
11
  from nmdc_runtime.site.translation.neon_utils import (
11
12
  _get_value_or_none,
12
13
  _create_controlled_identified_term_value,
13
- _create_controlled_term_value,
14
14
  _create_geolocation_value,
15
15
  _create_quantity_value,
16
16
  _create_timestamp_value,
@@ -47,6 +47,7 @@ class NeonBenthicDataTranslator(Translator):
47
47
  site_code_mapping: dict,
48
48
  neon_envo_mappings_file: pd.DataFrame,
49
49
  neon_raw_data_file_mappings_file: pd.DataFrame,
50
+ neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
50
51
  *args,
51
52
  **kwargs,
52
53
  ) -> None:
@@ -59,6 +60,7 @@ class NeonBenthicDataTranslator(Translator):
59
60
  "mms_benthicMetagenomeSequencing",
60
61
  "mms_benthicMetagenomeDnaExtraction",
61
62
  "amb_fieldParent",
63
+ "mms_benthicRawDataFiles", # <--- ensure this is present
62
64
  )
63
65
 
64
66
  if all(k in benthic_data for k in neon_amb_data_tables):
@@ -77,6 +79,12 @@ class NeonBenthicDataTranslator(Translator):
77
79
  benthic_data["amb_fieldParent"].to_sql(
78
80
  "amb_fieldParent", self.conn, if_exists="replace", index=False
79
81
  )
82
+ benthic_data["mms_benthicRawDataFiles"].to_sql(
83
+ "mms_benthicRawDataFiles",
84
+ self.conn,
85
+ if_exists="replace",
86
+ index=False,
87
+ )
80
88
  else:
81
89
  raise ValueError(
82
90
  f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
@@ -86,19 +94,24 @@ class NeonBenthicDataTranslator(Translator):
86
94
  "neonEnvoTerms", self.conn, if_exists="replace", index=False
87
95
  )
88
96
 
89
- self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
90
- self.neon_raw_data_file_mappings_df.to_sql(
91
- "neonRawDataFile", self.conn, if_exists="replace", index=False
92
- )
97
+ self.neon_raw_data_file_mappings_df = benthic_data["mms_benthicRawDataFiles"]
93
98
 
94
99
  self.site_code_mapping = site_code_mapping
95
100
 
101
+ self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
102
+
103
+ def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
104
+ return nmdc.Manifest(
105
+ id=manifest_id,
106
+ manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
107
+ type="nmdc:Manifest",
108
+ )
109
+
96
110
  def _translate_biosample(
97
111
  self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
98
112
  ) -> nmdc.Biosample:
99
113
  return nmdc.Biosample(
100
114
  id=nmdc_id,
101
- part_of="nmdc:sty-11-pzmd0x14",
102
115
  env_broad_scale=_create_controlled_identified_term_value(
103
116
  BENTHIC_BROAD_SCALE_MAPPINGS.get(
104
117
  biosample_row["aquaticSiteType"].values[0]
@@ -146,8 +159,10 @@ class NeonBenthicDataTranslator(Translator):
146
159
  depth=nmdc.QuantityValue(
147
160
  has_minimum_numeric_value=nmdc.Float("0"),
148
161
  has_maximum_numeric_value=nmdc.Float("1"),
149
- has_unit="meters",
162
+ has_unit="m",
163
+ type="nmdc:QuantityValue",
150
164
  ),
165
+ associated_studies=["nmdc:sty-11-pzmd0x14"],
151
166
  )
152
167
 
153
168
  def _translate_extraction_process(
@@ -185,10 +200,9 @@ class NeonBenthicDataTranslator(Translator):
185
200
  input_mass=_create_quantity_value(
186
201
  _get_value_or_none(extraction_row, "sampleMass"), "g"
187
202
  ),
188
- quality_control_report=nmdc.QualityControlReport(
189
- status=_get_value_or_none(extraction_row, "qaqcStatus")
190
- ),
203
+ qc_status=_get_value_or_none(extraction_row, "qaqcStatus"),
191
204
  processing_institution=processing_institution,
205
+ type="nmdc:Extraction",
192
206
  )
193
207
 
194
208
  def _translate_library_preparation(
@@ -201,13 +215,13 @@ class NeonBenthicDataTranslator(Translator):
201
215
  """
202
216
  Create LibraryPreparation process object. The input to LibraryPreparation process
203
217
  is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
204
- process is fed as input to an OmicsProcessing object.
218
+ process is fed as input to an NucleotideSequencing object.
205
219
 
206
220
  :param library_preparation_id: Minted id for LibraryPreparation process.
207
221
  :param library_preparation_input: Input to LibraryPreparation process is output from
208
222
  Extraction process.
209
223
  :param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
210
- is also input to OmicsProcessing.
224
+ is also input to NucleotideSequencing.
211
225
  :param library_preparation_row: Metadata required to populate LibraryPreparation.
212
226
  :return: Object that using LibraryPreparation process model.
213
227
  """
@@ -226,31 +240,47 @@ class NeonBenthicDataTranslator(Translator):
226
240
  start_date=_get_value_or_none(library_preparation_row, "collectDate"),
227
241
  end_date=_get_value_or_none(library_preparation_row, "processedDate"),
228
242
  processing_institution=processing_institution,
243
+ type="nmdc:LibraryPreparation",
229
244
  )
230
245
 
231
- def _translate_omics_processing(
246
+ def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
247
+ if not instrument_model:
248
+ raise ValueError(
249
+ f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
250
+ )
251
+
252
+ df = self.neon_nmdc_instrument_map_df
253
+ matching_row = df[
254
+ df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
255
+ ]
256
+
257
+ if not matching_row.empty:
258
+ nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
259
+ return nmdc_instrument_id
260
+
261
+ def _translate_nucleotide_sequencing(
232
262
  self,
233
- omics_processing_id: str,
263
+ nucleotide_sequencing_id: str,
234
264
  processed_sample_id: str,
235
265
  raw_data_file_data: str,
236
- omics_processing_row: pd.DataFrame,
237
- ) -> nmdc.OmicsProcessing:
238
- """Create nmdc OmicsProcessing object. This class typically models the run of a
239
- Bioinformatics workflow on sequence data from a biosample. The input to an OmicsProcessing
240
- process is the output from a LibraryPreparation process, and the output of OmicsProcessing
266
+ nucleotide_sequencing_row: pd.DataFrame,
267
+ ):
268
+ """Create nmdc NucleotideSequencing object. This class typically models the run of a
269
+ Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
270
+ process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
241
271
  is a DataObject which has the FASTQ sequence file URLs embedded in them.
242
272
 
243
- :param omics_processing_id: Minted id for an OmicsProcessing process.
273
+ :param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
244
274
  :param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
245
275
  :param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
246
276
  files embedded in them.
247
- :param omics_processing_row: DataFrame with metadata for an OmicsProcessing workflow
277
+ :param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
248
278
  process/run.
249
- :return: OmicsProcessing object that models a Bioinformatics workflow process/run.
279
+ :return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
250
280
  """
251
281
  processing_institution = None
252
282
  sequencing_facility = _get_value_or_none(
253
- omics_processing_row, "sequencingFacilityID"
283
+ nucleotide_sequencing_row, "sequencingFacilityID"
254
284
  )
255
285
  if sequencing_facility is not None:
256
286
  if re.search("Battelle", sequencing_facility, re.IGNORECASE):
@@ -258,19 +288,21 @@ class NeonBenthicDataTranslator(Translator):
258
288
  elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
259
289
  processing_institution = "ANL"
260
290
 
261
- return nmdc.OmicsProcessing(
262
- id=omics_processing_id,
291
+ return nmdc.NucleotideSequencing(
292
+ id=nucleotide_sequencing_id,
263
293
  has_input=processed_sample_id,
264
294
  has_output=raw_data_file_data,
265
295
  processing_institution=processing_institution,
266
- ncbi_project_name=_get_value_or_none(omics_processing_row, "ncbiProjectID"),
267
- omics_type=_create_controlled_term_value(
268
- omics_processing_row["investigation_type"].values[0]
296
+ ncbi_project_name=_get_value_or_none(
297
+ nucleotide_sequencing_row, "ncbiProjectID"
269
298
  ),
270
- instrument_name=f"{_get_value_or_none(omics_processing_row, 'sequencingMethod')} {_get_value_or_none(omics_processing_row, 'instrument_model')}",
271
- part_of="nmdc:sty-11-34xj1150",
272
- name=f"Terrestrial soil microbial communities - {_get_value_or_none(omics_processing_row, 'dnaSampleID')}",
273
- type="nmdc:OmicsProcessing",
299
+ instrument_used=self._get_instrument_id(
300
+ _get_value_or_none(nucleotide_sequencing_row, "instrument_model")
301
+ ),
302
+ name=f"Benthic microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
303
+ type="nmdc:NucleotideSequencing",
304
+ associated_studies=["nmdc:sty-11-pzmd0x14"],
305
+ analyte_category="metagenome",
274
306
  )
275
307
 
276
308
  def _translate_processed_sample(
@@ -287,12 +319,14 @@ class NeonBenthicDataTranslator(Translator):
287
319
  :param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
288
320
  :return: ProcessedSample objects to be stored in `processed_sample_set`.
289
321
  """
290
- return nmdc.ProcessedSample(id=processed_sample_id, name=sample_id)
322
+ return nmdc.ProcessedSample(
323
+ id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
324
+ )
291
325
 
292
326
  def _translate_data_object(
293
- self, do_id: str, url: str, do_type: str, checksum: str
327
+ self, do_id: str, url: str, do_type: str, manifest_id: str
294
328
  ) -> nmdc.DataObject:
295
- """Create nmdc DataObject which is the output of an OmicsProcessing process. This
329
+ """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
296
330
  object mainly contains information about the sequencing file that was generated as
297
331
  the result of running a Bioinformatics workflow on a certain ProcessedSample, which
298
332
  is the result of a LibraryPreparation process.
@@ -301,7 +335,6 @@ class NeonBenthicDataTranslator(Translator):
301
335
  :param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
302
336
  by Hugh Cross at NEON.
303
337
  :param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
304
- :param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
305
338
  at NEON.
306
339
  :return: DataObject with all the sequencing file metadata.
307
340
  """
@@ -314,14 +347,15 @@ class NeonBenthicDataTranslator(Translator):
314
347
  url=url,
315
348
  description=f"sequencing results for {basename}",
316
349
  type="nmdc:DataObject",
317
- md5_checksum=checksum,
318
350
  data_object_type=do_type,
351
+ data_category=nmdc.DataCategoryEnum.instrument_data.text,
352
+ in_manifest=manifest_id,
319
353
  )
320
354
 
321
- def get_database(self):
355
+ def get_database(self) -> nmdc.Database:
322
356
  database = nmdc.Database()
323
357
 
324
- query = """
358
+ join_query = """
325
359
  SELECT
326
360
  merged.laboratoryName,
327
361
  merged.sequencingFacilityID,
@@ -349,200 +383,190 @@ class NeonBenthicDataTranslator(Translator):
349
383
  afp.siteID,
350
384
  afp.sampleID,
351
385
  afp.collectDate
352
- FROM
353
- (
354
- SELECT
355
- bs.collectDate,
356
- bs.laboratoryName,
357
- bs.sequencingFacilityID,
358
- bs.processedDate,
359
- bs.dnaSampleID,
360
- bs.dnaSampleCode,
361
- bs.internalLabID,
362
- bs.instrument_model,
363
- bs.sequencingMethod,
364
- bs.investigation_type,
365
- bs.qaqcStatus,
366
- bs.ncbiProjectID,
367
- bd.genomicsSampleID,
368
- bd.sequenceAnalysisType,
369
- bd.sampleMass,
370
- bd.nucleicAcidConcentration
371
- FROM
372
- mms_benthicMetagenomeSequencing AS bs
373
- JOIN
374
- mms_benthicMetagenomeDnaExtraction AS bd
375
- ON
376
- bs.dnaSampleID = bd.dnaSampleID
377
- ) AS merged
386
+ FROM (
387
+ SELECT
388
+ bs.collectDate,
389
+ bs.laboratoryName,
390
+ bs.sequencingFacilityID,
391
+ bs.processedDate,
392
+ bs.dnaSampleID,
393
+ bs.dnaSampleCode,
394
+ bs.internalLabID,
395
+ bs.instrument_model,
396
+ bs.sequencingMethod,
397
+ bs.investigation_type,
398
+ bs.qaqcStatus,
399
+ bs.ncbiProjectID,
400
+ bd.genomicsSampleID,
401
+ bd.sequenceAnalysisType,
402
+ bd.sampleMass,
403
+ bd.nucleicAcidConcentration
404
+ FROM mms_benthicMetagenomeSequencing AS bs
405
+ JOIN mms_benthicMetagenomeDnaExtraction AS bd
406
+ ON bs.dnaSampleID = bd.dnaSampleID
407
+ ) AS merged
378
408
  LEFT JOIN amb_fieldParent AS afp
379
- ON
380
- merged.genomicsSampleID = afp.geneticSampleID
409
+ ON merged.genomicsSampleID = afp.geneticSampleID
381
410
  """
382
- benthic_samples = pd.read_sql_query(query, self.conn)
411
+ benthic_samples = pd.read_sql_query(join_query, self.conn)
383
412
  benthic_samples.to_sql(
384
413
  "benthicSamples", self.conn, if_exists="replace", index=False
385
414
  )
386
415
 
387
- neon_biosample_ids = benthic_samples["sampleID"]
388
- nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
389
- neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
416
+ sample_ids = benthic_samples["sampleID"]
417
+ nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_ids))
418
+ neon_to_nmdc_biosample_ids = dict(zip(sample_ids, nmdc_biosample_ids))
390
419
 
391
- neon_extraction_ids = benthic_samples["sampleID"]
392
- nmdc_extraction_ids = self._id_minter(
393
- "nmdc:Extraction", len(neon_extraction_ids)
394
- )
395
- neon_to_nmdc_extraction_ids = dict(
396
- zip(neon_extraction_ids, nmdc_extraction_ids)
397
- )
420
+ nmdc_extraction_ids = self._id_minter("nmdc:Extraction", len(sample_ids))
421
+ neon_to_nmdc_extraction_ids = dict(zip(sample_ids, nmdc_extraction_ids))
398
422
 
399
- neon_extraction_processed_ids = benthic_samples["sampleID"]
400
423
  nmdc_extraction_processed_ids = self._id_minter(
401
- "nmdc:ProcessedSample", len(neon_extraction_processed_ids)
424
+ "nmdc:ProcessedSample", len(sample_ids)
402
425
  )
403
426
  neon_to_nmdc_extraction_processed_ids = dict(
404
- zip(neon_extraction_processed_ids, nmdc_extraction_processed_ids)
427
+ zip(sample_ids, nmdc_extraction_processed_ids)
405
428
  )
406
429
 
407
- neon_lib_prep_ids = benthic_samples["sampleID"]
408
- nmdc_lib_prep_ids = self._id_minter(
409
- "nmdc:LibraryPreparation", len(neon_lib_prep_ids)
410
- )
411
- neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
430
+ nmdc_libprep_ids = self._id_minter("nmdc:LibraryPreparation", len(sample_ids))
431
+ neon_to_nmdc_libprep_ids = dict(zip(sample_ids, nmdc_libprep_ids))
412
432
 
413
- neon_lib_prep_processed_ids = benthic_samples["sampleID"]
414
- nmdc_lib_prep_processed_ids = self._id_minter(
415
- "nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
433
+ nmdc_libprep_processed_ids = self._id_minter(
434
+ "nmdc:ProcessedSample", len(sample_ids)
416
435
  )
417
- neon_to_nmdc_lib_prep_processed_ids = dict(
418
- zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
436
+ neon_to_nmdc_libprep_processed_ids = dict(
437
+ zip(sample_ids, nmdc_libprep_processed_ids)
419
438
  )
420
439
 
421
- neon_omprc_ids = benthic_samples["sampleID"]
422
- nmdc_omprc_ids = self._id_minter("nmdc:OmicsProcessing", len(neon_omprc_ids))
423
- neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
440
+ nmdc_ntseq_ids = self._id_minter("nmdc:NucleotideSequencing", len(sample_ids))
441
+ neon_to_nmdc_ntseq_ids = dict(zip(sample_ids, nmdc_ntseq_ids))
424
442
 
425
- neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
426
- neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
427
- nmdc_data_object_ids = self._id_minter(
428
- "nmdc:DataObject", len(neon_raw_file_paths)
429
- )
430
- neon_to_nmdc_data_object_ids = dict(
431
- zip(neon_raw_file_paths, nmdc_data_object_ids)
432
- )
443
+ raw_df = self.neon_raw_data_file_mappings_df
444
+ raw_file_paths = raw_df["rawDataFilePath"]
445
+ dataobject_ids = self._id_minter("nmdc:DataObject", len(raw_file_paths))
446
+ neon_to_nmdc_dataobject_ids = dict(zip(raw_file_paths, dataobject_ids))
433
447
 
434
- for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
435
- biosample_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
448
+ for neon_id, biosample_id in neon_to_nmdc_biosample_ids.items():
449
+ row = benthic_samples[benthic_samples["sampleID"] == neon_id]
450
+ if row.empty:
451
+ continue
436
452
 
453
+ # Example of how you might call _translate_biosample:
437
454
  database.biosample_set.append(
438
- self._translate_biosample(neon_id, nmdc_id, biosample_row)
455
+ self._translate_biosample(neon_id, biosample_id, row)
439
456
  )
440
457
 
441
- for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
442
- extraction_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
458
+ for neon_id, extraction_id in neon_to_nmdc_extraction_ids.items():
459
+ row = benthic_samples[benthic_samples["sampleID"] == neon_id]
460
+ if row.empty:
461
+ continue
443
462
 
444
- extraction_input = neon_to_nmdc_biosample_ids.get(neon_id)
445
- processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
463
+ biosample_id = neon_to_nmdc_biosample_ids.get(neon_id)
464
+ extraction_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
446
465
 
447
- if extraction_input is not None and processed_sample_id is not None:
448
- database.extraction_set.append(
466
+ if biosample_id and extraction_ps_id:
467
+ database.material_processing_set.append(
449
468
  self._translate_extraction_process(
450
- nmdc_id,
451
- extraction_input,
452
- processed_sample_id,
453
- extraction_row,
469
+ extraction_id, biosample_id, extraction_ps_id, row
454
470
  )
455
471
  )
456
-
457
- genomics_sample_id = _get_value_or_none(
458
- extraction_row, "genomicsSampleID"
459
- )
460
-
472
+ genomics_sample_id = _get_value_or_none(row, "genomicsSampleID")
461
473
  database.processed_sample_set.append(
462
474
  self._translate_processed_sample(
463
- processed_sample_id,
475
+ extraction_ps_id,
464
476
  f"Extracted DNA from {genomics_sample_id}",
465
477
  )
466
478
  )
467
479
 
468
- query = """
480
+ query2 = """
469
481
  SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
470
- FROM neonRawDataFile
482
+ FROM mms_benthicRawDataFiles
471
483
  GROUP BY dnaSampleID
472
484
  """
473
- neon_raw_data_files = pd.read_sql_query(query, self.conn)
474
- neon_raw_data_files_dict = (
475
- neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
485
+ raw_data_files_df = pd.read_sql_query(query2, self.conn)
486
+ dna_files_dict = (
487
+ raw_data_files_df.set_index("dnaSampleID")["rawDataFilePaths"]
476
488
  .str.split("|")
477
489
  .to_dict()
478
490
  )
479
- filtered_neon_raw_data_files_dict = {
480
- key: value
481
- for key, value in neon_raw_data_files_dict.items()
482
- if len(value) <= 2
483
- }
484
-
485
- for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
486
- lib_prep_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
487
-
488
- lib_prep_input = neon_to_nmdc_extraction_processed_ids.get(neon_id)
489
- processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
490
-
491
- if lib_prep_input is not None and processed_sample_id is not None:
492
- database.library_preparation_set.append(
493
- self._translate_library_preparation(
494
- nmdc_id,
495
- lib_prep_input,
496
- processed_sample_id,
497
- lib_prep_row,
498
- )
491
+
492
+ dna_sample_to_manifest_id: dict[str, str] = {}
493
+
494
+ for neon_id, libprep_id in neon_to_nmdc_libprep_ids.items():
495
+ row = benthic_samples[benthic_samples["sampleID"] == neon_id]
496
+ if row.empty:
497
+ continue
498
+
499
+ extr_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
500
+ libprep_ps_id = neon_to_nmdc_libprep_processed_ids.get(neon_id)
501
+ if not extr_ps_id or not libprep_ps_id:
502
+ continue
503
+
504
+ database.material_processing_set.append(
505
+ self._translate_library_preparation(
506
+ libprep_id, extr_ps_id, libprep_ps_id, row
499
507
  )
508
+ )
500
509
 
501
- dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
510
+ dna_sample_id = _get_value_or_none(row, "dnaSampleID")
511
+ database.processed_sample_set.append(
512
+ self._translate_processed_sample(
513
+ libprep_ps_id,
514
+ f"Library preparation for {dna_sample_id}",
515
+ )
516
+ )
502
517
 
503
- database.processed_sample_set.append(
504
- self._translate_processed_sample(
505
- processed_sample_id,
506
- f"Library preparation for {dna_sample_id}",
518
+ filepaths_for_dna: list[str] = dna_files_dict.get(dna_sample_id, [])
519
+ if not filepaths_for_dna:
520
+ # no raw files => skip
521
+ ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
522
+ if ntseq_id:
523
+ continue
524
+ continue
525
+
526
+ # If multiple => we create a Manifest
527
+ manifest_id: Optional[str] = None
528
+ if len(filepaths_for_dna) > 2:
529
+ if dna_sample_id not in dna_sample_to_manifest_id:
530
+ new_man_id = self._id_minter("nmdc:Manifest", 1)[0]
531
+ dna_sample_to_manifest_id[dna_sample_id] = new_man_id
532
+ database.manifest_set.append(self._translate_manifest(new_man_id))
533
+ manifest_id = dna_sample_to_manifest_id[dna_sample_id]
534
+
535
+ has_input_value = self.samp_procsm_dict.get(neon_id)
536
+ if not has_input_value:
537
+ continue
538
+
539
+ dataobject_ids_for_run: list[str] = []
540
+ for fp in filepaths_for_dna:
541
+ if fp not in neon_to_nmdc_dataobject_ids:
542
+ continue
543
+ do_id = neon_to_nmdc_dataobject_ids[fp]
544
+
545
+ do_type = None
546
+ if "_R1.fastq.gz" in fp:
547
+ do_type = "Metagenome Raw Read 1"
548
+ elif "_R2.fastq.gz" in fp:
549
+ do_type = "Metagenome Raw Read 2"
550
+
551
+ database.data_object_set.append(
552
+ self._translate_data_object(
553
+ do_id=do_id,
554
+ url=fp,
555
+ do_type=do_type,
556
+ manifest_id=manifest_id,
507
557
  )
508
558
  )
509
-
510
- has_output = None
511
- has_output_do_ids = []
512
-
513
- if dna_sample_id in filtered_neon_raw_data_files_dict:
514
- has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
515
- for item in has_output:
516
- if item in neon_to_nmdc_data_object_ids:
517
- has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
518
-
519
- checksum = None
520
- do_type = None
521
-
522
- checksum = neon_raw_data_file_mappings_df[
523
- neon_raw_data_file_mappings_df["rawDataFilePath"] == item
524
- ]["checkSum"].values[0]
525
- if "_R1.fastq.gz" in item:
526
- do_type = "Metagenome Raw Read 1"
527
- elif "_R2.fastq.gz" in item:
528
- do_type = "Metagenome Raw Read 2"
529
-
530
- database.data_object_set.append(
531
- self._translate_data_object(
532
- neon_to_nmdc_data_object_ids.get(item),
533
- item,
534
- do_type,
535
- checksum,
536
- )
537
- )
538
-
539
- database.omics_processing_set.append(
540
- self._translate_omics_processing(
541
- neon_to_nmdc_omprc_ids.get(neon_id),
542
- processed_sample_id,
543
- has_output_do_ids,
544
- lib_prep_row,
545
- )
559
+ dataobject_ids_for_run.append(do_id)
560
+
561
+ ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
562
+ if ntseq_id:
563
+ database.data_generation_set.append(
564
+ self._translate_nucleotide_sequencing(
565
+ ntseq_id,
566
+ has_input_value, # <--- from self.samp_procsm_dict
567
+ dataobject_ids_for_run,
568
+ row,
546
569
  )
570
+ )
547
571
 
548
572
  return database