nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,698 @@
1
+ import re
2
+ import sqlite3
3
+ from typing import Dict, Optional, Union
4
+
5
+ import pandas as pd
6
+ import requests_cache
7
+
8
+ from nmdc_schema import nmdc
9
+ from nmdc_runtime.site.translation.translator import Translator
10
+ from nmdc_runtime.site.util import get_basename
11
+ from nmdc_runtime.site.translation.neon_utils import (
12
+ _get_value_or_none,
13
+ _create_controlled_identified_term_value,
14
+ _create_geolocation_value,
15
+ _create_quantity_value,
16
+ _create_timestamp_value,
17
+ _create_text_value,
18
+ )
19
+
20
+
21
+ SURFACE_WATER_BROAD_SCALE_MAPPINGS = {
22
+ "lake": {"term_id": "ENVO:01000252", "term_name": "freshwater lake biome"},
23
+ "river": {"term_id": "ENVO:01000253", "term_name": "freshwater river biome"},
24
+ "stream": {"term_id": "ENVO:03605008", "term_name": "freshwater stream biome"},
25
+ }
26
+
27
+ SURFACE_WATER_LOCAL_SCALE_MAPPINGS = {
28
+ "lake": {
29
+ "buoy.c0": {"term_id": "ENVO:01001191", "term_name": "water surface"},
30
+ "buoy.c1": {"term_id": "ENVO:00002131", "term_name": "epilimnion"},
31
+ "buoy.c2": {"term_id": "ENVO:00002269", "term_name": "thermocline"},
32
+ "buoy.c3": {"term_id": "ENVO:00002130", "term_name": "hypolimnion"},
33
+ "littoral": {
34
+ "term_id": "ENVO:01000409",
35
+ "term_name": "freshwater littoral zone",
36
+ },
37
+ "inflow": {"term_id": "ENVO:00000476", "term_name": "lake inlet"},
38
+ },
39
+ "river": {"term_id": "ENVO:01000297", "term_name": "freshwater river"},
40
+ "stream": {"term_id": "ENVO:03605007", "term_name": "freshwater stream"},
41
+ }
42
+
43
+ SURFACE_WATER_MEDIUM_MAPPINGS = {
44
+ "lake": {
45
+ "term_id": "ENVO:04000007",
46
+ "term_name": "lake water",
47
+ },
48
+ "river": {"term_id": "ENVO:01000599", "term_name": "river water"},
49
+ "stream": {"term_id": "ENVO:03605006", "term_name": "stream water"},
50
+ }
51
+
52
+
53
+ class NeonSurfaceWaterDataTranslator(Translator):
54
+ def __init__(
55
+ self,
56
+ surface_water_data: dict,
57
+ site_code_mapping: dict,
58
+ neon_envo_mappings_file: pd.DataFrame,
59
+ neon_raw_data_file_mappings_file: pd.DataFrame,
60
+ neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
61
+ *args,
62
+ **kwargs,
63
+ ) -> None:
64
+ super().__init__(*args, **kwargs)
65
+
66
+ self.conn = sqlite3.connect("neon.db")
67
+ requests_cache.install_cache("neon_api_cache")
68
+
69
+ neon_amb_data_tables = (
70
+ "mms_swMetagenomeSequencing",
71
+ "mms_swMetagenomeDnaExtraction",
72
+ "mms_swRawDataFiles",
73
+ "amc_fieldGenetic",
74
+ "amc_fieldSuperParent",
75
+ )
76
+
77
+ if all(k in surface_water_data for k in neon_amb_data_tables):
78
+ surface_water_data["mms_swMetagenomeSequencing"].to_sql(
79
+ "mms_swMetagenomeSequencing",
80
+ self.conn,
81
+ if_exists="replace",
82
+ index=False,
83
+ )
84
+ surface_water_data["mms_swMetagenomeDnaExtraction"].to_sql(
85
+ "mms_swMetagenomeDnaExtraction",
86
+ self.conn,
87
+ if_exists="replace",
88
+ index=False,
89
+ )
90
+ surface_water_data["mms_swRawDataFiles"].to_sql(
91
+ "mms_swRawDataFiles", self.conn, if_exists="replace", index=False
92
+ )
93
+ surface_water_data["amc_fieldGenetic"].to_sql(
94
+ "amc_fieldGenetic", self.conn, if_exists="replace", index=False
95
+ )
96
+ surface_water_data["amc_fieldSuperParent"].to_sql(
97
+ "amc_fieldSuperParent", self.conn, if_exists="replace", index=False
98
+ )
99
+ else:
100
+ raise ValueError(
101
+ f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
102
+ )
103
+
104
+ neon_envo_mappings_file.to_sql(
105
+ "neonEnvoTerms", self.conn, if_exists="replace", index=False
106
+ )
107
+
108
+ self.neon_raw_data_file_mappings_df = surface_water_data["mms_swRawDataFiles"]
109
+
110
+ self.site_code_mapping = site_code_mapping
111
+
112
+ self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
113
+
114
+ def _translate_biosample(
115
+ self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
116
+ ) -> nmdc.Biosample:
117
+ def map_local_scale(
118
+ aquatic_site_type: str, named_location: Optional[str] = None
119
+ ) -> Dict[str, str]:
120
+ if aquatic_site_type == "lake":
121
+ for key in SURFACE_WATER_LOCAL_SCALE_MAPPINGS.get(
122
+ aquatic_site_type, {}
123
+ ):
124
+ if key in named_location:
125
+ return SURFACE_WATER_LOCAL_SCALE_MAPPINGS[aquatic_site_type][
126
+ key
127
+ ]
128
+ elif aquatic_site_type == "river" or aquatic_site_type == "stream":
129
+ return SURFACE_WATER_LOCAL_SCALE_MAPPINGS.get(aquatic_site_type, {})
130
+ return {}
131
+
132
+ depth = None
133
+ minimum_depth = biosample_row["lakeSampleDepth1"].values[0]
134
+ maximum_depth = biosample_row["lakeSampleDepth2"].values[0]
135
+
136
+ if not pd.isna(minimum_depth):
137
+ if not pd.isna(maximum_depth):
138
+ depth = nmdc.QuantityValue(
139
+ has_minimum_numeric_value=nmdc.Float(minimum_depth),
140
+ has_maximum_numeric_value=nmdc.Float(maximum_depth),
141
+ has_unit="m",
142
+ type="nmdc:QuantityValue",
143
+ )
144
+ else:
145
+ depth = nmdc.QuantityValue(
146
+ has_numeric_value=nmdc.Float(minimum_depth),
147
+ has_unit="m",
148
+ type="nmdc:QuantityValue",
149
+ )
150
+
151
+ return nmdc.Biosample(
152
+ id=nmdc_id,
153
+ env_broad_scale=_create_controlled_identified_term_value(
154
+ SURFACE_WATER_BROAD_SCALE_MAPPINGS.get(
155
+ biosample_row["aquaticSiteType"].values[0]
156
+ ).get("term_id"),
157
+ SURFACE_WATER_BROAD_SCALE_MAPPINGS.get(
158
+ biosample_row["aquaticSiteType"].values[0]
159
+ ).get("term_name"),
160
+ ),
161
+ env_local_scale=_create_controlled_identified_term_value(
162
+ map_local_scale(
163
+ biosample_row["aquaticSiteType"].values[0],
164
+ biosample_row["namedLocation"].values[0],
165
+ ).get("term_id"),
166
+ map_local_scale(
167
+ biosample_row["aquaticSiteType"].values[0],
168
+ biosample_row["namedLocation"].values[0],
169
+ ).get("term_name"),
170
+ ),
171
+ env_medium=_create_controlled_identified_term_value(
172
+ SURFACE_WATER_MEDIUM_MAPPINGS.get(
173
+ biosample_row["aquaticSiteType"].values[0]
174
+ ).get("term_id"),
175
+ SURFACE_WATER_MEDIUM_MAPPINGS.get(
176
+ biosample_row["aquaticSiteType"].values[0]
177
+ ).get("term_name"),
178
+ ),
179
+ name=neon_id,
180
+ lat_lon=_create_geolocation_value(
181
+ biosample_row["decimalLatitude"].values[0],
182
+ biosample_row["decimalLongitude"].values[0],
183
+ ),
184
+ elev=nmdc.Float(biosample_row["elevation"].values[0]),
185
+ collection_date=_create_timestamp_value(
186
+ biosample_row["seqCollectDate"].values[0]
187
+ ),
188
+ geo_loc_name=_create_text_value(
189
+ self.site_code_mapping[biosample_row["siteID"].values[0]]
190
+ if biosample_row["siteID"].values[0]
191
+ else None
192
+ ),
193
+ samp_collec_device=biosample_row["samplerType"].values[0],
194
+ diss_oxygen=_create_quantity_value(
195
+ biosample_row["dissolvedOxygen"].values[0], "mg/L"
196
+ ),
197
+ conduc=_create_quantity_value(
198
+ biosample_row["specificConductance"].values[0], "uS/cm"
199
+ ),
200
+ temp=_create_quantity_value(biosample_row["waterTemp"].values[0], "Cel"),
201
+ type="nmdc:Biosample",
202
+ analysis_type="metagenomics",
203
+ biosample_categories="NEON",
204
+ depth=depth,
205
+ samp_size=_create_quantity_value(
206
+ biosample_row["geneticFilteredSampleVolume"].values[0], "mL"
207
+ ),
208
+ env_package=nmdc.TextValue(has_raw_value="water", type="nmdc:TextValue"),
209
+ associated_studies=["nmdc:sty-11-hht5sb92"],
210
+ )
211
+
212
+ def _translate_extraction_process(
213
+ self,
214
+ extraction_id: str,
215
+ extraction_input: str,
216
+ processed_sample_id: str,
217
+ extraction_row: pd.DataFrame,
218
+ ) -> nmdc.Extraction:
219
+ """
220
+ Create an nmdc Extraction process, which is a process to model the DNA extraction in
221
+ a metagenome sequencing experiment. The input to an Extraction process is the
222
+ output from a Pooling process.
223
+
224
+ :param extraction_id: Minted id for Extraction process.
225
+ :param extraction_input: Input to an Extraction process is the output from a Pooling process.
226
+ :param processed_sample_id: Output of Extraction process is a ProcessedSample.
227
+ :param extraction_row: DataFrame with Extraction process metadata.
228
+ :return: Extraction process object.
229
+ """
230
+ processing_institution = None
231
+ laboratory_name = _get_value_or_none(extraction_row, "laboratoryName")
232
+ if laboratory_name is not None:
233
+ if re.search("Battelle", laboratory_name, re.IGNORECASE):
234
+ processing_institution = "Battelle"
235
+ elif re.search("Argonne", laboratory_name, re.IGNORECASE):
236
+ processing_institution = "ANL"
237
+
238
+ return nmdc.Extraction(
239
+ id=extraction_id,
240
+ has_input=extraction_input,
241
+ has_output=processed_sample_id,
242
+ start_date=_get_value_or_none(extraction_row, "extrCollectDate"),
243
+ end_date=_get_value_or_none(extraction_row, "extrProcessedDate"),
244
+ input_mass=_create_quantity_value(
245
+ _get_value_or_none(extraction_row, "sampleMass"), "g"
246
+ ),
247
+ qc_status=nmdc.StatusEnum(
248
+ _get_value_or_none(extraction_row, "extrQaqcStatus")
249
+ ),
250
+ processing_institution=processing_institution,
251
+ type="nmdc:Extraction",
252
+ )
253
+
254
+ def _translate_library_preparation(
255
+ self,
256
+ library_preparation_id: str,
257
+ library_preparation_input: str,
258
+ processed_sample_id: str,
259
+ library_preparation_row: pd.DataFrame,
260
+ ):
261
+ """
262
+ Create LibraryPreparation process object. The input to LibraryPreparation process
263
+ is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
264
+ process is fed as input to an NucleotideSequencing object.
265
+
266
+ :param library_preparation_id: Minted id for LibraryPreparation process.
267
+ :param library_preparation_input: Input to LibraryPreparation process is output from
268
+ Extraction process.
269
+ :param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
270
+ is also input to NucleotideSequencing.
271
+ :param library_preparation_row: Metadata required to populate LibraryPreparation.
272
+ :return: Object that using LibraryPreparation process model.
273
+ """
274
+ processing_institution = None
275
+ laboratory_name = _get_value_or_none(library_preparation_row, "laboratoryName")
276
+ if laboratory_name is not None:
277
+ if re.search("Battelle", laboratory_name, re.IGNORECASE):
278
+ processing_institution = "Battelle"
279
+ elif re.search("Argonne", laboratory_name, re.IGNORECASE):
280
+ processing_institution = "ANL"
281
+
282
+ return nmdc.LibraryPreparation(
283
+ id=library_preparation_id,
284
+ has_input=library_preparation_input,
285
+ has_output=processed_sample_id,
286
+ start_date=_get_value_or_none(library_preparation_row, "seqCollectDate"),
287
+ end_date=_get_value_or_none(library_preparation_row, "seqProcessedDate"),
288
+ processing_institution=processing_institution,
289
+ type="nmdc:LibraryPreparation",
290
+ )
291
+
292
+ def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
293
+ if not instrument_model:
294
+ raise ValueError(
295
+ f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
296
+ )
297
+
298
+ df = self.neon_nmdc_instrument_map_df
299
+ matching_row = df[
300
+ df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
301
+ ]
302
+
303
+ if not matching_row.empty:
304
+ nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
305
+ return nmdc_instrument_id
306
+
307
+ def _translate_nucleotide_sequencing(
308
+ self,
309
+ nucleotide_sequencing_id: str,
310
+ processed_sample_id: str,
311
+ raw_data_file_data: str,
312
+ nucleotide_sequencing_row: pd.DataFrame,
313
+ ):
314
+ """Create nmdc NucleotideSequencing object. This class typically models the run of a
315
+ Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
316
+ process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
317
+ is a DataObject which has the FASTQ sequence file URLs embedded in them.
318
+
319
+ :param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
320
+ :param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
321
+ :param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
322
+ files embedded in them.
323
+ :param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
324
+ process/run.
325
+ :return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
326
+ """
327
+ processing_institution = None
328
+ sequencing_facility = _get_value_or_none(
329
+ nucleotide_sequencing_row, "sequencingFacilityID"
330
+ )
331
+ if sequencing_facility is not None:
332
+ if re.search("Battelle", sequencing_facility, re.IGNORECASE):
333
+ processing_institution = "Battelle"
334
+ elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
335
+ processing_institution = "ANL"
336
+
337
+ return nmdc.NucleotideSequencing(
338
+ id=nucleotide_sequencing_id,
339
+ has_input=processed_sample_id,
340
+ has_output=raw_data_file_data,
341
+ processing_institution=processing_institution,
342
+ ncbi_project_name=_get_value_or_none(
343
+ nucleotide_sequencing_row, "ncbiProjectID"
344
+ ),
345
+ instrument_used=self._get_instrument_id(
346
+ _get_value_or_none(nucleotide_sequencing_row, "instrument_model")
347
+ ),
348
+ name=f"Surface water microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
349
+ type="nmdc:NucleotideSequencing",
350
+ associated_studies=["nmdc:sty-11-hht5sb92"],
351
+ analyte_category="metagenome",
352
+ )
353
+
354
+ def _translate_processed_sample(
355
+ self, processed_sample_id: str, sample_id: str
356
+ ) -> nmdc.ProcessedSample:
357
+ """
358
+ Create an nmdc ProcessedSample. ProcessedSample is typically the output of a PlannedProcess
359
+ like Pooling, Extraction, LibraryPreparation, etc. We are using this to create a
360
+ reference for the nmdc minted ProcessedSample ids in `processed_sample_set`. We are
361
+ associating the minted ids with the name of the sample it is coming from which can be
362
+ a value from either the `genomicsSampleID` column or from the `dnaSampleID` column.
363
+
364
+ :param processed_sample_id: NMDC minted ProcessedSampleID.
365
+ :param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
366
+ :return: ProcessedSample objects to be stored in `processed_sample_set`.
367
+ """
368
+ return nmdc.ProcessedSample(
369
+ id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
370
+ )
371
+
372
+ def _translate_data_object(
373
+ self, do_id: str, url: str, do_type: str, manifest_id: str
374
+ ) -> nmdc.DataObject:
375
+ """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
376
+ object mainly contains information about the sequencing file that was generated as
377
+ the result of running a Bioinformatics workflow on a certain ProcessedSample, which
378
+ is the result of a LibraryPreparation process.
379
+
380
+ :param do_id: NMDC minted DataObject id.
381
+ :param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
382
+ by Hugh Cross at NEON.
383
+ :param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
384
+ :param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
385
+ at NEON.
386
+ :return: DataObject with all the sequencing file metadata.
387
+ """
388
+ file_name = get_basename(url)
389
+ basename = file_name.split(".", 1)[0]
390
+
391
+ return nmdc.DataObject(
392
+ id=do_id,
393
+ name=file_name,
394
+ url=url,
395
+ description=f"sequencing results for {basename}",
396
+ type="nmdc:DataObject",
397
+ data_object_type=do_type,
398
+ data_category=nmdc.DataCategoryEnum.instrument_data.text,
399
+ in_manifest=manifest_id,
400
+ )
401
+
402
+ def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
403
+ return nmdc.Manifest(
404
+ id=manifest_id,
405
+ manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
406
+ type="nmdc:Manifest",
407
+ )
408
+
409
+ def get_database(self):
410
+ database = nmdc.Database()
411
+
412
+ query = """
413
+ SELECT
414
+ merged.laboratoryName,
415
+ merged.sequencingFacilityID,
416
+ merged.extrProcessedDate,
417
+ merged.seqProcessedDate,
418
+ merged.dnaSampleID,
419
+ merged.internalLabID,
420
+ merged.instrument_model,
421
+ merged.sequencingMethod,
422
+ merged.investigation_type,
423
+ merged.extrQaqcStatus,
424
+ merged.ncbiProjectID,
425
+ merged.genomicsSampleID,
426
+ merged.sequenceAnalysisType,
427
+ merged.sampleMass,
428
+ merged.nucleicAcidConcentration,
429
+ merged.siteID,
430
+ merged.seqCollectDate,
431
+ merged.extrCollectDate,
432
+ afg.geneticSampleID,
433
+ afg.geneticFilteredSampleVolume,
434
+ afg.sampleMaterial,
435
+ afs.parentSampleID,
436
+ afs.namedLocation,
437
+ afs.decimalLatitude,
438
+ afs.decimalLongitude,
439
+ afs.elevation,
440
+ afs.aquaticSiteType,
441
+ afs.samplerType,
442
+ afs.dissolvedOxygen,
443
+ afs.specificConductance,
444
+ afs.waterTemp,
445
+ afs.lakeSampleDepth1,
446
+ afs.lakeSampleDepth2
447
+ FROM
448
+ (
449
+ SELECT
450
+ msq.collectDate AS seqCollectDate,
451
+ mde.collectDate AS extrCollectDate,
452
+ msq.laboratoryName,
453
+ msq.sequencingFacilityID,
454
+ msq.processedDate AS seqProcessedDate,
455
+ mde.processedDate AS extrProcessedDate,
456
+ msq.dnaSampleID,
457
+ msq.internalLabID,
458
+ msq.instrument_model,
459
+ msq.sequencingMethod,
460
+ msq.investigation_type,
461
+ mde.qaqcStatus AS extrQaqcStatus,
462
+ msq.ncbiProjectID,
463
+ msq.siteID,
464
+ msq.labPrepMethod,
465
+ mde.genomicsSampleID,
466
+ mde.sequenceAnalysisType,
467
+ mde.sampleMass,
468
+ mde.nucleicAcidConcentration,
469
+ mde.nucleicAcidQuantMethod,
470
+ mde.nucleicAcidPurity
471
+ FROM
472
+ mms_swMetagenomeSequencing AS msq
473
+ JOIN
474
+ mms_swMetagenomeDnaExtraction AS mde
475
+ ON
476
+ msq.dnaSampleID = mde.dnaSampleID
477
+ ) AS merged
478
+ JOIN amc_fieldGenetic AS afg
479
+ ON
480
+ merged.genomicsSampleID = afg.geneticSampleID
481
+ JOIN amc_fieldSuperParent AS afs
482
+ ON
483
+ afg.parentSampleID = afs.parentSampleID
484
+ """
485
+ surface_water_samples = pd.read_sql_query(query, self.conn)
486
+
487
+ # --------------------------------------------------
488
+ # Create mappings for minted NMDC IDs
489
+ # --------------------------------------------------
490
+ neon_biosample_ids = surface_water_samples["parentSampleID"]
491
+ nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
492
+ neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
493
+
494
+ neon_extraction_ids = surface_water_samples["parentSampleID"]
495
+ nmdc_extraction_ids = self._id_minter(
496
+ "nmdc:Extraction", len(neon_extraction_ids)
497
+ )
498
+ neon_to_nmdc_extraction_ids = dict(
499
+ zip(neon_extraction_ids, nmdc_extraction_ids)
500
+ )
501
+
502
+ neon_extraction_processed_ids = surface_water_samples["parentSampleID"]
503
+ nmdc_extraction_processed_ids = self._id_minter(
504
+ "nmdc:ProcessedSample", len(neon_extraction_processed_ids)
505
+ )
506
+ neon_to_nmdc_extraction_processed_ids = dict(
507
+ zip(neon_extraction_processed_ids, nmdc_extraction_processed_ids)
508
+ )
509
+
510
+ neon_lib_prep_ids = surface_water_samples["parentSampleID"]
511
+ nmdc_lib_prep_ids = self._id_minter(
512
+ "nmdc:LibraryPreparation", len(neon_lib_prep_ids)
513
+ )
514
+ neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
515
+
516
+ neon_lib_prep_processed_ids = surface_water_samples["parentSampleID"]
517
+ nmdc_lib_prep_processed_ids = self._id_minter(
518
+ "nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
519
+ )
520
+ neon_to_nmdc_lib_prep_processed_ids = dict(
521
+ zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
522
+ )
523
+
524
+ # --------------------------------------------------
525
+ # STEP 1: Insert Biosamples
526
+ # --------------------------------------------------
527
+ for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
528
+ biosample_row = surface_water_samples[
529
+ surface_water_samples["parentSampleID"] == neon_id
530
+ ]
531
+ # database.biosample_set.append(
532
+ # self._translate_biosample(neon_id, nmdc_id, biosample_row)
533
+ # )
534
+
535
+ # --------------------------------------------------
536
+ # STEP 2: Insert Extraction Processes
537
+ # --------------------------------------------------
538
+ for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
539
+ extraction_row = surface_water_samples[
540
+ surface_water_samples["parentSampleID"] == neon_id
541
+ ]
542
+
543
+ extraction_input = neon_to_nmdc_biosample_ids.get(neon_id)
544
+ processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
545
+
546
+ if extraction_input is not None and processed_sample_id is not None:
547
+ database.material_processing_set.append(
548
+ self._translate_extraction_process(
549
+ nmdc_id,
550
+ extraction_input,
551
+ processed_sample_id,
552
+ extraction_row,
553
+ )
554
+ )
555
+
556
+ genomics_sample_id = _get_value_or_none(
557
+ extraction_row, "genomicsSampleID"
558
+ )
559
+
560
+ # Each Extraction process output => ProcessedSample
561
+ database.processed_sample_set.append(
562
+ self._translate_processed_sample(
563
+ processed_sample_id,
564
+ f"Extracted DNA from {genomics_sample_id}",
565
+ )
566
+ )
567
+
568
+ # --------------------------------------------------
569
+ # STEP 3: Insert LibraryPreparation Processes
570
+ # --------------------------------------------------
571
+ for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
572
+ lib_prep_row = surface_water_samples[
573
+ surface_water_samples["parentSampleID"] == neon_id
574
+ ]
575
+
576
+ lib_prep_input = neon_to_nmdc_extraction_processed_ids.get(neon_id)
577
+ processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
578
+
579
+ if lib_prep_input is not None and processed_sample_id is not None:
580
+ database.material_processing_set.append(
581
+ self._translate_library_preparation(
582
+ nmdc_id,
583
+ lib_prep_input,
584
+ processed_sample_id,
585
+ lib_prep_row,
586
+ )
587
+ )
588
+
589
+ dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
590
+
591
+ # Each LibraryPreparation process output => ProcessedSample
592
+ database.processed_sample_set.append(
593
+ self._translate_processed_sample(
594
+ processed_sample_id,
595
+ f"Library preparation for {dna_sample_id}",
596
+ )
597
+ )
598
+
599
+ # --------------------------------------------------
600
+ # STEP 4: Group raw files by (dnaSampleID, sequencerRunID)
601
+ # and insert DataObjects + DataGeneration processes
602
+ # --------------------------------------------------
603
+ raw_query = """
604
+ SELECT dnaSampleID, sequencerRunID, rawDataFilePath
605
+ FROM mms_swRawDataFiles
606
+ """
607
+ neon_raw_data_files_df = pd.read_sql_query(raw_query, self.conn)
608
+
609
+ for neon_id, nmdc_libprep_id in neon_to_nmdc_lib_prep_ids.items():
610
+ # 1) Pull out the row that corresponds to this parentSampleID
611
+ lib_prep_row = surface_water_samples[
612
+ surface_water_samples["parentSampleID"] == neon_id
613
+ ]
614
+
615
+ # 2) Grab the dnaSampleID from that row
616
+ dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
617
+ if not dna_sample_id:
618
+ # No dnaSampleID => skip
619
+ continue
620
+
621
+ # 3) Find all raw files for that dnaSampleID
622
+ dna_files = neon_raw_data_files_df[
623
+ neon_raw_data_files_df["dnaSampleID"] == dna_sample_id
624
+ ]
625
+ if dna_files.empty:
626
+ # No raw files => skip
627
+ continue
628
+
629
+ # -----------------------------------------
630
+ # LOOKUP DICT: get "has_input" for this neon_id
631
+ # -----------------------------------------
632
+ has_input_value = self.samp_procsm_dict.get(neon_id)
633
+ # If some neon_id isn't in the dictionary, handle it as needed
634
+ if not has_input_value:
635
+ # Could skip, or raise an error, or set a default
636
+ continue
637
+
638
+ # -------------------------------------------
639
+ # 4) CREATE A MANIFEST IF MULTIPLE RAW FILES
640
+ # for this row's dnaSampleID
641
+ # -------------------------------------------
642
+ manifest_id = None
643
+ if len(dna_files) > 2:
644
+ # For each row that references a dnaSampleID with multiple raw files,
645
+ # mint exactly one new manifest record
646
+ manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
647
+ new_manifest = self._translate_manifest(manifest_id)
648
+ # Add to the database
649
+ database.manifest_set.append(new_manifest)
650
+
651
+ # -------------------------------------------
652
+ # 5) NOW GROUP FILES BY sequencerRunID
653
+ # => one data_generation record per run
654
+ # -------------------------------------------
655
+ lib_prep_processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(
656
+ neon_id
657
+ )
658
+ if not lib_prep_processed_sample_id:
659
+ # If we don't have a ProcessedSample for some reason, skip
660
+ continue
661
+
662
+ for run_id, group_df in dna_files.groupby("sequencerRunID"):
663
+ # a) Mint new data_generation (NucleotideSequencing) ID for this run
664
+ data_generation_id = self._id_minter("nmdc:NucleotideSequencing", 1)[0]
665
+
666
+ # b) Create DataObjects for each raw file in this run
667
+ data_object_ids = []
668
+ for raw_fp in group_df["rawDataFilePath"]:
669
+ do_id = self._id_minter("nmdc:DataObject", 1)[0]
670
+
671
+ # Distinguish read type
672
+ do_type = None
673
+ if "_R1.fastq.gz" in raw_fp:
674
+ do_type = "Metagenome Raw Read 1"
675
+ elif "_R2.fastq.gz" in raw_fp:
676
+ do_type = "Metagenome Raw Read 2"
677
+
678
+ # Create the DataObject
679
+ data_obj = self._translate_data_object(
680
+ do_id=do_id,
681
+ url=raw_fp,
682
+ do_type=do_type,
683
+ manifest_id=manifest_id, # link to the new Manifest if it exists
684
+ )
685
+ database.data_object_set.append(data_obj)
686
+ data_object_ids.append(do_id)
687
+
688
+ # c) Finally, create the data generation record for this run
689
+ database.data_generation_set.append(
690
+ self._translate_nucleotide_sequencing(
691
+ nucleotide_sequencing_id=data_generation_id,
692
+ processed_sample_id=has_input_value,
693
+ raw_data_file_data=data_object_ids,
694
+ nucleotide_sequencing_row=lib_prep_row,
695
+ )
696
+ )
697
+
698
+ return database