nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,402 +0,0 @@
1
- ## author: Bill Duncan
2
- ## summary: Contains class with methods and properties for transforming data in NMDC ETL pipeline.
3
-
4
- import pickle
5
- import nmdc_runtime.lib.transform_nmdc_data as tx
6
- import nmdc_runtime.lib.extract_nmdc_data as ex
7
- import nmdc_runtime.lib.load_nmdc_data as lx
8
- import nmdc_runtime.lib.nmdc_dataframes as nmdc_dfs
9
- from nmdc_schema import nmdc
10
- import pandas as pds
11
- import jq
12
- import jsonasobj
13
- import json
14
- import zipfile
15
- import yaml
16
- from yaml import CLoader as Loader, CDumper as Dumper
17
- from dotted_dict import DottedDict
18
- from collections import namedtuple
19
-
20
-
21
- class NMDC_ETL:
22
- """
23
- Encapsulations ETL operations on data.
24
- """
25
-
26
- ####### BEGIN INNER CLASS #############
27
- class NMDC_DATA:
28
- """
29
- Encapsulates data used during ETL.
30
- It is useful b/c it allows for the class to be created from a pickle file.
31
- Using pickled data speeds up load times, and is useful for testing.
32
- """
33
-
34
- ## merged datafame that holds all the data
35
- merged_dataframe = None
36
-
37
- ## tables from merged dataset
38
- study_table = None
39
- contact_table = None
40
- proposals_table = None
41
- project_table = None
42
- jgi_emsl_table = None
43
- emsl_table = None
44
- emsl_biosample_table = None
45
- fastq_table = None
46
- project_biosample_table = None
47
- biosample_table = None
48
-
49
- ## dataframes built from tables
50
- study = None # gold studies
51
- emsl = None # emsl projects / data objects
52
- # data_objects = None # jgi data objects
53
- fastq = None
54
- biosample = None # gold biosamples
55
- project = None # gold projects
56
-
57
- def __init__(self, merged_data_file, pickled_data=""):
58
- ## create merged dataframe
59
- self.merged_dataframe = pds.read_csv(merged_data_file, sep="\t", dtype=str)
60
-
61
- ## Extract tables from merged dataset
62
- self.study_table = ex.extract_table(self.merged_dataframe, "study_table")
63
-
64
- self.contact_table = ex.extract_table(
65
- self.merged_dataframe, "contact_table"
66
- )
67
-
68
- self.proposals_table = ex.extract_table(
69
- self.merged_dataframe, "proposals_table"
70
- )
71
-
72
- self.project_table = ex.extract_table(
73
- self.merged_dataframe, "project_table"
74
- )
75
-
76
- self.jgi_emsl_table = ex.extract_table(
77
- self.merged_dataframe, "ficus_jgi_emsl"
78
- )
79
-
80
- self.emsl_table = ex.extract_table(self.merged_dataframe, "ficus_emsl")
81
-
82
- self.emsl_biosample_table = ex.extract_table(
83
- self.merged_dataframe, "ficus_emsl_biosample"
84
- )
85
-
86
- self.fastq_table = ex.extract_table(
87
- self.merged_dataframe, "ficus_fastq_table"
88
- )
89
-
90
- self.project_biosample_table = ex.extract_table(
91
- self.merged_dataframe, "project_biosample_table"
92
- )
93
-
94
- self.biosample_table = ex.extract_table(
95
- self.merged_dataframe, "biosample_table"
96
- )
97
-
98
- self.soil_package_table = ex.extract_table(
99
- self.merged_dataframe, "soil_package_table"
100
- )
101
-
102
- self.water_package_table = ex.extract_table(
103
- self.merged_dataframe, "water_package_table"
104
- )
105
-
106
- ## build dataframes from tables
107
- self.study = nmdc_dfs.make_study_dataframe(
108
- self.study_table, self.contact_table, self.proposals_table
109
- ) # gold studies
110
-
111
- self.emsl = nmdc_dfs.make_emsl_dataframe(
112
- self.emsl_table,
113
- self.jgi_emsl_table,
114
- self.study_table,
115
- self.emsl_biosample_table,
116
- ) # emsl projects / data objects
117
-
118
- # self.data_objects = nmdc_dfs.make_data_objects_dataframe(
119
- # self.faa_table, self.fna_table, self.fastq_table, self.project_table
120
- # ) # jgi data objects
121
-
122
- self.fastq = nmdc_dfs.make_jgi_fastq_dataframe(
123
- self.fastq_table, self.project_table
124
- )
125
-
126
- self.biosample = nmdc_dfs.make_biosample_dataframe(
127
- self.biosample_table,
128
- self.soil_package_table,
129
- self.water_package_table,
130
- self.project_biosample_table,
131
- self.project_table,
132
- self.study_table,
133
- ) # gold biosamples
134
-
135
- self.project = nmdc_dfs.make_project_dataframe(
136
- self.project_table,
137
- self.study_table,
138
- self.contact_table,
139
- self.fastq,
140
- self.project_biosample_table,
141
- self.biosample,
142
- ) # gold projects
143
-
144
- def save_as_pickle(self, save_path):
145
- with open(save_path, "wb") as handle:
146
- pickle.dump(self, handle)
147
-
148
- ####### END INNER CLASS #############
149
-
150
- ## dicts that result from transformation methods
151
- study_dict = None
152
- omics_processing_dict = None
153
- biosample_dict = None
154
- emsl_omics_processing_dict = None
155
- emsl_data_object_dict = None
156
- jgi_data_object_dict = None
157
-
158
- ## dict to hold the datasource spec
159
- data_source_spec = None
160
-
161
- ## dict to hold sssom mappings
162
- sssom_map = None
163
-
164
- ## variable to hold nmdc data
165
- nmdc_data = None
166
-
167
- def __init__(
168
- self, merged_data_file, data_source_spec_file, sssom_file, pickled_data=""
169
- ):
170
- ## build data source specfication
171
- with open(data_source_spec_file, "r") as input_file:
172
- self.data_source_spec = yaml.load(input_file, Loader=Loader)
173
-
174
- ## build sssom mapping
175
- self.sssom_map = tx.make_attribute_map(sssom_file)
176
-
177
- ## load NMDC DATA class
178
- if len(pickled_data) > 0:
179
- with open(pickled_data, "rb") as handle:
180
- self.nmdc_data = pickle.load(handle)
181
- else:
182
- self.nmdc_data = self.NMDC_DATA(merged_data_file, pickled_data)
183
-
184
- def pickle_nmdc_data(self, save_path):
185
- self.nmdc_data.save_as_pickle(save_path)
186
-
187
- @staticmethod
188
- def transform_dataframe(
189
- nmdc_df: pds.DataFrame,
190
- nmdc_class,
191
- constructor_map={},
192
- attribute_fields=[],
193
- attribute_map={},
194
- transform_map={},
195
- test_rows=0,
196
- print_df=False,
197
- print_dict=False,
198
- ) -> list:
199
- ## used for testing
200
- if test_rows != 0:
201
- nmdc_df = nmdc_df.head(test_rows)
202
- if print_df:
203
- print(nmdc_df)
204
-
205
- ## create nmdc dict of data from dataframe
206
- nmdc_dict = tx.dataframe_to_dict(
207
- nmdc_df,
208
- nmdc_class,
209
- constructor_map=constructor_map,
210
- attribute_fields=attribute_fields,
211
- attribute_map=attribute_map,
212
- transform_map=transform_map,
213
- )
214
-
215
- ## used for testing
216
- if print_dict:
217
- print(nmdc_dict)
218
-
219
- return nmdc_dict
220
-
221
- def transform_study(
222
- self,
223
- data_source_class="gold_study",
224
- test_rows=0,
225
- print_df=False,
226
- print_dict=False,
227
- ) -> list:
228
- ## specify constructor args and attributes
229
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
230
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
231
-
232
- self.study_dict = NMDC_ETL.transform_dataframe(
233
- nmdc_df=self.nmdc_data.study,
234
- nmdc_class=nmdc.Study,
235
- constructor_map=constructor,
236
- attribute_fields=attributes,
237
- attribute_map=self.sssom_map,
238
- test_rows=test_rows,
239
- print_df=print_df,
240
- print_dict=print_dict,
241
- )
242
- return self.study_dict
243
-
244
- def save_study(
245
- self, file_path="output/nmdc_etl/gold_study.json", data_format="json"
246
- ):
247
- return lx.save_nmdc_dict(self.study_dict, file_path, data_format)
248
-
249
- def transform_omics_processing(
250
- self,
251
- data_source_class="gold_omics_processing",
252
- test_rows=0,
253
- print_df=False,
254
- print_dict=False,
255
- ) -> list:
256
- ## specify constructor args and attributes
257
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
258
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
259
- transform_map = self.data_source_spec["classes"][data_source_class][
260
- "transforms"
261
- ]
262
-
263
- self.omics_processing_dict = NMDC_ETL.transform_dataframe(
264
- nmdc_df=self.nmdc_data.project,
265
- nmdc_class=nmdc.OmicsProcessing,
266
- constructor_map=constructor,
267
- attribute_fields=attributes,
268
- attribute_map=self.sssom_map,
269
- transform_map=transform_map,
270
- test_rows=test_rows,
271
- print_df=print_df,
272
- print_dict=print_dict,
273
- )
274
- return self.omics_processing_dict
275
-
276
- def save_omics_processing(
277
- self, file_path="output/nmdc_etl/gold_omics_processing.json", data_format="json"
278
- ):
279
- return lx.save_nmdc_dict(self.omics_processing_dict, file_path, data_format)
280
-
281
- def transform_biosample(
282
- self,
283
- data_source_class="gold_biosample",
284
- test_rows=0,
285
- print_df=False,
286
- print_dict=False,
287
- ) -> list:
288
- ## specify constructor args and attributes
289
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
290
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
291
- transform_map = self.data_source_spec["classes"][data_source_class][
292
- "transforms"
293
- ]
294
-
295
- self.biosample_dict = NMDC_ETL.transform_dataframe(
296
- nmdc_df=self.nmdc_data.biosample,
297
- nmdc_class=nmdc.Biosample,
298
- constructor_map=constructor,
299
- attribute_fields=attributes,
300
- attribute_map=self.sssom_map,
301
- transform_map=transform_map,
302
- test_rows=test_rows,
303
- print_df=print_df,
304
- print_dict=print_dict,
305
- )
306
- return self.biosample_dict
307
-
308
- def save_biosample(
309
- self, file_path="output/nmdc_etl/gold_biosample.json", data_format="json"
310
- ):
311
- return lx.save_nmdc_dict(self.biosample_dict, file_path, data_format)
312
-
313
- def transform_emsl_omics_processing(
314
- self,
315
- data_source_class="emsl_omics_processing",
316
- test_rows=0,
317
- print_df=False,
318
- print_dict=False,
319
- ) -> list:
320
- ## specify constructor args and attributes
321
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
322
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
323
-
324
- self.emsl_omics_processing_dict = NMDC_ETL.transform_dataframe(
325
- nmdc_df=self.nmdc_data.emsl,
326
- nmdc_class=nmdc.OmicsProcessing,
327
- constructor_map=constructor,
328
- attribute_fields=attributes,
329
- attribute_map=self.sssom_map,
330
- test_rows=test_rows,
331
- print_df=print_df,
332
- print_dict=print_dict,
333
- )
334
-
335
- return self.emsl_omics_processing_dict
336
-
337
- def save_emsl_omics_processing(
338
- self, file_path="output/nmdc_etl/emsl_omics_processing.json", data_format="json"
339
- ):
340
- return lx.save_nmdc_dict(
341
- self.emsl_omics_processing_dict, file_path, data_format
342
- )
343
-
344
- def transform_emsl_data_object(
345
- self,
346
- data_source_class="emsl_data_object",
347
- test_rows=0,
348
- print_df=False,
349
- print_dict=False,
350
- ) -> list:
351
- ## specify constructor args and attributes
352
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
353
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
354
-
355
- self.emsl_data_object_dict = NMDC_ETL.transform_dataframe(
356
- nmdc_df=self.nmdc_data.emsl,
357
- nmdc_class=nmdc.DataObject,
358
- constructor_map=constructor,
359
- attribute_fields=attributes,
360
- attribute_map=self.sssom_map,
361
- test_rows=test_rows,
362
- print_df=print_df,
363
- print_dict=print_dict,
364
- )
365
-
366
- return self.emsl_data_object_dict
367
-
368
- def save_emsl_data_object(
369
- self, file_path="output/nmdc_etl/emsl_data_objects.json", data_format="json"
370
- ):
371
- return lx.save_nmdc_dict(self.emsl_data_object_dict, file_path, data_format)
372
-
373
- def transform_jgi_data_object(
374
- self,
375
- data_source_class="jgi_data_object",
376
- test_rows=0,
377
- print_df=False,
378
- print_dict=False,
379
- ) -> list:
380
- ## specify constructor args and attributes
381
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
382
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
383
-
384
- self.jgi_data_object_dict = NMDC_ETL.transform_dataframe(
385
- nmdc_df=self.nmdc_data.fastq,
386
- nmdc_class=nmdc.DataObject,
387
- constructor_map=constructor,
388
- attribute_fields=attributes,
389
- attribute_map=self.sssom_map,
390
- test_rows=test_rows,
391
- print_df=print_df,
392
- print_dict=print_dict,
393
- )
394
-
395
- return self.jgi_data_object_dict
396
-
397
- def save_jgi_data_object(
398
- self,
399
- file_path="output/nmdc_etl/jgi_fastq_data_objects.json",
400
- data_format="json",
401
- ):
402
- return lx.save_nmdc_dict(self.jgi_data_object_dict, file_path, data_format)