nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +22 -2
  3. nmdc_runtime/api/core/idgen.py +36 -6
  4. nmdc_runtime/api/db/mongo.py +0 -12
  5. nmdc_runtime/api/endpoints/find.py +65 -225
  6. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  7. nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
  8. nmdc_runtime/api/endpoints/objects.py +4 -11
  9. nmdc_runtime/api/endpoints/operations.py +0 -27
  10. nmdc_runtime/api/endpoints/queries.py +22 -0
  11. nmdc_runtime/api/endpoints/sites.py +0 -24
  12. nmdc_runtime/api/endpoints/util.py +57 -35
  13. nmdc_runtime/api/entrypoint.sh +7 -0
  14. nmdc_runtime/api/main.py +84 -60
  15. nmdc_runtime/api/models/util.py +12 -5
  16. nmdc_runtime/api/openapi.py +116 -180
  17. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  18. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  19. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  20. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  21. nmdc_runtime/minter/adapters/repository.py +21 -0
  22. nmdc_runtime/minter/domain/model.py +20 -0
  23. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  24. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  25. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  26. nmdc_runtime/site/dagster.yaml +53 -0
  27. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  28. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  29. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  30. nmdc_runtime/site/export/ncbi_xml.py +632 -11
  31. nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
  32. nmdc_runtime/site/graphs.py +7 -0
  33. nmdc_runtime/site/ops.py +92 -34
  34. nmdc_runtime/site/repository.py +2 -0
  35. nmdc_runtime/site/resources.py +16 -3
  36. nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
  37. nmdc_runtime/site/workspace.yaml +13 -0
  38. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  39. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  40. nmdc_runtime/static/README.md +5 -0
  41. nmdc_runtime/static/favicon.ico +0 -0
  42. nmdc_runtime/util.py +87 -1
  43. nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
  44. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
  45. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
  46. nmdc_runtime/api/endpoints/ids.py +0 -192
  47. nmdc_runtime/client/__init__.py +0 -0
  48. nmdc_runtime/containers.py +0 -14
  49. nmdc_runtime/core/__init__.py +0 -0
  50. nmdc_runtime/core/db/Database.py +0 -13
  51. nmdc_runtime/core/db/__init__.py +0 -0
  52. nmdc_runtime/core/exceptions/__init__.py +0 -23
  53. nmdc_runtime/core/exceptions/base.py +0 -47
  54. nmdc_runtime/core/exceptions/token.py +0 -13
  55. nmdc_runtime/domain/__init__.py +0 -0
  56. nmdc_runtime/domain/users/__init__.py +0 -0
  57. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  58. nmdc_runtime/domain/users/userSchema.py +0 -37
  59. nmdc_runtime/domain/users/userService.py +0 -14
  60. nmdc_runtime/infrastructure/__init__.py +0 -0
  61. nmdc_runtime/infrastructure/database/__init__.py +0 -0
  62. nmdc_runtime/infrastructure/database/db.py +0 -3
  63. nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
  64. nmdc_runtime/infrastructure/database/models/user.py +0 -1
  65. nmdc_runtime/lib/__init__.py +0 -1
  66. nmdc_runtime/lib/extract_nmdc_data.py +0 -33
  67. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  68. nmdc_runtime/lib/nmdc_dataframes.py +0 -825
  69. nmdc_runtime/lib/nmdc_etl_class.py +0 -396
  70. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  71. nmdc_runtime/site/drsobjects/__init__.py +0 -0
  72. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  73. nmdc_runtime/site/drsobjects/registration.py +0 -131
  74. nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
  75. nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
  76. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
  77. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,396 +0,0 @@
1
- ## author: Bill Duncan
2
- ## summary: Contains class with methods and properties for transforming data in NMDC ETL pipeline.
3
-
4
- import pickle
5
- import nmdc_runtime.lib.transform_nmdc_data as tx
6
- import nmdc_runtime.lib.extract_nmdc_data as ex
7
- import nmdc_runtime.lib.load_nmdc_data as lx
8
- import nmdc_runtime.lib.nmdc_dataframes as nmdc_dfs
9
- from nmdc_schema import nmdc
10
- import pandas as pds
11
- import yaml
12
- from yaml import CLoader as Loader
13
-
14
-
15
- class NMDC_ETL:
16
- """
17
- Encapsulations ETL operations on data.
18
- """
19
-
20
- ####### BEGIN INNER CLASS #############
21
- class NMDC_DATA:
22
- """
23
- Encapsulates data used during ETL.
24
- It is useful b/c it allows for the class to be created from a pickle file.
25
- Using pickled data speeds up load times, and is useful for testing.
26
- """
27
-
28
- ## merged datafame that holds all the data
29
- merged_dataframe = None
30
-
31
- ## tables from merged dataset
32
- study_table = None
33
- contact_table = None
34
- proposals_table = None
35
- project_table = None
36
- jgi_emsl_table = None
37
- emsl_table = None
38
- emsl_biosample_table = None
39
- fastq_table = None
40
- project_biosample_table = None
41
- biosample_table = None
42
-
43
- ## dataframes built from tables
44
- study = None # gold studies
45
- emsl = None # emsl projects / data objects
46
- # data_objects = None # jgi data objects
47
- fastq = None
48
- biosample = None # gold biosamples
49
- project = None # gold projects
50
-
51
- def __init__(self, merged_data_file, pickled_data=""):
52
- ## create merged dataframe
53
- self.merged_dataframe = pds.read_csv(merged_data_file, sep="\t", dtype=str)
54
-
55
- ## Extract tables from merged dataset
56
- self.study_table = ex.extract_table(self.merged_dataframe, "study_table")
57
-
58
- self.contact_table = ex.extract_table(
59
- self.merged_dataframe, "contact_table"
60
- )
61
-
62
- self.proposals_table = ex.extract_table(
63
- self.merged_dataframe, "proposals_table"
64
- )
65
-
66
- self.project_table = ex.extract_table(
67
- self.merged_dataframe, "project_table"
68
- )
69
-
70
- self.jgi_emsl_table = ex.extract_table(
71
- self.merged_dataframe, "ficus_jgi_emsl"
72
- )
73
-
74
- self.emsl_table = ex.extract_table(self.merged_dataframe, "ficus_emsl")
75
-
76
- self.emsl_biosample_table = ex.extract_table(
77
- self.merged_dataframe, "ficus_emsl_biosample"
78
- )
79
-
80
- self.fastq_table = ex.extract_table(
81
- self.merged_dataframe, "ficus_fastq_table"
82
- )
83
-
84
- self.project_biosample_table = ex.extract_table(
85
- self.merged_dataframe, "project_biosample_table"
86
- )
87
-
88
- self.biosample_table = ex.extract_table(
89
- self.merged_dataframe, "biosample_table"
90
- )
91
-
92
- self.soil_package_table = ex.extract_table(
93
- self.merged_dataframe, "soil_package_table"
94
- )
95
-
96
- self.water_package_table = ex.extract_table(
97
- self.merged_dataframe, "water_package_table"
98
- )
99
-
100
- ## build dataframes from tables
101
- self.study = nmdc_dfs.make_study_dataframe(
102
- self.study_table, self.contact_table, self.proposals_table
103
- ) # gold studies
104
-
105
- self.emsl = nmdc_dfs.make_emsl_dataframe(
106
- self.emsl_table,
107
- self.jgi_emsl_table,
108
- self.study_table,
109
- self.emsl_biosample_table,
110
- ) # emsl projects / data objects
111
-
112
- # self.data_objects = nmdc_dfs.make_data_objects_dataframe(
113
- # self.faa_table, self.fna_table, self.fastq_table, self.project_table
114
- # ) # jgi data objects
115
-
116
- self.fastq = nmdc_dfs.make_jgi_fastq_dataframe(
117
- self.fastq_table, self.project_table
118
- )
119
-
120
- self.biosample = nmdc_dfs.make_biosample_dataframe(
121
- self.biosample_table,
122
- self.soil_package_table,
123
- self.water_package_table,
124
- self.project_biosample_table,
125
- self.project_table,
126
- self.study_table,
127
- ) # gold biosamples
128
-
129
- self.project = nmdc_dfs.make_project_dataframe(
130
- self.project_table,
131
- self.study_table,
132
- self.contact_table,
133
- self.fastq,
134
- self.project_biosample_table,
135
- self.biosample,
136
- ) # gold projects
137
-
138
- def save_as_pickle(self, save_path):
139
- with open(save_path, "wb") as handle:
140
- pickle.dump(self, handle)
141
-
142
- ####### END INNER CLASS #############
143
-
144
- ## dicts that result from transformation methods
145
- study_dict = None
146
- omics_processing_dict = None
147
- biosample_dict = None
148
- emsl_omics_processing_dict = None
149
- emsl_data_object_dict = None
150
- jgi_data_object_dict = None
151
-
152
- ## dict to hold the datasource spec
153
- data_source_spec = None
154
-
155
- ## dict to hold sssom mappings
156
- sssom_map = None
157
-
158
- ## variable to hold nmdc data
159
- nmdc_data = None
160
-
161
- def __init__(
162
- self, merged_data_file, data_source_spec_file, sssom_file, pickled_data=""
163
- ):
164
- ## build data source specfication
165
- with open(data_source_spec_file, "r") as input_file:
166
- self.data_source_spec = yaml.load(input_file, Loader=Loader)
167
-
168
- ## build sssom mapping
169
- self.sssom_map = tx.make_attribute_map(sssom_file)
170
-
171
- ## load NMDC DATA class
172
- if len(pickled_data) > 0:
173
- with open(pickled_data, "rb") as handle:
174
- self.nmdc_data = pickle.load(handle)
175
- else:
176
- self.nmdc_data = self.NMDC_DATA(merged_data_file, pickled_data)
177
-
178
- def pickle_nmdc_data(self, save_path):
179
- self.nmdc_data.save_as_pickle(save_path)
180
-
181
- @staticmethod
182
- def transform_dataframe(
183
- nmdc_df: pds.DataFrame,
184
- nmdc_class,
185
- constructor_map={},
186
- attribute_fields=[],
187
- attribute_map={},
188
- transform_map={},
189
- test_rows=0,
190
- print_df=False,
191
- print_dict=False,
192
- ) -> list:
193
- ## used for testing
194
- if test_rows != 0:
195
- nmdc_df = nmdc_df.head(test_rows)
196
- if print_df:
197
- print(nmdc_df)
198
-
199
- ## create nmdc dict of data from dataframe
200
- nmdc_dict = tx.dataframe_to_dict(
201
- nmdc_df,
202
- nmdc_class,
203
- constructor_map=constructor_map,
204
- attribute_fields=attribute_fields,
205
- attribute_map=attribute_map,
206
- transform_map=transform_map,
207
- )
208
-
209
- ## used for testing
210
- if print_dict:
211
- print(nmdc_dict)
212
-
213
- return nmdc_dict
214
-
215
- def transform_study(
216
- self,
217
- data_source_class="gold_study",
218
- test_rows=0,
219
- print_df=False,
220
- print_dict=False,
221
- ) -> list:
222
- ## specify constructor args and attributes
223
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
224
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
225
-
226
- self.study_dict = NMDC_ETL.transform_dataframe(
227
- nmdc_df=self.nmdc_data.study,
228
- nmdc_class=nmdc.Study,
229
- constructor_map=constructor,
230
- attribute_fields=attributes,
231
- attribute_map=self.sssom_map,
232
- test_rows=test_rows,
233
- print_df=print_df,
234
- print_dict=print_dict,
235
- )
236
- return self.study_dict
237
-
238
- def save_study(
239
- self, file_path="output/nmdc_etl/gold_study.json", data_format="json"
240
- ):
241
- return lx.save_nmdc_dict(self.study_dict, file_path, data_format)
242
-
243
- def transform_omics_processing(
244
- self,
245
- data_source_class="gold_omics_processing",
246
- test_rows=0,
247
- print_df=False,
248
- print_dict=False,
249
- ) -> list:
250
- ## specify constructor args and attributes
251
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
252
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
253
- transform_map = self.data_source_spec["classes"][data_source_class][
254
- "transforms"
255
- ]
256
-
257
- self.omics_processing_dict = NMDC_ETL.transform_dataframe(
258
- nmdc_df=self.nmdc_data.project,
259
- nmdc_class=nmdc.OmicsProcessing,
260
- constructor_map=constructor,
261
- attribute_fields=attributes,
262
- attribute_map=self.sssom_map,
263
- transform_map=transform_map,
264
- test_rows=test_rows,
265
- print_df=print_df,
266
- print_dict=print_dict,
267
- )
268
- return self.omics_processing_dict
269
-
270
- def save_omics_processing(
271
- self, file_path="output/nmdc_etl/gold_omics_processing.json", data_format="json"
272
- ):
273
- return lx.save_nmdc_dict(self.omics_processing_dict, file_path, data_format)
274
-
275
- def transform_biosample(
276
- self,
277
- data_source_class="gold_biosample",
278
- test_rows=0,
279
- print_df=False,
280
- print_dict=False,
281
- ) -> list:
282
- ## specify constructor args and attributes
283
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
284
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
285
- transform_map = self.data_source_spec["classes"][data_source_class][
286
- "transforms"
287
- ]
288
-
289
- self.biosample_dict = NMDC_ETL.transform_dataframe(
290
- nmdc_df=self.nmdc_data.biosample,
291
- nmdc_class=nmdc.Biosample,
292
- constructor_map=constructor,
293
- attribute_fields=attributes,
294
- attribute_map=self.sssom_map,
295
- transform_map=transform_map,
296
- test_rows=test_rows,
297
- print_df=print_df,
298
- print_dict=print_dict,
299
- )
300
- return self.biosample_dict
301
-
302
- def save_biosample(
303
- self, file_path="output/nmdc_etl/gold_biosample.json", data_format="json"
304
- ):
305
- return lx.save_nmdc_dict(self.biosample_dict, file_path, data_format)
306
-
307
- def transform_emsl_omics_processing(
308
- self,
309
- data_source_class="emsl_omics_processing",
310
- test_rows=0,
311
- print_df=False,
312
- print_dict=False,
313
- ) -> list:
314
- ## specify constructor args and attributes
315
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
316
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
317
-
318
- self.emsl_omics_processing_dict = NMDC_ETL.transform_dataframe(
319
- nmdc_df=self.nmdc_data.emsl,
320
- nmdc_class=nmdc.OmicsProcessing,
321
- constructor_map=constructor,
322
- attribute_fields=attributes,
323
- attribute_map=self.sssom_map,
324
- test_rows=test_rows,
325
- print_df=print_df,
326
- print_dict=print_dict,
327
- )
328
-
329
- return self.emsl_omics_processing_dict
330
-
331
- def save_emsl_omics_processing(
332
- self, file_path="output/nmdc_etl/emsl_omics_processing.json", data_format="json"
333
- ):
334
- return lx.save_nmdc_dict(
335
- self.emsl_omics_processing_dict, file_path, data_format
336
- )
337
-
338
- def transform_emsl_data_object(
339
- self,
340
- data_source_class="emsl_data_object",
341
- test_rows=0,
342
- print_df=False,
343
- print_dict=False,
344
- ) -> list:
345
- ## specify constructor args and attributes
346
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
347
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
348
-
349
- self.emsl_data_object_dict = NMDC_ETL.transform_dataframe(
350
- nmdc_df=self.nmdc_data.emsl,
351
- nmdc_class=nmdc.DataObject,
352
- constructor_map=constructor,
353
- attribute_fields=attributes,
354
- attribute_map=self.sssom_map,
355
- test_rows=test_rows,
356
- print_df=print_df,
357
- print_dict=print_dict,
358
- )
359
-
360
- return self.emsl_data_object_dict
361
-
362
- def save_emsl_data_object(
363
- self, file_path="output/nmdc_etl/emsl_data_objects.json", data_format="json"
364
- ):
365
- return lx.save_nmdc_dict(self.emsl_data_object_dict, file_path, data_format)
366
-
367
- def transform_jgi_data_object(
368
- self,
369
- data_source_class="jgi_data_object",
370
- test_rows=0,
371
- print_df=False,
372
- print_dict=False,
373
- ) -> list:
374
- ## specify constructor args and attributes
375
- constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
376
- attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
377
-
378
- self.jgi_data_object_dict = NMDC_ETL.transform_dataframe(
379
- nmdc_df=self.nmdc_data.fastq,
380
- nmdc_class=nmdc.DataObject,
381
- constructor_map=constructor,
382
- attribute_fields=attributes,
383
- attribute_map=self.sssom_map,
384
- test_rows=test_rows,
385
- print_df=print_df,
386
- print_dict=print_dict,
387
- )
388
-
389
- return self.jgi_data_object_dict
390
-
391
- def save_jgi_data_object(
392
- self,
393
- file_path="output/nmdc_etl/jgi_fastq_data_objects.json",
394
- data_format="json",
395
- ):
396
- return lx.save_nmdc_dict(self.jgi_data_object_dict, file_path, data_format)