nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,402 +0,0 @@
|
|
|
1
|
-
## author: Bill Duncan
|
|
2
|
-
## summary: Contains class with methods and properties for transforming data in NMDC ETL pipeline.
|
|
3
|
-
|
|
4
|
-
import pickle
|
|
5
|
-
import nmdc_runtime.lib.transform_nmdc_data as tx
|
|
6
|
-
import nmdc_runtime.lib.extract_nmdc_data as ex
|
|
7
|
-
import nmdc_runtime.lib.load_nmdc_data as lx
|
|
8
|
-
import nmdc_runtime.lib.nmdc_dataframes as nmdc_dfs
|
|
9
|
-
from nmdc_schema import nmdc
|
|
10
|
-
import pandas as pds
|
|
11
|
-
import jq
|
|
12
|
-
import jsonasobj
|
|
13
|
-
import json
|
|
14
|
-
import zipfile
|
|
15
|
-
import yaml
|
|
16
|
-
from yaml import CLoader as Loader, CDumper as Dumper
|
|
17
|
-
from dotted_dict import DottedDict
|
|
18
|
-
from collections import namedtuple
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class NMDC_ETL:
|
|
22
|
-
"""
|
|
23
|
-
Encapsulations ETL operations on data.
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
####### BEGIN INNER CLASS #############
|
|
27
|
-
class NMDC_DATA:
|
|
28
|
-
"""
|
|
29
|
-
Encapsulates data used during ETL.
|
|
30
|
-
It is useful b/c it allows for the class to be created from a pickle file.
|
|
31
|
-
Using pickled data speeds up load times, and is useful for testing.
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
## merged datafame that holds all the data
|
|
35
|
-
merged_dataframe = None
|
|
36
|
-
|
|
37
|
-
## tables from merged dataset
|
|
38
|
-
study_table = None
|
|
39
|
-
contact_table = None
|
|
40
|
-
proposals_table = None
|
|
41
|
-
project_table = None
|
|
42
|
-
jgi_emsl_table = None
|
|
43
|
-
emsl_table = None
|
|
44
|
-
emsl_biosample_table = None
|
|
45
|
-
fastq_table = None
|
|
46
|
-
project_biosample_table = None
|
|
47
|
-
biosample_table = None
|
|
48
|
-
|
|
49
|
-
## dataframes built from tables
|
|
50
|
-
study = None # gold studies
|
|
51
|
-
emsl = None # emsl projects / data objects
|
|
52
|
-
# data_objects = None # jgi data objects
|
|
53
|
-
fastq = None
|
|
54
|
-
biosample = None # gold biosamples
|
|
55
|
-
project = None # gold projects
|
|
56
|
-
|
|
57
|
-
def __init__(self, merged_data_file, pickled_data=""):
|
|
58
|
-
## create merged dataframe
|
|
59
|
-
self.merged_dataframe = pds.read_csv(merged_data_file, sep="\t", dtype=str)
|
|
60
|
-
|
|
61
|
-
## Extract tables from merged dataset
|
|
62
|
-
self.study_table = ex.extract_table(self.merged_dataframe, "study_table")
|
|
63
|
-
|
|
64
|
-
self.contact_table = ex.extract_table(
|
|
65
|
-
self.merged_dataframe, "contact_table"
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
self.proposals_table = ex.extract_table(
|
|
69
|
-
self.merged_dataframe, "proposals_table"
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
self.project_table = ex.extract_table(
|
|
73
|
-
self.merged_dataframe, "project_table"
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
self.jgi_emsl_table = ex.extract_table(
|
|
77
|
-
self.merged_dataframe, "ficus_jgi_emsl"
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
self.emsl_table = ex.extract_table(self.merged_dataframe, "ficus_emsl")
|
|
81
|
-
|
|
82
|
-
self.emsl_biosample_table = ex.extract_table(
|
|
83
|
-
self.merged_dataframe, "ficus_emsl_biosample"
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
self.fastq_table = ex.extract_table(
|
|
87
|
-
self.merged_dataframe, "ficus_fastq_table"
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
self.project_biosample_table = ex.extract_table(
|
|
91
|
-
self.merged_dataframe, "project_biosample_table"
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
self.biosample_table = ex.extract_table(
|
|
95
|
-
self.merged_dataframe, "biosample_table"
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
self.soil_package_table = ex.extract_table(
|
|
99
|
-
self.merged_dataframe, "soil_package_table"
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
self.water_package_table = ex.extract_table(
|
|
103
|
-
self.merged_dataframe, "water_package_table"
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
## build dataframes from tables
|
|
107
|
-
self.study = nmdc_dfs.make_study_dataframe(
|
|
108
|
-
self.study_table, self.contact_table, self.proposals_table
|
|
109
|
-
) # gold studies
|
|
110
|
-
|
|
111
|
-
self.emsl = nmdc_dfs.make_emsl_dataframe(
|
|
112
|
-
self.emsl_table,
|
|
113
|
-
self.jgi_emsl_table,
|
|
114
|
-
self.study_table,
|
|
115
|
-
self.emsl_biosample_table,
|
|
116
|
-
) # emsl projects / data objects
|
|
117
|
-
|
|
118
|
-
# self.data_objects = nmdc_dfs.make_data_objects_dataframe(
|
|
119
|
-
# self.faa_table, self.fna_table, self.fastq_table, self.project_table
|
|
120
|
-
# ) # jgi data objects
|
|
121
|
-
|
|
122
|
-
self.fastq = nmdc_dfs.make_jgi_fastq_dataframe(
|
|
123
|
-
self.fastq_table, self.project_table
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
self.biosample = nmdc_dfs.make_biosample_dataframe(
|
|
127
|
-
self.biosample_table,
|
|
128
|
-
self.soil_package_table,
|
|
129
|
-
self.water_package_table,
|
|
130
|
-
self.project_biosample_table,
|
|
131
|
-
self.project_table,
|
|
132
|
-
self.study_table,
|
|
133
|
-
) # gold biosamples
|
|
134
|
-
|
|
135
|
-
self.project = nmdc_dfs.make_project_dataframe(
|
|
136
|
-
self.project_table,
|
|
137
|
-
self.study_table,
|
|
138
|
-
self.contact_table,
|
|
139
|
-
self.fastq,
|
|
140
|
-
self.project_biosample_table,
|
|
141
|
-
self.biosample,
|
|
142
|
-
) # gold projects
|
|
143
|
-
|
|
144
|
-
def save_as_pickle(self, save_path):
|
|
145
|
-
with open(save_path, "wb") as handle:
|
|
146
|
-
pickle.dump(self, handle)
|
|
147
|
-
|
|
148
|
-
####### END INNER CLASS #############
|
|
149
|
-
|
|
150
|
-
## dicts that result from transformation methods
|
|
151
|
-
study_dict = None
|
|
152
|
-
omics_processing_dict = None
|
|
153
|
-
biosample_dict = None
|
|
154
|
-
emsl_omics_processing_dict = None
|
|
155
|
-
emsl_data_object_dict = None
|
|
156
|
-
jgi_data_object_dict = None
|
|
157
|
-
|
|
158
|
-
## dict to hold the datasource spec
|
|
159
|
-
data_source_spec = None
|
|
160
|
-
|
|
161
|
-
## dict to hold sssom mappings
|
|
162
|
-
sssom_map = None
|
|
163
|
-
|
|
164
|
-
## variable to hold nmdc data
|
|
165
|
-
nmdc_data = None
|
|
166
|
-
|
|
167
|
-
def __init__(
|
|
168
|
-
self, merged_data_file, data_source_spec_file, sssom_file, pickled_data=""
|
|
169
|
-
):
|
|
170
|
-
## build data source specfication
|
|
171
|
-
with open(data_source_spec_file, "r") as input_file:
|
|
172
|
-
self.data_source_spec = yaml.load(input_file, Loader=Loader)
|
|
173
|
-
|
|
174
|
-
## build sssom mapping
|
|
175
|
-
self.sssom_map = tx.make_attribute_map(sssom_file)
|
|
176
|
-
|
|
177
|
-
## load NMDC DATA class
|
|
178
|
-
if len(pickled_data) > 0:
|
|
179
|
-
with open(pickled_data, "rb") as handle:
|
|
180
|
-
self.nmdc_data = pickle.load(handle)
|
|
181
|
-
else:
|
|
182
|
-
self.nmdc_data = self.NMDC_DATA(merged_data_file, pickled_data)
|
|
183
|
-
|
|
184
|
-
def pickle_nmdc_data(self, save_path):
|
|
185
|
-
self.nmdc_data.save_as_pickle(save_path)
|
|
186
|
-
|
|
187
|
-
@staticmethod
|
|
188
|
-
def transform_dataframe(
|
|
189
|
-
nmdc_df: pds.DataFrame,
|
|
190
|
-
nmdc_class,
|
|
191
|
-
constructor_map={},
|
|
192
|
-
attribute_fields=[],
|
|
193
|
-
attribute_map={},
|
|
194
|
-
transform_map={},
|
|
195
|
-
test_rows=0,
|
|
196
|
-
print_df=False,
|
|
197
|
-
print_dict=False,
|
|
198
|
-
) -> list:
|
|
199
|
-
## used for testing
|
|
200
|
-
if test_rows != 0:
|
|
201
|
-
nmdc_df = nmdc_df.head(test_rows)
|
|
202
|
-
if print_df:
|
|
203
|
-
print(nmdc_df)
|
|
204
|
-
|
|
205
|
-
## create nmdc dict of data from dataframe
|
|
206
|
-
nmdc_dict = tx.dataframe_to_dict(
|
|
207
|
-
nmdc_df,
|
|
208
|
-
nmdc_class,
|
|
209
|
-
constructor_map=constructor_map,
|
|
210
|
-
attribute_fields=attribute_fields,
|
|
211
|
-
attribute_map=attribute_map,
|
|
212
|
-
transform_map=transform_map,
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
## used for testing
|
|
216
|
-
if print_dict:
|
|
217
|
-
print(nmdc_dict)
|
|
218
|
-
|
|
219
|
-
return nmdc_dict
|
|
220
|
-
|
|
221
|
-
def transform_study(
|
|
222
|
-
self,
|
|
223
|
-
data_source_class="gold_study",
|
|
224
|
-
test_rows=0,
|
|
225
|
-
print_df=False,
|
|
226
|
-
print_dict=False,
|
|
227
|
-
) -> list:
|
|
228
|
-
## specify constructor args and attributes
|
|
229
|
-
constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
|
|
230
|
-
attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
|
|
231
|
-
|
|
232
|
-
self.study_dict = NMDC_ETL.transform_dataframe(
|
|
233
|
-
nmdc_df=self.nmdc_data.study,
|
|
234
|
-
nmdc_class=nmdc.Study,
|
|
235
|
-
constructor_map=constructor,
|
|
236
|
-
attribute_fields=attributes,
|
|
237
|
-
attribute_map=self.sssom_map,
|
|
238
|
-
test_rows=test_rows,
|
|
239
|
-
print_df=print_df,
|
|
240
|
-
print_dict=print_dict,
|
|
241
|
-
)
|
|
242
|
-
return self.study_dict
|
|
243
|
-
|
|
244
|
-
def save_study(
|
|
245
|
-
self, file_path="output/nmdc_etl/gold_study.json", data_format="json"
|
|
246
|
-
):
|
|
247
|
-
return lx.save_nmdc_dict(self.study_dict, file_path, data_format)
|
|
248
|
-
|
|
249
|
-
def transform_omics_processing(
|
|
250
|
-
self,
|
|
251
|
-
data_source_class="gold_omics_processing",
|
|
252
|
-
test_rows=0,
|
|
253
|
-
print_df=False,
|
|
254
|
-
print_dict=False,
|
|
255
|
-
) -> list:
|
|
256
|
-
## specify constructor args and attributes
|
|
257
|
-
constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
|
|
258
|
-
attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
|
|
259
|
-
transform_map = self.data_source_spec["classes"][data_source_class][
|
|
260
|
-
"transforms"
|
|
261
|
-
]
|
|
262
|
-
|
|
263
|
-
self.omics_processing_dict = NMDC_ETL.transform_dataframe(
|
|
264
|
-
nmdc_df=self.nmdc_data.project,
|
|
265
|
-
nmdc_class=nmdc.OmicsProcessing,
|
|
266
|
-
constructor_map=constructor,
|
|
267
|
-
attribute_fields=attributes,
|
|
268
|
-
attribute_map=self.sssom_map,
|
|
269
|
-
transform_map=transform_map,
|
|
270
|
-
test_rows=test_rows,
|
|
271
|
-
print_df=print_df,
|
|
272
|
-
print_dict=print_dict,
|
|
273
|
-
)
|
|
274
|
-
return self.omics_processing_dict
|
|
275
|
-
|
|
276
|
-
def save_omics_processing(
|
|
277
|
-
self, file_path="output/nmdc_etl/gold_omics_processing.json", data_format="json"
|
|
278
|
-
):
|
|
279
|
-
return lx.save_nmdc_dict(self.omics_processing_dict, file_path, data_format)
|
|
280
|
-
|
|
281
|
-
def transform_biosample(
|
|
282
|
-
self,
|
|
283
|
-
data_source_class="gold_biosample",
|
|
284
|
-
test_rows=0,
|
|
285
|
-
print_df=False,
|
|
286
|
-
print_dict=False,
|
|
287
|
-
) -> list:
|
|
288
|
-
## specify constructor args and attributes
|
|
289
|
-
constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
|
|
290
|
-
attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
|
|
291
|
-
transform_map = self.data_source_spec["classes"][data_source_class][
|
|
292
|
-
"transforms"
|
|
293
|
-
]
|
|
294
|
-
|
|
295
|
-
self.biosample_dict = NMDC_ETL.transform_dataframe(
|
|
296
|
-
nmdc_df=self.nmdc_data.biosample,
|
|
297
|
-
nmdc_class=nmdc.Biosample,
|
|
298
|
-
constructor_map=constructor,
|
|
299
|
-
attribute_fields=attributes,
|
|
300
|
-
attribute_map=self.sssom_map,
|
|
301
|
-
transform_map=transform_map,
|
|
302
|
-
test_rows=test_rows,
|
|
303
|
-
print_df=print_df,
|
|
304
|
-
print_dict=print_dict,
|
|
305
|
-
)
|
|
306
|
-
return self.biosample_dict
|
|
307
|
-
|
|
308
|
-
def save_biosample(
|
|
309
|
-
self, file_path="output/nmdc_etl/gold_biosample.json", data_format="json"
|
|
310
|
-
):
|
|
311
|
-
return lx.save_nmdc_dict(self.biosample_dict, file_path, data_format)
|
|
312
|
-
|
|
313
|
-
def transform_emsl_omics_processing(
|
|
314
|
-
self,
|
|
315
|
-
data_source_class="emsl_omics_processing",
|
|
316
|
-
test_rows=0,
|
|
317
|
-
print_df=False,
|
|
318
|
-
print_dict=False,
|
|
319
|
-
) -> list:
|
|
320
|
-
## specify constructor args and attributes
|
|
321
|
-
constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
|
|
322
|
-
attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
|
|
323
|
-
|
|
324
|
-
self.emsl_omics_processing_dict = NMDC_ETL.transform_dataframe(
|
|
325
|
-
nmdc_df=self.nmdc_data.emsl,
|
|
326
|
-
nmdc_class=nmdc.OmicsProcessing,
|
|
327
|
-
constructor_map=constructor,
|
|
328
|
-
attribute_fields=attributes,
|
|
329
|
-
attribute_map=self.sssom_map,
|
|
330
|
-
test_rows=test_rows,
|
|
331
|
-
print_df=print_df,
|
|
332
|
-
print_dict=print_dict,
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
return self.emsl_omics_processing_dict
|
|
336
|
-
|
|
337
|
-
def save_emsl_omics_processing(
|
|
338
|
-
self, file_path="output/nmdc_etl/emsl_omics_processing.json", data_format="json"
|
|
339
|
-
):
|
|
340
|
-
return lx.save_nmdc_dict(
|
|
341
|
-
self.emsl_omics_processing_dict, file_path, data_format
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
def transform_emsl_data_object(
|
|
345
|
-
self,
|
|
346
|
-
data_source_class="emsl_data_object",
|
|
347
|
-
test_rows=0,
|
|
348
|
-
print_df=False,
|
|
349
|
-
print_dict=False,
|
|
350
|
-
) -> list:
|
|
351
|
-
## specify constructor args and attributes
|
|
352
|
-
constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
|
|
353
|
-
attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
|
|
354
|
-
|
|
355
|
-
self.emsl_data_object_dict = NMDC_ETL.transform_dataframe(
|
|
356
|
-
nmdc_df=self.nmdc_data.emsl,
|
|
357
|
-
nmdc_class=nmdc.DataObject,
|
|
358
|
-
constructor_map=constructor,
|
|
359
|
-
attribute_fields=attributes,
|
|
360
|
-
attribute_map=self.sssom_map,
|
|
361
|
-
test_rows=test_rows,
|
|
362
|
-
print_df=print_df,
|
|
363
|
-
print_dict=print_dict,
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
return self.emsl_data_object_dict
|
|
367
|
-
|
|
368
|
-
def save_emsl_data_object(
|
|
369
|
-
self, file_path="output/nmdc_etl/emsl_data_objects.json", data_format="json"
|
|
370
|
-
):
|
|
371
|
-
return lx.save_nmdc_dict(self.emsl_data_object_dict, file_path, data_format)
|
|
372
|
-
|
|
373
|
-
def transform_jgi_data_object(
|
|
374
|
-
self,
|
|
375
|
-
data_source_class="jgi_data_object",
|
|
376
|
-
test_rows=0,
|
|
377
|
-
print_df=False,
|
|
378
|
-
print_dict=False,
|
|
379
|
-
) -> list:
|
|
380
|
-
## specify constructor args and attributes
|
|
381
|
-
constructor = self.data_source_spec["classes"][data_source_class]["constructor"]
|
|
382
|
-
attributes = self.data_source_spec["classes"][data_source_class]["attributes"]
|
|
383
|
-
|
|
384
|
-
self.jgi_data_object_dict = NMDC_ETL.transform_dataframe(
|
|
385
|
-
nmdc_df=self.nmdc_data.fastq,
|
|
386
|
-
nmdc_class=nmdc.DataObject,
|
|
387
|
-
constructor_map=constructor,
|
|
388
|
-
attribute_fields=attributes,
|
|
389
|
-
attribute_map=self.sssom_map,
|
|
390
|
-
test_rows=test_rows,
|
|
391
|
-
print_df=print_df,
|
|
392
|
-
print_dict=print_dict,
|
|
393
|
-
)
|
|
394
|
-
|
|
395
|
-
return self.jgi_data_object_dict
|
|
396
|
-
|
|
397
|
-
def save_jgi_data_object(
|
|
398
|
-
self,
|
|
399
|
-
file_path="output/nmdc_etl/jgi_fastq_data_objects.json",
|
|
400
|
-
data_format="json",
|
|
401
|
-
):
|
|
402
|
-
return lx.save_nmdc_dict(self.jgi_data_object_dict, file_path, data_format)
|