nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -1
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +731 -40
- nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
- nmdc_runtime/site/graphs.py +80 -29
- nmdc_runtime/site/ops.py +522 -183
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +108 -117
- nmdc_runtime/site/resources.py +72 -36
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
- nmdc_runtime/site/translation/translator.py +64 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +175 -348
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
- nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
- nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,829 +0,0 @@
|
|
|
1
|
-
## author: Bill Duncan
|
|
2
|
-
## summary: Contains methods for creating dataframes needed for NMDC ETL pipeline.
|
|
3
|
-
|
|
4
|
-
## system level modules
|
|
5
|
-
import pandas as pds
|
|
6
|
-
import jsonasobj
|
|
7
|
-
import json
|
|
8
|
-
import zipfile
|
|
9
|
-
import yaml
|
|
10
|
-
from pandas.core.dtypes.missing import notnull
|
|
11
|
-
from yaml import CLoader as Loader, CDumper as Dumper
|
|
12
|
-
from dotted_dict import DottedDict
|
|
13
|
-
from collections import namedtuple
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def make_dataframe(
|
|
17
|
-
file_name,
|
|
18
|
-
subset_cols=[],
|
|
19
|
-
exclude_cols=[],
|
|
20
|
-
nrows=None,
|
|
21
|
-
lowercase_col_names=True,
|
|
22
|
-
replace_spaces=True,
|
|
23
|
-
replace_hash=True,
|
|
24
|
-
strip_spaces=True,
|
|
25
|
-
comment_str=None,
|
|
26
|
-
file_type="tsv",
|
|
27
|
-
delimiter="\t",
|
|
28
|
-
sheet_name=0,
|
|
29
|
-
file_archive_name="",
|
|
30
|
-
):
|
|
31
|
-
"""
|
|
32
|
-
Builds a pandas dataframe from the designated file.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
file_name: The name of the file containing the data for the dataframe. If the file is not in the same directory, then specify the path as part of the file name.
|
|
36
|
-
subset_cols: Specifies a specific of subset of columns to be included in the dataframe.
|
|
37
|
-
exclude_cols: Specifies a specific of subset of columns to be excluded from the dataframe.
|
|
38
|
-
nrows: Specifies the number of rows to be returned in the dataframe (useful for testing).
|
|
39
|
-
lowercase_col_names: If true, the column names are converted to lower case.
|
|
40
|
-
replace_spaces: If true, spaces in column names are replaced with underscores.
|
|
41
|
-
replace_hash: If true, hashes ('#') in column names are replaced with empty strings.
|
|
42
|
-
strip_spaces: If true, extra surrounding spaces are stripped from the column names.
|
|
43
|
-
comment_str: Specifies the string that is used for comments with the data.
|
|
44
|
-
file_type: Speicfies the type of file. Current acceptable file types are tsv, csv, and excel. Note that when using excel, you may need to specify a sheet name.
|
|
45
|
-
delimiter: Specifies the delimiter character used between fields.
|
|
46
|
-
sheet_name: If the files is an Excel spreadsheet, this parameter specifies a particular sheet.
|
|
47
|
-
archive_name: If the file_name is contained in an zip or file, this is the name of archive file.
|
|
48
|
-
Returns:
|
|
49
|
-
Pandas dataframe
|
|
50
|
-
"""
|
|
51
|
-
## normalize paramaters for use with pandas
|
|
52
|
-
if len(subset_cols) < 1:
|
|
53
|
-
subset_cols = None
|
|
54
|
-
if len(exclude_cols) < 1:
|
|
55
|
-
exclude_cols = None
|
|
56
|
-
|
|
57
|
-
## check if file is contained in an archive
|
|
58
|
-
file_archive = None
|
|
59
|
-
if len(file_archive_name) > 1:
|
|
60
|
-
file_archive = zipfile.ZipFile(file_archive_name, "r")
|
|
61
|
-
|
|
62
|
-
## load data from file
|
|
63
|
-
if "tsv" == file_type.lower() or "csv" == file_type.lower():
|
|
64
|
-
if None != file_archive:
|
|
65
|
-
df = pds.read_csv(
|
|
66
|
-
file_archive.open(file_name),
|
|
67
|
-
sep=delimiter,
|
|
68
|
-
nrows=nrows,
|
|
69
|
-
comment=comment_str,
|
|
70
|
-
)
|
|
71
|
-
else:
|
|
72
|
-
df = pds.read_csv(
|
|
73
|
-
file_name, sep=delimiter, nrows=nrows, comment=comment_str
|
|
74
|
-
)
|
|
75
|
-
elif "excel" == file_type.lower():
|
|
76
|
-
if None != file_archive:
|
|
77
|
-
df = pds.read_excel(
|
|
78
|
-
file_archive.open(file_name),
|
|
79
|
-
sheet_name=sheet_name,
|
|
80
|
-
nrows=nrows,
|
|
81
|
-
comment=comment_str,
|
|
82
|
-
engine="openpyxl",
|
|
83
|
-
)
|
|
84
|
-
else:
|
|
85
|
-
df = pds.read_excel(
|
|
86
|
-
file_name,
|
|
87
|
-
sheet_name=sheet_name,
|
|
88
|
-
nrows=nrows,
|
|
89
|
-
comment=comment_str,
|
|
90
|
-
engine="openpyxl",
|
|
91
|
-
)
|
|
92
|
-
elif "multi-sheet-excel" == file_type.lower():
|
|
93
|
-
if None != file_archive:
|
|
94
|
-
df = pds.concat(
|
|
95
|
-
pds.read_excel(
|
|
96
|
-
file_archive.open(file_name),
|
|
97
|
-
sheet_name=None,
|
|
98
|
-
index_col=None,
|
|
99
|
-
nrows=nrows,
|
|
100
|
-
comment=comment_str,
|
|
101
|
-
engine="openpyxl",
|
|
102
|
-
)
|
|
103
|
-
)
|
|
104
|
-
else:
|
|
105
|
-
df = pds.concat(
|
|
106
|
-
pds.read_excel(
|
|
107
|
-
file_name,
|
|
108
|
-
sheet_name=None,
|
|
109
|
-
index_col=None,
|
|
110
|
-
nrows=nrows,
|
|
111
|
-
comment=comment_str,
|
|
112
|
-
engine="openpyxl",
|
|
113
|
-
)
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
## clean column names
|
|
117
|
-
df = clean_dataframe_column_names(
|
|
118
|
-
df, lowercase_col_names, replace_spaces, replace_hash, strip_spaces
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
## create subset of columns
|
|
122
|
-
## note: since column names are case sensitive, this needs to happen after cleaning column names
|
|
123
|
-
if subset_cols:
|
|
124
|
-
df = df[subset_cols]
|
|
125
|
-
|
|
126
|
-
## return dataframe
|
|
127
|
-
return df
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def clean_dataframe_column_names(
|
|
131
|
-
df,
|
|
132
|
-
lowercase_col_names=True,
|
|
133
|
-
replace_spaces=True,
|
|
134
|
-
replace_hash=True,
|
|
135
|
-
strip_spaces=True,
|
|
136
|
-
):
|
|
137
|
-
"""
|
|
138
|
-
Changes the column names of a dataframe into a standard format. The default settings change the column names to:
|
|
139
|
-
- lower case
|
|
140
|
-
- replace spaces with underscores
|
|
141
|
-
- replace hash ('#') with empty string
|
|
142
|
-
Args:
|
|
143
|
-
df: The dataframe whose columns will be cleaned.
|
|
144
|
-
lowercase_col_names: If true, the column names are converted to lower case.
|
|
145
|
-
replace_spaces: If true, spaces in column names are replaced with underscores.
|
|
146
|
-
replace_hash: If true, hashes ('#') in column names are replaced with empty strings.
|
|
147
|
-
strip_spaces: If true, extra surrounding spaces are stripped from the column names.
|
|
148
|
-
Returns:
|
|
149
|
-
Pandas dataframe
|
|
150
|
-
"""
|
|
151
|
-
|
|
152
|
-
## clean column names
|
|
153
|
-
if lowercase_col_names:
|
|
154
|
-
df.columns = [c.strip().lower() for c in df.columns]
|
|
155
|
-
|
|
156
|
-
if replace_spaces:
|
|
157
|
-
df.columns = [c.replace(" ", "_") for c in df.columns]
|
|
158
|
-
|
|
159
|
-
if replace_hash:
|
|
160
|
-
df.columns = [c.replace("#", "") for c in df.columns]
|
|
161
|
-
|
|
162
|
-
if strip_spaces:
|
|
163
|
-
df.columns = [c.strip() for c in df.columns]
|
|
164
|
-
|
|
165
|
-
return df
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def merge_dataframes(dataframes: list, data_source_names=[]):
|
|
169
|
-
merged_df = pds.DataFrame(
|
|
170
|
-
columns=["nmdc_data_source", "nmdc_record_id", "attribute", "value"]
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
for idx, df in enumerate(dataframes):
|
|
174
|
-
if "pandas.core.frame.DataFrame" == type(df):
|
|
175
|
-
data_source_name = data_source_names[idx]
|
|
176
|
-
data = df
|
|
177
|
-
else:
|
|
178
|
-
data_source_name = df.name
|
|
179
|
-
data = df.data
|
|
180
|
-
|
|
181
|
-
## convert data into an EAV structure
|
|
182
|
-
eavdf = data.melt(id_vars=["nmdc_record_id"], var_name="attribute")
|
|
183
|
-
eavdf["nmdc_data_source"] = data_source_name
|
|
184
|
-
# print(data_source_name, len(eavdf))
|
|
185
|
-
|
|
186
|
-
merged_df = merged_df.append(eavdf, ignore_index=True)
|
|
187
|
-
|
|
188
|
-
return merged_df
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def make_dataframe_dictionary(
|
|
192
|
-
file_name,
|
|
193
|
-
subset_cols=[],
|
|
194
|
-
exclude_cols=[],
|
|
195
|
-
nrows=None,
|
|
196
|
-
lowercase_col_names=True,
|
|
197
|
-
replace_spaces=True,
|
|
198
|
-
file_type="tsv",
|
|
199
|
-
delimiter="\t",
|
|
200
|
-
sheet_name=0,
|
|
201
|
-
file_archive_name="",
|
|
202
|
-
):
|
|
203
|
-
"""
|
|
204
|
-
Builds a dictionary based on the structure of the pandas dataframe generated from the designated file.
|
|
205
|
-
The dictionary is oriented for records.
|
|
206
|
-
E.g.:
|
|
207
|
-
[
|
|
208
|
-
{
|
|
209
|
-
'col1': 1,
|
|
210
|
-
'col2': 0.5
|
|
211
|
-
},
|
|
212
|
-
{
|
|
213
|
-
'col1': 2,
|
|
214
|
-
'col2': 0.75
|
|
215
|
-
}
|
|
216
|
-
]
|
|
217
|
-
|
|
218
|
-
Essentially, this function is a shortcut for calling make_dataframe() and then transforming the result into a dictionary.
|
|
219
|
-
E.g.:
|
|
220
|
-
df = make_dataframe(file_name)
|
|
221
|
-
dictdf = dictdf = df.to_dict(orient="records")
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
Args:
|
|
225
|
-
file_name: The name of the file containing the data for the dataframe. If the file is not in the same directory, then specify the path as part of the file name.
|
|
226
|
-
subset_cols: Specifies a specific of subset of columns to be included in the dataframe.
|
|
227
|
-
exclude_cols: Specifies a specific of subset of columns to be excluded from the dataframe.
|
|
228
|
-
nrows: Specifies the number of rows to be returned in the dataframe (useful for testing).
|
|
229
|
-
lowercase_col_names: If true, the column names are converted to lower case.
|
|
230
|
-
replace_spaces: If true, spaces in column names are replaced with spaces.
|
|
231
|
-
file_type: Speicfies the type of file. Current acceptable file types are tsv, csv, and excel. Note that when using excel, you may need to specify a sheet name.
|
|
232
|
-
delimiter: Specifies the delimiter character used between fields.
|
|
233
|
-
sheet_name: If the files is an Excel spreadsheet, this parameter specifies a particular sheet.
|
|
234
|
-
archive_name: If the file_name is contained in an zip or file, this is the name of archive file.
|
|
235
|
-
Returns:
|
|
236
|
-
Dictionary built from a Pandas dataframe.
|
|
237
|
-
"""
|
|
238
|
-
df = make_dataframe(
|
|
239
|
-
file_name,
|
|
240
|
-
subset_cols=[],
|
|
241
|
-
exclude_cols=[],
|
|
242
|
-
nrows=None,
|
|
243
|
-
lowercase_col_names=True,
|
|
244
|
-
replace_spaces=True,
|
|
245
|
-
file_type="tsv",
|
|
246
|
-
delimiter=delimiter,
|
|
247
|
-
sheet_name=sheet_name,
|
|
248
|
-
file_archive_name=file_archive_name,
|
|
249
|
-
)
|
|
250
|
-
return df.to_dict(orient="records")
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def make_collection_date(year_val, month_val, day_val, hour_val="", minute_val=""):
|
|
254
|
-
def pad_value(val, pad_len=2):
|
|
255
|
-
s = str(val)
|
|
256
|
-
return s.zfill(pad_len)
|
|
257
|
-
|
|
258
|
-
return_val = ""
|
|
259
|
-
year_val = year_val.strip()
|
|
260
|
-
month_val = month_val.strip()
|
|
261
|
-
day_val = day_val.strip()
|
|
262
|
-
hour_val = hour_val.strip()
|
|
263
|
-
minute_val = minute_val.strip()
|
|
264
|
-
return_val = ""
|
|
265
|
-
|
|
266
|
-
## if a year isn't provided simply return the empty string
|
|
267
|
-
if len(year_val) < 1:
|
|
268
|
-
return ""
|
|
269
|
-
else:
|
|
270
|
-
return_val = pad_value(year_val, 4)
|
|
271
|
-
|
|
272
|
-
if len(month_val) > 0:
|
|
273
|
-
return_val = return_val + "-" + pad_value(month_val)
|
|
274
|
-
|
|
275
|
-
## we only days that have months assocated with them
|
|
276
|
-
if (len(month_val) > 0) and (len(day_val) > 0):
|
|
277
|
-
return_val = return_val + "-" + pad_value(day_val)
|
|
278
|
-
|
|
279
|
-
## we only want times with months and days associated with them
|
|
280
|
-
if (len(month_val) > 0) and (len(day_val) > 0):
|
|
281
|
-
if (len(hour_val) > 0) and (len(minute_val) > 0):
|
|
282
|
-
return_val = return_val + "T" + pad_value(hour_val) + ":" + minute_val
|
|
283
|
-
elif len(hour_val) > 0:
|
|
284
|
-
return_val = (
|
|
285
|
-
return_val + "T" + pad_value(hour_val) + "00"
|
|
286
|
-
) # case for when no minute val is given
|
|
287
|
-
|
|
288
|
-
return return_val
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def make_lat_lon(latitude, longitude):
|
|
292
|
-
# latitude = "" if pds.isnull(latitude) else str(latitude).strip().replace('\n', '')
|
|
293
|
-
# longitude = "" if pds.isnull(longitude) else str(longitude).strip().replace('\n', '')
|
|
294
|
-
latitude = None if pds.isnull(latitude) else float(latitude)
|
|
295
|
-
longitude = None if pds.isnull(longitude) else float(longitude)
|
|
296
|
-
|
|
297
|
-
if (not (latitude is None)) and (not (longitude is None)):
|
|
298
|
-
return f"{latitude} {longitude}".strip()
|
|
299
|
-
else:
|
|
300
|
-
return None
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
def make_study_dataframe(study_table, contact_table, proposals_table, result_cols=[]):
|
|
304
|
-
## subset dataframes
|
|
305
|
-
contact_table_splice = contact_table[
|
|
306
|
-
["contact_id", "principal_investigator_name"]
|
|
307
|
-
].copy()
|
|
308
|
-
proposals_table_splice = proposals_table[["gold_study", "doi"]].copy()
|
|
309
|
-
|
|
310
|
-
## make sure the contact ids are strings with the ".0" removed from the end (i.e., the strings aren't floats)
|
|
311
|
-
study_table["contact_id"] = (
|
|
312
|
-
study_table["contact_id"].astype(str).replace("\.0", "", regex=True)
|
|
313
|
-
)
|
|
314
|
-
contact_table_splice["contact_id"] = (
|
|
315
|
-
contact_table_splice["contact_id"].astype(str).replace("\.0", "", regex=True)
|
|
316
|
-
)
|
|
317
|
-
# print(study_table[['contact_id', 'principal_investigator_name']].head())
|
|
318
|
-
|
|
319
|
-
## left join data from contact
|
|
320
|
-
temp1_df = pds.merge(study_table, contact_table_splice, how="left", on="contact_id")
|
|
321
|
-
|
|
322
|
-
## left join data from proposals
|
|
323
|
-
temp2_df = pds.merge(
|
|
324
|
-
temp1_df,
|
|
325
|
-
proposals_table_splice,
|
|
326
|
-
how="left",
|
|
327
|
-
left_on="gold_id",
|
|
328
|
-
right_on="gold_study",
|
|
329
|
-
)
|
|
330
|
-
|
|
331
|
-
## add prefix
|
|
332
|
-
temp2_df.gold_id = "gold:" + temp2_df.gold_id
|
|
333
|
-
temp2_df.gold_study = "gold:" + temp2_df.gold_study
|
|
334
|
-
|
|
335
|
-
if len(result_cols) > 0:
|
|
336
|
-
return temp2_df[result_cols]
|
|
337
|
-
else:
|
|
338
|
-
return temp2_df
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
def make_project_dataframe(
|
|
342
|
-
project_table,
|
|
343
|
-
study_table,
|
|
344
|
-
contact_table,
|
|
345
|
-
data_object_table,
|
|
346
|
-
project_biosample_table=None,
|
|
347
|
-
biosample_table=None,
|
|
348
|
-
result_cols=[],
|
|
349
|
-
):
|
|
350
|
-
## subset data
|
|
351
|
-
study_table_splice = study_table[["study_id", "gold_id"]].copy()
|
|
352
|
-
contact_table_splice = contact_table[
|
|
353
|
-
["contact_id", "principal_investigator_name"]
|
|
354
|
-
].copy()
|
|
355
|
-
|
|
356
|
-
## rename study.gold_id to study_gold_id
|
|
357
|
-
study_table_splice.rename(columns={"gold_id": "study_gold_id"}, inplace=True)
|
|
358
|
-
|
|
359
|
-
## inner join on study (project must be part of study)
|
|
360
|
-
temp1_df = pds.merge(
|
|
361
|
-
project_table,
|
|
362
|
-
study_table_splice,
|
|
363
|
-
how="inner",
|
|
364
|
-
left_on="master_study_id",
|
|
365
|
-
right_on="study_id",
|
|
366
|
-
)
|
|
367
|
-
|
|
368
|
-
## left join contact data
|
|
369
|
-
temp2_df = pds.merge(
|
|
370
|
-
temp1_df,
|
|
371
|
-
contact_table_splice,
|
|
372
|
-
how="left",
|
|
373
|
-
left_on="pi_id",
|
|
374
|
-
right_on="contact_id",
|
|
375
|
-
)
|
|
376
|
-
|
|
377
|
-
## add prefix
|
|
378
|
-
temp2_df.gold_id = "gold:" + temp2_df.gold_id
|
|
379
|
-
temp2_df.study_gold_id = "gold:" + temp2_df.study_gold_id
|
|
380
|
-
|
|
381
|
-
## if present join data objects as output of project
|
|
382
|
-
if not (data_object_table is None):
|
|
383
|
-
## make copy and add prefix
|
|
384
|
-
data_object_table = data_object_table.copy()
|
|
385
|
-
data_object_table.gold_project_id = data_object_table.gold_project_id.map(
|
|
386
|
-
lambda x: x if "gold:" == x[0:5] else "gold:" + x
|
|
387
|
-
)
|
|
388
|
-
|
|
389
|
-
## create a group concat for all file ids in the data objects
|
|
390
|
-
groups = data_object_table.groupby("gold_project_id")["file_id"]
|
|
391
|
-
output_files = (
|
|
392
|
-
pds.DataFrame(groups.apply(lambda x: ",".join(filter(None, x))))
|
|
393
|
-
.drop_duplicates()
|
|
394
|
-
.reset_index()
|
|
395
|
-
)
|
|
396
|
-
output_files.rename(columns={"file_id": "output_file_ids"}, inplace=True)
|
|
397
|
-
output_files["output_file_ids"] = output_files["output_file_ids"].astype(str)
|
|
398
|
-
|
|
399
|
-
## left join output files for projects
|
|
400
|
-
temp2_df = pds.merge(
|
|
401
|
-
temp2_df,
|
|
402
|
-
output_files,
|
|
403
|
-
how="left",
|
|
404
|
-
left_on="gold_id",
|
|
405
|
-
right_on="gold_project_id",
|
|
406
|
-
)
|
|
407
|
-
|
|
408
|
-
## if present join biosamples as inputs to project
|
|
409
|
-
if (not (project_biosample_table is None)) and (not (biosample_table is None)):
|
|
410
|
-
## make local copies & rename column
|
|
411
|
-
project_biosample_table = project_biosample_table.copy()
|
|
412
|
-
biosample_table = biosample_table[["biosample_id", "gold_id"]].copy()
|
|
413
|
-
biosample_table.rename(columns={"gold_id": "biosample_gold_id"}, inplace=True)
|
|
414
|
-
|
|
415
|
-
## add prefix
|
|
416
|
-
biosample_table["biosample_gold_id"] = biosample_table["biosample_gold_id"].map(
|
|
417
|
-
lambda x: x if "gold:" == x[0:5] else "gold:" + x
|
|
418
|
-
)
|
|
419
|
-
|
|
420
|
-
## join project biosamples to biosamples
|
|
421
|
-
input_samples = pds.merge(
|
|
422
|
-
project_biosample_table, biosample_table, how="inner", on="biosample_id"
|
|
423
|
-
)
|
|
424
|
-
# require input samples (i.e., inner join)
|
|
425
|
-
temp2_df = pds.merge(temp2_df, input_samples, how="inner", on="project_id")
|
|
426
|
-
|
|
427
|
-
if len(result_cols) > 0:
|
|
428
|
-
return temp2_df[result_cols]
|
|
429
|
-
else:
|
|
430
|
-
return temp2_df
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
def make_biosample_dataframe(
|
|
434
|
-
biosample_table,
|
|
435
|
-
soil_package_table,
|
|
436
|
-
water_package_table,
|
|
437
|
-
project_biosample_table,
|
|
438
|
-
project_table,
|
|
439
|
-
study_table,
|
|
440
|
-
result_cols=[],
|
|
441
|
-
):
|
|
442
|
-
def make_collection_date_from_row(row):
|
|
443
|
-
def _format_date_part_value(val):
|
|
444
|
-
if pds.isnull(val):
|
|
445
|
-
return ""
|
|
446
|
-
|
|
447
|
-
if type("") == type(val):
|
|
448
|
-
if "." in val:
|
|
449
|
-
return val[0 : val.find(".")].strip()
|
|
450
|
-
else:
|
|
451
|
-
return val.strip()
|
|
452
|
-
else:
|
|
453
|
-
return str(int(val)).strip()
|
|
454
|
-
|
|
455
|
-
year_val = _format_date_part_value(row["sample_collection_year"])
|
|
456
|
-
month_val = _format_date_part_value(row["sample_collection_month"])
|
|
457
|
-
day_val = _format_date_part_value(row["sample_collection_day"])
|
|
458
|
-
hour_val = _format_date_part_value(row["sample_collection_hour"])
|
|
459
|
-
minute_val = _format_date_part_value(row["sample_collection_minute"])
|
|
460
|
-
|
|
461
|
-
return make_collection_date(year_val, month_val, day_val, hour_val, minute_val)
|
|
462
|
-
|
|
463
|
-
## subset data
|
|
464
|
-
project_biosample_table_splice = project_biosample_table[
|
|
465
|
-
["biosample_id", "project_id"]
|
|
466
|
-
].copy()
|
|
467
|
-
project_table_splice = project_table[
|
|
468
|
-
["project_id", "gold_id", "master_study_id"]
|
|
469
|
-
].copy()
|
|
470
|
-
study_table_splice = study_table[["study_id", "gold_id"]].copy()
|
|
471
|
-
|
|
472
|
-
## add prefix
|
|
473
|
-
project_table_splice.gold_id = "gold:" + project_table_splice.gold_id
|
|
474
|
-
study_table_splice.gold_id = "gold:" + study_table_splice.gold_id
|
|
475
|
-
|
|
476
|
-
## rename columns
|
|
477
|
-
project_table_splice.rename(columns={"gold_id": "project_gold_id"}, inplace=True)
|
|
478
|
-
study_table_splice.rename(columns={"gold_id": "study_gold_id"}, inplace=True)
|
|
479
|
-
|
|
480
|
-
## inner join projects and studies
|
|
481
|
-
project_table_splice = pds.merge(
|
|
482
|
-
project_table_splice,
|
|
483
|
-
study_table_splice,
|
|
484
|
-
how="inner",
|
|
485
|
-
left_on="master_study_id",
|
|
486
|
-
right_on="study_id",
|
|
487
|
-
)
|
|
488
|
-
|
|
489
|
-
## drop biosample rows that don't have required fields
|
|
490
|
-
biosample_table = biosample_table[biosample_table["env_broad_scale"].notnull()]
|
|
491
|
-
biosample_table = biosample_table[biosample_table["env_local_scale"].notnull()]
|
|
492
|
-
biosample_table = biosample_table[biosample_table["env_medium"].notnull()]
|
|
493
|
-
|
|
494
|
-
## left join package tables to biosample table
|
|
495
|
-
temp0_df = pds.merge(
|
|
496
|
-
biosample_table, soil_package_table, how="left", on="soil_package_id"
|
|
497
|
-
)
|
|
498
|
-
temp0_df = pds.merge(
|
|
499
|
-
temp0_df, water_package_table, how="left", on="water_package_id"
|
|
500
|
-
)
|
|
501
|
-
|
|
502
|
-
## inner join on project_biosample and project; i.e., biosamples must be linked to project
|
|
503
|
-
temp1_df = pds.merge(
|
|
504
|
-
temp0_df, project_biosample_table_splice, how="inner", on="biosample_id"
|
|
505
|
-
)
|
|
506
|
-
temp2_df = pds.merge(temp1_df, project_table_splice, how="inner", on="project_id")
|
|
507
|
-
|
|
508
|
-
## add collection date and lat_lon columns
|
|
509
|
-
temp2_df["collection_date"] = temp2_df.apply(
|
|
510
|
-
lambda row: make_collection_date_from_row(row), axis=1
|
|
511
|
-
)
|
|
512
|
-
temp2_df["lat_lon"] = temp2_df.apply(
|
|
513
|
-
lambda row: make_lat_lon(row.latitude, row.longitude), axis=1
|
|
514
|
-
)
|
|
515
|
-
|
|
516
|
-
## convert latitude and longitute columns to floats
|
|
517
|
-
temp2_df["latitude"] = temp2_df["latitude"].map(
|
|
518
|
-
lambda x: None if pds.isnull(x) else float(x)
|
|
519
|
-
)
|
|
520
|
-
temp2_df["longitude"] = temp2_df["longitude"].map(
|
|
521
|
-
lambda x: None if pds.isnull(x) else float(x)
|
|
522
|
-
)
|
|
523
|
-
|
|
524
|
-
## add gold prefix
|
|
525
|
-
temp2_df["gold_id"] = "gold:" + temp2_df["gold_id"]
|
|
526
|
-
|
|
527
|
-
## biosample might belong to more than one project; so do the equivalent of a group_cat
|
|
528
|
-
## see: https://queirozf.com/entries/pandas-dataframe-groupby-examples
|
|
529
|
-
## see: https://stackoverflow.com/questions/18138693/replicating-group-concat-for-pandas-dataframe
|
|
530
|
-
groups = (
|
|
531
|
-
temp2_df.groupby("biosample_id")["project_gold_id"]
|
|
532
|
-
.apply(lambda pid: ",".join(filter(None, pid)))
|
|
533
|
-
.reset_index()
|
|
534
|
-
)
|
|
535
|
-
groups.rename(columns={"project_gold_id": "project_gold_ids"}, inplace=True)
|
|
536
|
-
|
|
537
|
-
# join concat groups to dataframe
|
|
538
|
-
temp3_df = pds.merge(temp2_df, groups, how="left", on="biosample_id")
|
|
539
|
-
|
|
540
|
-
## A biosample may belong to multiple projects
|
|
541
|
-
# E.g. see biosample_id 247352 with gold_id "Gb0247352", belongs to projects 467278, 467306
|
|
542
|
-
## So, remove uneeded columns & drop dups
|
|
543
|
-
temp3_df.drop(columns=["project_gold_id"], inplace=True)
|
|
544
|
-
temp3_df.drop(columns=["project_id"], inplace=True)
|
|
545
|
-
temp3_df.drop_duplicates(inplace=True)
|
|
546
|
-
|
|
547
|
-
## for 'env_broad_scale', 'env_local_scale', 'env_medium' fields change 'ENVO_' to 'ENVO:'
|
|
548
|
-
# temp3_df['env_broad_scale'] = temp3_df
|
|
549
|
-
for idx in temp3_df.index:
|
|
550
|
-
if pds.notnull(temp3_df.loc[idx, "env_broad_scale"]):
|
|
551
|
-
temp3_df.loc[idx, "env_broad_scale"] = str(
|
|
552
|
-
temp3_df.loc[idx, "env_broad_scale"]
|
|
553
|
-
).replace("_", ":", 1)
|
|
554
|
-
if pds.notnull(temp3_df.loc[idx, "env_local_scale"]):
|
|
555
|
-
temp3_df.loc[idx, "env_local_scale"] = str(
|
|
556
|
-
temp3_df.loc[idx, "env_local_scale"]
|
|
557
|
-
).replace("_", ":", 1)
|
|
558
|
-
if pds.notnull(temp3_df.loc[idx, "env_medium"]):
|
|
559
|
-
temp3_df.loc[idx, "env_medium"] = str(
|
|
560
|
-
temp3_df.loc[idx, "env_medium"]
|
|
561
|
-
).replace("_", ":", 1)
|
|
562
|
-
|
|
563
|
-
if len(result_cols) > 0:
|
|
564
|
-
return temp3_df[result_cols]
|
|
565
|
-
else:
|
|
566
|
-
return temp3_df
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
def make_jgi_emsl_dataframe(jgi_emsl_table, study_table, result_cols=[]):
|
|
570
|
-
## subset data
|
|
571
|
-
study_table_splice = study_table[["study_id", "gold_id"]].copy()
|
|
572
|
-
|
|
573
|
-
## inner join jgi-emsl data on study (must be part of study)
|
|
574
|
-
temp1_df = pds.merge(
|
|
575
|
-
jgi_emsl_table,
|
|
576
|
-
study_table_splice,
|
|
577
|
-
how="inner",
|
|
578
|
-
left_on="gold_study_id",
|
|
579
|
-
right_on="gold_id",
|
|
580
|
-
)
|
|
581
|
-
|
|
582
|
-
## add prefix
|
|
583
|
-
temp1_df.gold_id = "gold:" + temp1_df.gold_id
|
|
584
|
-
temp1_df.gold_study_id = "gold:" + temp1_df.gold_study_id
|
|
585
|
-
|
|
586
|
-
if len(result_cols) > 0:
|
|
587
|
-
return temp1_df[result_cols]
|
|
588
|
-
else:
|
|
589
|
-
return temp1_df
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
def make_emsl_dataframe(
|
|
593
|
-
emsl_table, jgi_emsl_table, study_table, emsl_biosample_table, result_cols=[]
|
|
594
|
-
):
|
|
595
|
-
## subset data
|
|
596
|
-
study_table_splice = study_table[["study_id", "gold_id"]].copy()
|
|
597
|
-
jgi_emsl_table_splice = jgi_emsl_table[["gold_study_id", "emsl_proposal_id"]].copy()
|
|
598
|
-
biosample_slice = emsl_biosample_table[["dataset_id", "biosample_gold_id"]].copy()
|
|
599
|
-
biosample_slice["biosample_gold_id"] = (
|
|
600
|
-
"gold:" + biosample_slice["biosample_gold_id"]
|
|
601
|
-
) # add prefix
|
|
602
|
-
|
|
603
|
-
## inner join jgi-emsl data on study (must be part of study)
|
|
604
|
-
temp1_df = pds.merge(
|
|
605
|
-
jgi_emsl_table_splice,
|
|
606
|
-
study_table_splice,
|
|
607
|
-
how="inner",
|
|
608
|
-
left_on="gold_study_id",
|
|
609
|
-
right_on="gold_id",
|
|
610
|
-
)
|
|
611
|
-
|
|
612
|
-
## inner join emsl data on jgi-emsl proposal ids
|
|
613
|
-
temp2_df = pds.merge(emsl_table, temp1_df, how="inner", on="emsl_proposal_id")
|
|
614
|
-
|
|
615
|
-
## add data obect id column
|
|
616
|
-
temp2_df["data_object_id"] = "output_"
|
|
617
|
-
temp2_df["data_object_id"] = temp2_df["data_object_id"] + temp2_df[
|
|
618
|
-
"dataset_id"
|
|
619
|
-
].map(
|
|
620
|
-
str
|
|
621
|
-
) # build data object id
|
|
622
|
-
|
|
623
|
-
## add data object name column
|
|
624
|
-
temp2_df["data_object_name"] = "output: "
|
|
625
|
-
temp2_df["data_object_name"] = temp2_df["data_object_name"] + temp2_df[
|
|
626
|
-
"dataset_name"
|
|
627
|
-
].map(
|
|
628
|
-
str
|
|
629
|
-
) # build data object id
|
|
630
|
-
|
|
631
|
-
## group concat & join the biosample ids that are inputs to the omics process
|
|
632
|
-
## With filter function as None, the function defaults to Identity function,
|
|
633
|
-
## and each element in random_list is checked if it's true or not.
|
|
634
|
-
## see https://www.programiz.com/python-programming/methods/built-in/filter
|
|
635
|
-
groups = biosample_slice.groupby("dataset_id")["biosample_gold_id"]
|
|
636
|
-
input_biosamples = (
|
|
637
|
-
pds.DataFrame(groups.apply(lambda x: ",".join(filter(None, x))))
|
|
638
|
-
.drop_duplicates()
|
|
639
|
-
.reset_index()
|
|
640
|
-
)
|
|
641
|
-
|
|
642
|
-
input_biosamples.reset_index(inplace=True) # make dataset_id a column
|
|
643
|
-
input_biosamples.rename(
|
|
644
|
-
columns={"biosample_gold_id": "biosample_gold_ids"}, inplace=True
|
|
645
|
-
) # change column name
|
|
646
|
-
|
|
647
|
-
input_biosamples["biosample_gold_ids"] = input_biosamples[
|
|
648
|
-
"biosample_gold_ids"
|
|
649
|
-
].astype(
|
|
650
|
-
str
|
|
651
|
-
) # make sure biosample_ids are strings
|
|
652
|
-
|
|
653
|
-
temp2_df = pds.merge(temp2_df, input_biosamples, how="left", on="dataset_id")
|
|
654
|
-
|
|
655
|
-
## add "emsl:TBD" id for missing biosamples
|
|
656
|
-
temp2_df["biosample_gold_ids"] = temp2_df["biosample_gold_ids"].map(
|
|
657
|
-
lambda x: "emsl:TBD" if pds.isnull(x) else x
|
|
658
|
-
)
|
|
659
|
-
|
|
660
|
-
## add prefix
|
|
661
|
-
temp2_df.gold_id = "gold:" + temp2_df.gold_id
|
|
662
|
-
temp2_df.gold_study_id = "gold:" + temp2_df.gold_study_id
|
|
663
|
-
temp2_df.dataset_id = "emsl:" + temp2_df.dataset_id
|
|
664
|
-
temp2_df.data_object_id = "emsl:" + temp2_df.data_object_id
|
|
665
|
-
|
|
666
|
-
## replace NaNs with None
|
|
667
|
-
temp2_df = temp2_df.where(pds.notnull(temp2_df), None)
|
|
668
|
-
|
|
669
|
-
## drop duplicates
|
|
670
|
-
temp2_df.drop_duplicates(inplace=True)
|
|
671
|
-
|
|
672
|
-
if len(result_cols) > 0:
|
|
673
|
-
return temp2_df[result_cols]
|
|
674
|
-
else:
|
|
675
|
-
return temp2_df
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
def make_data_objects_dataframe(
|
|
679
|
-
faa_table, fna_table, fastq_table, project_table, result_cols=[]
|
|
680
|
-
):
|
|
681
|
-
## subset data
|
|
682
|
-
project_table_splice = project_table[["gold_id"]].copy()
|
|
683
|
-
|
|
684
|
-
## copy tables
|
|
685
|
-
faa_df = faa_table.copy()
|
|
686
|
-
fna_df = fna_table.copy()
|
|
687
|
-
fastq_df = fastq_table.copy()
|
|
688
|
-
|
|
689
|
-
## add prefixes for faa, fna, and fastq files
|
|
690
|
-
faa_df.file_id = "nmdc:" + faa_df.file_id
|
|
691
|
-
fna_df.file_id = "nmdc:" + fna_df.file_id
|
|
692
|
-
fastq_df.file_id = "jgi:" + fastq_df.file_id
|
|
693
|
-
|
|
694
|
-
## merge tables
|
|
695
|
-
data_objects = pds.concat([faa_df, fna_df, fastq_df], axis=0)
|
|
696
|
-
|
|
697
|
-
## inner joing data objects (e.g., faa, fna, fasq) to projects
|
|
698
|
-
temp1_df = pds.merge(
|
|
699
|
-
data_objects,
|
|
700
|
-
project_table_splice,
|
|
701
|
-
how="inner",
|
|
702
|
-
left_on="gold_project_id",
|
|
703
|
-
right_on="gold_id",
|
|
704
|
-
)
|
|
705
|
-
|
|
706
|
-
## add prefix for gold
|
|
707
|
-
temp1_df.gold_project_id = "gold:" + temp1_df.gold_project_id
|
|
708
|
-
temp1_df.gold_id = "gold:" + temp1_df.gold_id
|
|
709
|
-
|
|
710
|
-
if len(result_cols) > 0:
|
|
711
|
-
return temp1_df[result_cols]
|
|
712
|
-
else:
|
|
713
|
-
return temp1_df[data_objects.columns]
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
def make_jgi_fastq_dataframe(fastq_table, project_table, result_cols=[]):
|
|
717
|
-
## subset data
|
|
718
|
-
project_table_splice = project_table[["gold_id"]].copy()
|
|
719
|
-
|
|
720
|
-
## copy tables
|
|
721
|
-
fastq_df = fastq_table.copy()
|
|
722
|
-
|
|
723
|
-
## add prefixes for fastq file id
|
|
724
|
-
fastq_df.file_id = "jgi:" + fastq_df.file_id
|
|
725
|
-
|
|
726
|
-
## inner join to projects
|
|
727
|
-
temp1_df = pds.merge(
|
|
728
|
-
fastq_df,
|
|
729
|
-
project_table_splice,
|
|
730
|
-
how="inner",
|
|
731
|
-
left_on="gold_project_id",
|
|
732
|
-
right_on="gold_id",
|
|
733
|
-
)
|
|
734
|
-
|
|
735
|
-
## add prefix for gold
|
|
736
|
-
temp1_df.gold_project_id = "gold:" + temp1_df.gold_project_id
|
|
737
|
-
temp1_df.gold_id = "gold:" + temp1_df.gold_id
|
|
738
|
-
|
|
739
|
-
if len(result_cols) > 0:
|
|
740
|
-
return temp1_df[result_cols]
|
|
741
|
-
else:
|
|
742
|
-
return temp1_df[fastq_df.columns]
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
def make_dataframe_from_spec_file(data_spec_file, nrows=None):
|
|
746
|
-
def make_df_from_file(data_source, nrows):
|
|
747
|
-
file_type = data_source["file_type"]
|
|
748
|
-
fname = data_source["file_name"]
|
|
749
|
-
|
|
750
|
-
if "file_archive_name" in data_source.keys():
|
|
751
|
-
farchive = data_source["file_archive_name"]
|
|
752
|
-
df = make_dataframe(
|
|
753
|
-
fname, file_archive_name=farchive, file_type=file_type, nrows=nrows
|
|
754
|
-
)
|
|
755
|
-
else:
|
|
756
|
-
df = make_dataframe(fname, file_type=file_type, nrows=nrows)
|
|
757
|
-
|
|
758
|
-
return df
|
|
759
|
-
|
|
760
|
-
def make_df(source, source_type="file_name"):
|
|
761
|
-
name = source[0]
|
|
762
|
-
data = source[1]
|
|
763
|
-
data_source = source[1]["data_source"]
|
|
764
|
-
|
|
765
|
-
if source_type not in data_source.keys():
|
|
766
|
-
return None
|
|
767
|
-
|
|
768
|
-
## get data from file
|
|
769
|
-
if "file_name" in data_source.keys():
|
|
770
|
-
df = make_df_from_file(data_source, nrows=nrows)
|
|
771
|
-
|
|
772
|
-
## add extra columns
|
|
773
|
-
if "append_columns" in data.keys():
|
|
774
|
-
for col in data["append_columns"]:
|
|
775
|
-
df[col["name"]] = col["value"]
|
|
776
|
-
|
|
777
|
-
## filter rows by specific values
|
|
778
|
-
if "filters" in data.keys():
|
|
779
|
-
for fltr in data["filters"]:
|
|
780
|
-
if "include" in fltr:
|
|
781
|
-
df = df[
|
|
782
|
-
df[fltr["include"]["field"]].isin(fltr["include"]["values"])
|
|
783
|
-
]
|
|
784
|
-
elif "exclude" in fltr:
|
|
785
|
-
df = df[
|
|
786
|
-
~df[fltr["exclude"]["field"]].isin(fltr["exclude"]["values"])
|
|
787
|
-
]
|
|
788
|
-
else:
|
|
789
|
-
df = df[df[fltr["field"]].isin(fltr["values"])]
|
|
790
|
-
|
|
791
|
-
## select a subset of the columns
|
|
792
|
-
if "subset_cols" in data.keys():
|
|
793
|
-
df = df[data["subset_cols"]]
|
|
794
|
-
|
|
795
|
-
## rename columns
|
|
796
|
-
if "rename_slots" in data.keys():
|
|
797
|
-
for slot in data["rename_slots"]:
|
|
798
|
-
df.rename(columns={slot["old_name"]: slot["new_name"]}, inplace=True)
|
|
799
|
-
|
|
800
|
-
## add 'nmdc_record_id' as a primary key
|
|
801
|
-
if "id_key" in data.keys():
|
|
802
|
-
df["nmdc_record_id"] = df[data["id_key"]]
|
|
803
|
-
df["nmdc_record_id"] = df["nmdc_record_id"].astype(
|
|
804
|
-
str
|
|
805
|
-
) # ensure all keys are strings
|
|
806
|
-
else:
|
|
807
|
-
df.index.name = "nmdc_record_id" # rename the current index
|
|
808
|
-
df.reset_index(inplace=True) # turn index into a column
|
|
809
|
-
df["nmdc_record_id"] = df["nmdc_record_id"].astype(
|
|
810
|
-
str
|
|
811
|
-
) # ensure all keys are strings
|
|
812
|
-
|
|
813
|
-
return df
|
|
814
|
-
|
|
815
|
-
with open(data_spec_file, "r") as input_file:
|
|
816
|
-
# spec = DottedDict(yaml.load(input_file, Loader=Loader))
|
|
817
|
-
spec = yaml.load(input_file, Loader=Loader)
|
|
818
|
-
|
|
819
|
-
Data_source = namedtuple("Data_source", "data name")
|
|
820
|
-
|
|
821
|
-
dataframes = []
|
|
822
|
-
for source in spec["data_sources"].items():
|
|
823
|
-
df = make_df(source)
|
|
824
|
-
ds = Data_source(df, source[0])
|
|
825
|
-
dataframes.append(ds)
|
|
826
|
-
print(source[0], len(df))
|
|
827
|
-
|
|
828
|
-
merged_df = merge_dataframes(dataframes)
|
|
829
|
-
return merged_df
|