nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +22 -2
  3. nmdc_runtime/api/core/idgen.py +36 -6
  4. nmdc_runtime/api/db/mongo.py +0 -12
  5. nmdc_runtime/api/endpoints/find.py +65 -225
  6. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  7. nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
  8. nmdc_runtime/api/endpoints/objects.py +4 -11
  9. nmdc_runtime/api/endpoints/operations.py +0 -27
  10. nmdc_runtime/api/endpoints/queries.py +22 -0
  11. nmdc_runtime/api/endpoints/sites.py +0 -24
  12. nmdc_runtime/api/endpoints/util.py +57 -35
  13. nmdc_runtime/api/entrypoint.sh +7 -0
  14. nmdc_runtime/api/main.py +84 -60
  15. nmdc_runtime/api/models/util.py +12 -5
  16. nmdc_runtime/api/openapi.py +116 -180
  17. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  18. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  19. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  20. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  21. nmdc_runtime/minter/adapters/repository.py +21 -0
  22. nmdc_runtime/minter/domain/model.py +20 -0
  23. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  24. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  25. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  26. nmdc_runtime/site/dagster.yaml +53 -0
  27. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  28. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  29. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  30. nmdc_runtime/site/export/ncbi_xml.py +632 -11
  31. nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
  32. nmdc_runtime/site/graphs.py +7 -0
  33. nmdc_runtime/site/ops.py +92 -34
  34. nmdc_runtime/site/repository.py +2 -0
  35. nmdc_runtime/site/resources.py +16 -3
  36. nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
  37. nmdc_runtime/site/workspace.yaml +13 -0
  38. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  39. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  40. nmdc_runtime/static/README.md +5 -0
  41. nmdc_runtime/static/favicon.ico +0 -0
  42. nmdc_runtime/util.py +87 -1
  43. nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
  44. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
  45. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
  46. nmdc_runtime/api/endpoints/ids.py +0 -192
  47. nmdc_runtime/client/__init__.py +0 -0
  48. nmdc_runtime/containers.py +0 -14
  49. nmdc_runtime/core/__init__.py +0 -0
  50. nmdc_runtime/core/db/Database.py +0 -13
  51. nmdc_runtime/core/db/__init__.py +0 -0
  52. nmdc_runtime/core/exceptions/__init__.py +0 -23
  53. nmdc_runtime/core/exceptions/base.py +0 -47
  54. nmdc_runtime/core/exceptions/token.py +0 -13
  55. nmdc_runtime/domain/__init__.py +0 -0
  56. nmdc_runtime/domain/users/__init__.py +0 -0
  57. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  58. nmdc_runtime/domain/users/userSchema.py +0 -37
  59. nmdc_runtime/domain/users/userService.py +0 -14
  60. nmdc_runtime/infrastructure/__init__.py +0 -0
  61. nmdc_runtime/infrastructure/database/__init__.py +0 -0
  62. nmdc_runtime/infrastructure/database/db.py +0 -3
  63. nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
  64. nmdc_runtime/infrastructure/database/models/user.py +0 -1
  65. nmdc_runtime/lib/__init__.py +0 -1
  66. nmdc_runtime/lib/extract_nmdc_data.py +0 -33
  67. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  68. nmdc_runtime/lib/nmdc_dataframes.py +0 -825
  69. nmdc_runtime/lib/nmdc_etl_class.py +0 -396
  70. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  71. nmdc_runtime/site/drsobjects/__init__.py +0 -0
  72. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  73. nmdc_runtime/site/drsobjects/registration.py +0 -131
  74. nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
  75. nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
  76. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
  77. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,825 +0,0 @@
1
- ## author: Bill Duncan
2
- ## summary: Contains methods for creating dataframes needed for NMDC ETL pipeline.
3
-
4
- ## system level modules
5
- import pandas as pds
6
- import zipfile
7
- import yaml
8
- from yaml import CLoader as Loader
9
- from collections import namedtuple
10
-
11
-
12
- def make_dataframe(
13
- file_name,
14
- subset_cols=[],
15
- exclude_cols=[],
16
- nrows=None,
17
- lowercase_col_names=True,
18
- replace_spaces=True,
19
- replace_hash=True,
20
- strip_spaces=True,
21
- comment_str=None,
22
- file_type="tsv",
23
- delimiter="\t",
24
- sheet_name=0,
25
- file_archive_name="",
26
- ):
27
- """
28
- Builds a pandas dataframe from the designated file.
29
-
30
- Args:
31
- file_name: The name of the file containing the data for the dataframe. If the file is not in the same directory, then specify the path as part of the file name.
32
- subset_cols: Specifies a specific of subset of columns to be included in the dataframe.
33
- exclude_cols: Specifies a specific of subset of columns to be excluded from the dataframe.
34
- nrows: Specifies the number of rows to be returned in the dataframe (useful for testing).
35
- lowercase_col_names: If true, the column names are converted to lower case.
36
- replace_spaces: If true, spaces in column names are replaced with underscores.
37
- replace_hash: If true, hashes ('#') in column names are replaced with empty strings.
38
- strip_spaces: If true, extra surrounding spaces are stripped from the column names.
39
- comment_str: Specifies the string that is used for comments with the data.
40
- file_type: Speicfies the type of file. Current acceptable file types are tsv, csv, and excel. Note that when using excel, you may need to specify a sheet name.
41
- delimiter: Specifies the delimiter character used between fields.
42
- sheet_name: If the files is an Excel spreadsheet, this parameter specifies a particular sheet.
43
- archive_name: If the file_name is contained in an zip or file, this is the name of archive file.
44
- Returns:
45
- Pandas dataframe
46
- """
47
- ## normalize paramaters for use with pandas
48
- if len(subset_cols) < 1:
49
- subset_cols = None
50
- if len(exclude_cols) < 1:
51
- exclude_cols = None
52
-
53
- ## check if file is contained in an archive
54
- file_archive = None
55
- if len(file_archive_name) > 1:
56
- file_archive = zipfile.ZipFile(file_archive_name, "r")
57
-
58
- ## load data from file
59
- if "tsv" == file_type.lower() or "csv" == file_type.lower():
60
- if None != file_archive:
61
- df = pds.read_csv(
62
- file_archive.open(file_name),
63
- sep=delimiter,
64
- nrows=nrows,
65
- comment=comment_str,
66
- )
67
- else:
68
- df = pds.read_csv(
69
- file_name, sep=delimiter, nrows=nrows, comment=comment_str
70
- )
71
- elif "excel" == file_type.lower():
72
- if None != file_archive:
73
- df = pds.read_excel(
74
- file_archive.open(file_name),
75
- sheet_name=sheet_name,
76
- nrows=nrows,
77
- comment=comment_str,
78
- engine="openpyxl",
79
- )
80
- else:
81
- df = pds.read_excel(
82
- file_name,
83
- sheet_name=sheet_name,
84
- nrows=nrows,
85
- comment=comment_str,
86
- engine="openpyxl",
87
- )
88
- elif "multi-sheet-excel" == file_type.lower():
89
- if None != file_archive:
90
- df = pds.concat(
91
- pds.read_excel(
92
- file_archive.open(file_name),
93
- sheet_name=None,
94
- index_col=None,
95
- nrows=nrows,
96
- comment=comment_str,
97
- engine="openpyxl",
98
- )
99
- )
100
- else:
101
- df = pds.concat(
102
- pds.read_excel(
103
- file_name,
104
- sheet_name=None,
105
- index_col=None,
106
- nrows=nrows,
107
- comment=comment_str,
108
- engine="openpyxl",
109
- )
110
- )
111
-
112
- ## clean column names
113
- df = clean_dataframe_column_names(
114
- df, lowercase_col_names, replace_spaces, replace_hash, strip_spaces
115
- )
116
-
117
- ## create subset of columns
118
- ## note: since column names are case sensitive, this needs to happen after cleaning column names
119
- if subset_cols:
120
- df = df[subset_cols]
121
-
122
- ## return dataframe
123
- return df
124
-
125
-
126
- def clean_dataframe_column_names(
127
- df,
128
- lowercase_col_names=True,
129
- replace_spaces=True,
130
- replace_hash=True,
131
- strip_spaces=True,
132
- ):
133
- """
134
- Changes the column names of a dataframe into a standard format. The default settings change the column names to:
135
- - lower case
136
- - replace spaces with underscores
137
- - replace hash ('#') with empty string
138
- Args:
139
- df: The dataframe whose columns will be cleaned.
140
- lowercase_col_names: If true, the column names are converted to lower case.
141
- replace_spaces: If true, spaces in column names are replaced with underscores.
142
- replace_hash: If true, hashes ('#') in column names are replaced with empty strings.
143
- strip_spaces: If true, extra surrounding spaces are stripped from the column names.
144
- Returns:
145
- Pandas dataframe
146
- """
147
-
148
- ## clean column names
149
- if lowercase_col_names:
150
- df.columns = [c.strip().lower() for c in df.columns]
151
-
152
- if replace_spaces:
153
- df.columns = [c.replace(" ", "_") for c in df.columns]
154
-
155
- if replace_hash:
156
- df.columns = [c.replace("#", "") for c in df.columns]
157
-
158
- if strip_spaces:
159
- df.columns = [c.strip() for c in df.columns]
160
-
161
- return df
162
-
163
-
164
- def merge_dataframes(dataframes: list, data_source_names=[]):
165
- merged_df = pds.DataFrame(
166
- columns=["nmdc_data_source", "nmdc_record_id", "attribute", "value"]
167
- )
168
-
169
- for idx, df in enumerate(dataframes):
170
- if "pandas.core.frame.DataFrame" == type(df):
171
- data_source_name = data_source_names[idx]
172
- data = df
173
- else:
174
- data_source_name = df.name
175
- data = df.data
176
-
177
- ## convert data into an EAV structure
178
- eavdf = data.melt(id_vars=["nmdc_record_id"], var_name="attribute")
179
- eavdf["nmdc_data_source"] = data_source_name
180
- # print(data_source_name, len(eavdf))
181
-
182
- merged_df = merged_df.append(eavdf, ignore_index=True)
183
-
184
- return merged_df
185
-
186
-
187
- def make_dataframe_dictionary(
188
- file_name,
189
- subset_cols=[],
190
- exclude_cols=[],
191
- nrows=None,
192
- lowercase_col_names=True,
193
- replace_spaces=True,
194
- file_type="tsv",
195
- delimiter="\t",
196
- sheet_name=0,
197
- file_archive_name="",
198
- ):
199
- """
200
- Builds a dictionary based on the structure of the pandas dataframe generated from the designated file.
201
- The dictionary is oriented for records.
202
- E.g.:
203
- [
204
- {
205
- 'col1': 1,
206
- 'col2': 0.5
207
- },
208
- {
209
- 'col1': 2,
210
- 'col2': 0.75
211
- }
212
- ]
213
-
214
- Essentially, this function is a shortcut for calling make_dataframe() and then transforming the result into a dictionary.
215
- E.g.:
216
- df = make_dataframe(file_name)
217
- dictdf = dictdf = df.to_dict(orient="records")
218
-
219
-
220
- Args:
221
- file_name: The name of the file containing the data for the dataframe. If the file is not in the same directory, then specify the path as part of the file name.
222
- subset_cols: Specifies a specific of subset of columns to be included in the dataframe.
223
- exclude_cols: Specifies a specific of subset of columns to be excluded from the dataframe.
224
- nrows: Specifies the number of rows to be returned in the dataframe (useful for testing).
225
- lowercase_col_names: If true, the column names are converted to lower case.
226
- replace_spaces: If true, spaces in column names are replaced with spaces.
227
- file_type: Speicfies the type of file. Current acceptable file types are tsv, csv, and excel. Note that when using excel, you may need to specify a sheet name.
228
- delimiter: Specifies the delimiter character used between fields.
229
- sheet_name: If the files is an Excel spreadsheet, this parameter specifies a particular sheet.
230
- archive_name: If the file_name is contained in an zip or file, this is the name of archive file.
231
- Returns:
232
- Dictionary built from a Pandas dataframe.
233
- """
234
- df = make_dataframe(
235
- file_name,
236
- subset_cols=[],
237
- exclude_cols=[],
238
- nrows=None,
239
- lowercase_col_names=True,
240
- replace_spaces=True,
241
- file_type="tsv",
242
- delimiter=delimiter,
243
- sheet_name=sheet_name,
244
- file_archive_name=file_archive_name,
245
- )
246
- return df.to_dict(orient="records")
247
-
248
-
249
- def make_collection_date(year_val, month_val, day_val, hour_val="", minute_val=""):
250
- def pad_value(val, pad_len=2):
251
- s = str(val)
252
- return s.zfill(pad_len)
253
-
254
- return_val = ""
255
- year_val = year_val.strip()
256
- month_val = month_val.strip()
257
- day_val = day_val.strip()
258
- hour_val = hour_val.strip()
259
- minute_val = minute_val.strip()
260
- return_val = ""
261
-
262
- ## if a year isn't provided simply return the empty string
263
- if len(year_val) < 1:
264
- return ""
265
- else:
266
- return_val = pad_value(year_val, 4)
267
-
268
- if len(month_val) > 0:
269
- return_val = return_val + "-" + pad_value(month_val)
270
-
271
- ## we only days that have months assocated with them
272
- if (len(month_val) > 0) and (len(day_val) > 0):
273
- return_val = return_val + "-" + pad_value(day_val)
274
-
275
- ## we only want times with months and days associated with them
276
- if (len(month_val) > 0) and (len(day_val) > 0):
277
- if (len(hour_val) > 0) and (len(minute_val) > 0):
278
- return_val = return_val + "T" + pad_value(hour_val) + ":" + minute_val
279
- elif len(hour_val) > 0:
280
- return_val = (
281
- return_val + "T" + pad_value(hour_val) + "00"
282
- ) # case for when no minute val is given
283
-
284
- return return_val
285
-
286
-
287
- def make_lat_lon(latitude, longitude):
288
- # latitude = "" if pds.isnull(latitude) else str(latitude).strip().replace('\n', '')
289
- # longitude = "" if pds.isnull(longitude) else str(longitude).strip().replace('\n', '')
290
- latitude = None if pds.isnull(latitude) else float(latitude)
291
- longitude = None if pds.isnull(longitude) else float(longitude)
292
-
293
- if (not (latitude is None)) and (not (longitude is None)):
294
- return f"{latitude} {longitude}".strip()
295
- else:
296
- return None
297
-
298
-
299
- def make_study_dataframe(study_table, contact_table, proposals_table, result_cols=[]):
300
- ## subset dataframes
301
- contact_table_splice = contact_table[
302
- ["contact_id", "principal_investigator_name"]
303
- ].copy()
304
- proposals_table_splice = proposals_table[["gold_study", "doi"]].copy()
305
-
306
- ## make sure the contact ids are strings with the ".0" removed from the end (i.e., the strings aren't floats)
307
- study_table["contact_id"] = (
308
- study_table["contact_id"].astype(str).replace(r"\.0", "", regex=True)
309
- )
310
- contact_table_splice["contact_id"] = (
311
- contact_table_splice["contact_id"].astype(str).replace(r"\.0", "", regex=True)
312
- )
313
- # print(study_table[['contact_id', 'principal_investigator_name']].head())
314
-
315
- ## left join data from contact
316
- temp1_df = pds.merge(study_table, contact_table_splice, how="left", on="contact_id")
317
-
318
- ## left join data from proposals
319
- temp2_df = pds.merge(
320
- temp1_df,
321
- proposals_table_splice,
322
- how="left",
323
- left_on="gold_id",
324
- right_on="gold_study",
325
- )
326
-
327
- ## add prefix
328
- temp2_df.gold_id = "gold:" + temp2_df.gold_id
329
- temp2_df.gold_study = "gold:" + temp2_df.gold_study
330
-
331
- if len(result_cols) > 0:
332
- return temp2_df[result_cols]
333
- else:
334
- return temp2_df
335
-
336
-
337
- def make_project_dataframe(
338
- project_table,
339
- study_table,
340
- contact_table,
341
- data_object_table,
342
- project_biosample_table=None,
343
- biosample_table=None,
344
- result_cols=[],
345
- ):
346
- ## subset data
347
- study_table_splice = study_table[["study_id", "gold_id"]].copy()
348
- contact_table_splice = contact_table[
349
- ["contact_id", "principal_investigator_name"]
350
- ].copy()
351
-
352
- ## rename study.gold_id to study_gold_id
353
- study_table_splice.rename(columns={"gold_id": "study_gold_id"}, inplace=True)
354
-
355
- ## inner join on study (project must be part of study)
356
- temp1_df = pds.merge(
357
- project_table,
358
- study_table_splice,
359
- how="inner",
360
- left_on="master_study_id",
361
- right_on="study_id",
362
- )
363
-
364
- ## left join contact data
365
- temp2_df = pds.merge(
366
- temp1_df,
367
- contact_table_splice,
368
- how="left",
369
- left_on="pi_id",
370
- right_on="contact_id",
371
- )
372
-
373
- ## add prefix
374
- temp2_df.gold_id = "gold:" + temp2_df.gold_id
375
- temp2_df.study_gold_id = "gold:" + temp2_df.study_gold_id
376
-
377
- ## if present join data objects as output of project
378
- if not (data_object_table is None):
379
- ## make copy and add prefix
380
- data_object_table = data_object_table.copy()
381
- data_object_table.gold_project_id = data_object_table.gold_project_id.map(
382
- lambda x: x if "gold:" == x[0:5] else "gold:" + x
383
- )
384
-
385
- ## create a group concat for all file ids in the data objects
386
- groups = data_object_table.groupby("gold_project_id")["file_id"]
387
- output_files = (
388
- pds.DataFrame(groups.apply(lambda x: ",".join(filter(None, x))))
389
- .drop_duplicates()
390
- .reset_index()
391
- )
392
- output_files.rename(columns={"file_id": "output_file_ids"}, inplace=True)
393
- output_files["output_file_ids"] = output_files["output_file_ids"].astype(str)
394
-
395
- ## left join output files for projects
396
- temp2_df = pds.merge(
397
- temp2_df,
398
- output_files,
399
- how="left",
400
- left_on="gold_id",
401
- right_on="gold_project_id",
402
- )
403
-
404
- ## if present join biosamples as inputs to project
405
- if (not (project_biosample_table is None)) and (not (biosample_table is None)):
406
- ## make local copies & rename column
407
- project_biosample_table = project_biosample_table.copy()
408
- biosample_table = biosample_table[["biosample_id", "gold_id"]].copy()
409
- biosample_table.rename(columns={"gold_id": "biosample_gold_id"}, inplace=True)
410
-
411
- ## add prefix
412
- biosample_table["biosample_gold_id"] = biosample_table["biosample_gold_id"].map(
413
- lambda x: x if "gold:" == x[0:5] else "gold:" + x
414
- )
415
-
416
- ## join project biosamples to biosamples
417
- input_samples = pds.merge(
418
- project_biosample_table, biosample_table, how="inner", on="biosample_id"
419
- )
420
- # require input samples (i.e., inner join)
421
- temp2_df = pds.merge(temp2_df, input_samples, how="inner", on="project_id")
422
-
423
- if len(result_cols) > 0:
424
- return temp2_df[result_cols]
425
- else:
426
- return temp2_df
427
-
428
-
429
- def make_biosample_dataframe(
430
- biosample_table,
431
- soil_package_table,
432
- water_package_table,
433
- project_biosample_table,
434
- project_table,
435
- study_table,
436
- result_cols=[],
437
- ):
438
- def make_collection_date_from_row(row):
439
- def _format_date_part_value(val):
440
- if pds.isnull(val):
441
- return ""
442
-
443
- if type("") == type(val):
444
- if "." in val:
445
- return val[0 : val.find(".")].strip()
446
- else:
447
- return val.strip()
448
- else:
449
- return str(int(val)).strip()
450
-
451
- year_val = _format_date_part_value(row["sample_collection_year"])
452
- month_val = _format_date_part_value(row["sample_collection_month"])
453
- day_val = _format_date_part_value(row["sample_collection_day"])
454
- hour_val = _format_date_part_value(row["sample_collection_hour"])
455
- minute_val = _format_date_part_value(row["sample_collection_minute"])
456
-
457
- return make_collection_date(year_val, month_val, day_val, hour_val, minute_val)
458
-
459
- ## subset data
460
- project_biosample_table_splice = project_biosample_table[
461
- ["biosample_id", "project_id"]
462
- ].copy()
463
- project_table_splice = project_table[
464
- ["project_id", "gold_id", "master_study_id"]
465
- ].copy()
466
- study_table_splice = study_table[["study_id", "gold_id"]].copy()
467
-
468
- ## add prefix
469
- project_table_splice.gold_id = "gold:" + project_table_splice.gold_id
470
- study_table_splice.gold_id = "gold:" + study_table_splice.gold_id
471
-
472
- ## rename columns
473
- project_table_splice.rename(columns={"gold_id": "project_gold_id"}, inplace=True)
474
- study_table_splice.rename(columns={"gold_id": "study_gold_id"}, inplace=True)
475
-
476
- ## inner join projects and studies
477
- project_table_splice = pds.merge(
478
- project_table_splice,
479
- study_table_splice,
480
- how="inner",
481
- left_on="master_study_id",
482
- right_on="study_id",
483
- )
484
-
485
- ## drop biosample rows that don't have required fields
486
- biosample_table = biosample_table[biosample_table["env_broad_scale"].notnull()]
487
- biosample_table = biosample_table[biosample_table["env_local_scale"].notnull()]
488
- biosample_table = biosample_table[biosample_table["env_medium"].notnull()]
489
-
490
- ## left join package tables to biosample table
491
- temp0_df = pds.merge(
492
- biosample_table, soil_package_table, how="left", on="soil_package_id"
493
- )
494
- temp0_df = pds.merge(
495
- temp0_df, water_package_table, how="left", on="water_package_id"
496
- )
497
-
498
- ## inner join on project_biosample and project; i.e., biosamples must be linked to project
499
- temp1_df = pds.merge(
500
- temp0_df, project_biosample_table_splice, how="inner", on="biosample_id"
501
- )
502
- temp2_df = pds.merge(temp1_df, project_table_splice, how="inner", on="project_id")
503
-
504
- ## add collection date and lat_lon columns
505
- temp2_df["collection_date"] = temp2_df.apply(
506
- lambda row: make_collection_date_from_row(row), axis=1
507
- )
508
- temp2_df["lat_lon"] = temp2_df.apply(
509
- lambda row: make_lat_lon(row.latitude, row.longitude), axis=1
510
- )
511
-
512
- ## convert latitude and longitute columns to floats
513
- temp2_df["latitude"] = temp2_df["latitude"].map(
514
- lambda x: None if pds.isnull(x) else float(x)
515
- )
516
- temp2_df["longitude"] = temp2_df["longitude"].map(
517
- lambda x: None if pds.isnull(x) else float(x)
518
- )
519
-
520
- ## add gold prefix
521
- temp2_df["gold_id"] = "gold:" + temp2_df["gold_id"]
522
-
523
- ## biosample might belong to more than one project; so do the equivalent of a group_cat
524
- ## see: https://queirozf.com/entries/pandas-dataframe-groupby-examples
525
- ## see: https://stackoverflow.com/questions/18138693/replicating-group-concat-for-pandas-dataframe
526
- groups = (
527
- temp2_df.groupby("biosample_id")["project_gold_id"]
528
- .apply(lambda pid: ",".join(filter(None, pid)))
529
- .reset_index()
530
- )
531
- groups.rename(columns={"project_gold_id": "project_gold_ids"}, inplace=True)
532
-
533
- # join concat groups to dataframe
534
- temp3_df = pds.merge(temp2_df, groups, how="left", on="biosample_id")
535
-
536
- ## A biosample may belong to multiple projects
537
- # E.g. see biosample_id 247352 with gold_id "Gb0247352", belongs to projects 467278, 467306
538
- ## So, remove uneeded columns & drop dups
539
- temp3_df.drop(columns=["project_gold_id"], inplace=True)
540
- temp3_df.drop(columns=["project_id"], inplace=True)
541
- temp3_df.drop_duplicates(inplace=True)
542
-
543
- ## for 'env_broad_scale', 'env_local_scale', 'env_medium' fields change 'ENVO_' to 'ENVO:'
544
- # temp3_df['env_broad_scale'] = temp3_df
545
- for idx in temp3_df.index:
546
- if pds.notnull(temp3_df.loc[idx, "env_broad_scale"]):
547
- temp3_df.loc[idx, "env_broad_scale"] = str(
548
- temp3_df.loc[idx, "env_broad_scale"]
549
- ).replace("_", ":", 1)
550
- if pds.notnull(temp3_df.loc[idx, "env_local_scale"]):
551
- temp3_df.loc[idx, "env_local_scale"] = str(
552
- temp3_df.loc[idx, "env_local_scale"]
553
- ).replace("_", ":", 1)
554
- if pds.notnull(temp3_df.loc[idx, "env_medium"]):
555
- temp3_df.loc[idx, "env_medium"] = str(
556
- temp3_df.loc[idx, "env_medium"]
557
- ).replace("_", ":", 1)
558
-
559
- if len(result_cols) > 0:
560
- return temp3_df[result_cols]
561
- else:
562
- return temp3_df
563
-
564
-
565
- def make_jgi_emsl_dataframe(jgi_emsl_table, study_table, result_cols=[]):
566
- ## subset data
567
- study_table_splice = study_table[["study_id", "gold_id"]].copy()
568
-
569
- ## inner join jgi-emsl data on study (must be part of study)
570
- temp1_df = pds.merge(
571
- jgi_emsl_table,
572
- study_table_splice,
573
- how="inner",
574
- left_on="gold_study_id",
575
- right_on="gold_id",
576
- )
577
-
578
- ## add prefix
579
- temp1_df.gold_id = "gold:" + temp1_df.gold_id
580
- temp1_df.gold_study_id = "gold:" + temp1_df.gold_study_id
581
-
582
- if len(result_cols) > 0:
583
- return temp1_df[result_cols]
584
- else:
585
- return temp1_df
586
-
587
-
588
- def make_emsl_dataframe(
589
- emsl_table, jgi_emsl_table, study_table, emsl_biosample_table, result_cols=[]
590
- ):
591
- ## subset data
592
- study_table_splice = study_table[["study_id", "gold_id"]].copy()
593
- jgi_emsl_table_splice = jgi_emsl_table[["gold_study_id", "emsl_proposal_id"]].copy()
594
- biosample_slice = emsl_biosample_table[["dataset_id", "biosample_gold_id"]].copy()
595
- biosample_slice["biosample_gold_id"] = (
596
- "gold:" + biosample_slice["biosample_gold_id"]
597
- ) # add prefix
598
-
599
- ## inner join jgi-emsl data on study (must be part of study)
600
- temp1_df = pds.merge(
601
- jgi_emsl_table_splice,
602
- study_table_splice,
603
- how="inner",
604
- left_on="gold_study_id",
605
- right_on="gold_id",
606
- )
607
-
608
- ## inner join emsl data on jgi-emsl proposal ids
609
- temp2_df = pds.merge(emsl_table, temp1_df, how="inner", on="emsl_proposal_id")
610
-
611
- ## add data obect id column
612
- temp2_df["data_object_id"] = "output_"
613
- temp2_df["data_object_id"] = temp2_df["data_object_id"] + temp2_df[
614
- "dataset_id"
615
- ].map(
616
- str
617
- ) # build data object id
618
-
619
- ## add data object name column
620
- temp2_df["data_object_name"] = "output: "
621
- temp2_df["data_object_name"] = temp2_df["data_object_name"] + temp2_df[
622
- "dataset_name"
623
- ].map(
624
- str
625
- ) # build data object id
626
-
627
- ## group concat & join the biosample ids that are inputs to the omics process
628
- ## With filter function as None, the function defaults to Identity function,
629
- ## and each element in random_list is checked if it's true or not.
630
- ## see https://www.programiz.com/python-programming/methods/built-in/filter
631
- groups = biosample_slice.groupby("dataset_id")["biosample_gold_id"]
632
- input_biosamples = (
633
- pds.DataFrame(groups.apply(lambda x: ",".join(filter(None, x))))
634
- .drop_duplicates()
635
- .reset_index()
636
- )
637
-
638
- input_biosamples.reset_index(inplace=True) # make dataset_id a column
639
- input_biosamples.rename(
640
- columns={"biosample_gold_id": "biosample_gold_ids"}, inplace=True
641
- ) # change column name
642
-
643
- input_biosamples["biosample_gold_ids"] = input_biosamples[
644
- "biosample_gold_ids"
645
- ].astype(
646
- str
647
- ) # make sure biosample_ids are strings
648
-
649
- temp2_df = pds.merge(temp2_df, input_biosamples, how="left", on="dataset_id")
650
-
651
- ## add "emsl:TBD" id for missing biosamples
652
- temp2_df["biosample_gold_ids"] = temp2_df["biosample_gold_ids"].map(
653
- lambda x: "emsl:TBD" if pds.isnull(x) else x
654
- )
655
-
656
- ## add prefix
657
- temp2_df.gold_id = "gold:" + temp2_df.gold_id
658
- temp2_df.gold_study_id = "gold:" + temp2_df.gold_study_id
659
- temp2_df.dataset_id = "emsl:" + temp2_df.dataset_id
660
- temp2_df.data_object_id = "emsl:" + temp2_df.data_object_id
661
-
662
- ## replace NaNs with None
663
- temp2_df = temp2_df.where(pds.notnull(temp2_df), None)
664
-
665
- ## drop duplicates
666
- temp2_df.drop_duplicates(inplace=True)
667
-
668
- if len(result_cols) > 0:
669
- return temp2_df[result_cols]
670
- else:
671
- return temp2_df
672
-
673
-
674
- def make_data_objects_dataframe(
675
- faa_table, fna_table, fastq_table, project_table, result_cols=[]
676
- ):
677
- ## subset data
678
- project_table_splice = project_table[["gold_id"]].copy()
679
-
680
- ## copy tables
681
- faa_df = faa_table.copy()
682
- fna_df = fna_table.copy()
683
- fastq_df = fastq_table.copy()
684
-
685
- ## add prefixes for faa, fna, and fastq files
686
- faa_df.file_id = "nmdc:" + faa_df.file_id
687
- fna_df.file_id = "nmdc:" + fna_df.file_id
688
- fastq_df.file_id = "jgi:" + fastq_df.file_id
689
-
690
- ## merge tables
691
- data_objects = pds.concat([faa_df, fna_df, fastq_df], axis=0)
692
-
693
- ## inner joing data objects (e.g., faa, fna, fasq) to projects
694
- temp1_df = pds.merge(
695
- data_objects,
696
- project_table_splice,
697
- how="inner",
698
- left_on="gold_project_id",
699
- right_on="gold_id",
700
- )
701
-
702
- ## add prefix for gold
703
- temp1_df.gold_project_id = "gold:" + temp1_df.gold_project_id
704
- temp1_df.gold_id = "gold:" + temp1_df.gold_id
705
-
706
- if len(result_cols) > 0:
707
- return temp1_df[result_cols]
708
- else:
709
- return temp1_df[data_objects.columns]
710
-
711
-
712
- def make_jgi_fastq_dataframe(fastq_table, project_table, result_cols=[]):
713
- ## subset data
714
- project_table_splice = project_table[["gold_id"]].copy()
715
-
716
- ## copy tables
717
- fastq_df = fastq_table.copy()
718
-
719
- ## add prefixes for fastq file id
720
- fastq_df.file_id = "jgi:" + fastq_df.file_id
721
-
722
- ## inner join to projects
723
- temp1_df = pds.merge(
724
- fastq_df,
725
- project_table_splice,
726
- how="inner",
727
- left_on="gold_project_id",
728
- right_on="gold_id",
729
- )
730
-
731
- ## add prefix for gold
732
- temp1_df.gold_project_id = "gold:" + temp1_df.gold_project_id
733
- temp1_df.gold_id = "gold:" + temp1_df.gold_id
734
-
735
- if len(result_cols) > 0:
736
- return temp1_df[result_cols]
737
- else:
738
- return temp1_df[fastq_df.columns]
739
-
740
-
741
- def make_dataframe_from_spec_file(data_spec_file, nrows=None):
742
- def make_df_from_file(data_source, nrows):
743
- file_type = data_source["file_type"]
744
- fname = data_source["file_name"]
745
-
746
- if "file_archive_name" in data_source.keys():
747
- farchive = data_source["file_archive_name"]
748
- df = make_dataframe(
749
- fname, file_archive_name=farchive, file_type=file_type, nrows=nrows
750
- )
751
- else:
752
- df = make_dataframe(fname, file_type=file_type, nrows=nrows)
753
-
754
- return df
755
-
756
- def make_df(source, source_type="file_name"):
757
- name = source[0]
758
- data = source[1]
759
- data_source = source[1]["data_source"]
760
-
761
- if source_type not in data_source.keys():
762
- return None
763
-
764
- ## get data from file
765
- if "file_name" in data_source.keys():
766
- df = make_df_from_file(data_source, nrows=nrows)
767
-
768
- ## add extra columns
769
- if "append_columns" in data.keys():
770
- for col in data["append_columns"]:
771
- df[col["name"]] = col["value"]
772
-
773
- ## filter rows by specific values
774
- if "filters" in data.keys():
775
- for fltr in data["filters"]:
776
- if "include" in fltr:
777
- df = df[
778
- df[fltr["include"]["field"]].isin(fltr["include"]["values"])
779
- ]
780
- elif "exclude" in fltr:
781
- df = df[
782
- ~df[fltr["exclude"]["field"]].isin(fltr["exclude"]["values"])
783
- ]
784
- else:
785
- df = df[df[fltr["field"]].isin(fltr["values"])]
786
-
787
- ## select a subset of the columns
788
- if "subset_cols" in data.keys():
789
- df = df[data["subset_cols"]]
790
-
791
- ## rename columns
792
- if "rename_slots" in data.keys():
793
- for slot in data["rename_slots"]:
794
- df.rename(columns={slot["old_name"]: slot["new_name"]}, inplace=True)
795
-
796
- ## add 'nmdc_record_id' as a primary key
797
- if "id_key" in data.keys():
798
- df["nmdc_record_id"] = df[data["id_key"]]
799
- df["nmdc_record_id"] = df["nmdc_record_id"].astype(
800
- str
801
- ) # ensure all keys are strings
802
- else:
803
- df.index.name = "nmdc_record_id" # rename the current index
804
- df.reset_index(inplace=True) # turn index into a column
805
- df["nmdc_record_id"] = df["nmdc_record_id"].astype(
806
- str
807
- ) # ensure all keys are strings
808
-
809
- return df
810
-
811
- with open(data_spec_file, "r") as input_file:
812
- # spec = DottedDict(yaml.load(input_file, Loader=Loader))
813
- spec = yaml.load(input_file, Loader=Loader)
814
-
815
- Data_source = namedtuple("Data_source", "data name")
816
-
817
- dataframes = []
818
- for source in spec["data_sources"].items():
819
- df = make_df(source)
820
- ds = Data_source(df, source[0])
821
- dataframes.append(ds)
822
- print(source[0], len(df))
823
-
824
- merged_df = merge_dataframes(dataframes)
825
- return merged_df