nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -1
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +2 -0
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +731 -40
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
  77. nmdc_runtime/site/graphs.py +80 -29
  78. nmdc_runtime/site/ops.py +522 -183
  79. nmdc_runtime/site/repair/database_updater.py +210 -1
  80. nmdc_runtime/site/repository.py +108 -117
  81. nmdc_runtime/site/resources.py +72 -36
  82. nmdc_runtime/site/translation/gold_translator.py +22 -21
  83. nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
  84. nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
  85. nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
  86. nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
  87. nmdc_runtime/site/translation/translator.py +64 -1
  88. nmdc_runtime/site/util.py +8 -3
  89. nmdc_runtime/site/validation/util.py +16 -12
  90. nmdc_runtime/site/workspace.yaml +13 -0
  91. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  92. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  93. nmdc_runtime/static/README.md +5 -0
  94. nmdc_runtime/static/favicon.ico +0 -0
  95. nmdc_runtime/util.py +175 -348
  96. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  97. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  98. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  99. nmdc_runtime/containers.py +0 -14
  100. nmdc_runtime/core/db/Database.py +0 -15
  101. nmdc_runtime/core/exceptions/__init__.py +0 -23
  102. nmdc_runtime/core/exceptions/base.py +0 -47
  103. nmdc_runtime/core/exceptions/token.py +0 -13
  104. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  105. nmdc_runtime/domain/users/userSchema.py +0 -37
  106. nmdc_runtime/domain/users/userService.py +0 -14
  107. nmdc_runtime/infrastructure/database/db.py +0 -3
  108. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  109. nmdc_runtime/lib/__init__.py +0 -1
  110. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  111. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  112. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  113. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  114. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  115. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  116. nmdc_runtime/site/drsobjects/registration.py +0 -131
  117. nmdc_runtime/site/translation/emsl.py +0 -43
  118. nmdc_runtime/site/translation/gold.py +0 -53
  119. nmdc_runtime/site/translation/jgi.py +0 -32
  120. nmdc_runtime/site/translation/util.py +0 -132
  121. nmdc_runtime/site/validation/jgi.py +0 -43
  122. nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
  123. nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
  124. nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
  125. /nmdc_runtime/{client → api}/__init__.py +0 -0
  126. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  127. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  128. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  129. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  130. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  131. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  132. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  133. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  134. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
  135. {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,829 +0,0 @@
1
- ## author: Bill Duncan
2
- ## summary: Contains methods for creating dataframes needed for NMDC ETL pipeline.
3
-
4
- ## system level modules
5
- import pandas as pds
6
- import jsonasobj
7
- import json
8
- import zipfile
9
- import yaml
10
- from pandas.core.dtypes.missing import notnull
11
- from yaml import CLoader as Loader, CDumper as Dumper
12
- from dotted_dict import DottedDict
13
- from collections import namedtuple
14
-
15
-
16
- def make_dataframe(
17
- file_name,
18
- subset_cols=[],
19
- exclude_cols=[],
20
- nrows=None,
21
- lowercase_col_names=True,
22
- replace_spaces=True,
23
- replace_hash=True,
24
- strip_spaces=True,
25
- comment_str=None,
26
- file_type="tsv",
27
- delimiter="\t",
28
- sheet_name=0,
29
- file_archive_name="",
30
- ):
31
- """
32
- Builds a pandas dataframe from the designated file.
33
-
34
- Args:
35
- file_name: The name of the file containing the data for the dataframe. If the file is not in the same directory, then specify the path as part of the file name.
36
- subset_cols: Specifies a specific of subset of columns to be included in the dataframe.
37
- exclude_cols: Specifies a specific of subset of columns to be excluded from the dataframe.
38
- nrows: Specifies the number of rows to be returned in the dataframe (useful for testing).
39
- lowercase_col_names: If true, the column names are converted to lower case.
40
- replace_spaces: If true, spaces in column names are replaced with underscores.
41
- replace_hash: If true, hashes ('#') in column names are replaced with empty strings.
42
- strip_spaces: If true, extra surrounding spaces are stripped from the column names.
43
- comment_str: Specifies the string that is used for comments with the data.
44
- file_type: Speicfies the type of file. Current acceptable file types are tsv, csv, and excel. Note that when using excel, you may need to specify a sheet name.
45
- delimiter: Specifies the delimiter character used between fields.
46
- sheet_name: If the files is an Excel spreadsheet, this parameter specifies a particular sheet.
47
- archive_name: If the file_name is contained in an zip or file, this is the name of archive file.
48
- Returns:
49
- Pandas dataframe
50
- """
51
- ## normalize paramaters for use with pandas
52
- if len(subset_cols) < 1:
53
- subset_cols = None
54
- if len(exclude_cols) < 1:
55
- exclude_cols = None
56
-
57
- ## check if file is contained in an archive
58
- file_archive = None
59
- if len(file_archive_name) > 1:
60
- file_archive = zipfile.ZipFile(file_archive_name, "r")
61
-
62
- ## load data from file
63
- if "tsv" == file_type.lower() or "csv" == file_type.lower():
64
- if None != file_archive:
65
- df = pds.read_csv(
66
- file_archive.open(file_name),
67
- sep=delimiter,
68
- nrows=nrows,
69
- comment=comment_str,
70
- )
71
- else:
72
- df = pds.read_csv(
73
- file_name, sep=delimiter, nrows=nrows, comment=comment_str
74
- )
75
- elif "excel" == file_type.lower():
76
- if None != file_archive:
77
- df = pds.read_excel(
78
- file_archive.open(file_name),
79
- sheet_name=sheet_name,
80
- nrows=nrows,
81
- comment=comment_str,
82
- engine="openpyxl",
83
- )
84
- else:
85
- df = pds.read_excel(
86
- file_name,
87
- sheet_name=sheet_name,
88
- nrows=nrows,
89
- comment=comment_str,
90
- engine="openpyxl",
91
- )
92
- elif "multi-sheet-excel" == file_type.lower():
93
- if None != file_archive:
94
- df = pds.concat(
95
- pds.read_excel(
96
- file_archive.open(file_name),
97
- sheet_name=None,
98
- index_col=None,
99
- nrows=nrows,
100
- comment=comment_str,
101
- engine="openpyxl",
102
- )
103
- )
104
- else:
105
- df = pds.concat(
106
- pds.read_excel(
107
- file_name,
108
- sheet_name=None,
109
- index_col=None,
110
- nrows=nrows,
111
- comment=comment_str,
112
- engine="openpyxl",
113
- )
114
- )
115
-
116
- ## clean column names
117
- df = clean_dataframe_column_names(
118
- df, lowercase_col_names, replace_spaces, replace_hash, strip_spaces
119
- )
120
-
121
- ## create subset of columns
122
- ## note: since column names are case sensitive, this needs to happen after cleaning column names
123
- if subset_cols:
124
- df = df[subset_cols]
125
-
126
- ## return dataframe
127
- return df
128
-
129
-
130
- def clean_dataframe_column_names(
131
- df,
132
- lowercase_col_names=True,
133
- replace_spaces=True,
134
- replace_hash=True,
135
- strip_spaces=True,
136
- ):
137
- """
138
- Changes the column names of a dataframe into a standard format. The default settings change the column names to:
139
- - lower case
140
- - replace spaces with underscores
141
- - replace hash ('#') with empty string
142
- Args:
143
- df: The dataframe whose columns will be cleaned.
144
- lowercase_col_names: If true, the column names are converted to lower case.
145
- replace_spaces: If true, spaces in column names are replaced with underscores.
146
- replace_hash: If true, hashes ('#') in column names are replaced with empty strings.
147
- strip_spaces: If true, extra surrounding spaces are stripped from the column names.
148
- Returns:
149
- Pandas dataframe
150
- """
151
-
152
- ## clean column names
153
- if lowercase_col_names:
154
- df.columns = [c.strip().lower() for c in df.columns]
155
-
156
- if replace_spaces:
157
- df.columns = [c.replace(" ", "_") for c in df.columns]
158
-
159
- if replace_hash:
160
- df.columns = [c.replace("#", "") for c in df.columns]
161
-
162
- if strip_spaces:
163
- df.columns = [c.strip() for c in df.columns]
164
-
165
- return df
166
-
167
-
168
- def merge_dataframes(dataframes: list, data_source_names=[]):
169
- merged_df = pds.DataFrame(
170
- columns=["nmdc_data_source", "nmdc_record_id", "attribute", "value"]
171
- )
172
-
173
- for idx, df in enumerate(dataframes):
174
- if "pandas.core.frame.DataFrame" == type(df):
175
- data_source_name = data_source_names[idx]
176
- data = df
177
- else:
178
- data_source_name = df.name
179
- data = df.data
180
-
181
- ## convert data into an EAV structure
182
- eavdf = data.melt(id_vars=["nmdc_record_id"], var_name="attribute")
183
- eavdf["nmdc_data_source"] = data_source_name
184
- # print(data_source_name, len(eavdf))
185
-
186
- merged_df = merged_df.append(eavdf, ignore_index=True)
187
-
188
- return merged_df
189
-
190
-
191
- def make_dataframe_dictionary(
192
- file_name,
193
- subset_cols=[],
194
- exclude_cols=[],
195
- nrows=None,
196
- lowercase_col_names=True,
197
- replace_spaces=True,
198
- file_type="tsv",
199
- delimiter="\t",
200
- sheet_name=0,
201
- file_archive_name="",
202
- ):
203
- """
204
- Builds a dictionary based on the structure of the pandas dataframe generated from the designated file.
205
- The dictionary is oriented for records.
206
- E.g.:
207
- [
208
- {
209
- 'col1': 1,
210
- 'col2': 0.5
211
- },
212
- {
213
- 'col1': 2,
214
- 'col2': 0.75
215
- }
216
- ]
217
-
218
- Essentially, this function is a shortcut for calling make_dataframe() and then transforming the result into a dictionary.
219
- E.g.:
220
- df = make_dataframe(file_name)
221
- dictdf = dictdf = df.to_dict(orient="records")
222
-
223
-
224
- Args:
225
- file_name: The name of the file containing the data for the dataframe. If the file is not in the same directory, then specify the path as part of the file name.
226
- subset_cols: Specifies a specific of subset of columns to be included in the dataframe.
227
- exclude_cols: Specifies a specific of subset of columns to be excluded from the dataframe.
228
- nrows: Specifies the number of rows to be returned in the dataframe (useful for testing).
229
- lowercase_col_names: If true, the column names are converted to lower case.
230
- replace_spaces: If true, spaces in column names are replaced with spaces.
231
- file_type: Speicfies the type of file. Current acceptable file types are tsv, csv, and excel. Note that when using excel, you may need to specify a sheet name.
232
- delimiter: Specifies the delimiter character used between fields.
233
- sheet_name: If the files is an Excel spreadsheet, this parameter specifies a particular sheet.
234
- archive_name: If the file_name is contained in an zip or file, this is the name of archive file.
235
- Returns:
236
- Dictionary built from a Pandas dataframe.
237
- """
238
- df = make_dataframe(
239
- file_name,
240
- subset_cols=[],
241
- exclude_cols=[],
242
- nrows=None,
243
- lowercase_col_names=True,
244
- replace_spaces=True,
245
- file_type="tsv",
246
- delimiter=delimiter,
247
- sheet_name=sheet_name,
248
- file_archive_name=file_archive_name,
249
- )
250
- return df.to_dict(orient="records")
251
-
252
-
253
- def make_collection_date(year_val, month_val, day_val, hour_val="", minute_val=""):
254
- def pad_value(val, pad_len=2):
255
- s = str(val)
256
- return s.zfill(pad_len)
257
-
258
- return_val = ""
259
- year_val = year_val.strip()
260
- month_val = month_val.strip()
261
- day_val = day_val.strip()
262
- hour_val = hour_val.strip()
263
- minute_val = minute_val.strip()
264
- return_val = ""
265
-
266
- ## if a year isn't provided simply return the empty string
267
- if len(year_val) < 1:
268
- return ""
269
- else:
270
- return_val = pad_value(year_val, 4)
271
-
272
- if len(month_val) > 0:
273
- return_val = return_val + "-" + pad_value(month_val)
274
-
275
- ## we only days that have months assocated with them
276
- if (len(month_val) > 0) and (len(day_val) > 0):
277
- return_val = return_val + "-" + pad_value(day_val)
278
-
279
- ## we only want times with months and days associated with them
280
- if (len(month_val) > 0) and (len(day_val) > 0):
281
- if (len(hour_val) > 0) and (len(minute_val) > 0):
282
- return_val = return_val + "T" + pad_value(hour_val) + ":" + minute_val
283
- elif len(hour_val) > 0:
284
- return_val = (
285
- return_val + "T" + pad_value(hour_val) + "00"
286
- ) # case for when no minute val is given
287
-
288
- return return_val
289
-
290
-
291
- def make_lat_lon(latitude, longitude):
292
- # latitude = "" if pds.isnull(latitude) else str(latitude).strip().replace('\n', '')
293
- # longitude = "" if pds.isnull(longitude) else str(longitude).strip().replace('\n', '')
294
- latitude = None if pds.isnull(latitude) else float(latitude)
295
- longitude = None if pds.isnull(longitude) else float(longitude)
296
-
297
- if (not (latitude is None)) and (not (longitude is None)):
298
- return f"{latitude} {longitude}".strip()
299
- else:
300
- return None
301
-
302
-
303
- def make_study_dataframe(study_table, contact_table, proposals_table, result_cols=[]):
304
- ## subset dataframes
305
- contact_table_splice = contact_table[
306
- ["contact_id", "principal_investigator_name"]
307
- ].copy()
308
- proposals_table_splice = proposals_table[["gold_study", "doi"]].copy()
309
-
310
- ## make sure the contact ids are strings with the ".0" removed from the end (i.e., the strings aren't floats)
311
- study_table["contact_id"] = (
312
- study_table["contact_id"].astype(str).replace("\.0", "", regex=True)
313
- )
314
- contact_table_splice["contact_id"] = (
315
- contact_table_splice["contact_id"].astype(str).replace("\.0", "", regex=True)
316
- )
317
- # print(study_table[['contact_id', 'principal_investigator_name']].head())
318
-
319
- ## left join data from contact
320
- temp1_df = pds.merge(study_table, contact_table_splice, how="left", on="contact_id")
321
-
322
- ## left join data from proposals
323
- temp2_df = pds.merge(
324
- temp1_df,
325
- proposals_table_splice,
326
- how="left",
327
- left_on="gold_id",
328
- right_on="gold_study",
329
- )
330
-
331
- ## add prefix
332
- temp2_df.gold_id = "gold:" + temp2_df.gold_id
333
- temp2_df.gold_study = "gold:" + temp2_df.gold_study
334
-
335
- if len(result_cols) > 0:
336
- return temp2_df[result_cols]
337
- else:
338
- return temp2_df
339
-
340
-
341
- def make_project_dataframe(
342
- project_table,
343
- study_table,
344
- contact_table,
345
- data_object_table,
346
- project_biosample_table=None,
347
- biosample_table=None,
348
- result_cols=[],
349
- ):
350
- ## subset data
351
- study_table_splice = study_table[["study_id", "gold_id"]].copy()
352
- contact_table_splice = contact_table[
353
- ["contact_id", "principal_investigator_name"]
354
- ].copy()
355
-
356
- ## rename study.gold_id to study_gold_id
357
- study_table_splice.rename(columns={"gold_id": "study_gold_id"}, inplace=True)
358
-
359
- ## inner join on study (project must be part of study)
360
- temp1_df = pds.merge(
361
- project_table,
362
- study_table_splice,
363
- how="inner",
364
- left_on="master_study_id",
365
- right_on="study_id",
366
- )
367
-
368
- ## left join contact data
369
- temp2_df = pds.merge(
370
- temp1_df,
371
- contact_table_splice,
372
- how="left",
373
- left_on="pi_id",
374
- right_on="contact_id",
375
- )
376
-
377
- ## add prefix
378
- temp2_df.gold_id = "gold:" + temp2_df.gold_id
379
- temp2_df.study_gold_id = "gold:" + temp2_df.study_gold_id
380
-
381
- ## if present join data objects as output of project
382
- if not (data_object_table is None):
383
- ## make copy and add prefix
384
- data_object_table = data_object_table.copy()
385
- data_object_table.gold_project_id = data_object_table.gold_project_id.map(
386
- lambda x: x if "gold:" == x[0:5] else "gold:" + x
387
- )
388
-
389
- ## create a group concat for all file ids in the data objects
390
- groups = data_object_table.groupby("gold_project_id")["file_id"]
391
- output_files = (
392
- pds.DataFrame(groups.apply(lambda x: ",".join(filter(None, x))))
393
- .drop_duplicates()
394
- .reset_index()
395
- )
396
- output_files.rename(columns={"file_id": "output_file_ids"}, inplace=True)
397
- output_files["output_file_ids"] = output_files["output_file_ids"].astype(str)
398
-
399
- ## left join output files for projects
400
- temp2_df = pds.merge(
401
- temp2_df,
402
- output_files,
403
- how="left",
404
- left_on="gold_id",
405
- right_on="gold_project_id",
406
- )
407
-
408
- ## if present join biosamples as inputs to project
409
- if (not (project_biosample_table is None)) and (not (biosample_table is None)):
410
- ## make local copies & rename column
411
- project_biosample_table = project_biosample_table.copy()
412
- biosample_table = biosample_table[["biosample_id", "gold_id"]].copy()
413
- biosample_table.rename(columns={"gold_id": "biosample_gold_id"}, inplace=True)
414
-
415
- ## add prefix
416
- biosample_table["biosample_gold_id"] = biosample_table["biosample_gold_id"].map(
417
- lambda x: x if "gold:" == x[0:5] else "gold:" + x
418
- )
419
-
420
- ## join project biosamples to biosamples
421
- input_samples = pds.merge(
422
- project_biosample_table, biosample_table, how="inner", on="biosample_id"
423
- )
424
- # require input samples (i.e., inner join)
425
- temp2_df = pds.merge(temp2_df, input_samples, how="inner", on="project_id")
426
-
427
- if len(result_cols) > 0:
428
- return temp2_df[result_cols]
429
- else:
430
- return temp2_df
431
-
432
-
433
- def make_biosample_dataframe(
434
- biosample_table,
435
- soil_package_table,
436
- water_package_table,
437
- project_biosample_table,
438
- project_table,
439
- study_table,
440
- result_cols=[],
441
- ):
442
- def make_collection_date_from_row(row):
443
- def _format_date_part_value(val):
444
- if pds.isnull(val):
445
- return ""
446
-
447
- if type("") == type(val):
448
- if "." in val:
449
- return val[0 : val.find(".")].strip()
450
- else:
451
- return val.strip()
452
- else:
453
- return str(int(val)).strip()
454
-
455
- year_val = _format_date_part_value(row["sample_collection_year"])
456
- month_val = _format_date_part_value(row["sample_collection_month"])
457
- day_val = _format_date_part_value(row["sample_collection_day"])
458
- hour_val = _format_date_part_value(row["sample_collection_hour"])
459
- minute_val = _format_date_part_value(row["sample_collection_minute"])
460
-
461
- return make_collection_date(year_val, month_val, day_val, hour_val, minute_val)
462
-
463
- ## subset data
464
- project_biosample_table_splice = project_biosample_table[
465
- ["biosample_id", "project_id"]
466
- ].copy()
467
- project_table_splice = project_table[
468
- ["project_id", "gold_id", "master_study_id"]
469
- ].copy()
470
- study_table_splice = study_table[["study_id", "gold_id"]].copy()
471
-
472
- ## add prefix
473
- project_table_splice.gold_id = "gold:" + project_table_splice.gold_id
474
- study_table_splice.gold_id = "gold:" + study_table_splice.gold_id
475
-
476
- ## rename columns
477
- project_table_splice.rename(columns={"gold_id": "project_gold_id"}, inplace=True)
478
- study_table_splice.rename(columns={"gold_id": "study_gold_id"}, inplace=True)
479
-
480
- ## inner join projects and studies
481
- project_table_splice = pds.merge(
482
- project_table_splice,
483
- study_table_splice,
484
- how="inner",
485
- left_on="master_study_id",
486
- right_on="study_id",
487
- )
488
-
489
- ## drop biosample rows that don't have required fields
490
- biosample_table = biosample_table[biosample_table["env_broad_scale"].notnull()]
491
- biosample_table = biosample_table[biosample_table["env_local_scale"].notnull()]
492
- biosample_table = biosample_table[biosample_table["env_medium"].notnull()]
493
-
494
- ## left join package tables to biosample table
495
- temp0_df = pds.merge(
496
- biosample_table, soil_package_table, how="left", on="soil_package_id"
497
- )
498
- temp0_df = pds.merge(
499
- temp0_df, water_package_table, how="left", on="water_package_id"
500
- )
501
-
502
- ## inner join on project_biosample and project; i.e., biosamples must be linked to project
503
- temp1_df = pds.merge(
504
- temp0_df, project_biosample_table_splice, how="inner", on="biosample_id"
505
- )
506
- temp2_df = pds.merge(temp1_df, project_table_splice, how="inner", on="project_id")
507
-
508
- ## add collection date and lat_lon columns
509
- temp2_df["collection_date"] = temp2_df.apply(
510
- lambda row: make_collection_date_from_row(row), axis=1
511
- )
512
- temp2_df["lat_lon"] = temp2_df.apply(
513
- lambda row: make_lat_lon(row.latitude, row.longitude), axis=1
514
- )
515
-
516
- ## convert latitude and longitute columns to floats
517
- temp2_df["latitude"] = temp2_df["latitude"].map(
518
- lambda x: None if pds.isnull(x) else float(x)
519
- )
520
- temp2_df["longitude"] = temp2_df["longitude"].map(
521
- lambda x: None if pds.isnull(x) else float(x)
522
- )
523
-
524
- ## add gold prefix
525
- temp2_df["gold_id"] = "gold:" + temp2_df["gold_id"]
526
-
527
- ## biosample might belong to more than one project; so do the equivalent of a group_cat
528
- ## see: https://queirozf.com/entries/pandas-dataframe-groupby-examples
529
- ## see: https://stackoverflow.com/questions/18138693/replicating-group-concat-for-pandas-dataframe
530
- groups = (
531
- temp2_df.groupby("biosample_id")["project_gold_id"]
532
- .apply(lambda pid: ",".join(filter(None, pid)))
533
- .reset_index()
534
- )
535
- groups.rename(columns={"project_gold_id": "project_gold_ids"}, inplace=True)
536
-
537
- # join concat groups to dataframe
538
- temp3_df = pds.merge(temp2_df, groups, how="left", on="biosample_id")
539
-
540
- ## A biosample may belong to multiple projects
541
- # E.g. see biosample_id 247352 with gold_id "Gb0247352", belongs to projects 467278, 467306
542
- ## So, remove uneeded columns & drop dups
543
- temp3_df.drop(columns=["project_gold_id"], inplace=True)
544
- temp3_df.drop(columns=["project_id"], inplace=True)
545
- temp3_df.drop_duplicates(inplace=True)
546
-
547
- ## for 'env_broad_scale', 'env_local_scale', 'env_medium' fields change 'ENVO_' to 'ENVO:'
548
- # temp3_df['env_broad_scale'] = temp3_df
549
- for idx in temp3_df.index:
550
- if pds.notnull(temp3_df.loc[idx, "env_broad_scale"]):
551
- temp3_df.loc[idx, "env_broad_scale"] = str(
552
- temp3_df.loc[idx, "env_broad_scale"]
553
- ).replace("_", ":", 1)
554
- if pds.notnull(temp3_df.loc[idx, "env_local_scale"]):
555
- temp3_df.loc[idx, "env_local_scale"] = str(
556
- temp3_df.loc[idx, "env_local_scale"]
557
- ).replace("_", ":", 1)
558
- if pds.notnull(temp3_df.loc[idx, "env_medium"]):
559
- temp3_df.loc[idx, "env_medium"] = str(
560
- temp3_df.loc[idx, "env_medium"]
561
- ).replace("_", ":", 1)
562
-
563
- if len(result_cols) > 0:
564
- return temp3_df[result_cols]
565
- else:
566
- return temp3_df
567
-
568
-
569
- def make_jgi_emsl_dataframe(jgi_emsl_table, study_table, result_cols=[]):
570
- ## subset data
571
- study_table_splice = study_table[["study_id", "gold_id"]].copy()
572
-
573
- ## inner join jgi-emsl data on study (must be part of study)
574
- temp1_df = pds.merge(
575
- jgi_emsl_table,
576
- study_table_splice,
577
- how="inner",
578
- left_on="gold_study_id",
579
- right_on="gold_id",
580
- )
581
-
582
- ## add prefix
583
- temp1_df.gold_id = "gold:" + temp1_df.gold_id
584
- temp1_df.gold_study_id = "gold:" + temp1_df.gold_study_id
585
-
586
- if len(result_cols) > 0:
587
- return temp1_df[result_cols]
588
- else:
589
- return temp1_df
590
-
591
-
592
- def make_emsl_dataframe(
593
- emsl_table, jgi_emsl_table, study_table, emsl_biosample_table, result_cols=[]
594
- ):
595
- ## subset data
596
- study_table_splice = study_table[["study_id", "gold_id"]].copy()
597
- jgi_emsl_table_splice = jgi_emsl_table[["gold_study_id", "emsl_proposal_id"]].copy()
598
- biosample_slice = emsl_biosample_table[["dataset_id", "biosample_gold_id"]].copy()
599
- biosample_slice["biosample_gold_id"] = (
600
- "gold:" + biosample_slice["biosample_gold_id"]
601
- ) # add prefix
602
-
603
- ## inner join jgi-emsl data on study (must be part of study)
604
- temp1_df = pds.merge(
605
- jgi_emsl_table_splice,
606
- study_table_splice,
607
- how="inner",
608
- left_on="gold_study_id",
609
- right_on="gold_id",
610
- )
611
-
612
- ## inner join emsl data on jgi-emsl proposal ids
613
- temp2_df = pds.merge(emsl_table, temp1_df, how="inner", on="emsl_proposal_id")
614
-
615
- ## add data obect id column
616
- temp2_df["data_object_id"] = "output_"
617
- temp2_df["data_object_id"] = temp2_df["data_object_id"] + temp2_df[
618
- "dataset_id"
619
- ].map(
620
- str
621
- ) # build data object id
622
-
623
- ## add data object name column
624
- temp2_df["data_object_name"] = "output: "
625
- temp2_df["data_object_name"] = temp2_df["data_object_name"] + temp2_df[
626
- "dataset_name"
627
- ].map(
628
- str
629
- ) # build data object id
630
-
631
- ## group concat & join the biosample ids that are inputs to the omics process
632
- ## With filter function as None, the function defaults to Identity function,
633
- ## and each element in random_list is checked if it's true or not.
634
- ## see https://www.programiz.com/python-programming/methods/built-in/filter
635
- groups = biosample_slice.groupby("dataset_id")["biosample_gold_id"]
636
- input_biosamples = (
637
- pds.DataFrame(groups.apply(lambda x: ",".join(filter(None, x))))
638
- .drop_duplicates()
639
- .reset_index()
640
- )
641
-
642
- input_biosamples.reset_index(inplace=True) # make dataset_id a column
643
- input_biosamples.rename(
644
- columns={"biosample_gold_id": "biosample_gold_ids"}, inplace=True
645
- ) # change column name
646
-
647
- input_biosamples["biosample_gold_ids"] = input_biosamples[
648
- "biosample_gold_ids"
649
- ].astype(
650
- str
651
- ) # make sure biosample_ids are strings
652
-
653
- temp2_df = pds.merge(temp2_df, input_biosamples, how="left", on="dataset_id")
654
-
655
- ## add "emsl:TBD" id for missing biosamples
656
- temp2_df["biosample_gold_ids"] = temp2_df["biosample_gold_ids"].map(
657
- lambda x: "emsl:TBD" if pds.isnull(x) else x
658
- )
659
-
660
- ## add prefix
661
- temp2_df.gold_id = "gold:" + temp2_df.gold_id
662
- temp2_df.gold_study_id = "gold:" + temp2_df.gold_study_id
663
- temp2_df.dataset_id = "emsl:" + temp2_df.dataset_id
664
- temp2_df.data_object_id = "emsl:" + temp2_df.data_object_id
665
-
666
- ## replace NaNs with None
667
- temp2_df = temp2_df.where(pds.notnull(temp2_df), None)
668
-
669
- ## drop duplicates
670
- temp2_df.drop_duplicates(inplace=True)
671
-
672
- if len(result_cols) > 0:
673
- return temp2_df[result_cols]
674
- else:
675
- return temp2_df
676
-
677
-
678
- def make_data_objects_dataframe(
679
- faa_table, fna_table, fastq_table, project_table, result_cols=[]
680
- ):
681
- ## subset data
682
- project_table_splice = project_table[["gold_id"]].copy()
683
-
684
- ## copy tables
685
- faa_df = faa_table.copy()
686
- fna_df = fna_table.copy()
687
- fastq_df = fastq_table.copy()
688
-
689
- ## add prefixes for faa, fna, and fastq files
690
- faa_df.file_id = "nmdc:" + faa_df.file_id
691
- fna_df.file_id = "nmdc:" + fna_df.file_id
692
- fastq_df.file_id = "jgi:" + fastq_df.file_id
693
-
694
- ## merge tables
695
- data_objects = pds.concat([faa_df, fna_df, fastq_df], axis=0)
696
-
697
- ## inner joing data objects (e.g., faa, fna, fasq) to projects
698
- temp1_df = pds.merge(
699
- data_objects,
700
- project_table_splice,
701
- how="inner",
702
- left_on="gold_project_id",
703
- right_on="gold_id",
704
- )
705
-
706
- ## add prefix for gold
707
- temp1_df.gold_project_id = "gold:" + temp1_df.gold_project_id
708
- temp1_df.gold_id = "gold:" + temp1_df.gold_id
709
-
710
- if len(result_cols) > 0:
711
- return temp1_df[result_cols]
712
- else:
713
- return temp1_df[data_objects.columns]
714
-
715
-
716
- def make_jgi_fastq_dataframe(fastq_table, project_table, result_cols=[]):
717
- ## subset data
718
- project_table_splice = project_table[["gold_id"]].copy()
719
-
720
- ## copy tables
721
- fastq_df = fastq_table.copy()
722
-
723
- ## add prefixes for fastq file id
724
- fastq_df.file_id = "jgi:" + fastq_df.file_id
725
-
726
- ## inner join to projects
727
- temp1_df = pds.merge(
728
- fastq_df,
729
- project_table_splice,
730
- how="inner",
731
- left_on="gold_project_id",
732
- right_on="gold_id",
733
- )
734
-
735
- ## add prefix for gold
736
- temp1_df.gold_project_id = "gold:" + temp1_df.gold_project_id
737
- temp1_df.gold_id = "gold:" + temp1_df.gold_id
738
-
739
- if len(result_cols) > 0:
740
- return temp1_df[result_cols]
741
- else:
742
- return temp1_df[fastq_df.columns]
743
-
744
-
745
- def make_dataframe_from_spec_file(data_spec_file, nrows=None):
746
- def make_df_from_file(data_source, nrows):
747
- file_type = data_source["file_type"]
748
- fname = data_source["file_name"]
749
-
750
- if "file_archive_name" in data_source.keys():
751
- farchive = data_source["file_archive_name"]
752
- df = make_dataframe(
753
- fname, file_archive_name=farchive, file_type=file_type, nrows=nrows
754
- )
755
- else:
756
- df = make_dataframe(fname, file_type=file_type, nrows=nrows)
757
-
758
- return df
759
-
760
- def make_df(source, source_type="file_name"):
761
- name = source[0]
762
- data = source[1]
763
- data_source = source[1]["data_source"]
764
-
765
- if source_type not in data_source.keys():
766
- return None
767
-
768
- ## get data from file
769
- if "file_name" in data_source.keys():
770
- df = make_df_from_file(data_source, nrows=nrows)
771
-
772
- ## add extra columns
773
- if "append_columns" in data.keys():
774
- for col in data["append_columns"]:
775
- df[col["name"]] = col["value"]
776
-
777
- ## filter rows by specific values
778
- if "filters" in data.keys():
779
- for fltr in data["filters"]:
780
- if "include" in fltr:
781
- df = df[
782
- df[fltr["include"]["field"]].isin(fltr["include"]["values"])
783
- ]
784
- elif "exclude" in fltr:
785
- df = df[
786
- ~df[fltr["exclude"]["field"]].isin(fltr["exclude"]["values"])
787
- ]
788
- else:
789
- df = df[df[fltr["field"]].isin(fltr["values"])]
790
-
791
- ## select a subset of the columns
792
- if "subset_cols" in data.keys():
793
- df = df[data["subset_cols"]]
794
-
795
- ## rename columns
796
- if "rename_slots" in data.keys():
797
- for slot in data["rename_slots"]:
798
- df.rename(columns={slot["old_name"]: slot["new_name"]}, inplace=True)
799
-
800
- ## add 'nmdc_record_id' as a primary key
801
- if "id_key" in data.keys():
802
- df["nmdc_record_id"] = df[data["id_key"]]
803
- df["nmdc_record_id"] = df["nmdc_record_id"].astype(
804
- str
805
- ) # ensure all keys are strings
806
- else:
807
- df.index.name = "nmdc_record_id" # rename the current index
808
- df.reset_index(inplace=True) # turn index into a column
809
- df["nmdc_record_id"] = df["nmdc_record_id"].astype(
810
- str
811
- ) # ensure all keys are strings
812
-
813
- return df
814
-
815
- with open(data_spec_file, "r") as input_file:
816
- # spec = DottedDict(yaml.load(input_file, Loader=Loader))
817
- spec = yaml.load(input_file, Loader=Loader)
818
-
819
- Data_source = namedtuple("Data_source", "data name")
820
-
821
- dataframes = []
822
- for source in spec["data_sources"].items():
823
- df = make_df(source)
824
- ds = Data_source(df, source[0])
825
- dataframes.append(ds)
826
- print(source[0], len(df))
827
-
828
- merged_df = merge_dataframes(dataframes)
829
- return merged_df