cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,673 @@
1
+ #!python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Fri Sep 20 21:14:40 2024
5
+
6
+ @author: m102324
7
+ """
8
+
9
+ import sys
10
+ import pandas as pd
11
+ import numpy as np
12
+ import logging
13
+ import argparse
14
+ from sklearn.impute import KNNImputer
15
+ #from sklearn.experimental import enable_iterative_imputer
16
+ #from sklearn.impute import IterativeImputer
17
+
18
+ from impyute.cs.fast_knn import fast_knn
19
+ from impyute.cs.em import em
20
+ from impyute.ops.util import toy_df,insert_na
21
+ from impyute.cs.random import random_impute
22
+ from impyute.cs.buck_iterative import buck_iterative
23
+ from impyute.ts.moving_window import moving_window
24
+ from missingpy import MissForest
25
+
26
+ #use pip to install fancyimpute
27
+ from fancyimpute import NuclearNormMinimization, SoftImpute, BiScaler
28
+
29
+ from cpgmodule._version import __version__
30
+
31
+
32
+ __author__ = "Liguo Wang"
33
+ __copyright__ = "Copyleft"
34
+ __credits__ = []
35
+ __license__ = "MIT"
36
+ __maintainer__ = "Liguo Wang"
37
+ __email__ = "wang.liguo@mayo.edu"
38
+ __status__ = "Development"
39
+
40
+ logging.basicConfig(
41
+ format="%(asctime)s [%(levelname)s] %(message)s",
42
+ datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
43
+
44
+ def read_df(infile):
45
+ logging.info("Read input file \"%s\" as a data frame ..." % infile)
46
+ df = pd.read_csv(infile, index_col=0, sep=None, engine='python')
47
+ return df
48
+
49
+ def nafiller():
50
+ """
51
+ Generic and Specific Methods for DNA Methylation Missing Value Imputation.
52
+ """
53
+
54
+ general_help = "DNA methylation data imputation package"
55
+
56
+ # sub commands and help.
57
+ commands = {
58
+ 'DropNA': "Drop rows or columns with NAs.",
59
+ 'FillValue': "Replace missing values with a given value.",
60
+ 'FillMean': "Impute missing values with row-wise or column-wise means..",
61
+ 'FillMedian': "Impute missing values with row-wise or column-wise medians.",
62
+ 'FillMin': "Impute missing values with row-wise or column-wise minimum values.",
63
+ 'FillMax': "Impute missing values with row-wise or column-wise maximum values",
64
+ 'FillRand': "Impute missing values with randomly selected values from \
65
+ the same row or column.",
66
+ 'FillRef': "Impute missing values using values from an external \
67
+ reference dataset.",
68
+ 'MW': "Interpolate the missing values with moving window.",
69
+ 'KNN': "Impute missing values using scikit-learn's KNNImputer function. \
70
+ Note: slow for large datasets.",
71
+ 'KNN2': "Impute missing values using KNN2",
72
+ 'fKNN': "Impute missing values using Impyute's fast KNN (fKNN) method.",
73
+ 'EM': "Impute missing values using the Expectation Maximization (EM) \
74
+ algorithm.",
75
+ 'Buck': "Impute missing values using Buck's method.",
76
+ 'NNM': "Impute missing values using Nuclear Norm Minimization (NNM).",
77
+ 'SoftImpute': "Impute missing values by iterative soft thresholding of SVD decompositions.",
78
+ 'RF': "Impute missing values using Random Forest (RF) prediction.",
79
+ 'ToyDf': "Generate a toy dataframe with specified missing values for testing.",
80
+ 'InsertNA': "Insert n missing values into an exist dataframe.",
81
+ }
82
+ # create parse
83
+ parser = argparse.ArgumentParser(
84
+ description=general_help, epilog='',
85
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
86
+ )
87
+ parser.add_argument(
88
+ '-v', '--version', action='version', version='%s %s' %
89
+ ('nafiller', __version__)
90
+ )
91
+
92
+ # create sub-parser
93
+ sub_parsers = parser.add_subparsers(help='Sub-command description:')
94
+
95
+ DropNA_parser = sub_parsers.add_parser('DropNA', help=commands['DropNA'])
96
+ FillValue_parser = sub_parsers.add_parser('FillValue', help=commands['FillValue'])
97
+ FillMean_parser = sub_parsers.add_parser('FillMean', help=commands['FillMean'])
98
+ FillMedian_parser = sub_parsers.add_parser('FillMedian', help=commands['FillMedian'])
99
+ FillMin_parser = sub_parsers.add_parser('FillMin', help=commands['FillMin'])
100
+ FillMax_parser = sub_parsers.add_parser('FillMax', help=commands['FillMax'])
101
+ FillRand_parser = sub_parsers.add_parser('FillRand', help=commands['FillRand'])
102
+ FillRef_parser = sub_parsers.add_parser('FillRef', help=commands['FillRef'])
103
+ MW_parser = sub_parsers.add_parser('MW', help=commands['MW'])
104
+ KNN_parser = sub_parsers.add_parser('KNN', help=commands['KNN'])
105
+ fKNN_parser = sub_parsers.add_parser('fKNN', help=commands['fKNN'])
106
+ EM_parser = sub_parsers.add_parser('EM', help=commands['EM'])
107
+ Buck_parser = sub_parsers.add_parser('Buck', help=commands['Buck'])
108
+ NNM_parser = sub_parsers.add_parser('NNM', help=commands['NNM'])
109
+ SoftImpute_parser = sub_parsers.add_parser('SoftImpute', help=commands['SoftImpute'])
110
+ RF_parser = sub_parsers.add_parser('RF', help=commands['RF'])
111
+ ToyDf_parser = sub_parsers.add_parser('ToyDf', help=commands['ToyDf'])
112
+ InsertNA_parser = sub_parsers.add_parser('InsertNA', help=commands['InsertNA'])
113
+
114
+
115
+ DropNA_parser.add_argument(
116
+ 'input', type=str, metavar='input_df',
117
+ help="Input data frame.")
118
+ DropNA_parser.add_argument(
119
+ 'output', type=str, metavar='out_df',
120
+ help="Output data frame.")
121
+ DropNA_parser.add_argument(
122
+ '--axis', type=int, choices=range(2), default=0,
123
+ help="0 : drop rows with any missing values, 1 : drop columns with \
124
+ missing values. Default: 0")
125
+ DropNA_parser.add_argument(
126
+ '--decimal', type=int, default=5,
127
+ help="Number of decimal places to round each column to. default: %(default)s")
128
+
129
+ FillValue_parser.add_argument(
130
+ 'input', type=str, metavar='input_df',
131
+ help="Input data frame.")
132
+ FillValue_parser.add_argument(
133
+ 'output', type=str, metavar='out_df',
134
+ help="Output data frame.")
135
+ FillValue_parser.add_argument(
136
+ '--score', type=float, default=0.0,
137
+ help="The value uesd to fill all NAs.")
138
+ FillValue_parser.add_argument(
139
+ '--decimal', type=int, default=5,
140
+ help="Number of decimal places to round each column to. default: %(default)s")
141
+
142
+ FillMean_parser.add_argument(
143
+ 'input', type=str, metavar='input_df',
144
+ help="Input data frame.")
145
+ FillMean_parser.add_argument(
146
+ 'output', type=str, metavar='out_df',
147
+ help="Output data frame.")
148
+ FillMean_parser.add_argument(
149
+ '--axis', type=int, choices=range(2), default=1,
150
+ help="0 means column, 1 means row. Default: fill NAs with row means")
151
+ FillMean_parser.add_argument(
152
+ '--decimal', type=int, default=5,
153
+ help="Number of decimal places to round each column to. default: %(default)s")
154
+
155
+ FillMedian_parser.add_argument(
156
+ 'input', type=str, metavar='input_df',
157
+ help="Input data frame.")
158
+ FillMedian_parser.add_argument(
159
+ 'output', type=str, metavar='out_df',
160
+ help="Output data frame.")
161
+ FillMedian_parser.add_argument(
162
+ '--axis', type=int, choices=range(2), default=1,
163
+ help="0 means column, 1 means row. Default: fill NAs with row medians")
164
+ FillMedian_parser.add_argument(
165
+ '--decimal', type=int, default=5,
166
+ help="Number of decimal places to round each column to. default: %(default)s")
167
+
168
+ FillMin_parser.add_argument(
169
+ 'input', type=str, metavar='input_df',
170
+ help="Input data frame.")
171
+ FillMin_parser.add_argument(
172
+ 'output', type=str, metavar='out_df',
173
+ help="Output data frame.")
174
+ FillMin_parser.add_argument(
175
+ '--axis', type=int, choices=range(2), default=1,
176
+ help="0 means column, 1 means row. Default: fill NAs with the minimum value of the rows.")
177
+ FillMin_parser.add_argument(
178
+ '--decimal', type=int, default=5,
179
+ help="Number of decimal places to round each column to. default: %(default)s")
180
+
181
+ FillMax_parser.add_argument(
182
+ 'input', type=str, metavar='input_df',
183
+ help="Input data frame.")
184
+ FillMax_parser.add_argument(
185
+ 'output', type=str, metavar='out_df',
186
+ help="Output data frame.")
187
+ FillMax_parser.add_argument(
188
+ '--axis', type=int, choices=range(2), default=1,
189
+ help="0 means column, 1 means row. Default: fill NAs with the maximum value of the rows.")
190
+ FillMax_parser.add_argument(
191
+ '--decimal', type=int, default=5,
192
+ help="Number of decimal places to round each column to. default: %(default)s")
193
+
194
+ FillRand_parser.add_argument(
195
+ 'input', type=str, metavar='input_df',
196
+ help="Input data frame.")
197
+ FillRand_parser.add_argument(
198
+ 'output', type=str, metavar='out_df',
199
+ help="Output data frame.")
200
+ FillRand_parser.add_argument(
201
+ '--axis', type=int, choices=range(2), default=1,
202
+ help="0 means column, 1 means row. Default: fill NAs with values randomly selected from rows.")
203
+ FillRand_parser.add_argument(
204
+ '--decimal', type=int, default=5,
205
+ help="Number of decimal places to round each column to. default: %(default)s")
206
+
207
+ FillRef_parser.add_argument(
208
+ 'input', type=str, metavar='input_df',
209
+ help="Input data frame.")
210
+ FillRef_parser.add_argument(
211
+ 'output', type=str, metavar='out_df',
212
+ help="Output data frame.")
213
+ FillRef_parser.add_argument(
214
+ '--ref', type=str,
215
+ help="File name of the external reference.")
216
+ FillRef_parser.add_argument(
217
+ '--decimal', type=int, default=5,
218
+ help="Number of decimal places to round each column to. default: %(default)s")
219
+
220
+ MW_parser.add_argument(
221
+ 'input', type=str, metavar='input_df',
222
+ help="Input data frame.")
223
+ MW_parser.add_argument(
224
+ 'output', type=str, metavar='out_df',
225
+ help="Output data frame.")
226
+ MW_parser.add_argument(
227
+ '--nindex', type=int, choices=[0, -1, None],
228
+ default=None,
229
+ help="Null index. Index of the null value inside the moving average window. \
230
+ See impyute documentation for details. default: %(default)s")
231
+ MW_parser.add_argument(
232
+ '--wsize', type=int, default=5,
233
+ help="Size of the moving average window/area of values being used \
234
+ for each local imputation. This number includes the missing value. \
235
+ default: %(default)s")
236
+ MW_parser.add_argument(
237
+ '--errors', type=str, choices=["raise", "coerce", "ignore"],
238
+ default='coerce',
239
+ help="Errors will occur with the indexing of the windows - for \
240
+ example if there is a nan at data[x][0] and `nindex` is set to \
241
+ -1 or there is a nan at data[x][-1] and `nindex` is set to 0. `\
242
+ 'raise' will raise an error, `coerce` will try again using an \
243
+ nindex set to the middle and `ignore` will just leave it as a \
244
+ nan default: %(default)s")
245
+ MW_parser.add_argument(
246
+ '--func', type=str, choices=["mean", "median"],
247
+ default='mean',
248
+ help="Function to summerzie values within the moving window. \
249
+ default: %(default)s")
250
+ MW_parser.add_argument(
251
+ '--axis', type=int, choices=range(2), default=1,
252
+ help="0 means column, 1 means row. Default: fill missing value \
253
+ with windows moving on rows.")
254
+ MW_parser.add_argument(
255
+ '--decimal', type=int, default=5,
256
+ help="Number of decimal places to round each column to. default: %(default)s")
257
+
258
+ KNN_parser.add_argument(
259
+ 'input', type=str, metavar='input_df',
260
+ help="Input data frame.")
261
+ KNN_parser.add_argument(
262
+ 'output', type=str, metavar='out_df',
263
+ help="Output data frame.")
264
+ KNN_parser.add_argument(
265
+ '-k', '--neighbours', type=int,
266
+ help="Number of neighboring samples to use for imputation. If k is \
267
+ None, k = sqrt(n) where n is the \"total number of samples\".")
268
+ KNN_parser.add_argument(
269
+ '-w', '--weightfunction', type=str,
270
+ choices=['uniform', 'distance'],
271
+ default='distance',
272
+ help="Weight function used in predictionaction.")
273
+ KNN_parser.add_argument(
274
+ '--decimal', type=int, default=5,
275
+ help="Number of decimal places to round each column to. default: %(default)s")
276
+
277
+ fKNN_parser.add_argument(
278
+ 'input', type=str, metavar='input_df',
279
+ help="Input data frame.")
280
+ fKNN_parser.add_argument(
281
+ 'output', type=str, metavar='out_df',
282
+ help="Output data frame.")
283
+ fKNN_parser.add_argument(
284
+ '-k', '--neighbours', type=int,
285
+ help="Number of neighboring samples to use for imputation. If k is \
286
+ None, k = sqrt(n) where n is the \"total number of samples\".")
287
+ fKNN_parser.add_argument(
288
+ '--eps', type=float,
289
+ default=0,
290
+ help="Refer to the docs for [`scipy.spatial.KDTree.query`]. Must be\
291
+ non-negative float number.")
292
+ fKNN_parser.add_argument(
293
+ '--decimal', type=int, default=5,
294
+ help="Number of decimal places to round each column to. default: %(default)s")
295
+
296
+ EM_parser.add_argument(
297
+ 'input', type=str, metavar='input_df',
298
+ help="Input data frame.")
299
+ EM_parser.add_argument(
300
+ 'output', type=str, metavar='out_df',
301
+ help="Output data frame.")
302
+ EM_parser.add_argument(
303
+ '--decimal', type=int, default=5,
304
+ help="Number of decimal places to round each column to. default: %(default)s")
305
+
306
+ Buck_parser.add_argument(
307
+ 'input', type=str, metavar='input_df',
308
+ help="Input data frame.")
309
+ Buck_parser.add_argument(
310
+ 'output', type=str, metavar='out_df',
311
+ help="Output data frame.")
312
+ Buck_parser.add_argument(
313
+ '--decimal', type=int, default=5,
314
+ help="Number of decimal places to round each column to. default: %(default)s")
315
+
316
+ NNM_parser.add_argument(
317
+ 'input', type=str, metavar='input_df',
318
+ help="Input data frame.")
319
+ NNM_parser.add_argument(
320
+ 'output', type=str, metavar='out_df',
321
+ help="Output data frame.")
322
+ NNM_parser.add_argument(
323
+ '--decimal', type=int, default=5,
324
+ help="Number of decimal places to round each column to. default: %(default)s")
325
+
326
+ SoftImpute_parser.add_argument(
327
+ 'input', type=str, metavar='input_df',
328
+ help="Input data frame.")
329
+ SoftImpute_parser.add_argument(
330
+ 'output', type=str, metavar='out_df',
331
+ help="Output data frame.")
332
+ SoftImpute_parser.add_argument(
333
+ '--decimal', type=int, default=5,
334
+ help="Number of decimal places to round each column to. default: %(default)s")
335
+
336
+ RF_parser.add_argument(
337
+ 'input', type=str, metavar='input_df',
338
+ help="Input data frame.")
339
+ RF_parser.add_argument(
340
+ 'output', type=str, metavar='out_df',
341
+ help="Output data frame.")
342
+ RF_parser.add_argument(
343
+ '--decimal', type=int, default=5,
344
+ help="Number of decimal places to round each column to. default: %(default)s")
345
+
346
+ ToyDf_parser.add_argument(
347
+ 'output', type=str, metavar='out_df',
348
+ help="Name of the output data frame.")
349
+ ToyDf_parser.add_argument(
350
+ '--nrow', type=int, default=10,
351
+ help="Number of rows. default: %(default)s")
352
+ ToyDf_parser.add_argument(
353
+ '--ncol', type=int, default=10,
354
+ help="Number of columns. default: %(default)s")
355
+ ToyDf_parser.add_argument(
356
+ '--nmiss', type=float, default=5,
357
+ help="Number of missing values ingested into the dataframe. default: %(default)s")
358
+ ToyDf_parser.add_argument(
359
+ '--seed', type=int, default=123,
360
+ help="Seed used to initialize a pseudorandom number generator. default: %(default)s")
361
+ ToyDf_parser.add_argument(
362
+ '--prefix', type=str, default='s',
363
+ help="Prefix of the column names, a series numbers will be appended to \
364
+ the prefix. If this is set to None, as np.ndarray rather than pd.dataframe \
365
+ will be returned. default: %(default)s")
366
+ ToyDf_parser.add_argument(
367
+ '--min', type=float, default=0.0,
368
+ help="The minimum value. default: %(default)s")
369
+ ToyDf_parser.add_argument(
370
+ '--max', type=float, default=1.0,
371
+ help="The maximum value. default: %(default)s")
372
+ ToyDf_parser.add_argument(
373
+ '--decimal', type=int, default=5,
374
+ help="Number of decimal places to round each column to. default: %(default)s")
375
+
376
+ InsertNA_parser.add_argument(
377
+ 'input', type=str, metavar='input_df',
378
+ help="Input data frame.")
379
+ InsertNA_parser.add_argument(
380
+ 'output', type=str, metavar='out_df',
381
+ help="Output data frame.")
382
+ InsertNA_parser.add_argument(
383
+ '--nmiss', type=int,
384
+ help="Number of missing values ingested into the dataframe.")
385
+ InsertNA_parser.add_argument(
386
+ '--seed', type=int, default=123,
387
+ help="Seed used to initialize a pseudorandom number generator. default: %(default)s")
388
+ InsertNA_parser.add_argument(
389
+ '--decimal', type=int, default=5,
390
+ help="Number of decimal places to round each column to. default: %(default)s")
391
+
392
+ axis_name = {0:'columns', 1:'rows'}
393
+ args = parser.parse_args()
394
+ if len(sys.argv) == 1:
395
+ parser.print_help(sys.stderr)
396
+ sys.exit(0)
397
+ elif len(sys.argv) >= 2:
398
+ command = sys.argv[1]
399
+ if command.lower() == 'dropna':
400
+ input_df = read_df(args.input)
401
+ #reset
402
+ axis_name = {0:'rows', 1:'columns'}
403
+ logging.info("File \"%s\" contains %d missing values ..." %
404
+ (args.input, input_df.isna().sum().sum()))
405
+ logging.info("Remove %s with any missing values ..." % axis_name[args.axis])
406
+ output_df = input_df.dropna(axis=args.axis, how='any').round(args.decimal)
407
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
408
+ logging.info("File \"%s\" contains %d missing values ..." %
409
+ (args.output, output_df.isna().sum().sum()))
410
+ elif command.lower() == 'fillvalue':
411
+ input_df = read_df(args.input)
412
+ logging.info("File \"%s\" contains %d missing values ..." %
413
+ (args.input, input_df.isna().sum().sum()))
414
+ logging.info("Replace missing values with \"%f\" ..." % args.score)
415
+ output_df = input_df.fillna(args.score).round(args.decimal)
416
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
417
+ logging.info("File \"%s\" contains %d missing values ..." %
418
+ (args.output, output_df.isna().sum().sum()))
419
+ elif command.lower() == 'fillmean':
420
+ input_df = read_df(args.input)
421
+ logging.info("File \"%s\" contains %d missing values ..." %
422
+ (args.input, input_df.isna().sum().sum()))
423
+ if args.axis == 0:
424
+ logging.info("Replace missing values with COLUMN means ...")
425
+ output_df = input_df.fillna(input_df.mean())
426
+ elif args.axis == 1:
427
+ logging.info("Replace missing values with ROW means ...")
428
+ input_df = input_df.T
429
+ output_df = input_df.fillna(input_df.mean()).T
430
+ logging.info("Unknown parameter.")
431
+ pass
432
+ output_df = output_df.round(args.decimal)
433
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
434
+ logging.info("File \"%s\" contains %d missing values ..." %
435
+ (args.output, output_df.isna().sum().sum()))
436
+ elif command.lower() == 'fillmedian':
437
+ input_df = read_df(args.input)
438
+ logging.info("File \"%s\" contains %d missing values ..." %
439
+ (args.input, input_df.isna().sum().sum()))
440
+ if args.axis == 0:
441
+ logging.info("Replace missing values with COLUMN medians ...")
442
+ output_df = input_df.fillna(input_df.median())
443
+ elif args.axis == 1:
444
+ logging.info("Replace missing values with ROW medians ...")
445
+ input_df = input_df.T
446
+ output_df = input_df.fillna(input_df.median()).T
447
+ else:
448
+ logging.info("Unknown parameter.")
449
+ pass
450
+ output_df = output_df.round(args.decimal)
451
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
452
+ logging.info("File \"%s\" contains %d missing values ..." %
453
+ (args.output, output_df.isna().sum().sum()))
454
+ elif command.lower() == 'fillmin':
455
+ input_df = read_df(args.input)
456
+ logging.info("File \"%s\" contains %d missing values ..." %
457
+ (args.input, input_df.isna().sum().sum()))
458
+ if args.axis == 0:
459
+ logging.info("Replace missing values with COLUMN minimums ...")
460
+ output_df = input_df.fillna(input_df.min())
461
+ elif args.axis == 1:
462
+ logging.info("Replace missing values with ROW minimums ...")
463
+ input_df = input_df.T
464
+ output_df = input_df.fillna(input_df.min()).T
465
+ else:
466
+ logging.info("Unknown parameter.")
467
+ pass
468
+ output_df = output_df.round(args.decimal)
469
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
470
+ logging.info("File \"%s\" contains %d missing values ..." %
471
+ (args.output, output_df.isna().sum().sum()))
472
+ elif command.lower() == 'fillmax':
473
+ input_df = read_df(args.input)
474
+ logging.info("File \"%s\" contains %d missing values ..." %
475
+ (args.input, input_df.isna().sum().sum()))
476
+ if args.axis == 0:
477
+ logging.info("Replace missing values with COLUMN maximums ...")
478
+ output_df = input_df.fillna(input_df.max())
479
+ elif args.axis == 1:
480
+ logging.info("Replace missing values with ROW maximums ...")
481
+ input_df = input_df.T
482
+ output_df = input_df.fillna(input_df.max()).T
483
+ else:
484
+ logging.info("Unknown parameter.")
485
+ pass
486
+ output_df = output_df.round(args.decimal)
487
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
488
+ logging.info("File \"%s\" contains %d missing values ..." %
489
+ (args.output, output_df.isna().sum().sum()))
490
+ elif command.lower() == 'fillrand':
491
+ input_df = read_df(args.input)
492
+ logging.info("File \"%s\" contains %d missing values ..." %
493
+ (args.input, input_df.isna().sum().sum()))
494
+ if args.axis == 0:
495
+ logging.info("Replace missing values with values randomly selected from the same column ...")
496
+ output_df = random_impute(input_df)
497
+ elif args.axis == 1:
498
+ logging.info("Replace missing values with values randomly selected from the same row ...")
499
+ input_df = input_df.T
500
+ output_df = random_impute(input_df).T
501
+ else:
502
+ logging.info("Unknown parameter.")
503
+ pass
504
+ output_df = output_df.round(args.decimal)
505
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
506
+ logging.info("File \"%s\" contains %d missing values ..." %
507
+ (args.output, output_df.isna().sum().sum()))
508
+ elif command.lower() == 'fillref':
509
+ input_df = read_df(args.input)
510
+ logging.info("File \"%s\" contains %d missing values ..." %
511
+ (args.input, input_df.isna().sum().sum()))
512
+
513
+ input_df.index = input_df.index.map(str)
514
+ external_ref = {}
515
+ logging.info("Read external reference file \"%s\" ..." % args.ref)
516
+ for l in open(args.ref,'r'):
517
+ l = l.strip()
518
+ f = l.split()
519
+ try:
520
+ external_ref[f[0]] = float(f[1])
521
+ except ValueError:
522
+ pass
523
+ logging.info("Replace missing values with values from external reference ...")
524
+ for ID, betas in input_df.iterrows():
525
+ if betas.isnull().values.any():
526
+ if ID in external_ref:
527
+ ref_beta = external_ref[ID]
528
+ input_df.loc[ID, :] = betas.fillna(ref_beta)
529
+ else:
530
+ continue
531
+ else:
532
+ continue
533
+ output_df = input_df
534
+ output_df = output_df.round(args.decimal)
535
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
536
+ logging.info("File \"%s\" contains %d missing values ..." %
537
+ (args.output, output_df.isna().sum().sum()))
538
+ elif command.lower() == 'mw':
539
+ input_df = read_df(args.input)
540
+ logging.info("File \"%s\" contains %d missing values ..." %
541
+ (args.input, input_df.isna().sum().sum()))
542
+ logging.info("Replace missing values using moving window on %s ..." % axis_name[args.axis])
543
+ if args.axis == 1:
544
+ output_df = moving_window(
545
+ input_df, nindex = args.nindex, wsize=args.wsize,
546
+ errors=args.errors, func=getattr(np, args.func))
547
+ output_df = output_df.round(args.decimal)
548
+ elif args.axis == 0:
549
+ output_df = moving_window(
550
+ input_df.T, nindex = args.nindex, wsize=args.wsize,
551
+ errors=args.errors, func=getattr(np, args.func))
552
+ output_df = output_df.round(args.decimal).T
553
+
554
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
555
+ logging.info("File \"%s\" contains %d missing values ..." %
556
+ (args.output, output_df.isna().sum().sum()))
557
+ elif command.lower() == 'knn':
558
+ input_df = read_df(args.input)
559
+ logging.info("File \"%s\" contains %d missing values ..." %
560
+ (args.input, input_df.isna().sum().sum()))
561
+ logging.info("Replace missing values using scikit-learn's KNNImputer function ...")
562
+ if args.neighbours is None:
563
+ nb = int(input_df.shape[1]**0.5)
564
+ else:
565
+ nb = args.neighbours
566
+ imputer = KNNImputer(n_neighbors=nb, weights=args.weightfunction)
567
+ after = imputer.fit_transform(input_df)
568
+ output_df = pd.DataFrame(after, index = input_df.index,
569
+ columns = input_df.columns)
570
+ output_df = output_df.round(args.decimal)
571
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
572
+ logging.info("File \"%s\" contains %d missing values ..." %
573
+ (args.output, output_df.isna().sum().sum()))
574
+ elif command.lower() == 'fknn':
575
+ input_df = read_df(args.input)
576
+ logging.info("File \"%s\" contains %d missing values ..." %
577
+ (args.input, input_df.isna().sum().sum()))
578
+ logging.info("Replace missing values using Impyute's fast KNN (fKNN) method ...")
579
+ if args.neighbours is None:
580
+ nb = int(input_df.shape[1]**0.5)
581
+ else:
582
+ nb = args.neighbours
583
+ output_df = fast_knn(input_df, k=nb, eps=args.eps, p=2,
584
+ distance_upper_bound=np.inf, leafsize=10)
585
+ output_df = output_df.round(args.decimal)
586
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
587
+ logging.info("File \"%s\" contains %d missing values ..." %
588
+ (args.output, output_df.isna().sum().sum()))
589
+ elif command.lower() == 'em':
590
+ input_df = read_df(args.input)
591
+ logging.info("File \"%s\" contains %d missing values ..." %
592
+ (args.input, input_df.isna().sum().sum()))
593
+ logging.info("Replace missing values using EM algorithem ...")
594
+ output_df = em(input_df)
595
+ output_df = output_df.round(args.decimal)
596
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
597
+ logging.info("File \"%s\" contains %d missing values ..." %
598
+ (args.output, output_df.isna().sum().sum()))
599
+ elif command.lower() == 'buck':
600
+ input_df = read_df(args.input)
601
+ logging.info("File \"%s\" contains %d missing values ..." %
602
+ (args.input, input_df.isna().sum().sum()))
603
+ logging.info("Replace missing values using Buck's method ...")
604
+ output_df = buck_iterative(input_df)
605
+ output_df = output_df.round(args.decimal)
606
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
607
+ logging.info("File \"%s\" contains %d missing values ..." %
608
+ (args.output, output_df.isna().sum().sum()))
609
+ elif command.lower() == 'nnm':
610
+ input_df = read_df(args.input)
611
+ logging.info("File \"%s\" contains %d missing values ..." %
612
+ (args.input, input_df.isna().sum().sum()))
613
+ logging.info("Replace missing values using nuclear norm minimization ...")
614
+ X_filled = NuclearNormMinimization().fit_transform(input_df.to_numpy())
615
+ output_df = pd.DataFrame(X_filled, index=input_df.index, columns=input_df.columns)
616
+ output_df = output_df.round(args.decimal)
617
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
618
+ logging.info("File \"%s\" contains %d missing values ..." %
619
+ (args.output, output_df.isna().sum().sum()))
620
+ elif command.lower() == 'softimpute':
621
+ input_df = read_df(args.input)
622
+ logging.info("File \"%s\" contains %d missing values ..." %
623
+ (args.input, input_df.isna().sum().sum()))
624
+ logging.info("Replace missing values using SoftImpute ...")
625
+ X_filled = SoftImpute().fit_transform(input_df.to_numpy())
626
+ output_df = pd.DataFrame(X_filled, index=input_df.index, columns=input_df.columns)
627
+ output_df = output_df.round(args.decimal)
628
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
629
+ logging.info("File \"%s\" contains %d missing values ..." %
630
+ (args.output, output_df.isna().sum().sum()))
631
+
632
+ elif command.lower() == 'rf':
633
+ input_df = read_df(args.input)
634
+ logging.info("File \"%s\" contains %d missing values ..." %
635
+ (args.input, input_df.isna().sum().sum()))
636
+ logging.info("Replace missing values using Random Forest ...")
637
+ imputer = MissForest()
638
+ X_filled = imputer.fit_transform(input_df.to_numpy())
639
+ output_df = pd.DataFrame(X_filled, index=input_df.index, columns=input_df.columns)
640
+ output_df = output_df.round(args.decimal)
641
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
642
+ logging.info("File \"%s\" contains %d missing values ..." %
643
+ (args.output, output_df.isna().sum().sum()))
644
+
645
+ elif command.lower() == 'toydf':
646
+ logging.info("Generate toy dataframe ...")
647
+ output_df = toy_df(n_rows = args.nrow, n_cols = args.ncol,
648
+ missingness = args.nmiss, sample_prefix=args.prefix,
649
+ min_val = args.min, max_val = args.max,
650
+ rand_seed=args.seed)
651
+ #print(output_df)
652
+ output_df = output_df.round(args.decimal)
653
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
654
+ logging.info("File \"%s\" contains %d missing values ..." %
655
+ (args.output, output_df.isna().sum().sum()))
656
+ elif command.lower() == 'insertna':
657
+ input_df = read_df(args.input)
658
+ logging.info("File \"%s\" contains %d missing values ..." %
659
+ (args.input, input_df.isna().sum().sum()))
660
+ logging.info("Insert %d NAs into dataframe ..." % args.na)
661
+ output_df = insert_na(df=input_df, n_miss=args.nmiss, seed=args.seed)
662
+ output_df = output_df.round(args.decimal)
663
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
664
+ logging.info("File \"%s\" contains %d missing values ..." %
665
+ (args.output, output_df.isna().sum().sum()))
666
+
667
+ else:
668
+ print("Unknown command!")
669
+ parser.print_help(sys.stderr)
670
+ sys.exit(0)
671
+
672
+ if __name__=='__main__':
673
+ nafiller()