cpgtools 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cpgtools might be problematic. Click here for more details.

Files changed (94) hide show
  1. cpgmodule/_version.py +1 -0
  2. cpgmodule/utils.py +35 -0
  3. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
  4. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
  5. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +1 -2
  6. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
  7. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
  8. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
  9. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
  10. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
  11. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
  12. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
  13. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
  14. cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
  15. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
  16. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
  17. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
  18. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
  19. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
  20. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
  21. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
  22. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
  23. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
  24. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
  25. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
  26. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
  27. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
  28. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
  29. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
  30. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
  31. {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +1 -1
  32. cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
  33. cpgtools-2.0.2.dist-info/LICENSE +19 -0
  34. cpgtools-2.0.2.dist-info/METADATA +76 -0
  35. cpgtools-2.0.2.dist-info/RECORD +82 -0
  36. {cpgtools-2.0.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
  37. cpgtools-2.0.2.dist-info/top_level.txt +3 -0
  38. impyute/__init__.py +3 -0
  39. impyute/contrib/__init__.py +7 -0
  40. impyute/contrib/compare.py +69 -0
  41. impyute/contrib/count_missing.py +30 -0
  42. impyute/contrib/describe.py +63 -0
  43. impyute/cs/__init__.py +11 -0
  44. impyute/cs/buck_iterative.py +82 -0
  45. impyute/cs/central_tendency.py +84 -0
  46. impyute/cs/em.py +52 -0
  47. impyute/cs/fast_knn.py +130 -0
  48. impyute/cs/random.py +27 -0
  49. impyute/dataset/__init__.py +6 -0
  50. impyute/dataset/base.py +137 -0
  51. impyute/dataset/corrupt.py +55 -0
  52. impyute/deletion/__init__.py +5 -0
  53. impyute/deletion/complete_case.py +21 -0
  54. impyute/ops/__init__.py +12 -0
  55. impyute/ops/error.py +9 -0
  56. impyute/ops/inverse_distance_weighting.py +31 -0
  57. impyute/ops/matrix.py +47 -0
  58. impyute/ops/testing.py +20 -0
  59. impyute/ops/util.py +76 -0
  60. impyute/ops/wrapper.py +179 -0
  61. impyute/ts/__init__.py +6 -0
  62. impyute/ts/locf.py +57 -0
  63. impyute/ts/moving_window.py +128 -0
  64. missingpy/__init__.py +4 -0
  65. missingpy/knnimpute.py +328 -0
  66. missingpy/missforest.py +556 -0
  67. missingpy/pairwise_external.py +315 -0
  68. missingpy/tests/__init__.py +0 -0
  69. missingpy/tests/test_knnimpute.py +605 -0
  70. missingpy/tests/test_missforest.py +409 -0
  71. missingpy/utils.py +124 -0
  72. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  73. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  74. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  75. cpgmodule/data/GA_Bohlin.pkl +0 -0
  76. cpgmodule/data/GA_Haftorn.pkl +0 -0
  77. cpgmodule/data/GA_Knight.pkl +0 -0
  78. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  79. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  80. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  81. cpgmodule/data/GA_Mayne.pkl +0 -0
  82. cpgmodule/data/Hannum.pkl +0 -0
  83. cpgmodule/data/Horvath_2013.pkl +0 -0
  84. cpgmodule/data/Horvath_2018.pkl +0 -0
  85. cpgmodule/data/Levine.pkl +0 -0
  86. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  87. cpgmodule/data/Ped_McEwen.pkl +0 -0
  88. cpgmodule/data/Ped_Wu.pkl +0 -0
  89. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  90. cpgmodule/data/Zhang_EN.pkl +0 -0
  91. cpgtools-2.0.0.dist-info/LICENSE.txt +0 -674
  92. cpgtools-2.0.0.dist-info/METADATA +0 -28
  93. cpgtools-2.0.0.dist-info/RECORD +0 -64
  94. cpgtools-2.0.0.dist-info/top_level.txt +0 -2
@@ -0,0 +1,604 @@
1
+ #!python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Fri Sep 20 21:14:40 2024
5
+
6
+ @author: m102324
7
+ """
8
+
9
+ import sys
10
+ import pandas as pd
11
+ import numpy as np
12
+ import logging
13
+ import argparse
14
+ from sklearn.impute import KNNImputer
15
+ #from sklearn.experimental import enable_iterative_imputer
16
+ #from sklearn.impute import IterativeImputer
17
+
18
+ from impyute.cs.fast_knn import fast_knn
19
+ from impyute.cs.em import em
20
+ from impyute.ops.util import toy_df,insert_na
21
+ from impyute.cs.random import random_impute
22
+ from impyute.cs.buck_iterative import buck_iterative
23
+ from missingpy import MissForest
24
+
25
+ #use pip to install fancyimpute
26
+ from fancyimpute import NuclearNormMinimization, SoftImpute, BiScaler
27
+
28
+ from cpgmodule._version import __version__
29
+
30
+
31
+ __author__ = "Liguo Wang"
32
+ __copyright__ = "Copyleft"
33
+ __credits__ = []
34
+ __license__ = "MIT"
35
+ __maintainer__ = "Liguo Wang"
36
+ __email__ = "wang.liguo@mayo.edu"
37
+ __status__ = "Development"
38
+
39
+ logging.basicConfig(
40
+ format="%(asctime)s [%(levelname)s] %(message)s",
41
+ datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
42
+
43
+ def read_df(infile):
44
+ logging.info("Read input file \"%s\" as a data frame ..." % infile)
45
+ df = pd.read_csv(infile, index_col=0, sep=None, engine='python')
46
+ return df
47
+
48
+ def nafiller():
49
+ """
50
+ Generic and Specific Methods for DNA Methylation Missing Value Imputation.
51
+ """
52
+
53
+ general_help = "DNA methylation data imputation package"
54
+
55
+ # sub commands and help.
56
+ commands = {
57
+ 'DropNA': "Drop rows or columns with NAs.",
58
+ 'FillValue': "Replace missing values with a given value.",
59
+ 'FillMean': "Impute missing values with row-wise or column-wise means..",
60
+ 'FillMedian': "Impute missing values with row-wise or column-wise medians.",
61
+ 'FillMin': "Impute missing values with row-wise or column-wise minimum values.",
62
+ 'FillMax': "Impute missing values with row-wise or column-wise maximum values",
63
+ 'FillRand': "Impute missing values with randomly selected values from \
64
+ the same row or column.",
65
+ 'FillRef': "Impute missing values using values from an external \
66
+ reference dataset.",
67
+ 'KNN': "Impute missing values using scikit-learn's KNNImputer function. \
68
+ Note: slow for large datasets.",
69
+ 'KNN2': "Impute missing values using KNN2",
70
+ 'fKNN': "Impute missing values using Impyute's fast KNN (fKNN) method.",
71
+ 'EM': "Impute missing values using the Expectation Maximization (EM) \
72
+ algorithm.",
73
+ 'Buck': "Impute missing values using Buck's method.",
74
+ 'NNM': "Impute missing values using Nuclear Norm Minimization (NNM).",
75
+ 'SoftImpute': "Impute missing values by iterative soft thresholding of SVD decompositions.",
76
+ 'RF': "Impute missing values using Random Forest (RF) prediction.",
77
+ 'ToyDf': "Generate a toy dataframe with specified missing values for testing.",
78
+ 'InsertNA': "Insert n missing values into an exist dataframe.",
79
+ }
80
+ # create parse
81
+ parser = argparse.ArgumentParser(
82
+ description=general_help, epilog='',
83
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
84
+ )
85
+ parser.add_argument(
86
+ '-v', '--version', action='version', version='%s %s' %
87
+ ('nafiller', __version__)
88
+ )
89
+
90
+ # create sub-parser
91
+ sub_parsers = parser.add_subparsers(help='Sub-command description:')
92
+
93
+ DropNA_parser = sub_parsers.add_parser('DropNA', help=commands['DropNA'])
94
+ FillValue_parser = sub_parsers.add_parser('FillValue', help=commands['FillValue'])
95
+ FillMean_parser = sub_parsers.add_parser('FillMean', help=commands['FillMean'])
96
+ FillMedian_parser = sub_parsers.add_parser('FillMedian', help=commands['FillMedian'])
97
+ FillMin_parser = sub_parsers.add_parser('FillMin', help=commands['FillMin'])
98
+ FillMax_parser = sub_parsers.add_parser('FillMax', help=commands['FillMax'])
99
+ FillRand_parser = sub_parsers.add_parser('FillRand', help=commands['FillRand'])
100
+ FillRef_parser = sub_parsers.add_parser('FillRef', help=commands['FillRef'])
101
+ KNN_parser = sub_parsers.add_parser('KNN', help=commands['KNN'])
102
+ fKNN_parser = sub_parsers.add_parser('fKNN', help=commands['fKNN'])
103
+ EM_parser = sub_parsers.add_parser('EM', help=commands['EM'])
104
+ Buck_parser = sub_parsers.add_parser('Buck', help=commands['Buck'])
105
+ NNM_parser = sub_parsers.add_parser('NNM', help=commands['NNM'])
106
+ SoftImpute_parser = sub_parsers.add_parser('SoftImpute', help=commands['SoftImpute'])
107
+ RF_parser = sub_parsers.add_parser('RF', help=commands['RF'])
108
+ ToyDf_parser = sub_parsers.add_parser('ToyDf', help=commands['ToyDf'])
109
+ InsertNA_parser = sub_parsers.add_parser('InsertNA', help=commands['InsertNA'])
110
+
111
+
112
+ DropNA_parser.add_argument(
113
+ 'input', type=str, metavar='input_df',
114
+ help="Input data frame.")
115
+ DropNA_parser.add_argument(
116
+ 'output', type=str, metavar='out_df',
117
+ help="Output data frame.")
118
+ DropNA_parser.add_argument(
119
+ '-a', '--axis', type=int, choices=range(2), default=0,
120
+ help="0 : drop rows with any missing values, 1 : drop columns with \
121
+ missing values. Default: 0")
122
+ DropNA_parser.add_argument(
123
+ '--decimal', type=int, default=5,
124
+ help="Number of decimal places to round each column to. default: %(default)s")
125
+
126
+ FillValue_parser.add_argument(
127
+ 'input', type=str, metavar='input_df',
128
+ help="Input data frame.")
129
+ FillValue_parser.add_argument(
130
+ 'output', type=str, metavar='out_df',
131
+ help="Output data frame.")
132
+ FillValue_parser.add_argument(
133
+ '-s', '--score', type=float, default=0.0,
134
+ help="The value uesd to fill all NAs.")
135
+ FillValue_parser.add_argument(
136
+ '--decimal', type=int, default=5,
137
+ help="Number of decimal places to round each column to. default: %(default)s")
138
+
139
+ FillMean_parser.add_argument(
140
+ 'input', type=str, metavar='input_df',
141
+ help="Input data frame.")
142
+ FillMean_parser.add_argument(
143
+ 'output', type=str, metavar='out_df',
144
+ help="Output data frame.")
145
+ FillMean_parser.add_argument(
146
+ '-a', '--axis', type=int, choices=range(2), default=1,
147
+ help="0 means column, 1 means row. Default: fill NAs with row means")
148
+ FillMean_parser.add_argument(
149
+ '--decimal', type=int, default=5,
150
+ help="Number of decimal places to round each column to. default: %(default)s")
151
+
152
+ FillMedian_parser.add_argument(
153
+ 'input', type=str, metavar='input_df',
154
+ help="Input data frame.")
155
+ FillMedian_parser.add_argument(
156
+ 'output', type=str, metavar='out_df',
157
+ help="Output data frame.")
158
+ FillMedian_parser.add_argument(
159
+ '-a', '--axis', type=int, choices=range(2), default=1,
160
+ help="0 means column, 1 means row. Default: fill NAs with row medians")
161
+ FillMedian_parser.add_argument(
162
+ '--decimal', type=int, default=5,
163
+ help="Number of decimal places to round each column to. default: %(default)s")
164
+
165
+ FillMin_parser.add_argument(
166
+ 'input', type=str, metavar='input_df',
167
+ help="Input data frame.")
168
+ FillMin_parser.add_argument(
169
+ 'output', type=str, metavar='out_df',
170
+ help="Output data frame.")
171
+ FillMin_parser.add_argument(
172
+ '-a', '--axis', type=int, choices=range(2), default=1,
173
+ help="0 means column, 1 means row. Default: fill NAs with the minimum value of the rows.")
174
+ FillMin_parser.add_argument(
175
+ '--decimal', type=int, default=5,
176
+ help="Number of decimal places to round each column to. default: %(default)s")
177
+
178
+ FillMax_parser.add_argument(
179
+ 'input', type=str, metavar='input_df',
180
+ help="Input data frame.")
181
+ FillMax_parser.add_argument(
182
+ 'output', type=str, metavar='out_df',
183
+ help="Output data frame.")
184
+ FillMax_parser.add_argument(
185
+ '-a', '--axis', type=int, choices=range(2), default=1,
186
+ help="0 means column, 1 means row. Default: fill NAs with the maximum value of the rows.")
187
+ FillMax_parser.add_argument(
188
+ '--decimal', type=int, default=5,
189
+ help="Number of decimal places to round each column to. default: %(default)s")
190
+
191
+ FillRand_parser.add_argument(
192
+ 'input', type=str, metavar='input_df',
193
+ help="Input data frame.")
194
+ FillRand_parser.add_argument(
195
+ 'output', type=str, metavar='out_df',
196
+ help="Output data frame.")
197
+ FillRand_parser.add_argument(
198
+ '-a', '--axis', type=int, choices=range(2), default=1,
199
+ help="0 means column, 1 means row. Default: fill NAs with values randomly selected from rows.")
200
+ FillRand_parser.add_argument(
201
+ '--decimal', type=int, default=5,
202
+ help="Number of decimal places to round each column to. default: %(default)s")
203
+
204
+ FillRef_parser.add_argument(
205
+ 'input', type=str, metavar='input_df',
206
+ help="Input data frame.")
207
+ FillRef_parser.add_argument(
208
+ 'output', type=str, metavar='out_df',
209
+ help="Output data frame.")
210
+ FillRef_parser.add_argument(
211
+ '-r', '--ref', type=str,
212
+ help="File name of the external reference.")
213
+ FillRef_parser.add_argument(
214
+ '--decimal', type=int, default=5,
215
+ help="Number of decimal places to round each column to. default: %(default)s")
216
+
217
+ KNN_parser.add_argument(
218
+ 'input', type=str, metavar='input_df',
219
+ help="Input data frame.")
220
+ KNN_parser.add_argument(
221
+ 'output', type=str, metavar='out_df',
222
+ help="Output data frame.")
223
+ KNN_parser.add_argument(
224
+ '-k', '--neighbours', type=int,
225
+ help="Number of neighboring samples to use for imputation. If k is \
226
+ None, k = sqrt(n) where n is the \"total number of samples\".")
227
+ KNN_parser.add_argument(
228
+ '-w', '--weightfunction', type=str,
229
+ choices=['uniform', 'distance'],
230
+ default='distance',
231
+ help="Weight function used in predictionaction.")
232
+ KNN_parser.add_argument(
233
+ '--decimal', type=int, default=5,
234
+ help="Number of decimal places to round each column to. default: %(default)s")
235
+
236
+ fKNN_parser.add_argument(
237
+ 'input', type=str, metavar='input_df',
238
+ help="Input data frame.")
239
+ fKNN_parser.add_argument(
240
+ 'output', type=str, metavar='out_df',
241
+ help="Output data frame.")
242
+ fKNN_parser.add_argument(
243
+ '-k', '--neighbours', type=int,
244
+ help="Number of neighboring samples to use for imputation. If k is \
245
+ None, k = sqrt(n) where n is the \"total number of samples\".")
246
+ fKNN_parser.add_argument(
247
+ '--eps', type=float,
248
+ default=0,
249
+ help="Refer to the docs for [`scipy.spatial.KDTree.query`]. Must be\
250
+ non-negative float number.")
251
+ fKNN_parser.add_argument(
252
+ '--decimal', type=int, default=5,
253
+ help="Number of decimal places to round each column to. default: %(default)s")
254
+
255
+ EM_parser.add_argument(
256
+ 'input', type=str, metavar='input_df',
257
+ help="Input data frame.")
258
+ EM_parser.add_argument(
259
+ 'output', type=str, metavar='out_df',
260
+ help="Output data frame.")
261
+ EM_parser.add_argument(
262
+ '--decimal', type=int, default=5,
263
+ help="Number of decimal places to round each column to. default: %(default)s")
264
+
265
+ Buck_parser.add_argument(
266
+ 'input', type=str, metavar='input_df',
267
+ help="Input data frame.")
268
+ Buck_parser.add_argument(
269
+ 'output', type=str, metavar='out_df',
270
+ help="Output data frame.")
271
+ Buck_parser.add_argument(
272
+ '--decimal', type=int, default=5,
273
+ help="Number of decimal places to round each column to. default: %(default)s")
274
+
275
+ NNM_parser.add_argument(
276
+ 'input', type=str, metavar='input_df',
277
+ help="Input data frame.")
278
+ NNM_parser.add_argument(
279
+ 'output', type=str, metavar='out_df',
280
+ help="Output data frame.")
281
+ NNM_parser.add_argument(
282
+ '--decimal', type=int, default=5,
283
+ help="Number of decimal places to round each column to. default: %(default)s")
284
+
285
+ SoftImpute_parser.add_argument(
286
+ 'input', type=str, metavar='input_df',
287
+ help="Input data frame.")
288
+ SoftImpute_parser.add_argument(
289
+ 'output', type=str, metavar='out_df',
290
+ help="Output data frame.")
291
+ SoftImpute_parser.add_argument(
292
+ '--decimal', type=int, default=5,
293
+ help="Number of decimal places to round each column to. default: %(default)s")
294
+
295
+ RF_parser.add_argument(
296
+ 'input', type=str, metavar='input_df',
297
+ help="Input data frame.")
298
+ RF_parser.add_argument(
299
+ 'output', type=str, metavar='out_df',
300
+ help="Output data frame.")
301
+ RF_parser.add_argument(
302
+ '--decimal', type=int, default=5,
303
+ help="Number of decimal places to round each column to. default: %(default)s")
304
+
305
+ ToyDf_parser.add_argument(
306
+ 'output', type=str, metavar='out_df',
307
+ help="Name of the output data frame.")
308
+ ToyDf_parser.add_argument(
309
+ '-r', '--nrow', type=int, default=10,
310
+ help="Number of rows. default: %(default)s")
311
+ ToyDf_parser.add_argument(
312
+ '-c', '--ncol', type=int, default=10,
313
+ help="Number of columns. default: %(default)s")
314
+ ToyDf_parser.add_argument(
315
+ '--na', type=int, default=5,
316
+ help="Number of missing values ingested into the dataframe. default: %(default)s")
317
+ ToyDf_parser.add_argument(
318
+ '-s', '--seed', type=int, default=123,
319
+ help="Seed used to initialize a pseudorandom number generator. default: %(default)s")
320
+ ToyDf_parser.add_argument(
321
+ '--prefix', type=str, default='s',
322
+ help="Prefix of the column names, a series numbers will be appended to the prefix. default: %(default)s")
323
+ ToyDf_parser.add_argument(
324
+ '--decimal', type=int, default=5,
325
+ help="Number of decimal places to round each column to. default: %(default)s")
326
+
327
+ InsertNA_parser.add_argument(
328
+ 'input', type=str, metavar='input_df',
329
+ help="Input data frame.")
330
+ InsertNA_parser.add_argument(
331
+ 'output', type=str, metavar='out_df',
332
+ help="Output data frame.")
333
+ InsertNA_parser.add_argument(
334
+ '--na', type=int,
335
+ help="Number of missing values ingested into the dataframe.")
336
+ InsertNA_parser.add_argument(
337
+ '-s', '--seed', type=int, default=123,
338
+ help="Seed used to initialize a pseudorandom number generator. default: %(default)s")
339
+ InsertNA_parser.add_argument(
340
+ '--decimal', type=int, default=5,
341
+ help="Number of decimal places to round each column to. default: %(default)s")
342
+
343
+ axis_name = {0:'columns', 1:'rows'}
344
+ args = parser.parse_args()
345
+ if len(sys.argv) == 1:
346
+ parser.print_help(sys.stderr)
347
+ sys.exit(0)
348
+ elif len(sys.argv) >= 2:
349
+ command = sys.argv[1]
350
+ if command.lower() == 'dropna':
351
+ input_df = read_df(args.input)
352
+ #reset
353
+ axis_name = {0:'rows', 1:'columns'}
354
+ logging.info("File \"%s\" contains %d missing values ..." %
355
+ (args.input, input_df.isna().sum().sum()))
356
+ logging.info("Remove %s with any missing values ..." % axis_name[args.axis])
357
+ output_df = input_df.dropna(axis=args.axis, how='any').round(args.decimal)
358
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
359
+ logging.info("File \"%s\" contains %d missing values ..." %
360
+ (args.output, output_df.isna().sum().sum()))
361
+ elif command.lower() == 'fillvalue':
362
+ input_df = read_df(args.input)
363
+ logging.info("File \"%s\" contains %d missing values ..." %
364
+ (args.input, input_df.isna().sum().sum()))
365
+ logging.info("Replace missing values with \"%f\" ..." % args.score)
366
+ output_df = input_df.fillna(args.score).round(args.decimal)
367
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
368
+ logging.info("File \"%s\" contains %d missing values ..." %
369
+ (args.output, output_df.isna().sum().sum()))
370
+ elif command.lower() == 'fillmean':
371
+ input_df = read_df(args.input)
372
+ logging.info("File \"%s\" contains %d missing values ..." %
373
+ (args.input, input_df.isna().sum().sum()))
374
+ if args.axis == 0:
375
+ logging.info("Replace missing values with COLUMN means ...")
376
+ output_df = input_df.fillna(input_df.mean())
377
+ elif args.axis == 1:
378
+ logging.info("Replace missing values with ROW means ...")
379
+ input_df = input_df.T
380
+ output_df = input_df.fillna(input_df.mean()).T
381
+ logging.info("Unknown parameter.")
382
+ pass
383
+ output_df = output_df.round(args.decimal)
384
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
385
+ logging.info("File \"%s\" contains %d missing values ..." %
386
+ (args.output, output_df.isna().sum().sum()))
387
+ elif command.lower() == 'fillmedian':
388
+ input_df = read_df(args.input)
389
+ logging.info("File \"%s\" contains %d missing values ..." %
390
+ (args.input, input_df.isna().sum().sum()))
391
+ if args.axis == 0:
392
+ logging.info("Replace missing values with COLUMN medians ...")
393
+ output_df = input_df.fillna(input_df.median())
394
+ elif args.axis == 1:
395
+ logging.info("Replace missing values with ROW medians ...")
396
+ input_df = input_df.T
397
+ output_df = input_df.fillna(input_df.median()).T
398
+ else:
399
+ logging.info("Unknown parameter.")
400
+ pass
401
+ output_df = output_df.round(args.decimal)
402
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
403
+ logging.info("File \"%s\" contains %d missing values ..." %
404
+ (args.output, output_df.isna().sum().sum()))
405
+ elif command.lower() == 'fillmin':
406
+ input_df = read_df(args.input)
407
+ logging.info("File \"%s\" contains %d missing values ..." %
408
+ (args.input, input_df.isna().sum().sum()))
409
+ if args.axis == 0:
410
+ logging.info("Replace missing values with COLUMN minimums ...")
411
+ output_df = input_df.fillna(input_df.min())
412
+ elif args.axis == 1:
413
+ logging.info("Replace missing values with ROW minimums ...")
414
+ input_df = input_df.T
415
+ output_df = input_df.fillna(input_df.min()).T
416
+ else:
417
+ logging.info("Unknown parameter.")
418
+ pass
419
+ output_df = output_df.round(args.decimal)
420
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
421
+ logging.info("File \"%s\" contains %d missing values ..." %
422
+ (args.output, output_df.isna().sum().sum()))
423
+ elif command.lower() == 'fillmax':
424
+ input_df = read_df(args.input)
425
+ logging.info("File \"%s\" contains %d missing values ..." %
426
+ (args.input, input_df.isna().sum().sum()))
427
+ if args.axis == 0:
428
+ logging.info("Replace missing values with COLUMN maximums ...")
429
+ output_df = input_df.fillna(input_df.max())
430
+ elif args.axis == 1:
431
+ logging.info("Replace missing values with ROW maximums ...")
432
+ input_df = input_df.T
433
+ output_df = input_df.fillna(input_df.max()).T
434
+ else:
435
+ logging.info("Unknown parameter.")
436
+ pass
437
+ output_df = output_df.round(args.decimal)
438
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
439
+ logging.info("File \"%s\" contains %d missing values ..." %
440
+ (args.output, output_df.isna().sum().sum()))
441
+ elif command.lower() == 'fillrand':
442
+ input_df = read_df(args.input)
443
+ logging.info("File \"%s\" contains %d missing values ..." %
444
+ (args.input, input_df.isna().sum().sum()))
445
+ if args.axis == 0:
446
+ logging.info("Replace missing values with values randomly selected from the same column ...")
447
+ output_df = random_impute(input_df)
448
+ elif args.axis == 1:
449
+ logging.info("Replace missing values with values randomly selected from the same row ...")
450
+ input_df = input_df.T
451
+ output_df = random_impute(input_df).T
452
+ else:
453
+ logging.info("Unknown parameter.")
454
+ pass
455
+ output_df = output_df.round(args.decimal)
456
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
457
+ logging.info("File \"%s\" contains %d missing values ..." %
458
+ (args.output, output_df.isna().sum().sum()))
459
+ elif command.lower() == 'fillref':
460
+ input_df = read_df(args.input)
461
+ logging.info("File \"%s\" contains %d missing values ..." %
462
+ (args.input, input_df.isna().sum().sum()))
463
+
464
+ input_df.index = input_df.index.map(str)
465
+ external_ref = {}
466
+ logging.info("Read external reference file \"%s\" ..." % args.ref)
467
+ for l in open(args.ref,'r'):
468
+ l = l.strip()
469
+ f = l.split()
470
+ try:
471
+ external_ref[f[0]] = float(f[1])
472
+ except ValueError:
473
+ pass
474
+ logging.info("Replace missing values with values from external reference ...")
475
+ for ID, betas in input_df.iterrows():
476
+ if betas.isnull().values.any():
477
+ if ID in external_ref:
478
+ ref_beta = external_ref[ID]
479
+ input_df.loc[ID, :] = betas.fillna(ref_beta)
480
+ else:
481
+ continue
482
+ else:
483
+ continue
484
+ output_df = input_df
485
+ output_df = output_df.round(args.decimal)
486
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
487
+ logging.info("File \"%s\" contains %d missing values ..." %
488
+ (args.output, output_df.isna().sum().sum()))
489
+ elif command.lower() == 'knn':
490
+ input_df = read_df(args.input)
491
+ logging.info("File \"%s\" contains %d missing values ..." %
492
+ (args.input, input_df.isna().sum().sum()))
493
+ logging.info("Replace missing values using scikit-learn's KNNImputer function ...")
494
+ if args.neighbours is None:
495
+ nb = int(input_df.shape[1]**0.5)
496
+ else:
497
+ nb = args.neighbours
498
+ imputer = KNNImputer(n_neighbors=nb, weights=args.weightfunction)
499
+ after = imputer.fit_transform(input_df)
500
+ output_df = pd.DataFrame(after, index = input_df.index,
501
+ columns = input_df.columns)
502
+ output_df = output_df.round(args.decimal)
503
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
504
+ logging.info("File \"%s\" contains %d missing values ..." %
505
+ (args.output, output_df.isna().sum().sum()))
506
+ elif command.lower() == 'fknn':
507
+ input_df = read_df(args.input)
508
+ logging.info("File \"%s\" contains %d missing values ..." %
509
+ (args.input, input_df.isna().sum().sum()))
510
+ logging.info("Replace missing values using Impyute's fast KNN (fKNN) method ...")
511
+ if args.neighbours is None:
512
+ nb = int(input_df.shape[1]**0.5)
513
+ else:
514
+ nb = args.neighbours
515
+ output_df = fast_knn(input_df, k=nb, eps=args.eps, p=2,
516
+ distance_upper_bound=np.inf, leafsize=10)
517
+ output_df = output_df.round(args.decimal)
518
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
519
+ logging.info("File \"%s\" contains %d missing values ..." %
520
+ (args.output, output_df.isna().sum().sum()))
521
+ elif command.lower() == 'em':
522
+ input_df = read_df(args.input)
523
+ logging.info("File \"%s\" contains %d missing values ..." %
524
+ (args.input, input_df.isna().sum().sum()))
525
+ logging.info("Replace missing values using EM algorithem ...")
526
+ output_df = em(input_df)
527
+ output_df = output_df.round(args.decimal)
528
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
529
+ logging.info("File \"%s\" contains %d missing values ..." %
530
+ (args.output, output_df.isna().sum().sum()))
531
+ elif command.lower() == 'buck':
532
+ input_df = read_df(args.input)
533
+ logging.info("File \"%s\" contains %d missing values ..." %
534
+ (args.input, input_df.isna().sum().sum()))
535
+ logging.info("Replace missing values using Buck's method ...")
536
+ output_df = buck_iterative(input_df)
537
+ output_df = output_df.round(args.decimal)
538
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
539
+ logging.info("File \"%s\" contains %d missing values ..." %
540
+ (args.output, output_df.isna().sum().sum()))
541
+ elif command.lower() == 'nnm':
542
+ input_df = read_df(args.input)
543
+ logging.info("File \"%s\" contains %d missing values ..." %
544
+ (args.input, input_df.isna().sum().sum()))
545
+ logging.info("Replace missing values using nuclear norm minimization ...")
546
+ X_filled = NuclearNormMinimization().fit_transform(input_df.to_numpy())
547
+ output_df = pd.DataFrame(X_filled, index=input_df.index, columns=input_df.columns)
548
+ output_df = output_df.round(args.decimal)
549
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
550
+ logging.info("File \"%s\" contains %d missing values ..." %
551
+ (args.output, output_df.isna().sum().sum()))
552
+ elif command.lower() == 'softimpute':
553
+ input_df = read_df(args.input)
554
+ logging.info("File \"%s\" contains %d missing values ..." %
555
+ (args.input, input_df.isna().sum().sum()))
556
+ logging.info("Replace missing values using SoftImpute ...")
557
+ X_filled = SoftImpute().fit_transform(input_df.to_numpy())
558
+ output_df = pd.DataFrame(X_filled, index=input_df.index, columns=input_df.columns)
559
+ output_df = output_df.round(args.decimal)
560
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
561
+ logging.info("File \"%s\" contains %d missing values ..." %
562
+ (args.output, output_df.isna().sum().sum()))
563
+
564
+ elif command.lower() == 'rf':
565
+ input_df = read_df(args.input)
566
+ logging.info("File \"%s\" contains %d missing values ..." %
567
+ (args.input, input_df.isna().sum().sum()))
568
+ logging.info("Replace missing values using Random Forest ...")
569
+ imputer = MissForest()
570
+ X_filled = imputer.fit_transform(input_df.to_numpy())
571
+ output_df = pd.DataFrame(X_filled, index=input_df.index, columns=input_df.columns)
572
+ output_df = output_df.round(args.decimal)
573
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
574
+ logging.info("File \"%s\" contains %d missing values ..." %
575
+ (args.output, output_df.isna().sum().sum()))
576
+
577
+ elif command.lower() == 'toydf':
578
+ logging.info("Generate toy dataframe ...")
579
+ output_df = toy_df(nrow = args.nrow, ncol = args.ncol,
580
+ n_miss = args.na, sample_prefix=args.prefix,
581
+ seed=args.seed)
582
+ #print(output_df)
583
+ output_df = output_df.round(args.decimal)
584
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
585
+ logging.info("File \"%s\" contains %d missing values ..." %
586
+ (args.output, output_df.isna().sum().sum()))
587
+ elif command.lower() == 'insertna':
588
+ input_df = read_df(args.input)
589
+ logging.info("File \"%s\" contains %d missing values ..." %
590
+ (args.input, input_df.isna().sum().sum()))
591
+ logging.info("Insert %d NAs into dataframe ..." % args.na)
592
+ output_df = insert_na(df=input_df, n_miss=args.na, seed=args.seed)
593
+ output_df = output_df.round(args.decimal)
594
+ output_df.to_csv(args.output, sep="\t", na_rep="NaN")
595
+ logging.info("File \"%s\" contains %d missing values ..." %
596
+ (args.output, output_df.isna().sum().sum()))
597
+
598
+ else:
599
+ print("Unknown command!")
600
+ parser.print_help(sys.stderr)
601
+ sys.exit(0)
602
+
603
+ if __name__=='__main__':
604
+ nafiller()
@@ -26,6 +26,7 @@ import sys,os
26
26
  import collections
27
27
  import subprocess
28
28
  import numpy as np
29
+ from cpgmodule._version import __version__
29
30
  from optparse import OptionParser
30
31
  from cpgmodule import ireader
31
32
  from cpgmodule.utils import *
@@ -36,7 +37,6 @@ __author__ = "Liguo Wang"
36
37
  __copyright__ = "Copyleft"
37
38
  __credits__ = []
38
39
  __license__ = "GPL"
39
- __version__="2.0.0"
40
40
  __maintainer__ = "Liguo Wang"
41
41
  __email__ = "wang.liguo@mayo.edu"
42
42
  __status__ = "Development"
@@ -18,6 +18,7 @@ import sys,os
18
18
  import collections
19
19
  import numpy as np
20
20
  from scipy import stats
21
+ from cpgmodule._version import __version__
21
22
  from optparse import OptionParser
22
23
  from cpgmodule import ireader
23
24
  from cpgmodule.utils import *
@@ -26,7 +27,6 @@ __author__ = "Liguo Wang"
26
27
  __copyright__ = "Copyleft"
27
28
  __credits__ = []
28
29
  __license__ = "GPL"
29
- __version__="2.0.0"
30
30
  __maintainer__ = "Liguo Wang"
31
31
  __email__ = "wang.liguo@mayo.edu"
32
32
  __status__ = "Development"
@@ -21,6 +21,7 @@ import collections
21
21
  import subprocess
22
22
  import numpy as np
23
23
  from optparse import OptionParser
24
+ from cpgmodule._version import __version__
24
25
  from cpgmodule import ireader
25
26
  from cpgmodule.utils import *
26
27
  from cpgmodule import BED
@@ -29,7 +30,6 @@ __author__ = "Liguo Wang"
29
30
  __copyright__ = "Copyleft"
30
31
  __credits__ = []
31
32
  __license__ = "GPL"
32
- __version__="2.0.0"
33
33
  __maintainer__ = "Liguo Wang"
34
34
  __email__ = "wang.liguo@mayo.edu"
35
35
  __status__ = "Development"
@@ -25,6 +25,7 @@ import collections
25
25
  import subprocess
26
26
  import numpy as np
27
27
  from optparse import OptionParser
28
+ from cpgmodule._version import __version__
28
29
  from cpgmodule import ireader
29
30
  from cpgmodule.utils import *
30
31
  from cpgmodule import BED
@@ -33,7 +34,6 @@ __author__ = "Liguo Wang"
33
34
  __copyright__ = "Copyleft"
34
35
  __credits__ = []
35
36
  __license__ = "GPL"
36
- __version__="2.0.0"
37
37
  __maintainer__ = "Liguo Wang"
38
38
  __email__ = "wang.liguo@mayo.edu"
39
39
  __status__ = "Development"