cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,170 @@
1
+ import sys
2
+ from bx.intervals import *
3
+ import numpy as np
4
+ from cpgmodule import ireader
5
+
6
+ def getBasalDomains(bedfile, up = 5000, down = 1000, printit = False):
7
+ '''
8
+ Define gene's basal regulatory domain.
9
+ bedfile: one gene one TSS (could use the canonical (longest) isoform, or merge all isoforms into a super transcript.
10
+ up: size of extension to upstream of TSS
11
+ down: size of extension to downstream of TSS
12
+ '''
13
+ basal_ranges = {}
14
+
15
+ for l in ireader.reader(bedfile):
16
+ if l.startswith('#'):
17
+ continue
18
+ if l.startswith('track'):
19
+ continue
20
+ if l.startswith('browser'):
21
+ continue
22
+ f = l.split()
23
+ try:
24
+ chrom = f[0]
25
+ start = int(f[1])
26
+ end = int(f[2])
27
+ symbol = f[3]
28
+ strand = f[5]
29
+ if start > end:
30
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
31
+ continue
32
+ except:
33
+ print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
34
+ continue
35
+
36
+ if chrom not in basal_ranges:
37
+ basal_ranges[chrom] = IntervalTree()
38
+
39
+ if strand == '+':
40
+ basal_st = (start + 1) - up
41
+ basal_end = (start + 1) + down
42
+ basal_st = max(0, basal_st)
43
+ basal_ranges[chrom].insert_interval( Interval(basal_st, basal_end, strand = '+', value = symbol))
44
+
45
+ elif strand == '-':
46
+ basal_st = end - down
47
+ basal_end = end + up
48
+ basal_st = max(0, basal_st)
49
+ basal_ranges[chrom].insert_interval( Interval(basal_st, basal_end, strand = '-', value = symbol))
50
+ if printit:
51
+ print('\t'.join([str(i) for i in (chrom, basal_st, basal_end, symbol, '0', strand)]), file = sys.stdout)
52
+ return basal_ranges
53
+
54
+ def geteExtendedDomains(basal_ranges, bedfile, up = 5000, down = 1000, ext=1000000, printit = False):
55
+ '''
56
+ Define gene's extended regulatory domain.
57
+ bedfile:one gene one TSS (could use the canonical (longest) isoform, or merge all
58
+ isoforms into a super transcript.
59
+ ext: maximum size of extension (default 1000Kb)
60
+
61
+ Two step process:
62
+ 1) Each gene is assigned a basal regulatory domain of a minimum distance upstream and
63
+ downstream of the TSS (regardless of other nearby genes).
64
+ 2) The gene regulatory domain is extended in both directions to the nearest gene's
65
+ basal domain but no more than the maximum extension in one direction.
66
+ '''
67
+ domain_ranges = {} #gene's regulatory domain range
68
+ for l in ireader.reader(bedfile):
69
+ if l.startswith('#'):
70
+ continue
71
+ if l.startswith('track'):
72
+ continue
73
+ if l.startswith('browser'):
74
+ continue
75
+ f = l.split()
76
+ try:
77
+ chrom = f[0]
78
+ start = int(f[1])
79
+ end = int(f[2])
80
+ symbol = f[3]
81
+ strand = f[5]
82
+ if start > end:
83
+ print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
84
+ continue
85
+ except:
86
+ print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
87
+
88
+ if strand == '+':
89
+ tss = start + 1
90
+ basal_st = tss - up
91
+ basal_end = tss + down
92
+ extension_st = tss - ext
93
+ extension_end = tss + ext
94
+ elif strand == '-':
95
+ tss = end
96
+ basal_st = tss - down
97
+ basal_end = tss + up
98
+ extension_st = tss - ext
99
+ extension_end = tss + ext
100
+ if extension_st < 0:
101
+ extension_st = 0
102
+ #try to update extension_st
103
+ overlaps = basal_ranges[chrom].find(extension_st, basal_st)
104
+ if len(overlaps) > 0:
105
+ for o in overlaps:
106
+ if o.end > extension_st:
107
+ extension_st = o.end
108
+ if extension_st > basal_st:
109
+ extension_st = basal_st
110
+
111
+ #try to update extension_end
112
+ overlaps = basal_ranges[chrom].find(basal_end, extension_end)
113
+ if len(overlaps) > 0:
114
+ for o in overlaps:
115
+ if o.start < extension_end:
116
+ extension_end = o.start
117
+ if extension_end < basal_end:
118
+ extension_end = basal_end
119
+
120
+ if chrom not in domain_ranges:
121
+ domain_ranges[chrom] = IntervalTree()
122
+ else:
123
+ domain_ranges[chrom].insert_interval(Interval(extension_st, extension_end, strand = strand, value=symbol))
124
+
125
+
126
+ if printit:
127
+ print('\t'.join([str(i) for i in (chrom, extension_st, extension_end, symbol, '0', strand, basal_st, basal_end, '255,0,0', 1, extension_end - extension_st, 0)]), file = sys.stdout)
128
+
129
+ return domain_ranges
130
+ """
131
+ if len(overlaps) == 1:
132
+ domain_ranges[chrom].insert_interval(Interval(extension_st, extension_end, strand = strand, value=symbol))
133
+ if printit:
134
+ print('\t'.join([str(i) for i in (chrom, extension_st, extension_end, symbol, '0', strand)]), file = sys.stdout)
135
+
136
+ else:
137
+ o_basal_starts = [] #starts of basal_region overlapped with extension_region
138
+ o_basal_ends = []
139
+ for o in overlaps:
140
+ if o.vallue = symbol:
141
+ continue
142
+ o_basal_starts.append(o.start)
143
+ o_basal_ends.append(o.end)
144
+
145
+ tmp1 = [i for i in o_basal_ends if i > extension_st and i < tss]
146
+ tmp2 = [i for i in o_basal_starts if i < extension_end and i > tss]
147
+ if len(tmp1) == 0:
148
+ truncaed_ext_st = extension_st
149
+ else:
150
+ truncaed_ext_st = max(tmp1)
151
+ if len(tmp2) == 0:
152
+ truncaed_ext_end = extension_end
153
+ else:
154
+ truncaed_ext_end = min(tmp2)
155
+
156
+ truncaed_ext_st = max(0,truncaed_ext_st)
157
+ domain_ranges[chrom].insert_interval(Interval(truncaed_ext_st, truncaed_ext_end, strand = strand, value=symbol))
158
+
159
+ if printit:
160
+ print('\t'.join([str(i) for i in (chrom, truncaed_ext_st, truncaed_ext_end, symbol + '_extended', '0', strand)]), file = sys.stdout)
161
+ """
162
+
163
+ if __name__=='__main__':
164
+ tmp = basal_domain(sys.argv[1], printit = False)
165
+ extended_domain(basal_ranges = tmp, bedfile = sys.argv[1], printit=True)
166
+
167
+
168
+
169
+
170
+