phylokrr-dev 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,252 @@
1
+
2
+ import numpy as np
3
+ from phylokrr.treeio import parseNewickTree, is_leaf
4
+
5
+ def split_leaves(sorted_leaves, folds):
6
+
7
+ n = len(sorted_leaves)
8
+ fold_size = n//folds
9
+ myfolds = []
10
+ for i in range(folds):
11
+ test_idx = list(range(i * fold_size, (i + 1) * fold_size))
12
+ train_idx = list(set(range(n)) - set(test_idx))
13
+ test_l = sorted_leaves[test_idx]
14
+ train_l = sorted_leaves[train_idx]
15
+ myfolds.append([train_l, test_l])
16
+
17
+ return myfolds
18
+
19
+ def tree_postorder(root):
20
+
21
+ L = []
22
+ S = [root]
23
+ while S:
24
+ v = S.pop()
25
+ L.append(v)
26
+
27
+ if is_leaf(v):
28
+ continue
29
+
30
+ l = v.left
31
+ r = v.right
32
+ S.append(l)
33
+ S.append(r)
34
+
35
+ return L
36
+
37
+ # tree = "((4,5),(6,7));"
38
+ def alpha_weighting(tree, alpha=1/4):
39
+ """
40
+ Weight the leaves of a phylogenetic tree based on
41
+ on the number of decendents using an alpha parameter.
42
+
43
+ Since it uses a hash table to store the weights,
44
+ the order of the leaves is not guaranteed
45
+ to be the same as in the input tree.
46
+
47
+ """
48
+ _, root = parseNewickTree(tree)
49
+
50
+ L = tree_postorder(root)
51
+
52
+ while L:
53
+ v = L.pop()
54
+ if is_leaf(v):
55
+ v.branch_length = 1
56
+ continue
57
+
58
+ l = v.left
59
+ r = v.right
60
+ v.branch_length = l.branch_length + r.branch_length
61
+
62
+ # alpha = 1/4
63
+ A = {}
64
+ # S = [(root, math.log(1))]
65
+ S = [(root, 1)]
66
+ while S:
67
+ pa,p = S.pop()
68
+
69
+ if is_leaf(pa):
70
+ # store only leaves probabilities
71
+ A[pa.name] = p
72
+ continue
73
+
74
+ nl = pa.left.branch_length
75
+ nr = pa.right.branch_length
76
+
77
+ pr = (nr/(nl+nr))**alpha
78
+ pl = 1 - pr
79
+ # S.append((pa.left, math.log(pl) + p))
80
+ # S.append((pa.right, math.log(pr) + p))
81
+ S.append((pa.left, pl * p))
82
+ S.append((pa.right, pr * p))
83
+
84
+ return A
85
+
86
+ def sample_mChoice(P, rng, Fz):
87
+ """
88
+
89
+ Return a set of indices sampled according to P, without replacement, and with size Fz
90
+ """
91
+ S_idx = rng.choice(len(P), size=Fz, p=P, replace=False)
92
+ P[S_idx] = 0
93
+ P /= np.sum(P)
94
+
95
+ return set(S_idx)
96
+
97
+ def split_data_idx(num_test, P, rng): # type: ignore
98
+ """
99
+
100
+ Return two lists of indices, one for test and one for train
101
+ """
102
+ all_idx = set(range(len(P)))
103
+ test_idx = sample_mChoice(P, rng, num_test)
104
+ return list(test_idx), list(all_idx - test_idx)
105
+
106
+ def getT2(P, rng, n, folds = 5):
107
+ """
108
+ Return a list of sets
109
+ """
110
+ T = []
111
+ Fz = n//folds
112
+ # O(k * n)
113
+ for _ in range(folds - 1):
114
+ T.append(
115
+ sample_mChoice(P, rng, Fz)
116
+ )
117
+
118
+ last_S = []
119
+ for i in range(n):
120
+ if P[i] != 0.0:
121
+ last_S.append(i)
122
+ T.append(set(last_S))
123
+
124
+ return T
125
+
126
+
127
+ def weighted_sample(P, rng, folds=5): # type: ignore
128
+
129
+ n = len(P)
130
+ all_idx = set(range(n))
131
+ T = getT2(P, rng, n, folds)
132
+
133
+ myfolds = []
134
+ for test_idx in T:
135
+ myfolds.append([
136
+ list(all_idx - test_idx),
137
+ list(test_idx)
138
+ ])
139
+ return myfolds
140
+
141
+ # region: Testing
142
+
143
+ # tree_file = "test_tree_v5.txt"
144
+ # with open(tree_file, 'r') as f:
145
+ # tree = f.read().strip()
146
+
147
+ # from phylokrr.treeio import get_vcv
148
+
149
+ # rng = np.random.default_rng(seed=12038)
150
+ # V, leaves = get_vcv(tree, process = "OU", sigma2 = 1, alpha = 1)
151
+
152
+ # # P = np.linalg.inv(V) @ np.ones(len(leaves))
153
+ # # P /= np.sum(P)
154
+
155
+
156
+
157
+
158
+ # A = alpha_weighting(tree, alpha = 1e2)
159
+ # P = np.array([A[leaf] for leaf in leaves])
160
+ # # print(A)
161
+ # print(P)
162
+
163
+ # n = len(leaves)
164
+ # test_idx, train_idx = split_data_idx(num_test=int(n*0.25), P=P, rng=rng)
165
+
166
+ # P_train = P[train_idx]
167
+ # myfolds = weighted_sample(P_train, rng, folds=5)
168
+
169
+ # # for a,b in myfolds:
170
+ # # print(a,"-------",b)
171
+ # # print(P[list(myfolds[3][1])])
172
+
173
+ # test_leaves = [leaves[i] for i in test_idx]
174
+ # with open("test_leaves.txt", 'w') as f:
175
+ # f.write(",".join(test_leaves) + "\n")
176
+ # # print("Test Leaves: ", test_leaves)
177
+
178
+ # # # # # myfolds
179
+ # with open("folds.txt", 'w') as f:
180
+ # # with open("folds_random.txt", 'w') as f:
181
+ # for train_idx_f, test_idx_f in myfolds:
182
+
183
+ # train_leaves_f = []
184
+ # for idx in train_idx_f:
185
+ # train_leaves_f.append(leaves[train_idx[idx]])
186
+
187
+ # test_leaves_f = []
188
+ # for idx in test_idx_f:
189
+ # test_leaves_f.append(leaves[train_idx[idx]])
190
+
191
+ # f.write(",".join(train_leaves_f) + "\n")
192
+ # f.write(",".join(test_leaves_f) + "\n")
193
+
194
+ # endregion
195
+
196
+
197
+ # region: Old code
198
+
199
+ # def updateP(S, P):
200
+ # # mark those taken as 0
201
+ # P[S != 0] = 0
202
+ # # P[S] = 0
203
+ # # adjust the probabilities
204
+ # P /= np.sum(P)
205
+ # # print("Updated P: ", P)
206
+
207
+
208
+ # def multi_sample(P, rng, Fz):
209
+ # # P = P.copy()
210
+ # # S_idx = rng.choice(len(P), size=Fz, p=P, replace=False)
211
+ # # S = np.zeros(len(P), dtype=int)
212
+ # # S[S_idx] = 1
213
+
214
+ # S = rng.multinomial(n=Fz, pvals=P, size=1)[0]
215
+ # updateP(S, P)
216
+ # NZ = sum(S != 0)
217
+ # while Fz > NZ:
218
+ # Si = rng.multinomial(n=Fz - NZ, pvals=P, size=1)[0]
219
+ # # Just update with the recently taken
220
+ # # samples, not the whole S
221
+ # updateP(Si, P)
222
+ # S += Si
223
+ # NZ = sum(S != 0)
224
+
225
+ # return S
226
+
227
+ # def getT(P, rng, n, folds = 5):
228
+ # # P = P.copy()
229
+ # T = []
230
+ # Fz = n//folds
231
+ # # O(k * n)
232
+ # for _ in range(folds - 1):
233
+ # S = multi_sample2(P, rng, Fz)
234
+ # T.append(S)
235
+
236
+ # S = np.zeros(n, dtype=int)
237
+ # S[P != 0] = 1
238
+ # T.append(S)
239
+ # return T
240
+
241
+ # def split_sample(t):
242
+ # test_idx = []
243
+ # train_idx = []
244
+ # for k, v in enumerate(t):
245
+ # if v != 0:
246
+ # test_idx.append(k)
247
+ # else:
248
+ # train_idx.append(k)
249
+
250
+ # return test_idx, train_idx
251
+
252
+ # endregion
@@ -0,0 +1,256 @@
1
+ import sys
2
+ import time
3
+ from collections import deque
4
+
5
+ import numpy as np
6
+
7
+ from phylokrr_dev.alpha_beta_weighting import weighted_sample
8
+ from phylokrr_dev.utils import progressbar
9
+
10
+
11
+ def sample_hyperParams(params, seed, sample, verbose):
12
+ """
13
+ Random search for hyperparameter tuning using k-fold cross-validation
14
+
15
+ Parameters
16
+ ----------
17
+ params : dict
18
+ hyperparameters to sample from
19
+
20
+ seed : int
21
+ random seed
22
+
23
+ sample : int
24
+ number of samples
25
+
26
+ verbose : bool
27
+ print number of unique hyperparameters
28
+
29
+ Returns
30
+ -------
31
+ np.ndarray
32
+ sampled hyperparameters
33
+
34
+ dict
35
+ index of hyperparameters
36
+
37
+ list
38
+ names of hyperparameters
39
+ """
40
+
41
+ np.random.seed(seed=seed)
42
+ # make random choice from the grid of hyperparameters
43
+ P_n = params.keys()
44
+ P_r = np.zeros((sample, len(P_n))) # random hyperparameters
45
+
46
+ P_index = {}
47
+ for n,k in enumerate(P_n):
48
+ P_r[:,n] = np.random.choice(params[k], sample, )
49
+ P_index[k] = n
50
+
51
+ if verbose:
52
+ # check tested_params are unique
53
+ P_r = np.unique(P_r, axis=0)
54
+ print("Number of unique hyperparameters: ", P_r.shape[0])
55
+
56
+ return P_r, P_index, P_n
57
+
58
+ def pick_init_index(P_index, P_r, seed):
59
+ """
60
+ Pick the initial index for the hyperparameters
61
+ """
62
+
63
+ np.random.seed(seed=seed)
64
+ # get the lambda index
65
+ lam_idx = ""
66
+ if "lambda" in P_index:
67
+ lam_idx = P_index["lambda"]
68
+
69
+ if "lam" in P_index:
70
+ lam_idx = P_index["lam"]
71
+
72
+ if lam_idx:
73
+ init_idx = np.argmax(P_r[:,lam_idx])
74
+ # ave = np.mean(P_r[:,lam_idx])
75
+ # init_idx = np.argmin(np.abs(P_r[:,lam_idx] - ave))
76
+ else:
77
+ init_idx = np.random.randint(P_r.shape[0])
78
+
79
+ return init_idx
80
+
81
+ def init_model(X_train, y_train, vcv_train, model,
82
+ param_init, i):
83
+ """
84
+ Initialize the model.
85
+ This initialization of the model is used at the beginning of every
86
+ fold. This set the inverse of the inverse of the correlation matrix
87
+ and the initial alpha.
88
+
89
+ if it is the first fold iteration, the alpha is calculated
90
+ from the initial hyperparameters. Otherwise, the previous
91
+ solution is used as the initial alpha.
92
+
93
+ For a given fold, the correlation matrix is the same
94
+ for all the hyperparameters, then inverse of the correlation
95
+ matrix is calculated only once at the beginning of the fold.
96
+ """
97
+ # for the initial model, R inv should be calculated
98
+ model.copy_R_inv = False
99
+
100
+
101
+ # for the initial model, initial params should be random
102
+ # when it is the first fold iteration. Otherwise
103
+ # we can use the previous solution as the initial
104
+ # i is the fold index
105
+ # model.warm_start = False if i == 0 else True
106
+ model.warm_start = False
107
+
108
+ # set the initial hyperparameters
109
+ model.set_params(**param_init)
110
+ # model fit, while also setting the R_inv matrix
111
+ time_start = time.time()
112
+ model.fit(X_train, y_train, vcv_train)
113
+ time_end = time.time()
114
+ print(f"Initial model fit time: {time_end - time_start:.2f} seconds")
115
+
116
+ # for the next iteration alpha is used
117
+ model.warm_start = False
118
+
119
+ # for the next iteration R_inv is used
120
+ model.copy_R_inv = True
121
+
122
+
123
+ def sort_P_r(P_r, P_index):
124
+
125
+ if "lambda" in P_index:
126
+ lam_index = P_index["lambda"]
127
+
128
+ elif "lam" in P_index:
129
+ lam_index = P_index["lam"]
130
+
131
+ else:
132
+ lam_index = None
133
+
134
+ if lam_index:
135
+ P_r = P_r[np.argsort(P_r[:,lam_index])[::-1],]
136
+
137
+ return P_r
138
+
139
+
140
+ def k_fold_cv_weigthed(X, y, vcv, model, W,
141
+ params,
142
+ folds = 3,
143
+ sample = 100,
144
+ verbose = True,
145
+ seed = 12038):
146
+ """
147
+ """
148
+
149
+ model.verbose = False
150
+ np.random.seed(seed=seed)
151
+ rng = np.random.RandomState(seed=seed)
152
+
153
+ # X = X_train
154
+ P_r, P_index, P_n = sample_hyperParams(params, seed, sample, verbose)
155
+
156
+ # if lambda is present, then sort by lambda
157
+ P_r = sort_P_r(P_r, P_index)
158
+
159
+ # a random index if lambda is not present.
160
+ # otherwise (most typical), the index with the highest lambda
161
+ init_idx = pick_init_index(P_index, P_r, seed)
162
+ param_init = dict(zip(P_n, P_r[init_idx,:]))
163
+
164
+ n, _ = X.shape
165
+ # linked list to store all the errors
166
+ all_errors = deque([])
167
+ myfolds = weighted_sample(W, rng, folds=folds)
168
+
169
+ # O(n^3*folds*sample)
170
+ for i, (train_idx, test_idx) in enumerate(myfolds):
171
+ print("Fold: ", i)
172
+ # print(test_idx)
173
+ X_train, X_test = X[train_idx, :], X[test_idx, :]
174
+ y_train, y_test = y[train_idx], y[test_idx]
175
+
176
+ is_vcv = isinstance(vcv, np.ndarray)
177
+ vcv_train = None
178
+ vcv_test_12 = None
179
+
180
+ if is_vcv and model.add_extraK:
181
+ model.Rnn = vcv[train_idx,:][:,train_idx]
182
+ model.Rzn = vcv[test_idx,:][:,train_idx]
183
+
184
+ elif is_vcv and not model.add_extraK:
185
+ vcv_train = vcv[train_idx,:][:,train_idx]
186
+ vcv_test_12 = vcv[test_idx,:][:,train_idx]
187
+
188
+ elif not is_vcv and model.add_extraK:
189
+ raise ValueError("Missing correlation structure at Cross validation")
190
+
191
+ else:
192
+ # everything is None
193
+ pass
194
+
195
+ # inverse of the correlation is calculated only once
196
+ # This is used to test all the rest of hyperparameters
197
+ # An initial alpha is calculated from a previously
198
+ # specified hyperparameter. This initial alpha is
199
+ # used for all the folds
200
+ init_model(X_train, y_train, vcv_train,
201
+ model, param_init, i)
202
+
203
+ # init_err1 = model.score(X_test, y_test, vcv_test_12)*len(y_test)
204
+ # init_err2 = model.score(X_test, y_test, None)*len(y_test)
205
+ # init_err = max(init_err1, init_err2)
206
+
207
+ init_err = model.score(X_test, y_test, None)*len(y_test)
208
+
209
+ all_errors.append([init_idx, init_err])
210
+
211
+ j = 0
212
+ for p_j in progressbar(P_r, prefix = "Testing hyperparameters: ", size=60):
213
+
214
+ if j == init_idx:
215
+ continue
216
+
217
+ tmp_param = dict(zip(P_n, p_j))
218
+
219
+ model.set_params(**tmp_param)
220
+ # vcv and spps are already copied
221
+ # from init_model
222
+ model.fit(X_train, y_train, vcv_train)
223
+
224
+ # tmp_err1 = model.score(X_test, y_test, vcv_test_12)*len(y_test)
225
+ # tmp_err2 = model.score(X_test, y_test, None)*len(y_test)
226
+ # tmp_err = max(tmp_err1, tmp_err2)
227
+
228
+ tmp_err = model.score(X_test, y_test, None)*len(y_test)
229
+
230
+ all_errors.append([j, tmp_err])
231
+ j += 1
232
+
233
+ # O(sample*folds)
234
+ out = {}
235
+ for pi, err in all_errors:
236
+ if pi in out:
237
+ out[pi] += (err/n)
238
+ else:
239
+ out[pi] = (err/n)
240
+
241
+ # out = lkrgee_best_params # for testing
242
+ # print("Time taken: ", time.time() - start)
243
+
244
+ # O(sample * log sample)
245
+ best_pi,best_err = sorted(out.items(), key=lambda kv: kv[1], reverse=False)[0]
246
+ best_ = dict(zip(P_n, P_r[best_pi,:]))
247
+
248
+ if verbose:
249
+ print("CV score: ", best_err)
250
+
251
+ # This let model fit with another correlation matrix
252
+ model.copy_R_inv = False
253
+ # let the model start with new alpha
254
+ model.warm_start = False
255
+
256
+ return best_