PyPI - phylokrr-dev - Versions diffs - 0.2.0__py3-none-any.whl - Mend

phylokrr-dev 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

phylokrr_dev/__init__.py +0 -0
phylokrr_dev/alpha_beta_weighting.py +252 -0
phylokrr_dev/block_cross_validation.py +256 -0
phylokrr_dev/phyloKPQL.py +920 -0
phylokrr_dev/utils.py +23 -0
phylokrr_dev-0.2.0.dist-info/METADATA +9 -0
phylokrr_dev-0.2.0.dist-info/RECORD +9 -0
phylokrr_dev-0.2.0.dist-info/WHEEL +5 -0
phylokrr_dev-0.2.0.dist-info/top_level.txt +1 -0

phylokrr_dev/__init__.py ADDED Viewed

File without changes

phylokrr_dev/alpha_beta_weighting.py ADDED Viewed

@@ -0,0 +1,252 @@
+import numpy as np
+from phylokrr.treeio import parseNewickTree, is_leaf
+def split_leaves(sorted_leaves, folds):
+    n = len(sorted_leaves)
+    fold_size = n//folds
+    myfolds = []
+    for i in range(folds):
+        test_idx = list(range(i * fold_size, (i + 1) * fold_size))
+        train_idx = list(set(range(n)) - set(test_idx))
+        test_l = sorted_leaves[test_idx]
+        train_l = sorted_leaves[train_idx]
+        myfolds.append([train_l, test_l])
+    return myfolds
+def tree_postorder(root):
+    L = []
+    S = [root]
+    while S:
+        v = S.pop()
+        L.append(v)
+        if is_leaf(v):
+            continue
+        l = v.left
+        r = v.right
+        S.append(l)
+        S.append(r)
+    return L
+# tree = "((4,5),(6,7));"
+def alpha_weighting(tree, alpha=1/4):
+    """
+    Weight the leaves of a phylogenetic tree based on
+    on the number of decendents using an alpha parameter.
+    Since it uses a hash table to store the weights,
+    the order of the leaves is not guaranteed
+    to be the same as in the input tree.
+    """
+    _, root = parseNewickTree(tree)
+    L = tree_postorder(root)
+    while L:
+        v = L.pop()
+        if is_leaf(v):
+            v.branch_length = 1
+            continue
+        l = v.left
+        r = v.right
+        v.branch_length = l.branch_length + r.branch_length
+    # alpha = 1/4
+    A = {}
+    # S = [(root, math.log(1))]
+    S = [(root, 1)]
+    while S:
+        pa,p = S.pop()
+        if is_leaf(pa):
+            # store only leaves probabilities
+            A[pa.name] = p
+            continue
+        nl = pa.left.branch_length
+        nr = pa.right.branch_length
+        pr = (nr/(nl+nr))**alpha
+        pl = 1 - pr
+        # S.append((pa.left, math.log(pl) + p))
+        # S.append((pa.right, math.log(pr) + p))
+        S.append((pa.left, pl * p))
+        S.append((pa.right, pr * p))
+    return A
+def sample_mChoice(P, rng, Fz):
+    """
+    Return a set of indices sampled according to P, without replacement, and with size Fz
+    """
+    S_idx = rng.choice(len(P), size=Fz, p=P, replace=False)
+    P[S_idx] = 0
+    P /= np.sum(P)
+    return set(S_idx)
+def split_data_idx(num_test, P, rng): # type: ignore
+    """
+    Return two lists of indices, one for test and one for train
+    """
+    all_idx = set(range(len(P)))
+    test_idx = sample_mChoice(P, rng, num_test)
+    return list(test_idx), list(all_idx - test_idx)
+def getT2(P, rng, n, folds = 5):
+    """
+    Return a list of sets
+    """
+    T = []
+    Fz = n//folds
+    # O(k * n)
+    for _ in range(folds - 1):
+        T.append(
+            sample_mChoice(P, rng, Fz)
+        )
+    last_S = []
+    for i in range(n):
+        if P[i] != 0.0:
+            last_S.append(i)
+    T.append(set(last_S))
+    return T
+def weighted_sample(P, rng, folds=5): # type: ignore
+    n = len(P)
+    all_idx = set(range(n))
+    T = getT2(P, rng, n, folds)
+    myfolds = []
+    for test_idx in T:
+        myfolds.append([
+            list(all_idx - test_idx),
+            list(test_idx)
+        ])
+    return myfolds
+# region: Testing
+# tree_file = "test_tree_v5.txt"
+# with open(tree_file, 'r') as f:
+#     tree = f.read().strip()
+# from phylokrr.treeio import get_vcv
+# rng = np.random.default_rng(seed=12038)
+# V, leaves = get_vcv(tree,  process = "OU", sigma2 = 1, alpha = 1)
+# # P = np.linalg.inv(V) @ np.ones(len(leaves))
+# # P /= np.sum(P)
+# A = alpha_weighting(tree, alpha = 1e2)
+# P = np.array([A[leaf] for leaf in leaves])
+# # print(A)
+# print(P)
+# n = len(leaves)
+# test_idx, train_idx = split_data_idx(num_test=int(n*0.25), P=P, rng=rng)
+# P_train = P[train_idx]
+# myfolds = weighted_sample(P_train, rng, folds=5)
+# # for a,b in myfolds:
+# #     print(a,"-------",b)
+# # print(P[list(myfolds[3][1])])
+# test_leaves = [leaves[i] for i in test_idx]
+# with open("test_leaves.txt", 'w') as f:
+#     f.write(",".join(test_leaves) + "\n")
+# # print("Test Leaves: ", test_leaves)
+# # # # # myfolds
+# with open("folds.txt", 'w') as f:
+# # with open("folds_random.txt", 'w') as f:
+#     for train_idx_f, test_idx_f in myfolds:
+#         train_leaves_f = []
+#         for idx in train_idx_f:
+#             train_leaves_f.append(leaves[train_idx[idx]])
+#         test_leaves_f = []
+#         for idx in test_idx_f:
+#             test_leaves_f.append(leaves[train_idx[idx]])
+#         f.write(",".join(train_leaves_f) + "\n")
+#         f.write(",".join(test_leaves_f) + "\n")
+# endregion
+# region: Old code
+# def updateP(S, P):
+#     # mark those taken as 0
+#     P[S != 0] = 0
+#     # P[S] = 0
+#     # adjust the probabilities
+#     P /= np.sum(P)
+#     # print("Updated P: ", P)
+# def multi_sample(P, rng, Fz):
+#     # P = P.copy()
+#     # S_idx = rng.choice(len(P), size=Fz, p=P, replace=False)
+#     # S = np.zeros(len(P), dtype=int)
+#     # S[S_idx] = 1
+#     S = rng.multinomial(n=Fz, pvals=P, size=1)[0]
+#     updateP(S, P)
+#     NZ = sum(S != 0)
+#     while Fz > NZ:
+#         Si = rng.multinomial(n=Fz - NZ, pvals=P, size=1)[0]
+#         # Just update with the recently taken
+#         # samples, not the whole S
+#         updateP(Si, P)
+#         S += Si
+#         NZ = sum(S != 0)
+#     return S
+# def getT(P, rng, n, folds = 5):
+#     # P = P.copy()
+#     T = []
+#     Fz = n//folds
+#     # O(k * n)
+#     for _ in range(folds - 1):
+#         S = multi_sample2(P, rng, Fz)
+#         T.append(S)
+#     S = np.zeros(n, dtype=int)
+#     S[P != 0] = 1
+#     T.append(S)
+#     return T
+# def split_sample(t):
+#     test_idx = []
+#     train_idx = []
+#     for k, v in enumerate(t):
+#         if v != 0:
+#             test_idx.append(k)
+#         else:
+#             train_idx.append(k)
+#     return test_idx, train_idx
+# endregion

phylokrr_dev/block_cross_validation.py ADDED Viewed

@@ -0,0 +1,256 @@
+import sys
+import time
+from collections import deque
+import numpy as np
+from phylokrr_dev.alpha_beta_weighting import weighted_sample
+from phylokrr_dev.utils import progressbar
+def sample_hyperParams(params, seed, sample, verbose):
+    """
+    Random search for hyperparameter tuning using k-fold cross-validation
+    Parameters
+    ----------
+    params : dict
+        hyperparameters to sample from
+    seed : int
+        random seed
+    sample : int
+        number of samples
+    verbose : bool
+        print number of unique hyperparameters
+    Returns
+    -------
+    np.ndarray
+        sampled hyperparameters
+    dict
+        index of hyperparameters
+    list
+        names of hyperparameters
+    """
+    np.random.seed(seed=seed)
+    # make random choice from the grid of hyperparameters
+    P_n = params.keys()
+    P_r = np.zeros((sample, len(P_n))) # random hyperparameters
+    P_index = {}
+    for n,k in enumerate(P_n):
+        P_r[:,n] = np.random.choice(params[k], sample, )
+        P_index[k] = n
+    if verbose:
+        # check tested_params are unique
+        P_r = np.unique(P_r, axis=0)
+        print("Number of unique hyperparameters: ", P_r.shape[0])
+    return  P_r, P_index, P_n
+def pick_init_index(P_index, P_r, seed):
+    """
+    Pick the initial index for the hyperparameters
+    """
+    np.random.seed(seed=seed)
+    # get the lambda index
+    lam_idx = ""
+    if "lambda" in P_index:
+        lam_idx = P_index["lambda"]
+    if "lam" in P_index:
+        lam_idx = P_index["lam"]
+    if lam_idx:
+        init_idx = np.argmax(P_r[:,lam_idx])
+        # ave = np.mean(P_r[:,lam_idx])
+        # init_idx = np.argmin(np.abs(P_r[:,lam_idx] - ave))
+    else:
+        init_idx = np.random.randint(P_r.shape[0])
+    return init_idx
+def init_model(X_train, y_train, vcv_train, model,
+                    param_init, i):
+    """
+    Initialize the model.
+    This initialization of the model is used at the beginning of every
+    fold. This set the inverse of the inverse of the correlation matrix
+    and the initial alpha.
+    if it is the first fold iteration, the alpha is calculated
+    from the initial hyperparameters. Otherwise, the previous
+    solution is used as the initial alpha.
+    For a given fold, the correlation matrix is the same
+    for all the hyperparameters, then inverse of the correlation
+    matrix is calculated only once at the beginning of the fold.
+    """
+    # for the initial model, R inv should be calculated
+    model.copy_R_inv = False
+    # for the initial model, initial params should be random
+    # when it is the first fold iteration. Otherwise
+    # we can use the previous solution as the initial
+    # i is the fold index
+    # model.warm_start = False if i == 0 else True
+    model.warm_start = False
+    # set the initial hyperparameters
+    model.set_params(**param_init)
+    # model fit, while also setting the R_inv matrix
+    time_start = time.time()
+    model.fit(X_train, y_train, vcv_train)
+    time_end = time.time()
+    print(f"Initial model fit time: {time_end - time_start:.2f} seconds")
+    # for the next iteration alpha is used
+    model.warm_start = False
+    # for the next iteration R_inv is used
+    model.copy_R_inv = True
+def sort_P_r(P_r, P_index):
+    if "lambda" in P_index:
+        lam_index = P_index["lambda"]
+    elif "lam" in P_index:
+        lam_index = P_index["lam"]
+    else:
+        lam_index = None
+    if lam_index:
+        P_r = P_r[np.argsort(P_r[:,lam_index])[::-1],]
+    return P_r
+def k_fold_cv_weigthed(X, y, vcv, model, W,
+                       params,
+                       folds = 3,
+                       sample = 100,
+                       verbose = True,
+                       seed = 12038):
+    """
+    """
+    model.verbose = False
+    np.random.seed(seed=seed)
+    rng = np.random.RandomState(seed=seed)
+    # X = X_train
+    P_r, P_index, P_n = sample_hyperParams(params, seed, sample, verbose)
+    # if lambda is present, then sort by lambda
+    P_r = sort_P_r(P_r, P_index)
+    # a random index if lambda is not present.
+    # otherwise (most typical), the index with the highest lambda
+    init_idx = pick_init_index(P_index, P_r, seed)
+    param_init = dict(zip(P_n, P_r[init_idx,:]))
+    n, _ = X.shape
+    # linked list to store all the errors
+    all_errors = deque([])
+    myfolds = weighted_sample(W, rng, folds=folds)
+    # O(n^3*folds*sample)
+    for i, (train_idx, test_idx) in enumerate(myfolds):
+        print("Fold: ", i)
+        # print(test_idx)
+        X_train, X_test = X[train_idx, :], X[test_idx, :]
+        y_train, y_test = y[train_idx], y[test_idx]
+        is_vcv =  isinstance(vcv, np.ndarray)
+        vcv_train = None
+        vcv_test_12 = None
+        if is_vcv and model.add_extraK:
+            model.Rnn = vcv[train_idx,:][:,train_idx]
+            model.Rzn = vcv[test_idx,:][:,train_idx]
+        elif is_vcv and not model.add_extraK:
+            vcv_train = vcv[train_idx,:][:,train_idx]
+            vcv_test_12 = vcv[test_idx,:][:,train_idx]
+        elif not is_vcv and model.add_extraK:
+            raise ValueError("Missing correlation structure at Cross validation")
+        else:
+            # everything is None
+            pass
+        # inverse of the correlation is calculated only once
+        # This is used to test all the rest of hyperparameters
+        # An initial alpha is calculated from a previously
+        # specified hyperparameter. This initial alpha is
+        # used for all the folds
+        init_model(X_train, y_train, vcv_train,
+                   model, param_init, i)
+        # init_err1 = model.score(X_test, y_test, vcv_test_12)*len(y_test)
+        # init_err2 = model.score(X_test, y_test, None)*len(y_test)
+        # init_err = max(init_err1, init_err2)
+        init_err = model.score(X_test, y_test, None)*len(y_test)
+        all_errors.append([init_idx, init_err])
+        j = 0
+        for p_j in progressbar(P_r, prefix = "Testing hyperparameters: ", size=60):
+            if j == init_idx:
+                continue
+            tmp_param = dict(zip(P_n, p_j))
+            model.set_params(**tmp_param)
+            # vcv and spps are already copied
+            # from init_model
+            model.fit(X_train, y_train, vcv_train)
+            # tmp_err1 = model.score(X_test, y_test, vcv_test_12)*len(y_test)
+            # tmp_err2 = model.score(X_test, y_test, None)*len(y_test)
+            # tmp_err = max(tmp_err1, tmp_err2)
+            tmp_err = model.score(X_test, y_test, None)*len(y_test)
+            all_errors.append([j, tmp_err])
+            j += 1
+    # O(sample*folds)
+    out = {}
+    for pi, err in all_errors:
+        if pi in out:
+            out[pi] += (err/n)
+        else:
+            out[pi] = (err/n)
+    # out = lkrgee_best_params # for testing
+    # print("Time taken: ", time.time() - start)
+    # O(sample * log sample)
+    best_pi,best_err = sorted(out.items(), key=lambda kv: kv[1], reverse=False)[0]
+    best_ = dict(zip(P_n, P_r[best_pi,:]))
+    if verbose:
+        print("CV score: ", best_err)
+    # This let model fit with another correlation matrix
+    model.copy_R_inv = False
+    # let the model start with new alpha
+    model.warm_start = False
+    return best_