PyPI - sequenzo - Versions diffs - 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl - Mend

sequenzo 0.1.31__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sequenzo/big_data/clara/clara.py ADDED Viewed

@@ -0,0 +1,476 @@
+"""
+@Author  : 李欣怡
+@File    : clara.py
+@Time    : 2024/12/27 12:04
+@Desc    :
+"""
+import gc
+import os
+from contextlib import redirect_stdout
+import warnings
+from joblib import Parallel, delayed
+# from Tutorials.test import result
+from sequenzo.clustering.sequenzo_fastcluster.fastcluster import linkage
+from scipy.special import comb
+from itertools import product
+from sequenzo.big_data.clara.utils.aggregatecases import *
+from sequenzo.big_data.clara.utils.davies_bouldin import *
+from sequenzo.clustering.KMedoids import *
+from sequenzo.big_data.clara.utils.get_weighted_diss import *
+from sequenzo.define_sequence_data import SequenceData
+from sequenzo.dissimilarity_measures.get_distance_matrix import get_distance_matrix
+def adjustedRandIndex(x, y=None):
+    if isinstance(x, np.ndarray):
+        x = np.array(x)
+        y = np.array(y)
+        if len(x) != len(y):
+            raise ValueError("Arguments must be vectors of the same length")
+        tab = pd.crosstab(x, y)
+    else:
+        tab = x
+    if tab.shape == (1, 1):
+        return 1
+    # 计算 ARI 的四个部分：a, b, c, d
+    a = np.sum(comb(tab.to_numpy(), 2))  # 选择每对组合的组合数
+    b = np.sum(comb(np.sum(tab.to_numpy(), axis=1), 2)) - a
+    c = np.sum(comb(np.sum(tab.to_numpy(), axis=0), 2)) - a
+    d = comb(np.sum(tab.to_numpy()), 2) - a - b - c
+    ARI = (a - (a + b) * (a + c) / (a + b + c + d)) / ((a + b + a + c) / 2 - (a + b) * (a + c) / (a + b + c + d))
+    return ARI
+def jaccardCoef(tab):
+    if tab.shape == (1, 1):
+        return 1
+    # 计算交集（n11）和并集（n01 和 n10）
+    n11 = np.sum(tab.to_numpy() ** 2)  # 交集
+    n01 = np.sum(np.sum(tab.to_numpy(), axis=0) ** 2)  # 列和的平方
+    n10 = np.sum(np.sum(tab.to_numpy(), axis=1) ** 2)  # 行和的平方
+    return n11 / (n01 + n10 - n11)
+def clara(seqdata, R=100, kvals=None, sample_size=None, method="crisp", dist_args=None,
+          criteria=["distance"], stability=False, max_dist=None):
+    # ==================
+    # Parameter checking
+    # ==================
+    if kvals is None:
+        kvals = range(2, 11)
+    if sample_size is None:
+        sample_size = 40 + 2 * max(kvals)
+    print("[>] Starting generalized CLARA for sequence analysis.")
+    # Check for input data type (should be a sequence object)
+    if not isinstance(seqdata, SequenceData):
+        raise ValueError("[!] 'seqdata' should be SequenceData, check the input format.")
+    if max(kvals) > sample_size:
+        raise ValueError("[!] More clusters than the size of the sample requested.")
+    allmethods = ["crisp"]
+    if method.lower() not in [m.lower() for m in allmethods]:
+        raise ValueError(f"[!] Unknown method {method}. Please specify one of the following: {', '.join(allmethods)}")
+    if method.lower() == "representativeness" and max_dist is None:
+        raise ValueError("[!] You need to set max.dist when using representativeness method.")
+    allcriteria = ["distance", "db", "xb", "pbm", "ams"]
+    criteria = [c.lower() for c in criteria]
+    if not all(c in allcriteria for c in criteria):
+        raise ValueError(
+            f"[!] Unknown criteria among {', '.join(criteria)}. Please specify at least one among {', '.join(allcriteria)}.")
+    if dist_args is None:
+        raise ValueError("[!] You need to set the 'dist_args' for get_distance_matrix function.")
+    print(f"[>] Using {method} clustering optimizing the following criterion: {', '.join(criteria)}.")
+    # FIXME : Add coherance check between method and criteria
+    # ===========
+    # Aggregation
+    # ===========
+    number_seq = len(seqdata.seqdata)
+    print(f"  - Aggregating {number_seq} sequences...")
+    ac = DataFrameAggregator().aggregate(seqdata.seqdata)
+    agseqdata = seqdata.seqdata.iloc[ac['aggIndex'], :]
+    # agseqdata.attrs['weights'] = None
+    ac['probs'] = ac['aggWeights'] / number_seq
+    print(f"  - OK ({len(ac['aggWeights'])} unique cases).")
+    # Memory cleanup before parallel computation
+    gc.collect()
+    print("[>] Starting iterations...")
+    def calc_pam_iter(circle, agseqdata, sample_size, kvals, ac):
+        # Sampling with replacement allows the process to proceed normally
+        # even when the sample size exceeds the dataset size, as samples can be repeatedly drawn."
+        mysample = np.random.choice(len(agseqdata), size=sample_size, p=ac['probs'], replace=True)
+        mysample = pd.DataFrame({'id': mysample})
+        # Re-aggregate!
+        ac2 = DataFrameAggregator().aggregate(mysample)
+        data_subset = agseqdata.iloc[mysample.iloc[ac2['aggIndex'], 0], :]
+        with open(os.devnull, 'w') as fnull:
+            with redirect_stdout(fnull):
+                states = np.arange(1, len(seqdata.states) + 1).tolist()
+                data_subset = SequenceData(data_subset,
+                                           time=seqdata.time,
+                                           states=states)
+                dist_args['seqdata'] = data_subset
+                diss = get_distance_matrix(opts=dist_args)
+        diss = diss.values
+        _diss = diss.copy()
+        _diss = get_weighted_diss(_diss, ac2['aggWeights'])
+        hc = linkage(_diss, method='ward')
+        del _diss
+        # For each number of clusters
+        allclust = []
+        for k in kvals:
+            # Weighted PAM clustering on subsample
+            # TODO : hc 已经是选好的中心点了，为什么初始化 clusterid 的时候要用 -1 呢？
+            #  因为没有必要啊，直接用原来的不好吗？尤其在没有进入 if 分支的情况下，这样处理也能避免 -1 的数据访问越界。所以为什么要初始化为-1呢？
+            clustering = KMedoids(diss=diss, k=k, cluster_only=True, initialclust=hc, weights=ac2['aggWeights'], verbose=False)
+            medoids = mysample.iloc[ac2['aggIndex'][np.unique(clustering)], :]
+            medoids = medoids.to_numpy().flatten()
+            del clustering
+            # =====================================================
+            # Compute Distances Between All Sequence to the Medoids
+            # =====================================================
+            refseq = [list(range(0, len(agseqdata))), medoids.tolist()]
+            with open(os.devnull, 'w') as fnull:
+                with redirect_stdout(fnull):
+                    states = np.arange(1, len(seqdata.states) + 1).tolist()
+                    agseqdata = SequenceData(agseqdata,
+                                             time=seqdata.time,
+                                             states=states)
+                    dist_args['seqdata'] = agseqdata
+                    dist_args['refseq'] = refseq
+                    diss2 = get_distance_matrix(opts=dist_args)
+                    del dist_args['refseq']
+                    agseqdata = agseqdata.seqdata   # Restore scene
+            # Compute two minimal distances are used for silhouette width
+            # and other criterions
+            diss2 = diss2.to_numpy()
+            alphabeta = np.array([np.sort(row)[:2] for row in diss2])
+            sil = (alphabeta[:, 1] - alphabeta[:, 0]) / np.maximum(alphabeta[:, 1], alphabeta[:, 0])
+            # Allocate to clusters
+            memb = np.argmin(diss2, axis=1)     # Each data point is assigned to its nearest cluster
+            mean_diss = np.sum(alphabeta[:, 0] * ac['probs'])
+            warnings.filterwarnings('ignore', category=RuntimeWarning)  # The ÷0 case is ignored
+            db = davies_bouldin_internal(diss=diss2, clustering=memb, medoids=medoids, weights=ac['aggWeights'])['db']
+            warnings.resetwarnings()
+            pbm = ((1 / len(medoids)) * (np.max(diss2[medoids]) / mean_diss)) ** 2
+            ams = np.sum(sil * ac['probs'])
+            distmed = diss2[medoids, :]
+            distmed_flat = distmed[np.triu_indices_from(distmed, k=1)]  # Take the upper triangular part
+            minsep = np.min(distmed_flat)
+            xb = mean_diss / minsep
+            del alphabeta
+            del sil
+            del diss2
+            del distmed
+            del minsep
+            allclust.append({
+                'mean_diss': mean_diss,
+                'db': db,
+                'pbm': pbm,
+                'ams': ams,
+                'xb': xb,
+                'clustering': memb,
+                'medoids': medoids
+            })
+        del diss
+        gc.collect()
+        return allclust
+    # Compute in parallel using joblib
+    # the output example of `results`:
+    #         results[0] = all iter1's = [{k=2's}, {k=3's}, ... , {k=10's}]
+    #         results[1] = all iter2's = [{k=2's}, {k=3's}, ... , {k=10's}]
+    results = Parallel(n_jobs=-1)(
+        delayed(calc_pam_iter)(circle=i, agseqdata=agseqdata, sample_size=sample_size, kvals=kvals, ac=ac) for i in range(R))
+    # results = []
+    # for i in range(R):
+    #     res = calc_pam_iter(circle=i,
+    #                         agseqdata=agseqdata,
+    #                         sample_size=sample_size,
+    #                         kvals=kvals,
+    #                         ac=ac)
+    #     results.append(res)
+    print("  - Done.")
+    print("[>] Aggregating iterations for each k values...")
+    # aggregated output example :
+    #         data[0] = all k=2's = [{when iter1, k=2's}, {when iter2, k=2's}, ... , {when iter100, k=2's}]
+    #         data[1] = all k=3's = [{when iter1, k=3's}, {when iter2, k=3's}, ... , {when iter100, k=3's}]
+    collected_data = [[] for _ in kvals]
+    for iter_result in results:
+        k = 0
+        for item in iter_result:
+            collected_data[k].append(item)
+            k += 1
+    kvalscriteria = list(product(range(len(kvals)), criteria))
+    kret = []
+    for item in kvalscriteria:
+        k = item[0]
+        _criteria = item[1]
+        mean_all_diss = [d['mean_diss'] for d in collected_data[k]]
+        db_all = [d['db'] for d in collected_data[k]]
+        pbm_all = [d['pbm'] for d in collected_data[k]]
+        ams_all = [d['ams'] for d in collected_data[k]]
+        xb_all = [d['xb'] for d in collected_data[k]]
+        clustering_all_diss = [d['clustering'] for d in collected_data[k]]
+        med_all_diss = [d['medoids'] for d in collected_data[k]]
+        # Find best clustering
+        objective = {
+            "distance": mean_all_diss,
+            "pbm": pbm_all,
+            "db": db_all,
+            "ams": ams_all,
+            "xb": xb_all
+        }
+        objective = objective[_criteria]
+        best = np.argmax(objective) if _criteria in ["ams", "pbm"] else np.argmin(objective)
+        # Compute clustering stability of the best partition
+        if stability:
+            def process_task(j, clustering_all_diss, ac, best):
+                df = pd.DataFrame({
+                    'clustering_j': clustering_all_diss[j],        # The J-TH cluster
+                    'clustering_best': clustering_all_diss[best],  # The best-TH clustering
+                    'aggWeights': ac['aggWeights']
+                })
+                tab = df.groupby(['clustering_j', 'clustering_best'])['aggWeights'].sum().unstack(fill_value=0)
+                val = [adjustedRandIndex(tab), jaccardCoef(tab)]
+                return val
+            arilist = []
+            if method in ["noise", "fuzzy"]:
+                for j in range(R):
+                    val = process_task(j, clustering_all_diss, ac, best)
+                    arilist.append(val)
+            else:
+                arilist = Parallel(n_jobs=-1)(
+                    delayed(process_task)(j, clustering_all_diss, ac, best) for j in range(R))
+            arimatrix = np.vstack(arilist)
+            arimatrix = pd.DataFrame(arimatrix, columns=["ARI", "JC"])
+            ari08 = np.sum(arimatrix.iloc[:, 0] >= 0.8)
+            jc08 = np.sum(arimatrix.iloc[:, 1] >= 0.8)
+        else:
+            arimatrix = np.nan
+            ari08 = np.nan
+            jc08 = np.nan
+        _clustering = clustering_all_diss[best]
+        disagclust = np.full(seqdata.seqdata.shape[0], -1)
+        for i, index in enumerate(ac["disaggIndex"]):
+            disagclust[i] = _clustering[index] + 1      # 1-based index for clusters
+        evol_diss = np.maximum.accumulate(objective) if _criteria in ["ams", "pbm"] else np.minimum.accumulate(objective)
+        # Store the best solution and evaluations of the others
+        bestcluster = {
+            "medoids": ac["aggIndex"][med_all_diss[best]],
+            "clustering": disagclust,
+            "evol_diss": evol_diss,
+            "iter_objective": objective,
+            "objective": objective[best],
+            "iteration": best,
+            "arimatrix": arimatrix,
+            "criteria": _criteria,
+            "method": method,
+            "avg_dist": mean_all_diss[best],
+            "pbm": pbm_all[best],
+            "db": db_all[best],
+            "xb": xb_all[best],
+            "ams": ams_all[best],
+            "ari08": ari08,
+            "jc08": jc08,
+            "R": R,
+            "k": k
+        }
+        # Store computed cluster quality
+        kresult = {
+            "k": k+2,
+            "criteria": criteria,
+            "stats": [bestcluster["avg_dist"], bestcluster["pbm"], bestcluster["db"], bestcluster["xb"],
+                      bestcluster["ams"], bestcluster["ari08"], bestcluster["jc08"], best],
+            "bestcluster": bestcluster
+        }
+        kret.append(kresult)
+    def claraObj(kretlines, method, kvals, kret, seqdata):
+        clustering = np.full((seqdata.seqdata.shape[0], len(kvals)), -1)
+        clustering = pd.DataFrame(clustering)
+        clustering.columns = [f"Cluster {val}" for val in kvals]
+        clustering.index = seqdata.ids
+        ret = {
+            "kvals": kvals,
+            "clara": {},
+            "clustering": clustering,
+            "stats": np.full((len(kvals), 8), -1, dtype=float)
+        }
+        for i in kretlines:
+            k = kret[i]['k'] - 2    # start from 0, not 2
+            ret['stats'][k, :] = np.array(kret[i]['stats'])
+            ret['clara'][k] = kret[i]['bestcluster']
+            ret['clustering'].iloc[:, k] = kret[i]['bestcluster']['clustering']
+        ret['stats'] = pd.DataFrame(ret['stats'],
+                                    columns=["Avg dist", "PBM", "DB", "XB", "AMS", "ARI>0.8", "JC>0.8", "Best iter"])
+        ret['stats'].insert(0, "Number of Clusters", [f"Cluster {k}" for k in kvals])
+        ret['stats']["k_num"] = kvals
+        return ret
+    if len(criteria) > 1:
+        ret = {
+            'param': {
+                'criteria': criteria,
+                'pam_combine': False,
+                'all_criterias': criteria,
+                'kvals': kvals,
+                'method': method,
+                'stability': stability
+            }
+        }
+        for meth in criteria:
+            indices = np.where(np.array([tup[1] for tup in kvalscriteria]) == meth)[0]
+            ret[meth] = claraObj(kretlines=indices, method=method, kvals=kvals, kret=kret, seqdata=seqdata)
+        allstats = {}
+        for meth in criteria:
+            stats = pd.DataFrame(ret[meth]['stats'])
+            stats['criteria'] = meth
+            allstats[meth] = stats
+        ret['allstats'] = pd.concat(allstats.values(), ignore_index=False)
+    else:
+        ret = claraObj(kretlines=range(len(kvalscriteria)), method=method, kvals=kvals, kret=kret, seqdata=seqdata)
+    print("  - Done.")
+    return ret
+if __name__ == '__main__':
+    from sequenzo import *  # Social sequence analysis
+    import pandas as pd  # Import necesarry packages
+    # TODO : clara 返回的隶属矩阵要转置一下，因为plot_sequence_index里的参数id_group_df：cluster id 是行，id 是列
+    # ===============================
+    #             Sohee
+    # ===============================
+    # df = pd.read_csv('D:/college/research/QiQi/sequenzo/data_and_output/orignal data/sohee/sequence_data.csv')
+    # time_list = list(df.columns)[1:133]
+    # states = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+    # # states = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
+    # labels = ['FT+WC', 'FT+BC', 'PT+WC', 'PT+BC', 'U', 'OLF']
+    # sequence_data = SequenceData(df, time=time_list, time_type="age", states=states, labels=labels, id_col="PID")
+    # om.to_csv("D:/college/research/QiQi/sequenzo/files/sequenzo_Sohee_string_OM_TRATE.csv", index=True)
+    # ===============================
+    #             kass
+    # ===============================
+    # df = pd.read_csv('D:/college/research/QiQi/sequenzo/files/orignal data/kass/wide_civil_final_df.csv')
+    # time_list = list(df.columns)[1:]
+    # states = ['Extensive Warfare', 'Limited Violence', 'No Violence', 'Pervasive Warfare', 'Prolonged Warfare',
+    #           'Serious Violence', 'Serious Warfare', 'Sporadic Violence', 'Technological Warfare', 'Total Warfare']
+    # sequence_data = SequenceData(df, time=time_list, time_type="year", states=states, id_col="COUNTRY")
+    # ===============================
+    #             CO2
+    # ===============================
+    # df = pd.read_csv("D:/country_co2_emissions_missing.csv")
+    # time = list(df.columns)[1:]
+    # states = ['Very Low', 'Low', 'Middle', 'High', 'Very High']
+    # sequence_data = SequenceData(df, time_type="age", time=time, id_col="country", states=states)
+    # ===============================
+    #            detailed
+    # ===============================
+    # df = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/sampled_data_sets/detailed_data/sampled_1000_data.csv")
+    # time = list(df.columns)[4:]
+    # states = ['data', 'data & intensive math', 'hardware', 'research', 'software', 'software & hardware', 'support & test']
+    # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']],
+    #                              time=time, id_col="worker_id", states=states)
+    # ===============================
+    #             broad
+    # ===============================
+    # df = pd.read_csv("D:/college/research/QiQi/sequenzo/data_and_output/sampled_data_sets/broad_data/sampled_1000_data.csv")
+    # time = list(df.columns)[4:]
+    # states = ['Non-computing', 'Non-technical computing', 'Technical computing']
+    # sequence_data = SequenceData(df[['worker_id', 'C1', 'C2', 'C3', 'C4', 'C5']],
+    #                              time_type="age", time=time, id_col="worker_id", states=states)
+    df = pd.read_csv("/Users/xinyi/Projects/sequenzo/sequenzo/data_and_output/orignal data/not_real_detailed_data/synthetic_detailed_U5_N1000.csv")
+    _time = list(df.columns)[2:]
+    states = ["Data", "Data science", "Hardware", "Research", "Software", "Support & test", "Systems & infrastructure"]
+    df = df[['id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']]
+    sequence_data = SequenceData(df, time=_time, id_col="id", states=states)
+    result = clara(sequence_data,
+                   R=250,
+                   sample_size=500,
+                   kvals=range(2, 6),
+                   criteria=['distance'],
+                   dist_args={"method": "OM", "sm": "CONSTANT", "indel": 1},
+                   stability=True)
+    # print(result)
+    print(result['stats'])

sequenzo/big_data/clara/utils/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+@Author  : 李欣怡
+@File    : __init__.py.py
+@Time    : 2025/2/28 00:30
+@Desc    :
+"""
+from .aggregatecases import *
+from .davies_bouldin import *
+from .wfcmdd import *
+from sequenzo.clustering.KMedoids import KMedoids
+def _import_c_code():
+    """Lazily import the c_code module to avoid circular dependencies during installation"""
+    try:
+        from sequenzo.clustering import clustering_c_code
+        return clustering_c_code
+    except ImportError:
+        # If the C extension cannot be imported, return None
+        print(
+            "Warning: The C++ extension (c_code) could not be imported. Please ensure the extension module is compiled correctly.")
+        return None
+__all__ = [
+    'KMedoids'
+]

sequenzo/big_data/clara/utils/aggregatecases.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""
+@Author  : 李欣怡
+@File    : aggregatecases.py
+@Time    : 2024/12/27 10:12
+@Desc    :
+"""
+import pandas as pd
+import numpy as np
+class WcAggregateCases:
+    def aggregate(self, x, weights=None, **kwargs):
+        """
+        The appropriate aggregation method is invoked dynamically depending on the type of x
+        """
+        method_name = f"aggregate_{type(x).__name__}"
+        method = getattr(self, method_name, None)
+        if method is None:
+            raise NotImplementedError(f"No aggregation method for type {type(x).__name__}")
+        return method(x, weights, **kwargs)
+class WcAggregateCasesInternal:
+    def aggregate(self, x, weights=None):
+        x = pd.DataFrame(x)
+        lx = len(x)
+        if weights is None:
+            weights = np.ones(lx)
+        ids = x.apply(lambda row: "@@@WC_SEP@@".join(row.astype(str)), axis=1)
+        mcorr = [np.nan] * lx
+        def _compute_weight_each_group_and_sum(group):
+            first_element = group.iloc[0]
+            for idx in group:
+                mcorr[idx] = first_element
+            weighted_sum = np.sum(weights[group])
+            return [first_element, weighted_sum]
+        df = pd.DataFrame({
+            'index': range(0, lx),
+            'id': ids
+        })
+        grouped = df.groupby('id')['index'].apply(_compute_weight_each_group_and_sum)
+        agg_df = pd.DataFrame(grouped.tolist(), columns=['aggIndex', 'aggWeights'])
+        aggIndex = agg_df['aggIndex']
+        mcorr2 = [aggIndex[aggIndex == val].index[0] if val in aggIndex.values else -1 for val in mcorr]
+        ret = {
+            "aggIndex": agg_df['aggIndex'].values,
+            "aggWeights": agg_df['aggWeights'].values,
+            "disaggIndex": mcorr2,
+            "disaggWeights": weights
+        }
+        return ret
+class DataFrameAggregator(WcAggregateCases):
+    def aggregate_DataFrame(self, x, weights=None, **kwargs):
+        internal = WcAggregateCasesInternal()
+        return internal.aggregate(x, weights)
+class MatrixAggregator(WcAggregateCases):
+    def aggregate_ndarray(self, x, weights=None, **kwargs):
+        internal = WcAggregateCasesInternal()
+        return internal.aggregate(x, weights)
+class StsListAggregator(WcAggregateCases):
+    def aggregate_stslist(self, x, weights=None, weighted=True, **kwargs):
+        if weights is None and weighted:
+            weights = getattr(x, "weights", None)
+        internal = WcAggregateCasesInternal()
+        return internal.aggregate(x, weights)
+# Print function (for output)
+def print_wcAggregateCases(result):
+    print(f"Number of disaggregated cases: {len(result['disaggWeights'])}")
+    print(f"Number of aggregated cases: {len(result['aggWeights'])}")
+    print(f"Average aggregated cases: {len(result['disaggWeights']) / len(result['aggWeights'])}")
+    print(f"Average (weighted) aggregation: {np.mean(result['aggWeights'])}")

sequenzo/big_data/clara/utils/davies_bouldin.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""
+@Author  : 李欣怡
+@File    : davies_bouldin.py
+@Time    : 2024/12/27 17:56
+@Desc    :
+    :param
+        diss : numpy 2D, 距离矩阵
+        clustering : numpy 1D, 每个数据点的隶属矩阵（一种可能：初始化时每个点构成一个簇，则每个数据点隶属自己）
+        medoids : numpy 1D, 簇的中心点
+"""
+import numpy as np
+def davies_bouldin_internal(diss, clustering, medoids, p=1, weights=None, medoidclust=False):
+    # If weights are not provided, use uniform weights
+    if weights is None:
+        weights = np.ones(diss.shape[0])
+    list_diam = np.zeros(len(medoids))
+    # Calculate the diameter for each medoid
+    for i in range(len(medoids)):
+        medi = medoids[i] if medoidclust else i
+        cond = (clustering == medi)
+        # Calculate the diameter (weighted distance)
+        list_diam[i] = (np.sum(weights[cond] * diss[cond, i] ** p) / np.sum(weights[cond])) ** (1 / p)
+    maximum = np.zeros(len(medoids))
+    # Calculate the maximum ratio for each medoid
+    for i in range(len(medoids)):
+        # Calculate the distance to other medoids
+        maximum2 = (list_diam[i] + list_diam) / diss[medoids[i], :]
+        # Take the maximum of the valid (finite) values
+        # ensure values for "same" medoids
+        maximum[i] = np.max(maximum2[np.isfinite(maximum2)])
+    # Calculate the final Davies-Bouldin index (average of maximum values)
+    final_db = np.mean(maximum)
+    return {'db': final_db, 'per_cluster': maximum}
+def fuzzy_davies_bouldin_internal(diss, memb, medoids, weights=None):
+    if weights is None:
+        weights = np.ones(diss.shape[0])
+    # R 中定义后未使用，用另一个值赋值了
+    list_diam = np.zeros(len(medoids))
+    # R 中只定义未使用
+    # n = np.sum(weights)
+    mw = memb * weights[:, None]
+    list_diam = np.sum(mw * diss, axis=0) / np.sum(mw, axis=0)
+    # 初始化一个数组来存储每个簇的最大值
+    maximum = np.zeros(len(medoids))
+    # 对每个簇计算其与其他簇的相似度
+    for i in range(len(medoids)):
+        maximum2 = (list_diam[i] + list_diam) / diss[medoids[i], :]
+        maximum[i] = np.max(maximum2[np.isfinite(maximum2)])
+    final_db = np.mean(maximum)
+    return {'db': final_db, 'per_cluster': maximum}
+def adjpbm_internal(diss, clustering, medoids, p=1, weights=None, medoidclust=False):
+    if weights is None:
+        weights = np.ones(diss.shape[0])
+    # Calculate internal distance
+    internaldist = [
+        (sum(weights[clustering == (medoids[i] if medoidclust else i)] * diss[
+            clustering == (medoids[i] if medoidclust else i), i] ** p) /
+         sum(weights[clustering == (medoids[i] if medoidclust else i)])) ** (1 / p)
+        for i in range(len(medoids))
+    ]
+    # Calculate the minimum separation distance between medoids
+    separation = np.nanmin(diss[medoids, :][:, medoids])
+    # Calculate pbm (probabilistic cluster separation)
+    pbm = (1 / len(medoids)) * (separation / np.sum(internaldist))
+    return pbm

sequenzo/big_data/clara/utils/get_weighted_diss.cpython-310-darwin.so ADDED Viewed

Binary file