PyPI - sciv - Versions diffs - 0.0.88__py3-none-any.whl → 0.0.89__py3-none-any.whl - Mend

sciv 0.0.88py3-none-any.whl → 0.0.89py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

sciv/tool/_algorithm_.py CHANGED Viewed

@@ -902,73 +902,97 @@ def overlap_sum(regions: AnnData, variants: dict, trait_info: DataFrame) -> AnnD
     """
     # Unique feature set
-    label_all = list(regions.var.index)
+    label_all = regions.var.index.tolist()
     # Peak number
     label_all_size: int = len(label_all)
-    # trait/disease information
-    trait_names: list = list(trait_info["id"])
+    # 预先把 peaks 的 index 做成 dict，O(1) 查找
+    label2idx = {lb: i for i, lb in enumerate(label_all)}
-    matrix = np.zeros((label_all_size, len(trait_names)))
+    trait_names = trait_info["id"].tolist()
+    n_trait = len(trait_names)
+    # 提前分配稀疏矩阵，按列填充，最后一次性转成 csc 再转 csr，省内存且快
+    row_indices, col_indices, data_vals = [], [], []
-    regions_df = regions.var.copy()
+    # 检查列存在性一次完成
+    required = {"chr", "start", "end"}
-    regions_columns: list = list(regions_df.columns)
-    if "chr" not in regions_columns or "start" not in regions_columns or "end" not in regions_columns:
+    if not required.issubset(regions.var.columns):
         ul.log(__name__).error(
-            f"The peaks information {regions_columns} in data `adata` must include three columns: `chr`, `start` and "
-            f"`end`. (It is recommended to use the `read_sc_atac` method.)"
+            f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
+            f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
         )
         raise ValueError(
-            f"The peaks information {regions_columns} in data `adata` must include three columns: `chr`, `start` and "
-            f"`end`. (It is recommended to use the `read_sc_atac` method.)"
+            f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
+            f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
         )
-    regions_df = regions_df.rename_axis("index")
-    regions_df = regions_df.reset_index()
-    # sort
-    regions_df = regions_df.sort_values(["chr", "start", "end"])[["index", "chr", "start", "end"]]
+    regions_df = (
+        regions.var
+        .reset_index()
+        .loc[:, ["index", "chr", "start", "end"]]
+        .sort_values(["chr", "start", "end"])
+    )
-    ul.log(__name__).info(f"Obtain peak-trait/disease matrix. (overlap variant information)")
-    for trait_name in tqdm(trait_names):
+    ul.log(__name__).info("Obtain peak-trait/disease matrix. (overlap variant information)")
+    # 外层循环按 trait 并行可再加速，这里先保持单循环
+    for col_idx, trait_name in enumerate(tqdm(trait_names)):
         variant: AnnData = variants[trait_name]
-        index: int = trait_names.index(trait_name)
+        overlap_df: DataFrame = _overlap_(regions_df, variant.obs)
-        # handle overlap data
-        overlap_info: DataFrame = _overlap_(regions_df, variant.obs)
-        if overlap_info.shape[0] == 0:
+        if overlap_df.empty:
             continue
-        overlap_info.rename({"index": "label"}, axis="columns", inplace=True)
-        overlap_info.reset_index(inplace=True)
-        overlap_info["region_id"] = (
-            overlap_info["chr"].astype(str)
-            + ":" + overlap_info["start"].astype(str) + "-" + overlap_info["end"].astype(str)
-        )
-        # get region
-        region_info = overlap_info.groupby("region_id", as_index=False)["label"].first()
-        region_info.index = region_info["label"].astype(str)
-        label: list = list(region_info["label"])
-        # Mutation information with repetitive features
-        label_size: int = len(label)
+        # 直接拿到 label->variant_id 的列表，省掉 groupby
+        overlap_df = overlap_df.rename(columns={"index": "label"})
+        # 把 label 映射到行号
+        overlap_df = overlap_df[overlap_df["label"].isin(label2idx)]
-        for j in range(label_size):
+        if overlap_df.empty:
+            continue
-            # Determine whether the features after overlap exist, In other words, whether there is overlap in this feature
-            if label[j] in label_all:
-                # get the index of label
-                label_index = label_all.index(label[j])
-                overlap_info_region = overlap_info[overlap_info["label"] == label[j]]
-                # sum value
-                overlap_variant = variant[list(overlap_info_region["variant_id"]), :]
-                matrix[label_index, index] = overlap_variant.X.sum(axis=0)
+        # 一次性求和：先按 label 分组，把 variant_id 收集成列表
+        label_var_ids = (
+            overlap_df
+            .groupby("label")["variant_id"]
+            .apply(list)
+            .reset_index()
+        )
-    overlap_adata = AnnData(to_sparse(matrix), var=trait_info, obs=regions.var)
+        # 遍历每个 label，一次性切片求和
+        for _, row in label_var_ids.iterrows():
+            label = row["label"]
+            row_idx = label2idx[label]
+            var_ids = row["variant_id"]
+            # 切片一次求和，避免逐行切片
+            matrix_sum = variant[var_ids, :].X.sum(axis=0)
+            if np.isscalar(matrix_sum):
+                matrix_sum = np.asarray(matrix_sum).reshape(1)
+            # 收集非零值
+            if matrix_sum.size == 1:
+                val = float(matrix_sum)
+                if val != 0:
+                    row_indices.append(row_idx)
+                    col_indices.append(col_idx)
+                    data_vals.append(val)
+            else:
+                for t_idx, v in enumerate(matrix_sum):
+                    if v != 0:
+                        row_indices.append(row_idx)
+                        col_indices.append(col_idx + t_idx)
+                        data_vals.append(float(v))
+    # 构建稀疏矩阵，再转 csr
+    overlap_sparse = sparse.csc_matrix(
+        (data_vals, (row_indices, col_indices)),
+        shape=(label_all_size, n_trait),
+        dtype=np.float32
+    ).tocsr()
+    overlap_adata = AnnData(overlap_sparse, var=trait_info, obs=regions.var)
     overlap_adata.uns["is_overlap"] = True
     return overlap_adata

sciv/tool/_random_walk_.py CHANGED Viewed

@@ -201,6 +201,7 @@ def random_walk(
     if device == 'cpu' or (device == 'auto' and not availability):
         sample_count = seed_cell_weight.shape[1]
+        # 使用 joblib.Parallel 并指定 backend='threading' 保证顺序与输入一致
         results = Parallel(n_jobs=n_jobs)(
             delayed(_random_walk_cpu_)(seed_cell_weight[:, i], weight, gamma, epsilon, p)
             for i in tqdm(range(sample_count))
@@ -208,7 +209,19 @@ def random_walk(
         return np.column_stack(results)
     elif device == 'gpu' or (device == 'auto' and availability):
-        return _random_walk_gpu_(seed_cell_weight, weight, gamma, epsilon, p, device='gpu')
+        try:
+            return _random_walk_gpu_(seed_cell_weight, weight, gamma, epsilon, p, device='gpu')
+        except RuntimeError as e:
+            ul.log(__name__).warning(f"GPU failed to run, try to switch to CPU running.\n {e}")
+            sample_count = seed_cell_weight.shape[1]
+            results = Parallel(n_jobs=n_jobs)(
+                delayed(_random_walk_cpu_)(seed_cell_weight[:, i], weight, gamma, epsilon, p)
+                for i in tqdm(range(sample_count))
+            )
+            return np.column_stack(results)
     else:
         ul.log(__name__).error(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
         raise ValueError(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
@@ -677,7 +690,7 @@ class RandomWalk:
                 seed_cell_matrix_en[:, i] = seed_cell_en_value / (1 if seed_cell_en_value.sum() == 0 else seed_cell_en_value.sum())
         # Parallel processing of all traits and real-time display of progress
-        Parallel(n_jobs=-1, backend="threading")(
+        Parallel(n_jobs=self.n_jobs)(
             delayed(_process_single_trait)(i) for i in tqdm(self.trait_range, desc="Obtain progress of seed cells with weights")
         )
@@ -774,6 +787,7 @@ class RandomWalk:
         score = self._random_walk_core_(seed_cell_data, weight=weight)
+        ul.log(__name__).info("Normalize the results")
         cell_value = self.scale_norm(score)
         if _layer_label_ == "trs":
@@ -904,8 +918,8 @@ class RandomWalk:
         trs_score = to_dense(self.trs_adata.X if label == "run_en" else self.trs_adata.layers[_trs_layer_label_], is_array=True)
         # Initialize enriched container
-        trait_cell_enrichment = np.zeros(self.trs_adata.shape)
-        trait_cell_credible = np.zeros(self.trs_adata.shape)
+        trait_cell_enrichment = np.zeros(self.trs_adata.shape).astype(int)
+        trait_cell_credible = np.zeros(self.trs_adata.shape).astype(np.float32)
         ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-random walk)")
         # Random walk
@@ -916,8 +930,9 @@ class RandomWalk:
         )
         ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-score)")
-        for i in tqdm(self.trait_range):
+        # Process each trait in parallel
+        def _process_trait(i):
             # Random walk
             cell_value = cell_value_data[:, i]
@@ -940,7 +955,10 @@ class RandomWalk:
             trait_cell_enrichment[:, i][cell_value_credible > self.credible_threshold] = 1
             trait_cell_credible[:, i] = cell_value_credible
-        self.trs_adata.layers[_layer_label_] = to_sparse(trait_cell_enrichment.astype(int))
+        # Process each trait in parallel
+        Parallel(n_jobs=self.n_jobs)(delayed(_process_trait)(i) for i in tqdm(self.trait_range))
+        self.trs_adata.layers[_layer_label_] = to_sparse(trait_cell_enrichment)
         if not self.is_simple:
             self.trs_adata.layers[f"credible_{_layer_label_}"] = to_sparse(trait_cell_credible)

{sciv-0.0.88.dist-info → sciv-0.0.89.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sciv
-Version: 0.0.88
+Version: 0.0.89
 Summary: Unveiling the pivotal cell types involved in variant function regulation at a single-cell resolution
 Project-URL: github, https://github.com/YuZhengM/sciv
 Author-email: Zheng-Min Yu <yuzmbio@163.com>

{sciv-0.0.88.dist-info → sciv-0.0.89.dist-info}/RECORD RENAMED Viewed

@@ -27,13 +27,13 @@ sciv/preprocessing/_scanpy_.py,sha256=flC10W5YPgJE5Ccxt5hP8qW3Q3cOi8DZ36Z1j9D3oN
 sciv/preprocessing/_scvi_.py,sha256=ZIDkQ_4deYmzSMiAbu5C3j_jMMl7hBTFLCBXHCNj3B4,10332
 sciv/preprocessing/_snapatac_.py,sha256=Dq8CHF7Psl3CQszaEokQYO56Oe2uzyWOy_cGlaOywfc,27798
 sciv/tool/__init__.py,sha256=WXzHkWt6RgBC3qqD-98nR5wQmt6oC850ox_VpMrapSU,2468
-sciv/tool/_algorithm_.py,sha256=fRzf7fjHF2O2beFEGBZX5qOF7nAKSdgHqHCUDZJ_3BE,49302
+sciv/tool/_algorithm_.py,sha256=-6YbvZEH95tvOicf2-bkvpmzdyOe5yAjelL7HjPaXF8,49952
 sciv/tool/_matrix_.py,sha256=O1EAhA9wxh06P_eOxEBesK7kO7IExKlhH6uJzGh1HBM,24322
-sciv/tool/_random_walk_.py,sha256=BKFjj5z1XI7Fzsr9Qsnzequ-oR5bMgK_WHf7KcpoSPU,47510
+sciv/tool/_random_walk_.py,sha256=27P1tjQtpjm44ZPE5CGEn3kb-PF-LSF2QsXw10jugSU,48304
 sciv/util/__init__.py,sha256=nOxZ8if27X7AUJ6hZwTwxOJwIBJb0obWlHjqCzjg_Gc,1964
 sciv/util/_constant_.py,sha256=w0wKQd8guLd1ZTW24_5aECrWsIWDiNQmEpLsWlHar1A,3000
 sciv/util/_core_.py,sha256=ZD2uSnEBHVu0i9TmXWzri_3bXZzYKnIZk818gW3zadE,14751
-sciv-0.0.88.dist-info/METADATA,sha256=V0-eiddGBywMSRVwQP8uPWs3Fq7OFpwpO4GYWkr_hqw,3465
-sciv-0.0.88.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-sciv-0.0.88.dist-info/licenses/LICENSE,sha256=4UvHVf3qCOZjHLs4LkYz8u96XRpXnZrpTKrkUQPs5_A,1075
-sciv-0.0.88.dist-info/RECORD,,
+sciv-0.0.89.dist-info/METADATA,sha256=-YZF4BLit8qmdZ_66DBLffciYokeFFWurTA0sEZi3IU,3465
+sciv-0.0.89.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+sciv-0.0.89.dist-info/licenses/LICENSE,sha256=4UvHVf3qCOZjHLs4LkYz8u96XRpXnZrpTKrkUQPs5_A,1075
+sciv-0.0.89.dist-info/RECORD,,

{sciv-0.0.88.dist-info → sciv-0.0.89.dist-info}/WHEEL RENAMED Viewed

File without changes

{sciv-0.0.88.dist-info → sciv-0.0.89.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sciv 0.0.88__py3-none-any.whl → 0.0.89__py3-none-any.whl

sciv 0.0.88py3-none-any.whl → 0.0.89py3-none-any.whl