PyPI - sciv - Versions diffs - 0.0.88__py3-none-any.whl → 0.0.90__py3-none-any.whl - Mend

sciv 0.0.88py3-none-any.whl → 0.0.90py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

sciv/tool/_algorithm_.py CHANGED Viewed

@@ -814,7 +814,9 @@ def _overlap_(regions_sort: DataFrame, variants: DataFrame) -> DataFrame:
     variants_overlap_info_list: list = []
-    for index, chr_a, start, end in zip(regions_sort["index"], regions_sort["chr"], regions_sort["start"],
+    for index, chr_a, start, end in zip(regions_sort["index"],
+                                        regions_sort["chr"],
+                                        regions_sort["start"],
                                         regions_sort["end"]):
         # judge chr
@@ -902,73 +904,89 @@ def overlap_sum(regions: AnnData, variants: dict, trait_info: DataFrame) -> AnnD
     """
     # Unique feature set
-    label_all = list(regions.var.index)
+    label_all = regions.var.index.tolist()
     # Peak number
     label_all_size: int = len(label_all)
-    # trait/disease information
-    trait_names: list = list(trait_info["id"])
+    # Pre-build a dict of peak indices for O(1) lookup
+    label2idx = {lb: i for i, lb in enumerate(label_all)}
-    matrix = np.zeros((label_all_size, len(trait_names)))
+    trait_names = trait_info["id"].tolist()
+    n_trait = len(trait_names)
+    # Pre-allocate sparse matrix, fill column by column, then convert to csc and then csr for efficiency
+    row_indices, col_indices, data_vals = [], [], []
-    regions_df = regions.var.copy()
+    # Check column existence once
+    required = {"chr", "start", "end"}
-    regions_columns: list = list(regions_df.columns)
-    if "chr" not in regions_columns or "start" not in regions_columns or "end" not in regions_columns:
+    if not required.issubset(regions.var.columns):
         ul.log(__name__).error(
-            f"The peaks information {regions_columns} in data `adata` must include three columns: `chr`, `start` and "
-            f"`end`. (It is recommended to use the `read_sc_atac` method.)"
+            f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
+            f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
         )
         raise ValueError(
-            f"The peaks information {regions_columns} in data `adata` must include three columns: `chr`, `start` and "
-            f"`end`. (It is recommended to use the `read_sc_atac` method.)"
+            f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
+            f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
         )
-    regions_df = regions_df.rename_axis("index")
-    regions_df = regions_df.reset_index()
-    # sort
-    regions_df = regions_df.sort_values(["chr", "start", "end"])[["index", "chr", "start", "end"]]
+    regions_df = (
+        regions.var
+        .reset_index()
+        .loc[:, ["index", "chr", "start", "end"]]
+        .sort_values(["chr", "start", "end"])
+    )
-    ul.log(__name__).info(f"Obtain peak-trait/disease matrix. (overlap variant information)")
-    for trait_name in tqdm(trait_names):
+    ul.log(__name__).info("Obtain peak-trait/disease matrix. (overlap variant information)")
+    # The outer loop can be further accelerated by parallelizing over traits; here we keep it single-threaded for now.
+    for col_idx, trait_name in enumerate(tqdm(trait_names)):
         variant: AnnData = variants[trait_name]
-        index: int = trait_names.index(trait_name)
-        # handle overlap data
-        overlap_info: DataFrame = _overlap_(regions_df, variant.obs)
+        overlap_df: DataFrame = _overlap_(regions_df, variant.obs)
-        if overlap_info.shape[0] == 0:
+        if overlap_df.empty:
             continue
-        overlap_info.rename({"index": "label"}, axis="columns", inplace=True)
-        overlap_info.reset_index(inplace=True)
-        overlap_info["region_id"] = (
-            overlap_info["chr"].astype(str)
-            + ":" + overlap_info["start"].astype(str) + "-" + overlap_info["end"].astype(str)
+        # Sum at once: first group by label and collect variant_id into a list
+        label_var_ids = (
+            overlap_df
+            .groupby("index")["variant_id"]
+            .apply(list)
+            .reset_index()
         )
-        # get region
-        region_info = overlap_info.groupby("region_id", as_index=False)["label"].first()
-        region_info.index = region_info["label"].astype(str)
-        label: list = list(region_info["label"])
-        # Mutation information with repetitive features
-        label_size: int = len(label)
-        for j in range(label_size):
-            # Determine whether the features after overlap exist, In other words, whether there is overlap in this feature
-            if label[j] in label_all:
-                # get the index of label
-                label_index = label_all.index(label[j])
-                overlap_info_region = overlap_info[overlap_info["label"] == label[j]]
-                # sum value
-                overlap_variant = variant[list(overlap_info_region["variant_id"]), :]
-                matrix[label_index, index] = overlap_variant.X.sum(axis=0)
-    overlap_adata = AnnData(to_sparse(matrix), var=trait_info, obs=regions.var)
+        # Traverse each label, sum once for each variant_id list
+        for _, row in label_var_ids.iterrows():
+            label = row["index"]
+            row_idx = label2idx[label]
+            var_ids = row["variant_id"]
+            # Sum once for all variant_ids in the list, avoiding row-by-row slicing
+            matrix_sum = variant[var_ids, :].X.sum(axis=0)
+            if np.isscalar(matrix_sum):
+                matrix_sum = np.asarray(matrix_sum).reshape(1)
+            # Collect non-zero values
+            if matrix_sum.size == 1:
+                val = float(matrix_sum)
+                if val != 0:
+                    row_indices.append(row_idx)
+                    col_indices.append(col_idx)
+                    data_vals.append(val)
+            else:
+                for t_idx, v in enumerate(matrix_sum):
+                    if v != 0:
+                        row_indices.append(row_idx)
+                        col_indices.append(col_idx + t_idx)
+                        data_vals.append(float(v))
+    # Build sparse matrix, then convert to csr format
+    overlap_sparse = sparse.csc_matrix(
+        (data_vals, (row_indices, col_indices)),
+        shape=(label_all_size, n_trait),
+        dtype=np.float32
+    ).tocsr()
+    overlap_adata = AnnData(overlap_sparse, var=trait_info, obs=regions.var)
     overlap_adata.uns["is_overlap"] = True
     return overlap_adata

sciv/tool/_random_walk_.py CHANGED Viewed

@@ -208,7 +208,19 @@ def random_walk(
         return np.column_stack(results)
     elif device == 'gpu' or (device == 'auto' and availability):
-        return _random_walk_gpu_(seed_cell_weight, weight, gamma, epsilon, p, device='gpu')
+        try:
+            return _random_walk_gpu_(seed_cell_weight, weight, gamma, epsilon, p, device='gpu')
+        except RuntimeError as e:
+            ul.log(__name__).warning(f"GPU failed to run, try to switch to CPU running.\n {e}")
+            sample_count = seed_cell_weight.shape[1]
+            results = Parallel(n_jobs=n_jobs)(
+                delayed(_random_walk_cpu_)(seed_cell_weight[:, i], weight, gamma, epsilon, p)
+                for i in tqdm(range(sample_count))
+            )
+            return np.column_stack(results)
     else:
         ul.log(__name__).error(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
         raise ValueError(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
@@ -677,7 +689,7 @@ class RandomWalk:
                 seed_cell_matrix_en[:, i] = seed_cell_en_value / (1 if seed_cell_en_value.sum() == 0 else seed_cell_en_value.sum())
         # Parallel processing of all traits and real-time display of progress
-        Parallel(n_jobs=-1, backend="threading")(
+        Parallel(n_jobs=self.n_jobs, backend='threading')(
             delayed(_process_single_trait)(i) for i in tqdm(self.trait_range, desc="Obtain progress of seed cells with weights")
         )
@@ -774,6 +786,7 @@ class RandomWalk:
         score = self._random_walk_core_(seed_cell_data, weight=weight)
+        ul.log(__name__).info("Normalize the results")
         cell_value = self.scale_norm(score)
         if _layer_label_ == "trs":
@@ -904,8 +917,8 @@ class RandomWalk:
         trs_score = to_dense(self.trs_adata.X if label == "run_en" else self.trs_adata.layers[_trs_layer_label_], is_array=True)
         # Initialize enriched container
-        trait_cell_enrichment = np.zeros(self.trs_adata.shape)
-        trait_cell_credible = np.zeros(self.trs_adata.shape)
+        trait_cell_enrichment = np.zeros(self.trs_adata.shape).astype(int)
+        trait_cell_credible = np.zeros(self.trs_adata.shape).astype(np.float32)
         ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-random walk)")
         # Random walk
@@ -916,8 +929,9 @@ class RandomWalk:
         )
         ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-score)")
-        for i in tqdm(self.trait_range):
+        # Process each trait in parallel
+        def _process_trait(i):
             # Random walk
             cell_value = cell_value_data[:, i]
@@ -940,7 +954,10 @@ class RandomWalk:
             trait_cell_enrichment[:, i][cell_value_credible > self.credible_threshold] = 1
             trait_cell_credible[:, i] = cell_value_credible
-        self.trs_adata.layers[_layer_label_] = to_sparse(trait_cell_enrichment.astype(int))
+        # Process each trait in parallel, backend='threading' can effectively prevent the read-only parameter issue caused by copying in loky multi-process mode
+        Parallel(n_jobs=self.n_jobs, backend='threading')(delayed(_process_trait)(i) for i in tqdm(self.trait_range))
+        self.trs_adata.layers[_layer_label_] = to_sparse(trait_cell_enrichment)
         if not self.is_simple:
             self.trs_adata.layers[f"credible_{_layer_label_}"] = to_sparse(trait_cell_credible)

{sciv-0.0.88.dist-info → sciv-0.0.90.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sciv
-Version: 0.0.88
+Version: 0.0.90
 Summary: Unveiling the pivotal cell types involved in variant function regulation at a single-cell resolution
 Project-URL: github, https://github.com/YuZhengM/sciv
 Author-email: Zheng-Min Yu <yuzmbio@163.com>

{sciv-0.0.88.dist-info → sciv-0.0.90.dist-info}/RECORD RENAMED Viewed

@@ -27,13 +27,13 @@ sciv/preprocessing/_scanpy_.py,sha256=flC10W5YPgJE5Ccxt5hP8qW3Q3cOi8DZ36Z1j9D3oN
 sciv/preprocessing/_scvi_.py,sha256=ZIDkQ_4deYmzSMiAbu5C3j_jMMl7hBTFLCBXHCNj3B4,10332
 sciv/preprocessing/_snapatac_.py,sha256=Dq8CHF7Psl3CQszaEokQYO56Oe2uzyWOy_cGlaOywfc,27798
 sciv/tool/__init__.py,sha256=WXzHkWt6RgBC3qqD-98nR5wQmt6oC850ox_VpMrapSU,2468
-sciv/tool/_algorithm_.py,sha256=fRzf7fjHF2O2beFEGBZX5qOF7nAKSdgHqHCUDZJ_3BE,49302
+sciv/tool/_algorithm_.py,sha256=yTImeGMWK6Y2gxygR90bqRF1vkvU857l5M9A_Q_VoZI,49839
 sciv/tool/_matrix_.py,sha256=O1EAhA9wxh06P_eOxEBesK7kO7IExKlhH6uJzGh1HBM,24322
-sciv/tool/_random_walk_.py,sha256=BKFjj5z1XI7Fzsr9Qsnzequ-oR5bMgK_WHf7KcpoSPU,47510
+sciv/tool/_random_walk_.py,sha256=lnMiJyDuqDB75l9IkFLr7bIgKYhWxlBdIcExSErpQNw,48374
 sciv/util/__init__.py,sha256=nOxZ8if27X7AUJ6hZwTwxOJwIBJb0obWlHjqCzjg_Gc,1964
 sciv/util/_constant_.py,sha256=w0wKQd8guLd1ZTW24_5aECrWsIWDiNQmEpLsWlHar1A,3000
 sciv/util/_core_.py,sha256=ZD2uSnEBHVu0i9TmXWzri_3bXZzYKnIZk818gW3zadE,14751
-sciv-0.0.88.dist-info/METADATA,sha256=V0-eiddGBywMSRVwQP8uPWs3Fq7OFpwpO4GYWkr_hqw,3465
-sciv-0.0.88.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-sciv-0.0.88.dist-info/licenses/LICENSE,sha256=4UvHVf3qCOZjHLs4LkYz8u96XRpXnZrpTKrkUQPs5_A,1075
-sciv-0.0.88.dist-info/RECORD,,
+sciv-0.0.90.dist-info/METADATA,sha256=0yOyKQwEYMys8J6KodKgk2FORBN2SZQhRmiTF5je2FI,3465
+sciv-0.0.90.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+sciv-0.0.90.dist-info/licenses/LICENSE,sha256=4UvHVf3qCOZjHLs4LkYz8u96XRpXnZrpTKrkUQPs5_A,1075
+sciv-0.0.90.dist-info/RECORD,,

{sciv-0.0.88.dist-info → sciv-0.0.90.dist-info}/WHEEL RENAMED Viewed

File without changes

{sciv-0.0.88.dist-info → sciv-0.0.90.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sciv 0.0.88__py3-none-any.whl → 0.0.90__py3-none-any.whl

sciv 0.0.88py3-none-any.whl → 0.0.90py3-none-any.whl