sciv 0.0.88__py3-none-any.whl → 0.0.89__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sciv/tool/_algorithm_.py +71 -47
- sciv/tool/_random_walk_.py +24 -6
- {sciv-0.0.88.dist-info → sciv-0.0.89.dist-info}/METADATA +1 -1
- {sciv-0.0.88.dist-info → sciv-0.0.89.dist-info}/RECORD +6 -6
- {sciv-0.0.88.dist-info → sciv-0.0.89.dist-info}/WHEEL +0 -0
- {sciv-0.0.88.dist-info → sciv-0.0.89.dist-info}/licenses/LICENSE +0 -0
sciv/tool/_algorithm_.py
CHANGED
|
@@ -902,73 +902,97 @@ def overlap_sum(regions: AnnData, variants: dict, trait_info: DataFrame) -> AnnD
|
|
|
902
902
|
"""
|
|
903
903
|
|
|
904
904
|
# Unique feature set
|
|
905
|
-
label_all =
|
|
905
|
+
label_all = regions.var.index.tolist()
|
|
906
906
|
# Peak number
|
|
907
907
|
label_all_size: int = len(label_all)
|
|
908
908
|
|
|
909
|
-
#
|
|
910
|
-
|
|
909
|
+
# 预先把 peaks 的 index 做成 dict,O(1) 查找
|
|
910
|
+
label2idx = {lb: i for i, lb in enumerate(label_all)}
|
|
911
911
|
|
|
912
|
-
|
|
912
|
+
trait_names = trait_info["id"].tolist()
|
|
913
|
+
n_trait = len(trait_names)
|
|
914
|
+
# 提前分配稀疏矩阵,按列填充,最后一次性转成 csc 再转 csr,省内存且快
|
|
915
|
+
row_indices, col_indices, data_vals = [], [], []
|
|
913
916
|
|
|
914
|
-
|
|
917
|
+
# 检查列存在性一次完成
|
|
918
|
+
required = {"chr", "start", "end"}
|
|
915
919
|
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
if "chr" not in regions_columns or "start" not in regions_columns or "end" not in regions_columns:
|
|
920
|
+
if not required.issubset(regions.var.columns):
|
|
919
921
|
ul.log(__name__).error(
|
|
920
|
-
f"The peaks information {
|
|
921
|
-
f"`end`. (It is recommended to use the `read_sc_atac` method.)"
|
|
922
|
+
f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
|
|
923
|
+
f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
|
|
922
924
|
)
|
|
923
925
|
raise ValueError(
|
|
924
|
-
f"The peaks information {
|
|
925
|
-
f"`end`. (It is recommended to use the `read_sc_atac` method.)"
|
|
926
|
+
f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
|
|
927
|
+
f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
|
|
926
928
|
)
|
|
927
929
|
|
|
928
|
-
regions_df =
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
930
|
+
regions_df = (
|
|
931
|
+
regions.var
|
|
932
|
+
.reset_index()
|
|
933
|
+
.loc[:, ["index", "chr", "start", "end"]]
|
|
934
|
+
.sort_values(["chr", "start", "end"])
|
|
935
|
+
)
|
|
932
936
|
|
|
933
|
-
ul.log(__name__).info(
|
|
934
|
-
for trait_name in tqdm(trait_names):
|
|
937
|
+
ul.log(__name__).info("Obtain peak-trait/disease matrix. (overlap variant information)")
|
|
935
938
|
|
|
939
|
+
# 外层循环按 trait 并行可再加速,这里先保持单循环
|
|
940
|
+
for col_idx, trait_name in enumerate(tqdm(trait_names)):
|
|
936
941
|
variant: AnnData = variants[trait_name]
|
|
937
|
-
|
|
942
|
+
overlap_df: DataFrame = _overlap_(regions_df, variant.obs)
|
|
938
943
|
|
|
939
|
-
|
|
940
|
-
overlap_info: DataFrame = _overlap_(regions_df, variant.obs)
|
|
941
|
-
|
|
942
|
-
if overlap_info.shape[0] == 0:
|
|
944
|
+
if overlap_df.empty:
|
|
943
945
|
continue
|
|
944
946
|
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
+ ":" + overlap_info["start"].astype(str) + "-" + overlap_info["end"].astype(str)
|
|
950
|
-
)
|
|
951
|
-
|
|
952
|
-
# get region
|
|
953
|
-
region_info = overlap_info.groupby("region_id", as_index=False)["label"].first()
|
|
954
|
-
region_info.index = region_info["label"].astype(str)
|
|
955
|
-
label: list = list(region_info["label"])
|
|
956
|
-
|
|
957
|
-
# Mutation information with repetitive features
|
|
958
|
-
label_size: int = len(label)
|
|
947
|
+
# 直接拿到 label->variant_id 的列表,省掉 groupby
|
|
948
|
+
overlap_df = overlap_df.rename(columns={"index": "label"})
|
|
949
|
+
# 把 label 映射到行号
|
|
950
|
+
overlap_df = overlap_df[overlap_df["label"].isin(label2idx)]
|
|
959
951
|
|
|
960
|
-
|
|
952
|
+
if overlap_df.empty:
|
|
953
|
+
continue
|
|
961
954
|
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
matrix[label_index, index] = overlap_variant.X.sum(axis=0)
|
|
955
|
+
# 一次性求和:先按 label 分组,把 variant_id 收集成列表
|
|
956
|
+
label_var_ids = (
|
|
957
|
+
overlap_df
|
|
958
|
+
.groupby("label")["variant_id"]
|
|
959
|
+
.apply(list)
|
|
960
|
+
.reset_index()
|
|
961
|
+
)
|
|
970
962
|
|
|
971
|
-
|
|
963
|
+
# 遍历每个 label,一次性切片求和
|
|
964
|
+
for _, row in label_var_ids.iterrows():
|
|
965
|
+
label = row["label"]
|
|
966
|
+
row_idx = label2idx[label]
|
|
967
|
+
var_ids = row["variant_id"]
|
|
968
|
+
# 切片一次求和,避免逐行切片
|
|
969
|
+
matrix_sum = variant[var_ids, :].X.sum(axis=0)
|
|
970
|
+
|
|
971
|
+
if np.isscalar(matrix_sum):
|
|
972
|
+
matrix_sum = np.asarray(matrix_sum).reshape(1)
|
|
973
|
+
|
|
974
|
+
# 收集非零值
|
|
975
|
+
if matrix_sum.size == 1:
|
|
976
|
+
val = float(matrix_sum)
|
|
977
|
+
if val != 0:
|
|
978
|
+
row_indices.append(row_idx)
|
|
979
|
+
col_indices.append(col_idx)
|
|
980
|
+
data_vals.append(val)
|
|
981
|
+
else:
|
|
982
|
+
for t_idx, v in enumerate(matrix_sum):
|
|
983
|
+
if v != 0:
|
|
984
|
+
row_indices.append(row_idx)
|
|
985
|
+
col_indices.append(col_idx + t_idx)
|
|
986
|
+
data_vals.append(float(v))
|
|
987
|
+
|
|
988
|
+
# 构建稀疏矩阵,再转 csr
|
|
989
|
+
overlap_sparse = sparse.csc_matrix(
|
|
990
|
+
(data_vals, (row_indices, col_indices)),
|
|
991
|
+
shape=(label_all_size, n_trait),
|
|
992
|
+
dtype=np.float32
|
|
993
|
+
).tocsr()
|
|
994
|
+
|
|
995
|
+
overlap_adata = AnnData(overlap_sparse, var=trait_info, obs=regions.var)
|
|
972
996
|
overlap_adata.uns["is_overlap"] = True
|
|
973
997
|
return overlap_adata
|
|
974
998
|
|
sciv/tool/_random_walk_.py
CHANGED
|
@@ -201,6 +201,7 @@ def random_walk(
|
|
|
201
201
|
if device == 'cpu' or (device == 'auto' and not availability):
|
|
202
202
|
sample_count = seed_cell_weight.shape[1]
|
|
203
203
|
|
|
204
|
+
# 使用 joblib.Parallel 并指定 backend='threading' 保证顺序与输入一致
|
|
204
205
|
results = Parallel(n_jobs=n_jobs)(
|
|
205
206
|
delayed(_random_walk_cpu_)(seed_cell_weight[:, i], weight, gamma, epsilon, p)
|
|
206
207
|
for i in tqdm(range(sample_count))
|
|
@@ -208,7 +209,19 @@ def random_walk(
|
|
|
208
209
|
|
|
209
210
|
return np.column_stack(results)
|
|
210
211
|
elif device == 'gpu' or (device == 'auto' and availability):
|
|
211
|
-
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
return _random_walk_gpu_(seed_cell_weight, weight, gamma, epsilon, p, device='gpu')
|
|
215
|
+
except RuntimeError as e:
|
|
216
|
+
ul.log(__name__).warning(f"GPU failed to run, try to switch to CPU running.\n {e}")
|
|
217
|
+
sample_count = seed_cell_weight.shape[1]
|
|
218
|
+
|
|
219
|
+
results = Parallel(n_jobs=n_jobs)(
|
|
220
|
+
delayed(_random_walk_cpu_)(seed_cell_weight[:, i], weight, gamma, epsilon, p)
|
|
221
|
+
for i in tqdm(range(sample_count))
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
return np.column_stack(results)
|
|
212
225
|
else:
|
|
213
226
|
ul.log(__name__).error(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
|
|
214
227
|
raise ValueError(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
|
|
@@ -677,7 +690,7 @@ class RandomWalk:
|
|
|
677
690
|
seed_cell_matrix_en[:, i] = seed_cell_en_value / (1 if seed_cell_en_value.sum() == 0 else seed_cell_en_value.sum())
|
|
678
691
|
|
|
679
692
|
# Parallel processing of all traits and real-time display of progress
|
|
680
|
-
Parallel(n_jobs
|
|
693
|
+
Parallel(n_jobs=self.n_jobs)(
|
|
681
694
|
delayed(_process_single_trait)(i) for i in tqdm(self.trait_range, desc="Obtain progress of seed cells with weights")
|
|
682
695
|
)
|
|
683
696
|
|
|
@@ -774,6 +787,7 @@ class RandomWalk:
|
|
|
774
787
|
|
|
775
788
|
score = self._random_walk_core_(seed_cell_data, weight=weight)
|
|
776
789
|
|
|
790
|
+
ul.log(__name__).info("Normalize the results")
|
|
777
791
|
cell_value = self.scale_norm(score)
|
|
778
792
|
|
|
779
793
|
if _layer_label_ == "trs":
|
|
@@ -904,8 +918,8 @@ class RandomWalk:
|
|
|
904
918
|
trs_score = to_dense(self.trs_adata.X if label == "run_en" else self.trs_adata.layers[_trs_layer_label_], is_array=True)
|
|
905
919
|
|
|
906
920
|
# Initialize enriched container
|
|
907
|
-
trait_cell_enrichment = np.zeros(self.trs_adata.shape)
|
|
908
|
-
trait_cell_credible = np.zeros(self.trs_adata.shape)
|
|
921
|
+
trait_cell_enrichment = np.zeros(self.trs_adata.shape).astype(int)
|
|
922
|
+
trait_cell_credible = np.zeros(self.trs_adata.shape).astype(np.float32)
|
|
909
923
|
|
|
910
924
|
ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-random walk)")
|
|
911
925
|
# Random walk
|
|
@@ -916,8 +930,9 @@ class RandomWalk:
|
|
|
916
930
|
)
|
|
917
931
|
|
|
918
932
|
ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-score)")
|
|
919
|
-
for i in tqdm(self.trait_range):
|
|
920
933
|
|
|
934
|
+
# Process each trait in parallel
|
|
935
|
+
def _process_trait(i):
|
|
921
936
|
# Random walk
|
|
922
937
|
cell_value = cell_value_data[:, i]
|
|
923
938
|
|
|
@@ -940,7 +955,10 @@ class RandomWalk:
|
|
|
940
955
|
trait_cell_enrichment[:, i][cell_value_credible > self.credible_threshold] = 1
|
|
941
956
|
trait_cell_credible[:, i] = cell_value_credible
|
|
942
957
|
|
|
943
|
-
|
|
958
|
+
# Process each trait in parallel
|
|
959
|
+
Parallel(n_jobs=self.n_jobs)(delayed(_process_trait)(i) for i in tqdm(self.trait_range))
|
|
960
|
+
|
|
961
|
+
self.trs_adata.layers[_layer_label_] = to_sparse(trait_cell_enrichment)
|
|
944
962
|
|
|
945
963
|
if not self.is_simple:
|
|
946
964
|
self.trs_adata.layers[f"credible_{_layer_label_}"] = to_sparse(trait_cell_credible)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sciv
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.89
|
|
4
4
|
Summary: Unveiling the pivotal cell types involved in variant function regulation at a single-cell resolution
|
|
5
5
|
Project-URL: github, https://github.com/YuZhengM/sciv
|
|
6
6
|
Author-email: Zheng-Min Yu <yuzmbio@163.com>
|
|
@@ -27,13 +27,13 @@ sciv/preprocessing/_scanpy_.py,sha256=flC10W5YPgJE5Ccxt5hP8qW3Q3cOi8DZ36Z1j9D3oN
|
|
|
27
27
|
sciv/preprocessing/_scvi_.py,sha256=ZIDkQ_4deYmzSMiAbu5C3j_jMMl7hBTFLCBXHCNj3B4,10332
|
|
28
28
|
sciv/preprocessing/_snapatac_.py,sha256=Dq8CHF7Psl3CQszaEokQYO56Oe2uzyWOy_cGlaOywfc,27798
|
|
29
29
|
sciv/tool/__init__.py,sha256=WXzHkWt6RgBC3qqD-98nR5wQmt6oC850ox_VpMrapSU,2468
|
|
30
|
-
sciv/tool/_algorithm_.py,sha256
|
|
30
|
+
sciv/tool/_algorithm_.py,sha256=-6YbvZEH95tvOicf2-bkvpmzdyOe5yAjelL7HjPaXF8,49952
|
|
31
31
|
sciv/tool/_matrix_.py,sha256=O1EAhA9wxh06P_eOxEBesK7kO7IExKlhH6uJzGh1HBM,24322
|
|
32
|
-
sciv/tool/_random_walk_.py,sha256=
|
|
32
|
+
sciv/tool/_random_walk_.py,sha256=27P1tjQtpjm44ZPE5CGEn3kb-PF-LSF2QsXw10jugSU,48304
|
|
33
33
|
sciv/util/__init__.py,sha256=nOxZ8if27X7AUJ6hZwTwxOJwIBJb0obWlHjqCzjg_Gc,1964
|
|
34
34
|
sciv/util/_constant_.py,sha256=w0wKQd8guLd1ZTW24_5aECrWsIWDiNQmEpLsWlHar1A,3000
|
|
35
35
|
sciv/util/_core_.py,sha256=ZD2uSnEBHVu0i9TmXWzri_3bXZzYKnIZk818gW3zadE,14751
|
|
36
|
-
sciv-0.0.
|
|
37
|
-
sciv-0.0.
|
|
38
|
-
sciv-0.0.
|
|
39
|
-
sciv-0.0.
|
|
36
|
+
sciv-0.0.89.dist-info/METADATA,sha256=-YZF4BLit8qmdZ_66DBLffciYokeFFWurTA0sEZi3IU,3465
|
|
37
|
+
sciv-0.0.89.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
38
|
+
sciv-0.0.89.dist-info/licenses/LICENSE,sha256=4UvHVf3qCOZjHLs4LkYz8u96XRpXnZrpTKrkUQPs5_A,1075
|
|
39
|
+
sciv-0.0.89.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|