sciv 0.0.88__py3-none-any.whl → 0.0.89__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sciv/tool/_algorithm_.py CHANGED
@@ -902,73 +902,97 @@ def overlap_sum(regions: AnnData, variants: dict, trait_info: DataFrame) -> AnnD
902
902
  """
903
903
 
904
904
  # Unique feature set
905
- label_all = list(regions.var.index)
905
+ label_all = regions.var.index.tolist()
906
906
  # Peak number
907
907
  label_all_size: int = len(label_all)
908
908
 
909
- # trait/disease information
910
- trait_names: list = list(trait_info["id"])
909
+ # 预先把 peaks 的 index 做成 dict,O(1) 查找
910
+ label2idx = {lb: i for i, lb in enumerate(label_all)}
911
911
 
912
- matrix = np.zeros((label_all_size, len(trait_names)))
912
+ trait_names = trait_info["id"].tolist()
913
+ n_trait = len(trait_names)
914
+ # 提前分配稀疏矩阵,按列填充,最后一次性转成 csc 再转 csr,省内存且快
915
+ row_indices, col_indices, data_vals = [], [], []
913
916
 
914
- regions_df = regions.var.copy()
917
+ # 检查列存在性一次完成
918
+ required = {"chr", "start", "end"}
915
919
 
916
- regions_columns: list = list(regions_df.columns)
917
-
918
- if "chr" not in regions_columns or "start" not in regions_columns or "end" not in regions_columns:
920
+ if not required.issubset(regions.var.columns):
919
921
  ul.log(__name__).error(
920
- f"The peaks information {regions_columns} in data `adata` must include three columns: `chr`, `start` and "
921
- f"`end`. (It is recommended to use the `read_sc_atac` method.)"
922
+ f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
923
+ f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
922
924
  )
923
925
  raise ValueError(
924
- f"The peaks information {regions_columns} in data `adata` must include three columns: `chr`, `start` and "
925
- f"`end`. (It is recommended to use the `read_sc_atac` method.)"
926
+ f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
927
+ f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
926
928
  )
927
929
 
928
- regions_df = regions_df.rename_axis("index")
929
- regions_df = regions_df.reset_index()
930
- # sort
931
- regions_df = regions_df.sort_values(["chr", "start", "end"])[["index", "chr", "start", "end"]]
930
+ regions_df = (
931
+ regions.var
932
+ .reset_index()
933
+ .loc[:, ["index", "chr", "start", "end"]]
934
+ .sort_values(["chr", "start", "end"])
935
+ )
932
936
 
933
- ul.log(__name__).info(f"Obtain peak-trait/disease matrix. (overlap variant information)")
934
- for trait_name in tqdm(trait_names):
937
+ ul.log(__name__).info("Obtain peak-trait/disease matrix. (overlap variant information)")
935
938
 
939
+ # 外层循环按 trait 并行可再加速,这里先保持单循环
940
+ for col_idx, trait_name in enumerate(tqdm(trait_names)):
936
941
  variant: AnnData = variants[trait_name]
937
- index: int = trait_names.index(trait_name)
942
+ overlap_df: DataFrame = _overlap_(regions_df, variant.obs)
938
943
 
939
- # handle overlap data
940
- overlap_info: DataFrame = _overlap_(regions_df, variant.obs)
941
-
942
- if overlap_info.shape[0] == 0:
944
+ if overlap_df.empty:
943
945
  continue
944
946
 
945
- overlap_info.rename({"index": "label"}, axis="columns", inplace=True)
946
- overlap_info.reset_index(inplace=True)
947
- overlap_info["region_id"] = (
948
- overlap_info["chr"].astype(str)
949
- + ":" + overlap_info["start"].astype(str) + "-" + overlap_info["end"].astype(str)
950
- )
951
-
952
- # get region
953
- region_info = overlap_info.groupby("region_id", as_index=False)["label"].first()
954
- region_info.index = region_info["label"].astype(str)
955
- label: list = list(region_info["label"])
956
-
957
- # Mutation information with repetitive features
958
- label_size: int = len(label)
947
+ # 直接拿到 label->variant_id 的列表,省掉 groupby
948
+ overlap_df = overlap_df.rename(columns={"index": "label"})
949
+ # label 映射到行号
950
+ overlap_df = overlap_df[overlap_df["label"].isin(label2idx)]
959
951
 
960
- for j in range(label_size):
952
+ if overlap_df.empty:
953
+ continue
961
954
 
962
- # Determine whether the features after overlap exist, In other words, whether there is overlap in this feature
963
- if label[j] in label_all:
964
- # get the index of label
965
- label_index = label_all.index(label[j])
966
- overlap_info_region = overlap_info[overlap_info["label"] == label[j]]
967
- # sum value
968
- overlap_variant = variant[list(overlap_info_region["variant_id"]), :]
969
- matrix[label_index, index] = overlap_variant.X.sum(axis=0)
955
+ # 一次性求和:先按 label 分组,把 variant_id 收集成列表
956
+ label_var_ids = (
957
+ overlap_df
958
+ .groupby("label")["variant_id"]
959
+ .apply(list)
960
+ .reset_index()
961
+ )
970
962
 
971
- overlap_adata = AnnData(to_sparse(matrix), var=trait_info, obs=regions.var)
963
+ # 遍历每个 label,一次性切片求和
964
+ for _, row in label_var_ids.iterrows():
965
+ label = row["label"]
966
+ row_idx = label2idx[label]
967
+ var_ids = row["variant_id"]
968
+ # 切片一次求和,避免逐行切片
969
+ matrix_sum = variant[var_ids, :].X.sum(axis=0)
970
+
971
+ if np.isscalar(matrix_sum):
972
+ matrix_sum = np.asarray(matrix_sum).reshape(1)
973
+
974
+ # 收集非零值
975
+ if matrix_sum.size == 1:
976
+ val = float(matrix_sum)
977
+ if val != 0:
978
+ row_indices.append(row_idx)
979
+ col_indices.append(col_idx)
980
+ data_vals.append(val)
981
+ else:
982
+ for t_idx, v in enumerate(matrix_sum):
983
+ if v != 0:
984
+ row_indices.append(row_idx)
985
+ col_indices.append(col_idx + t_idx)
986
+ data_vals.append(float(v))
987
+
988
+ # 构建稀疏矩阵,再转 csr
989
+ overlap_sparse = sparse.csc_matrix(
990
+ (data_vals, (row_indices, col_indices)),
991
+ shape=(label_all_size, n_trait),
992
+ dtype=np.float32
993
+ ).tocsr()
994
+
995
+ overlap_adata = AnnData(overlap_sparse, var=trait_info, obs=regions.var)
972
996
  overlap_adata.uns["is_overlap"] = True
973
997
  return overlap_adata
974
998
 
@@ -201,6 +201,7 @@ def random_walk(
201
201
  if device == 'cpu' or (device == 'auto' and not availability):
202
202
  sample_count = seed_cell_weight.shape[1]
203
203
 
204
+ # 使用 joblib.Parallel 并指定 backend='threading' 保证顺序与输入一致
204
205
  results = Parallel(n_jobs=n_jobs)(
205
206
  delayed(_random_walk_cpu_)(seed_cell_weight[:, i], weight, gamma, epsilon, p)
206
207
  for i in tqdm(range(sample_count))
@@ -208,7 +209,19 @@ def random_walk(
208
209
 
209
210
  return np.column_stack(results)
210
211
  elif device == 'gpu' or (device == 'auto' and availability):
211
- return _random_walk_gpu_(seed_cell_weight, weight, gamma, epsilon, p, device='gpu')
212
+
213
+ try:
214
+ return _random_walk_gpu_(seed_cell_weight, weight, gamma, epsilon, p, device='gpu')
215
+ except RuntimeError as e:
216
+ ul.log(__name__).warning(f"GPU failed to run, try to switch to CPU running.\n {e}")
217
+ sample_count = seed_cell_weight.shape[1]
218
+
219
+ results = Parallel(n_jobs=n_jobs)(
220
+ delayed(_random_walk_cpu_)(seed_cell_weight[:, i], weight, gamma, epsilon, p)
221
+ for i in tqdm(range(sample_count))
222
+ )
223
+
224
+ return np.column_stack(results)
212
225
  else:
213
226
  ul.log(__name__).error(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
214
227
  raise ValueError(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
@@ -677,7 +690,7 @@ class RandomWalk:
677
690
  seed_cell_matrix_en[:, i] = seed_cell_en_value / (1 if seed_cell_en_value.sum() == 0 else seed_cell_en_value.sum())
678
691
 
679
692
  # Parallel processing of all traits and real-time display of progress
680
- Parallel(n_jobs=-1, backend="threading")(
693
+ Parallel(n_jobs=self.n_jobs)(
681
694
  delayed(_process_single_trait)(i) for i in tqdm(self.trait_range, desc="Obtain progress of seed cells with weights")
682
695
  )
683
696
 
@@ -774,6 +787,7 @@ class RandomWalk:
774
787
 
775
788
  score = self._random_walk_core_(seed_cell_data, weight=weight)
776
789
 
790
+ ul.log(__name__).info("Normalize the results")
777
791
  cell_value = self.scale_norm(score)
778
792
 
779
793
  if _layer_label_ == "trs":
@@ -904,8 +918,8 @@ class RandomWalk:
904
918
  trs_score = to_dense(self.trs_adata.X if label == "run_en" else self.trs_adata.layers[_trs_layer_label_], is_array=True)
905
919
 
906
920
  # Initialize enriched container
907
- trait_cell_enrichment = np.zeros(self.trs_adata.shape)
908
- trait_cell_credible = np.zeros(self.trs_adata.shape)
921
+ trait_cell_enrichment = np.zeros(self.trs_adata.shape).astype(int)
922
+ trait_cell_credible = np.zeros(self.trs_adata.shape).astype(np.float32)
909
923
 
910
924
  ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-random walk)")
911
925
  # Random walk
@@ -916,8 +930,9 @@ class RandomWalk:
916
930
  )
917
931
 
918
932
  ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-score)")
919
- for i in tqdm(self.trait_range):
920
933
 
934
+ # Process each trait in parallel
935
+ def _process_trait(i):
921
936
  # Random walk
922
937
  cell_value = cell_value_data[:, i]
923
938
 
@@ -940,7 +955,10 @@ class RandomWalk:
940
955
  trait_cell_enrichment[:, i][cell_value_credible > self.credible_threshold] = 1
941
956
  trait_cell_credible[:, i] = cell_value_credible
942
957
 
943
- self.trs_adata.layers[_layer_label_] = to_sparse(trait_cell_enrichment.astype(int))
958
+ # Process each trait in parallel
959
+ Parallel(n_jobs=self.n_jobs)(delayed(_process_trait)(i) for i in tqdm(self.trait_range))
960
+
961
+ self.trs_adata.layers[_layer_label_] = to_sparse(trait_cell_enrichment)
944
962
 
945
963
  if not self.is_simple:
946
964
  self.trs_adata.layers[f"credible_{_layer_label_}"] = to_sparse(trait_cell_credible)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sciv
3
- Version: 0.0.88
3
+ Version: 0.0.89
4
4
  Summary: Unveiling the pivotal cell types involved in variant function regulation at a single-cell resolution
5
5
  Project-URL: github, https://github.com/YuZhengM/sciv
6
6
  Author-email: Zheng-Min Yu <yuzmbio@163.com>
@@ -27,13 +27,13 @@ sciv/preprocessing/_scanpy_.py,sha256=flC10W5YPgJE5Ccxt5hP8qW3Q3cOi8DZ36Z1j9D3oN
27
27
  sciv/preprocessing/_scvi_.py,sha256=ZIDkQ_4deYmzSMiAbu5C3j_jMMl7hBTFLCBXHCNj3B4,10332
28
28
  sciv/preprocessing/_snapatac_.py,sha256=Dq8CHF7Psl3CQszaEokQYO56Oe2uzyWOy_cGlaOywfc,27798
29
29
  sciv/tool/__init__.py,sha256=WXzHkWt6RgBC3qqD-98nR5wQmt6oC850ox_VpMrapSU,2468
30
- sciv/tool/_algorithm_.py,sha256=fRzf7fjHF2O2beFEGBZX5qOF7nAKSdgHqHCUDZJ_3BE,49302
30
+ sciv/tool/_algorithm_.py,sha256=-6YbvZEH95tvOicf2-bkvpmzdyOe5yAjelL7HjPaXF8,49952
31
31
  sciv/tool/_matrix_.py,sha256=O1EAhA9wxh06P_eOxEBesK7kO7IExKlhH6uJzGh1HBM,24322
32
- sciv/tool/_random_walk_.py,sha256=BKFjj5z1XI7Fzsr9Qsnzequ-oR5bMgK_WHf7KcpoSPU,47510
32
+ sciv/tool/_random_walk_.py,sha256=27P1tjQtpjm44ZPE5CGEn3kb-PF-LSF2QsXw10jugSU,48304
33
33
  sciv/util/__init__.py,sha256=nOxZ8if27X7AUJ6hZwTwxOJwIBJb0obWlHjqCzjg_Gc,1964
34
34
  sciv/util/_constant_.py,sha256=w0wKQd8guLd1ZTW24_5aECrWsIWDiNQmEpLsWlHar1A,3000
35
35
  sciv/util/_core_.py,sha256=ZD2uSnEBHVu0i9TmXWzri_3bXZzYKnIZk818gW3zadE,14751
36
- sciv-0.0.88.dist-info/METADATA,sha256=V0-eiddGBywMSRVwQP8uPWs3Fq7OFpwpO4GYWkr_hqw,3465
37
- sciv-0.0.88.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
- sciv-0.0.88.dist-info/licenses/LICENSE,sha256=4UvHVf3qCOZjHLs4LkYz8u96XRpXnZrpTKrkUQPs5_A,1075
39
- sciv-0.0.88.dist-info/RECORD,,
36
+ sciv-0.0.89.dist-info/METADATA,sha256=-YZF4BLit8qmdZ_66DBLffciYokeFFWurTA0sEZi3IU,3465
37
+ sciv-0.0.89.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
+ sciv-0.0.89.dist-info/licenses/LICENSE,sha256=4UvHVf3qCOZjHLs4LkYz8u96XRpXnZrpTKrkUQPs5_A,1075
39
+ sciv-0.0.89.dist-info/RECORD,,
File without changes