sciv 0.0.88__py3-none-any.whl → 0.0.90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sciv/tool/_algorithm_.py CHANGED
@@ -814,7 +814,9 @@ def _overlap_(regions_sort: DataFrame, variants: DataFrame) -> DataFrame:
814
814
 
815
815
  variants_overlap_info_list: list = []
816
816
 
817
- for index, chr_a, start, end in zip(regions_sort["index"], regions_sort["chr"], regions_sort["start"],
817
+ for index, chr_a, start, end in zip(regions_sort["index"],
818
+ regions_sort["chr"],
819
+ regions_sort["start"],
818
820
  regions_sort["end"]):
819
821
 
820
822
  # judge chr
@@ -902,73 +904,89 @@ def overlap_sum(regions: AnnData, variants: dict, trait_info: DataFrame) -> AnnD
902
904
  """
903
905
 
904
906
  # Unique feature set
905
- label_all = list(regions.var.index)
907
+ label_all = regions.var.index.tolist()
906
908
  # Peak number
907
909
  label_all_size: int = len(label_all)
908
910
 
909
- # trait/disease information
910
- trait_names: list = list(trait_info["id"])
911
+ # Pre-build a dict of peak indices for O(1) lookup
912
+ label2idx = {lb: i for i, lb in enumerate(label_all)}
911
913
 
912
- matrix = np.zeros((label_all_size, len(trait_names)))
914
+ trait_names = trait_info["id"].tolist()
915
+ n_trait = len(trait_names)
916
+ # Pre-allocate sparse matrix, fill column by column, then convert to csc and then csr for efficiency
917
+ row_indices, col_indices, data_vals = [], [], []
913
918
 
914
- regions_df = regions.var.copy()
919
+ # Check column existence once
920
+ required = {"chr", "start", "end"}
915
921
 
916
- regions_columns: list = list(regions_df.columns)
917
-
918
- if "chr" not in regions_columns or "start" not in regions_columns or "end" not in regions_columns:
922
+ if not required.issubset(regions.var.columns):
919
923
  ul.log(__name__).error(
920
- f"The peaks information {regions_columns} in data `adata` must include three columns: `chr`, `start` and "
921
- f"`end`. (It is recommended to use the `read_sc_atac` method.)"
924
+ f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
925
+ f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
922
926
  )
923
927
  raise ValueError(
924
- f"The peaks information {regions_columns} in data `adata` must include three columns: `chr`, `start` and "
925
- f"`end`. (It is recommended to use the `read_sc_atac` method.)"
928
+ f"The peaks information {regions.var.columns} in data `adata` must include three columns: `chr`, `start` "
929
+ f"and `end`. (It is recommended to use the `read_sc_atac` method.)"
926
930
  )
927
931
 
928
- regions_df = regions_df.rename_axis("index")
929
- regions_df = regions_df.reset_index()
930
- # sort
931
- regions_df = regions_df.sort_values(["chr", "start", "end"])[["index", "chr", "start", "end"]]
932
+ regions_df = (
933
+ regions.var
934
+ .reset_index()
935
+ .loc[:, ["index", "chr", "start", "end"]]
936
+ .sort_values(["chr", "start", "end"])
937
+ )
932
938
 
933
- ul.log(__name__).info(f"Obtain peak-trait/disease matrix. (overlap variant information)")
934
- for trait_name in tqdm(trait_names):
939
+ ul.log(__name__).info("Obtain peak-trait/disease matrix. (overlap variant information)")
935
940
 
941
+ # The outer loop can be further accelerated by parallelizing over traits; here we keep it single-threaded for now.
942
+ for col_idx, trait_name in enumerate(tqdm(trait_names)):
936
943
  variant: AnnData = variants[trait_name]
937
- index: int = trait_names.index(trait_name)
938
-
939
- # handle overlap data
940
- overlap_info: DataFrame = _overlap_(regions_df, variant.obs)
944
+ overlap_df: DataFrame = _overlap_(regions_df, variant.obs)
941
945
 
942
- if overlap_info.shape[0] == 0:
946
+ if overlap_df.empty:
943
947
  continue
944
948
 
945
- overlap_info.rename({"index": "label"}, axis="columns", inplace=True)
946
- overlap_info.reset_index(inplace=True)
947
- overlap_info["region_id"] = (
948
- overlap_info["chr"].astype(str)
949
- + ":" + overlap_info["start"].astype(str) + "-" + overlap_info["end"].astype(str)
949
+ # Sum at once: first group by label and collect variant_id into a list
950
+ label_var_ids = (
951
+ overlap_df
952
+ .groupby("index")["variant_id"]
953
+ .apply(list)
954
+ .reset_index()
950
955
  )
951
956
 
952
- # get region
953
- region_info = overlap_info.groupby("region_id", as_index=False)["label"].first()
954
- region_info.index = region_info["label"].astype(str)
955
- label: list = list(region_info["label"])
956
-
957
- # Mutation information with repetitive features
958
- label_size: int = len(label)
959
-
960
- for j in range(label_size):
961
-
962
- # Determine whether the features after overlap exist, In other words, whether there is overlap in this feature
963
- if label[j] in label_all:
964
- # get the index of label
965
- label_index = label_all.index(label[j])
966
- overlap_info_region = overlap_info[overlap_info["label"] == label[j]]
967
- # sum value
968
- overlap_variant = variant[list(overlap_info_region["variant_id"]), :]
969
- matrix[label_index, index] = overlap_variant.X.sum(axis=0)
970
-
971
- overlap_adata = AnnData(to_sparse(matrix), var=trait_info, obs=regions.var)
957
+ # Traverse each label, sum once for each variant_id list
958
+ for _, row in label_var_ids.iterrows():
959
+ label = row["index"]
960
+ row_idx = label2idx[label]
961
+ var_ids = row["variant_id"]
962
+ # Sum once for all variant_ids in the list, avoiding row-by-row slicing
963
+ matrix_sum = variant[var_ids, :].X.sum(axis=0)
964
+
965
+ if np.isscalar(matrix_sum):
966
+ matrix_sum = np.asarray(matrix_sum).reshape(1)
967
+
968
+ # Collect non-zero values
969
+ if matrix_sum.size == 1:
970
+ val = float(matrix_sum)
971
+ if val != 0:
972
+ row_indices.append(row_idx)
973
+ col_indices.append(col_idx)
974
+ data_vals.append(val)
975
+ else:
976
+ for t_idx, v in enumerate(matrix_sum):
977
+ if v != 0:
978
+ row_indices.append(row_idx)
979
+ col_indices.append(col_idx + t_idx)
980
+ data_vals.append(float(v))
981
+
982
+ # Build sparse matrix, then convert to csr format
983
+ overlap_sparse = sparse.csc_matrix(
984
+ (data_vals, (row_indices, col_indices)),
985
+ shape=(label_all_size, n_trait),
986
+ dtype=np.float32
987
+ ).tocsr()
988
+
989
+ overlap_adata = AnnData(overlap_sparse, var=trait_info, obs=regions.var)
972
990
  overlap_adata.uns["is_overlap"] = True
973
991
  return overlap_adata
974
992
 
@@ -208,7 +208,19 @@ def random_walk(
208
208
 
209
209
  return np.column_stack(results)
210
210
  elif device == 'gpu' or (device == 'auto' and availability):
211
- return _random_walk_gpu_(seed_cell_weight, weight, gamma, epsilon, p, device='gpu')
211
+
212
+ try:
213
+ return _random_walk_gpu_(seed_cell_weight, weight, gamma, epsilon, p, device='gpu')
214
+ except RuntimeError as e:
215
+ ul.log(__name__).warning(f"GPU failed to run, try to switch to CPU running.\n {e}")
216
+ sample_count = seed_cell_weight.shape[1]
217
+
218
+ results = Parallel(n_jobs=n_jobs)(
219
+ delayed(_random_walk_cpu_)(seed_cell_weight[:, i], weight, gamma, epsilon, p)
220
+ for i in tqdm(range(sample_count))
221
+ )
222
+
223
+ return np.column_stack(results)
212
224
  else:
213
225
  ul.log(__name__).error(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
214
226
  raise ValueError(f'The `device` ({device}) is not supported. Only supports "cpu", "gpu", and "auto" values.')
@@ -677,7 +689,7 @@ class RandomWalk:
677
689
  seed_cell_matrix_en[:, i] = seed_cell_en_value / (1 if seed_cell_en_value.sum() == 0 else seed_cell_en_value.sum())
678
690
 
679
691
  # Parallel processing of all traits and real-time display of progress
680
- Parallel(n_jobs=-1, backend="threading")(
692
+ Parallel(n_jobs=self.n_jobs, backend='threading')(
681
693
  delayed(_process_single_trait)(i) for i in tqdm(self.trait_range, desc="Obtain progress of seed cells with weights")
682
694
  )
683
695
 
@@ -774,6 +786,7 @@ class RandomWalk:
774
786
 
775
787
  score = self._random_walk_core_(seed_cell_data, weight=weight)
776
788
 
789
+ ul.log(__name__).info("Normalize the results")
777
790
  cell_value = self.scale_norm(score)
778
791
 
779
792
  if _layer_label_ == "trs":
@@ -904,8 +917,8 @@ class RandomWalk:
904
917
  trs_score = to_dense(self.trs_adata.X if label == "run_en" else self.trs_adata.layers[_trs_layer_label_], is_array=True)
905
918
 
906
919
  # Initialize enriched container
907
- trait_cell_enrichment = np.zeros(self.trs_adata.shape)
908
- trait_cell_credible = np.zeros(self.trs_adata.shape)
920
+ trait_cell_enrichment = np.zeros(self.trs_adata.shape).astype(int)
921
+ trait_cell_credible = np.zeros(self.trs_adata.shape).astype(np.float32)
909
922
 
910
923
  ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-random walk)")
911
924
  # Random walk
@@ -916,8 +929,9 @@ class RandomWalk:
916
929
  )
917
930
 
918
931
  ul.log(__name__).info(f"Calculate {len(self.trait_list)} traits/diseases for process `{label}`. (Enrichment-score)")
919
- for i in tqdm(self.trait_range):
920
932
 
933
+ # Process each trait in parallel
934
+ def _process_trait(i):
921
935
  # Random walk
922
936
  cell_value = cell_value_data[:, i]
923
937
 
@@ -940,7 +954,10 @@ class RandomWalk:
940
954
  trait_cell_enrichment[:, i][cell_value_credible > self.credible_threshold] = 1
941
955
  trait_cell_credible[:, i] = cell_value_credible
942
956
 
943
- self.trs_adata.layers[_layer_label_] = to_sparse(trait_cell_enrichment.astype(int))
957
+ # Process each trait in parallel, backend='threading' can effectively prevent the read-only parameter issue caused by copying in loky multi-process mode
958
+ Parallel(n_jobs=self.n_jobs, backend='threading')(delayed(_process_trait)(i) for i in tqdm(self.trait_range))
959
+
960
+ self.trs_adata.layers[_layer_label_] = to_sparse(trait_cell_enrichment)
944
961
 
945
962
  if not self.is_simple:
946
963
  self.trs_adata.layers[f"credible_{_layer_label_}"] = to_sparse(trait_cell_credible)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sciv
3
- Version: 0.0.88
3
+ Version: 0.0.90
4
4
  Summary: Unveiling the pivotal cell types involved in variant function regulation at a single-cell resolution
5
5
  Project-URL: github, https://github.com/YuZhengM/sciv
6
6
  Author-email: Zheng-Min Yu <yuzmbio@163.com>
@@ -27,13 +27,13 @@ sciv/preprocessing/_scanpy_.py,sha256=flC10W5YPgJE5Ccxt5hP8qW3Q3cOi8DZ36Z1j9D3oN
27
27
  sciv/preprocessing/_scvi_.py,sha256=ZIDkQ_4deYmzSMiAbu5C3j_jMMl7hBTFLCBXHCNj3B4,10332
28
28
  sciv/preprocessing/_snapatac_.py,sha256=Dq8CHF7Psl3CQszaEokQYO56Oe2uzyWOy_cGlaOywfc,27798
29
29
  sciv/tool/__init__.py,sha256=WXzHkWt6RgBC3qqD-98nR5wQmt6oC850ox_VpMrapSU,2468
30
- sciv/tool/_algorithm_.py,sha256=fRzf7fjHF2O2beFEGBZX5qOF7nAKSdgHqHCUDZJ_3BE,49302
30
+ sciv/tool/_algorithm_.py,sha256=yTImeGMWK6Y2gxygR90bqRF1vkvU857l5M9A_Q_VoZI,49839
31
31
  sciv/tool/_matrix_.py,sha256=O1EAhA9wxh06P_eOxEBesK7kO7IExKlhH6uJzGh1HBM,24322
32
- sciv/tool/_random_walk_.py,sha256=BKFjj5z1XI7Fzsr9Qsnzequ-oR5bMgK_WHf7KcpoSPU,47510
32
+ sciv/tool/_random_walk_.py,sha256=lnMiJyDuqDB75l9IkFLr7bIgKYhWxlBdIcExSErpQNw,48374
33
33
  sciv/util/__init__.py,sha256=nOxZ8if27X7AUJ6hZwTwxOJwIBJb0obWlHjqCzjg_Gc,1964
34
34
  sciv/util/_constant_.py,sha256=w0wKQd8guLd1ZTW24_5aECrWsIWDiNQmEpLsWlHar1A,3000
35
35
  sciv/util/_core_.py,sha256=ZD2uSnEBHVu0i9TmXWzri_3bXZzYKnIZk818gW3zadE,14751
36
- sciv-0.0.88.dist-info/METADATA,sha256=V0-eiddGBywMSRVwQP8uPWs3Fq7OFpwpO4GYWkr_hqw,3465
37
- sciv-0.0.88.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
- sciv-0.0.88.dist-info/licenses/LICENSE,sha256=4UvHVf3qCOZjHLs4LkYz8u96XRpXnZrpTKrkUQPs5_A,1075
39
- sciv-0.0.88.dist-info/RECORD,,
36
+ sciv-0.0.90.dist-info/METADATA,sha256=0yOyKQwEYMys8J6KodKgk2FORBN2SZQhRmiTF5je2FI,3465
37
+ sciv-0.0.90.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
+ sciv-0.0.90.dist-info/licenses/LICENSE,sha256=4UvHVf3qCOZjHLs4LkYz8u96XRpXnZrpTKrkUQPs5_A,1075
39
+ sciv-0.0.90.dist-info/RECORD,,
File without changes