cap-anndata 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
cap_anndata/__init__.py CHANGED
@@ -1,6 +1,10 @@
1
1
  from .backed_df import CapAnnDataDF
2
2
  from .backed_uns import CapAnnDataUns
3
3
  from .cap_anndata import CapAnnData
4
+ from .reader import (
5
+ read_directly,
6
+ read_h5ad,
7
+ )
4
8
 
5
9
 
6
10
  __all__ = ["CapAnnData"]
cap_anndata/backed_df.py CHANGED
@@ -1,8 +1,11 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
- from typing import List
3
+ from typing import List, Any, Union
4
4
  import logging
5
5
 
6
+ from pandas._typing import Self
7
+ from pandas.core.generic import bool_t
8
+
6
9
  logger = logging.getLogger(__name__)
7
10
 
8
11
 
@@ -13,7 +16,8 @@ class CapAnnDataDF(pd.DataFrame):
13
16
  The main feature of the class is handling <column-order> attribute
14
17
  which must be a copy of h5py.Group attribute
15
18
  """
16
- _metadata = ['column_order']
19
+
20
+ _metadata = ["column_order"]
17
21
 
18
22
  def rename_column(self, old_name: str, new_name: str) -> None:
19
23
  i = np.where(self.column_order == old_name)[0]
@@ -31,10 +35,35 @@ class CapAnnDataDF(pd.DataFrame):
31
35
  return super().__setitem__(key, value)
32
36
 
33
37
  @classmethod
34
- def from_df(cls, df: pd.DataFrame, column_order: List[str] = None):
38
+ def from_df(cls, df: pd.DataFrame, column_order: List[str] = None) -> Self:
35
39
  if column_order is None:
36
40
  column_order = df.columns.to_numpy()
37
41
 
38
42
  new_inst = cls(df)
39
43
  new_inst.column_order = column_order
40
44
  return new_inst
45
+
46
+ def join(self, other: Any, **kwargs) -> Self:
47
+ result = super().join(other=other, **kwargs)
48
+ if isinstance(other, CapAnnDataDF):
49
+ new_columns = [
50
+ col for col in other.column_order if col not in self.column_order
51
+ ]
52
+ else:
53
+ new_columns = [col for col in other.columns if col not in self.column_order]
54
+ column_order = np.append(self.column_order, new_columns)
55
+ return self.from_df(result, column_order=column_order)
56
+
57
+ def merge(self, right, **kwargs) -> Self:
58
+ result = super().merge(right=right, **kwargs)
59
+ if isinstance(right, CapAnnDataDF):
60
+ new_columns = [
61
+ col for col in right.column_order if col not in self.column_order
62
+ ]
63
+ else:
64
+ new_columns = [col for col in right.columns if col not in self.column_order]
65
+ column_order = np.append(self.column_order, new_columns)
66
+ return self.from_df(result, column_order=column_order)
67
+
68
+ def copy(self, deep: Union[bool_t, None] = True) -> Self:
69
+ return self.from_df(super().copy(deep=deep), column_order=self.column_order)
@@ -1,13 +1,13 @@
1
1
  import logging
2
- import contextlib
3
2
  import anndata as ad
3
+ import numpy as np
4
4
  import h5py
5
5
  from typing import List, Union, Dict, Tuple, Final
6
6
  from anndata._io.specs import read_elem, write_elem
7
- from dataclasses import dataclass
8
7
 
9
8
  from cap_anndata import CapAnnDataDF, CapAnnDataUns
10
9
 
10
+
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
  X_NOTATION = Union[h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset]
@@ -16,26 +16,11 @@ OBSM_NOTATION = Dict[str, X_NOTATION]
16
16
  NotLinkedObject: Final = "__NotLinkedObject"
17
17
 
18
18
 
19
- @dataclass
20
- class RawLayer:
21
- var: CapAnnDataDF = None
22
- X: X_NOTATION = None
23
-
24
- @property
25
- def shape(self) -> Tuple[int, int]:
26
- return self.X.shape if self.X is not None else None
27
-
28
-
29
- class CapAnnData:
30
- def __init__(self, h5_file: h5py.File) -> None:
31
- self._file: h5py.File = h5_file
32
- self.obs: CapAnnDataDF = None
33
- self.var: CapAnnDataDF = None
19
+ class BaseLayerMatrixAndDf:
20
+ def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
21
+ self._file = file
22
+ self._path_to_content = path_to_content
34
23
  self._X: X_NOTATION = None
35
- self._obsm: OBSM_NOTATION = None
36
- self._uns: CapAnnDataUns = None
37
- self._raw: RawLayer = None
38
- self._shape: Tuple[int, int] = None
39
24
 
40
25
  @property
41
26
  def X(self) -> X_NOTATION:
@@ -43,45 +28,48 @@ class CapAnnData:
43
28
  self._link_x()
44
29
  return self._X
45
30
 
46
- @property
47
- def obsm(self) -> OBSM_NOTATION:
48
- if self._obsm is None:
49
- self._link_obsm()
50
- return self._obsm
51
-
52
- @property
53
- def raw(self) -> RawLayer:
54
- if self._raw is None:
55
- self._link_raw_x()
56
- return self._raw
31
+ def _link_x(self) -> None:
32
+ x = self._file[self._path_to_content + "X"]
33
+ if isinstance(x, h5py.Dataset):
34
+ # dense X
35
+ self._X = x
36
+ else:
37
+ # sparse dataset
38
+ self._X = ad.experimental.sparse_dataset(x)
57
39
 
58
40
  @property
59
- def uns(self) -> CapAnnDataUns:
60
- if self._uns is None:
61
- self._uns = CapAnnDataUns({k: NotLinkedObject for k in self._file["uns"].keys()})
62
- return self._uns
63
-
64
- def read_obs(self, columns: List[str] = None) -> None:
65
- self.obs = self._read_df(self._file["obs"], columns=columns)
41
+ def shape(self) -> Tuple[int, int]:
42
+ if self.X is not None:
43
+ shape = tuple(map(int, self.X.shape))
44
+ else:
45
+ shape = None
46
+ return shape
47
+
48
+ def _lazy_df_load(self, key: str) -> CapAnnDataDF:
49
+ df = CapAnnDataDF()
50
+ attribute = self._path_to_content + key
51
+ column_order = self._read_attr(self._file[attribute], "column-order")
52
+ df.column_order = column_order
53
+ if df.column_order.dtype != object:
54
+ # empty DataFrame will have column_order as float64
55
+ # which leads to failure in overwrite method
56
+ df.column_order = df.column_order.astype(object)
57
+ return df
66
58
 
67
- def read_var(self, columns: List[str] = None, raw: bool = False) -> None:
68
- if raw:
69
- # Check if raw exists first
70
- if "raw" not in self._file.keys():
71
- logger.warning("Can't read raw.var since raw layer doesn't exist!")
72
- return
59
+ @staticmethod
60
+ def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
61
+ attrs = dict(obj.attrs)
62
+ if attr_name not in attrs.keys():
63
+ raise KeyError(f"The {attr_name} doesn't exist!")
64
+ return attrs[attr_name]
73
65
 
74
- if self._raw is None:
75
- self._raw = RawLayer()
76
- self._link_raw_x()
66
+ def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
67
+ group_path = self._path_to_content + key
68
+ if group_path not in self._file.keys():
69
+ raise ValueError(f"The group {group_path} doesn't exist in the file!")
77
70
 
78
- key = "raw/var"
79
- self._raw.var = self._read_df(self._file[key], columns=columns)
80
- else:
81
- key = "var"
82
- self.var = self._read_df(self._file[key], columns=columns)
71
+ h5_group = self._file[group_path]
83
72
 
84
- def _read_df(self, h5_group: h5py.Group, columns: List[str]) -> CapAnnDataDF:
85
73
  column_order = self._read_attr(h5_group, "column-order")
86
74
 
87
75
  if columns is None:
@@ -91,31 +79,143 @@ class CapAnnData:
91
79
  cols_to_read = [c for c in columns if c in column_order]
92
80
  df = CapAnnDataDF()
93
81
  df.column_order = column_order
94
-
95
82
  index_col = self._read_attr(h5_group, "_index")
96
83
  df.index = read_elem(h5_group[index_col])
97
84
 
98
85
  for col in cols_to_read:
99
86
  df[col] = read_elem(h5_group[col])
87
+
100
88
  if df.column_order.dtype != object:
101
89
  # empty DataFrame will have column_order as float64
102
90
  # which leads to failure in overwrite method
103
91
  df.column_order = df.column_order.astype(object)
104
92
  return df
105
93
 
106
- @staticmethod
107
- def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
108
- attrs = dict(obj.attrs)
109
- if attr_name not in attrs.keys():
110
- raise KeyError(f"The {attr_name} doesn't exist!")
111
- return attrs[attr_name]
94
+ def _write_elem_lzf(self, dest_key: str, elem: any) -> None:
95
+ write_elem(self._file, dest_key, elem, dataset_kwargs={"compression": "lzf"})
96
+
97
+ def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
98
+ if not isinstance(cap_df, CapAnnDataDF):
99
+ raise TypeError(
100
+ f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
101
+ )
102
+
103
+ if axis not in [0, 1]:
104
+ raise ValueError("The axis should be either 0 or 1!")
105
+
106
+ if cap_df.shape[0] != self.shape[axis]:
107
+ items = "cells" if axis == 0 else "genes"
108
+ raise ValueError(
109
+ f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
110
+ "AnnData object!"
111
+ )
112
+
113
+
114
+ class RawLayer(BaseLayerMatrixAndDf):
115
+ def __init__(self, h5_file: h5py.File):
116
+ super().__init__(h5_file, path_to_content="/raw/")
117
+ self._var: CapAnnDataDF = None
118
+
119
+ @property
120
+ def var(self) -> CapAnnDataDF:
121
+ if self._var is None:
122
+ self._var = self._lazy_df_load("var")
123
+ return self._var
124
+
125
+ @var.setter
126
+ def var(self, cap_df: CapAnnDataDF) -> None:
127
+ self._validate_cap_df(cap_df, axis=1)
128
+ self._var = cap_df
129
+
130
+ def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
131
+ df = self._read_df(key="var", columns=columns)
132
+ if self.var.empty or reset:
133
+ self._var = df
134
+ else:
135
+ for col in df.columns:
136
+ self._var[col] = df[col]
137
+
138
+
139
+ class CapAnnData(BaseLayerMatrixAndDf):
140
+ def __init__(self, h5_file: h5py.File) -> None:
141
+ super().__init__(h5_file, path_to_content="/")
142
+ self._file: h5py.File = h5_file
143
+ self._obs: CapAnnDataDF = None
144
+ self._var: CapAnnDataDF = None
145
+ self._X: X_NOTATION = None
146
+ self._obsm: OBSM_NOTATION = None
147
+ self._uns: CapAnnDataUns = None
148
+ self._raw: RawLayer = None
149
+ self._shape: Tuple[int, int] = None
150
+
151
+ @property
152
+ def obs(self) -> CapAnnDataDF:
153
+ if self._obs is None:
154
+ self._obs = self._lazy_df_load("obs")
155
+ return self._obs
156
+
157
+ @obs.setter
158
+ def obs(self, cap_df: CapAnnDataDF) -> None:
159
+ self._validate_cap_df(cap_df, axis=0)
160
+ self._obs = cap_df
161
+
162
+ @property
163
+ def var(self) -> CapAnnDataDF:
164
+ if self._var is None:
165
+ self._var = self._lazy_df_load("var")
166
+ return self._var
167
+
168
+ @var.setter
169
+ def var(self, cap_df: CapAnnDataDF) -> None:
170
+ self._validate_cap_df(cap_df, axis=1)
171
+ self._var = cap_df
172
+
173
+ @property
174
+ def obsm(self) -> OBSM_NOTATION:
175
+ if self._obsm is None:
176
+ self._link_obsm()
177
+ return self._obsm
178
+
179
+ @property
180
+ def raw(self) -> RawLayer:
181
+ if self._raw is None:
182
+ if "raw" not in self._file.keys():
183
+ logger.warning("Can't read raw.var since raw layer doesn't exist!")
184
+ return
185
+
186
+ self._raw = RawLayer(self._file)
187
+ return self._raw
188
+
189
+ @property
190
+ def uns(self) -> CapAnnDataUns:
191
+ if self._uns is None:
192
+ self._uns = CapAnnDataUns(
193
+ {k: NotLinkedObject for k in self._file["uns"].keys()}
194
+ )
195
+ return self._uns
196
+
197
+ def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
198
+ df = self._read_df("obs", columns=columns)
199
+ if self.obs.empty or reset:
200
+ self._obs = df
201
+ else:
202
+ for col in df.columns:
203
+ self._obs[col] = df[col]
204
+
205
+ def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
206
+ df = self._read_df("var", columns=columns)
207
+ if self.var.empty or reset:
208
+ self._var = df
209
+ else:
210
+ for col in df.columns:
211
+ self._var[col] = df[col]
112
212
 
113
213
  def overwrite(self, fields: List[str] = None) -> None:
114
214
  field_to_entity = {
115
215
  "obs": self.obs,
116
216
  "var": self.var,
117
217
  "raw.var": self.raw.var if self.raw is not None else None,
118
- "uns": self.uns
218
+ "uns": self.uns,
119
219
  }
120
220
 
121
221
  if fields is None:
@@ -124,7 +224,9 @@ class CapAnnData:
124
224
  for f in fields:
125
225
  if f not in field_to_entity.keys():
126
226
  raise KeyError(
127
- f"The field {f} is not supported! The list of suported fields are equal to supported attributes of the CapAnnData class: obs, var, raw.var and uns.")
227
+ f"The field {f} is not supported! The list of supported fields are equal to supported "
228
+ f"attributes of the CapAnnData class: obs, var, raw.var and uns."
229
+ )
128
230
 
129
231
  for key in ["obs", "var", "raw.var"]:
130
232
  if key in fields:
@@ -132,11 +234,17 @@ class CapAnnData:
132
234
  if entity is None:
133
235
  continue
134
236
 
135
- key = key.replace(".", '/') if key == "raw.var" else key
237
+ key = key.replace(".", "/") if key == "raw.var" else key
136
238
 
137
239
  for col in entity.columns:
138
240
  self._write_elem_lzf(f"{key}/{col}", entity[col].values)
139
- self._file[key].attrs['column-order'] = entity.column_order
241
+
242
+ column_order = entity.column_order
243
+ if (
244
+ column_order.size == 0
245
+ ): # Refs https://github.com/cellannotation/cap-anndata/issues/6
246
+ column_order = np.array([], dtype=np.float64)
247
+ self._file[key].attrs["column-order"] = column_order
140
248
 
141
249
  if "uns" in fields:
142
250
  for key in self.uns.keys():
@@ -156,32 +264,6 @@ class CapAnnData:
156
264
  sourse = self._file[f"uns/{key}"]
157
265
  self.uns[key] = read_elem(sourse)
158
266
 
159
- @property
160
- def shape(self) -> tuple[int, int]:
161
- return self.X.shape
162
-
163
- def _link_x(self) -> None:
164
- x = self._file["X"]
165
- if isinstance(x, h5py.Dataset):
166
- # dense X
167
- self._X = x
168
- else:
169
- # sparse dataset
170
- self._X = ad.experimental.sparse_dataset(x)
171
-
172
- def _link_raw_x(self) -> None:
173
- if "raw" in self._file.keys():
174
- if self._raw is None:
175
- self._raw = RawLayer()
176
-
177
- raw_x = self._file["raw/X"]
178
- if isinstance(raw_x, h5py.Dataset):
179
- # dense X
180
- self._raw.X = raw_x
181
- else:
182
- # sparse dataset
183
- self._raw.X = ad.experimental.sparse_dataset(raw_x)
184
-
185
267
  def _link_obsm(self) -> None:
186
268
  self._obsm = {}
187
269
  if "obsm" in self._file.keys():
@@ -198,27 +280,8 @@ class CapAnnData:
198
280
  def obsm_keys(self) -> List[str]:
199
281
  return list(self.obsm.keys())
200
282
 
201
- def _write_elem_lzf(self, dest_key: str, elem: any) -> None:
202
- write_elem(self._file, dest_key, elem, dataset_kwargs={"compression": "lzf"})
283
+ def obs_keys(self) -> List[str]:
284
+ return self.obs.column_order.tolist()
203
285
 
204
- @staticmethod
205
- @contextlib.contextmanager
206
- def read_anndata_file(file_path, backed='r'):
207
- """The method to read anndata file using original AnnData package"""
208
- logger.debug(f"Read file {file_path} in backed mode = {backed}...")
209
-
210
- adata = None
211
- try:
212
- adata = ad.read_h5ad(file_path, backed=backed)
213
- logger.debug(f"Successfully read anndata file path {file_path}")
214
- yield adata
215
-
216
- except Exception as error:
217
- logger.error(f"Error during read anndata file at path: {file_path}, error = {error}!")
218
- raise error
219
-
220
- finally:
221
- if adata is not None:
222
- if adata.isbacked:
223
- adata.file.close()
224
- logger.debug("AnnData closed!")
286
+ def var_keys(self) -> List[str]:
287
+ return self.var.column_order.tolist()
cap_anndata/reader.py ADDED
@@ -0,0 +1,44 @@
1
+ import logging
2
+ import contextlib
3
+ import h5py
4
+
5
+ from cap_anndata import CapAnnData
6
+
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ @contextlib.contextmanager
12
+ def read_h5ad(file_path: str, edit: bool = False):
13
+ """
14
+ This is the main read method for CapAnnData.
15
+ Must be used in 'with' context.
16
+ """
17
+ mode = "r+" if edit else "r"
18
+ logger.debug(f"Read file {file_path} mode={mode} in context...")
19
+
20
+ try:
21
+ file = h5py.File(file_path, mode)
22
+ cap_adata = CapAnnData(file)
23
+ logger.debug(f"Successfully read anndata file path {file_path}")
24
+ yield cap_adata
25
+
26
+ except Exception as error:
27
+ logger.error(f"Error during read anndata file at path: {file_path}, error = {error}!")
28
+ raise error
29
+
30
+ finally:
31
+ file.close()
32
+ logger.debug("AnnData closed!")
33
+
34
+
35
+ def read_directly(file_path: str, edit: bool = False) -> CapAnnData:
36
+ """
37
+ Must be used only in specific cases.
38
+ User is responsible to close the h5py file when the work with CapAnnData instance done.
39
+ """
40
+ mode = "r+" if edit else "r"
41
+ logger.debug(f"Read file {file_path} mode={mode} directly...")
42
+ file = h5py.File(file_path, mode)
43
+ cap_adata = CapAnnData(file)
44
+ return cap_adata
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cap_anndata
3
- Version: 0.1.1
4
- Summary: Partial read of AnnData files for low-memory operations with large datasets.
3
+ Version: 0.2.1
4
+ Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
5
5
  Home-page: https://github.com/cellannotation/cap-anndata
6
6
  Author: R. Mukhin, A. Isaev
7
7
  Author-email: roman@ebookapplications.com
@@ -15,8 +15,10 @@ License-File: LICENSE
15
15
  Requires-Dist: numpy >=1.26.3
16
16
  Requires-Dist: pandas >=2.2.0
17
17
  Requires-Dist: anndata >=0.10.5
18
+ Requires-Dist: h5py >=3.5.0
18
19
  Provides-Extra: dev
19
20
  Requires-Dist: pytest >=8.0.0 ; extra == 'dev'
21
+ Requires-Dist: setuptools ~=69.1.1 ; extra == 'dev'
20
22
 
21
23
  # CAP-AnnData: Enhanced Partial I/O for AnnData Files
22
24
 
@@ -25,41 +27,65 @@ CAP-AnnData enriches the AnnData ecosystem by offering tailored functionalities
25
27
 
26
28
  ## Getting Started
27
29
 
30
+ ### Installation
31
+ Install CAP-AnnData via pip:
32
+
33
+ ```commandline
34
+ pip install -U cap-anndata
35
+ ```
36
+
28
37
  ### Running Tests
29
- Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests in `test/unit_test.py`.
38
+ Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests via `pytest` from the root of the repo.
39
+
40
+ ```commandline
41
+ pip install pytest
42
+ pytest test
43
+ ```
30
44
 
31
45
  Make sure Python 3.9 or newer is used, along with all requirements specified in requirements.txt
32
46
 
33
47
  ## How-TO:
34
48
 
35
- #### 1. Read AnnData File Dataframes
49
+ #### 1. Access AnnData File DataFrames
36
50
 
37
51
  ##### Basic Reading
38
52
  By default, `CapAnnData` does not automatically read any data. To begin working with dataframes, you need to explicitly read the data from the AnnData file. You can read the entire dataframe or select specific columns. For partial reading, provide a list of column names.
39
53
 
40
54
  ```python
41
- import h5py
42
- from cap_anndata import CapAnnData
55
+ from cap_anndata import read_h5ad
43
56
 
44
57
  file_path = "your_data.h5ad"
45
- with h5py.File(file_path, 'r') as file:
46
- cap_adata = CapAnnData(file)
47
-
58
+ with read_h5ad(file_path=file_path, edit=False) as cap_adata:
59
+ # Get the list of all obs columns in AnnData file
60
+ cap_adata.obs_keys() # ['a', 'b', 'c']
48
61
  # Read all columns of 'obs'
49
62
  cap_adata.read_obs()
63
+ # Get the list of columns of DataFrame in memory
64
+ cap_adata.obs.columns # ['a', 'b', 'c']
50
65
 
66
+ # Get the list of all var columns in AnnData file
67
+ cap_adata.var_keys() # ['d', 'e', 'f']
51
68
  # Read specific columns of 'var'
52
- cap_adata.read_var(columns=['gene_expression', 'dispersion'])
53
-
54
- # Read all columns of raw.var
55
- cap_adata.read_var(raw=True)
69
+ cap_adata.read_var(columns=['d'])
70
+ cap_adata.var.columns # ['d']
71
+ # Read additional column
72
+ cap_adata.read_var(columns=['e'])
73
+ cap_adata.var.columns # ['d', 'e']
74
+
75
+ # Read column and reset the in-memory DataFrame before that
76
+ cap_adata.read_var(columns=['f'], reset=True)
77
+ cap_adata.var.columns # ['f']
78
+
79
+ # Read no columns of raw.var (only the index)
80
+ cap_adata.raw.read_var(columns=[])
56
81
  ```
57
82
 
58
- ##### Non-existing columns
83
+ ##### Difference between `obs_keys()` and `obs.columns`
84
+ `obs_keys()` returns the list of columns in the on-disc AnnData file, while `obs.columns` returns the list of columns in the in-memory DataFrame. The two lists may differ if you read only specific columns. If you modify the in-memory DataFrame, the `obs_keys()` will reflect the changes. BTW it is recommended to check the `obs_keys()` before the `overwrite()` call to avoid the AnnData file damage.
59
85
 
60
- If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting Dataframe. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read.
86
+ If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting DataFrame. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read. Exactly the same behavior is for the `var_keys()` and `var.columns`.
61
87
 
62
- #### 2. Modify the AnnData File Dataframes In-Place
88
+ #### 2. Modify the AnnData File DataFrames In-Place
63
89
 
64
90
  You can directly modify the dataframe by adding, renaming, or removing columns.
65
91
 
@@ -68,13 +94,14 @@ You can directly modify the dataframe by adding, renaming, or removing columns.
68
94
  cap_adata.obs['new_col'] = [value1, value2, value3]
69
95
 
70
96
  # Rename a column
71
- cap_adata.rename_column('old_col_name', 'new_col_name')
97
+ cap_adata.obs.rename_column('old_col_name', 'new_col_name')
72
98
 
73
99
  # Remove a column
74
- cap_adata.remove_column('col_to_remove')
100
+ cap_adata.obs.remove_column('col_to_remove')
75
101
  ```
76
102
 
77
103
  After modifications, you can overwrite the changes back to the AnnData file. If a value doesn't exist, it will be created.
104
+ Note: `read_h5ad` must be called with `edit=True` argument to open `.h5ad` file in `r+` mode.
78
105
 
79
106
  ```python
80
107
  # overwrite all values which were read
@@ -84,7 +111,7 @@ cap_adata.overwrite()
84
111
  cap_adata.overwrite(['obs', 'var'])
85
112
  ```
86
113
 
87
- The full list of supported fields: `X`, `raw.X`, `obs`, `var`, `raw.var`, `obsm`, `uns`.
114
+ The full list of supported fields: `obs`, `var`, `raw.var`, `obsm`, `uns`.
88
115
 
89
116
  #### 3. How to Read Few Columns but Overwrite One in a Dataframe
90
117
 
@@ -100,14 +127,19 @@ cap_adata.obs.drop(columns='sample', inplace=True)
100
127
 
101
128
  # Overwrite changes
102
129
  cap_adata.overwrite(['obs'])
130
+
131
+ # NOTE that the line
132
+ # cap_adata.read_obs(columns=['sample'], reset=True)
133
+ # Will override in-memory changes with values from the AnnData file
103
134
  ```
104
135
 
105
136
  #### 4. How to work with X and raw.X
106
137
 
107
- The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
138
+ The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
139
+ The X object will be returned as the `h5py.Dataset` or `AnnData.experimental.sparse_dataset`.
108
140
 
109
141
  ```python
110
- with h5py.File(path) as file:
142
+ with read_h5ad(file_path=file_path, edit=False) as cap_adata:
111
143
  # self.X is None here
112
144
  cap_adata = CapAnnData(file)
113
145
 
@@ -135,13 +167,13 @@ s_ = np.s_[mask, :5]
135
167
 
136
168
  #### 5. How to handle obsm embeddings matrixes
137
169
 
138
- By the default the CapAnnData will not read the embeddings matrix. The link to the h5py objects will be created upon the first call of the `.obsm` property. Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead. We can get the information about the name and shape of the embeddings without taking the whole matrixes in the memory!
170
+ By the default the CapAnnData will not read the embeddings matrix.
171
+ The link to the h5py objects will be created upon the first call of the `.obsm` property.
172
+ Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead.
173
+ It is possible to get the information about the name and shape of the embeddings without taking the whole matrix in the memory.
139
174
 
140
175
  ```python
141
- with h5py.File(path) as file:
142
- # initialization
143
- cap_adata = CapAnnData(file)
144
-
176
+ with read_h5ad(file_path=file_path, edit=False) as cap_adata:
145
177
  # will return the list of strings
146
178
  obsm_keys = cap_adata.obsm_keys()
147
179
 
@@ -158,10 +190,7 @@ with h5py.File(path) as file:
158
190
  The `CapAnnData` class will lazely link the uns section upon the first call but ***WILL NOT*** read it into memory. Instead, the dictionary of the pairs `{'key': "__NotLinkedObject"}` will be creted. It allow to get the list of keys before the actual read. To read the uns section in the memory the `.read_uns(keys)` method must be called.
159
191
 
160
192
  ```python
161
- with h5py.File(path) as file:
162
- # initialization
163
- cap_adata = CapAnnData(file)
164
-
193
+ with read_h5ad(file_path=file_path, edit=True) as cap_adata:
165
194
  # will return the keys() object
166
195
  keys = cap_adata.uns.keys()
167
196
 
@@ -197,3 +226,28 @@ To save `uns` changes the method `CapAnnData.overwrite()` must be called.
197
226
  cap_adata.overwrite() # all in-memory fields will be overwritten
198
227
  cap_adata.overwrite(["uns"]) # overwrite the uns secion only
199
228
  ```
229
+
230
+ #### 7. Join and Merge DataFrames
231
+
232
+ Cap-AnnData provides enhanced methods for joining and merging dataframes, preserving column order and data integrity
233
+
234
+ ```python
235
+ from cap_anndata import CapAnnDataDF
236
+ import pandas as pd
237
+
238
+ data1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
239
+ data2 = pd.DataFrame({'D': [7, 8, 9], 'E': [10, 11, 12]})
240
+ cap_anndata_df1 = CapAnnDataDF.from_df(data1, column_order=['A', 'B', 'C'])
241
+
242
+ cap_df = cap_anndata_df1.join(data2, how='left')
243
+
244
+ cap_df.columns # ['A', 'B', 'D', 'E']
245
+ cap_df.column_order # ['A', 'B', 'C', 'D', 'E']
246
+
247
+ data3 = pd.DataFrame({'A': [2, 3, 4], 'D': [10, 11, 12]})
248
+ cap_df = cap_anndata_df1.merge(data3, on='A')
249
+
250
+ cap_df.columns # ['A', 'B', 'D']
251
+ cap_df.column_order # ['A', 'B', 'C', 'D']
252
+ cap_df.shape # (2, 3)
253
+ ```
@@ -0,0 +1,10 @@
1
+ cap_anndata/__init__.py,sha256=l9lvFpcMsQksp8_dI-fjUgrImoMdztbu3jVSdmxNPmA,205
2
+ cap_anndata/backed_df.py,sha256=06wZwEjszFQ8lkvy6-GgD_SD14idu9857RtlfMQiBjE,2691
3
+ cap_anndata/backed_uns.py,sha256=Tfxoz3RgcgENf4SvxFOox9w048K2QmBTh1VbAf4yqVI,854
4
+ cap_anndata/cap_anndata.py,sha256=nv5f7A9jyK_rZ2kx54XvnX-V65MFlE3CYQC-n_zBhB8,10097
5
+ cap_anndata/reader.py,sha256=kg9xoS_S0gY6WpsHE8PwGMa14VXh9Ibqjw4bwoerYsE,1267
6
+ cap_anndata-0.2.1.dist-info/LICENSE,sha256=JAV0w7TBl6wQe9iFcCKjAWgpurym0f-Q0B75zm2PrKw,1560
7
+ cap_anndata-0.2.1.dist-info/METADATA,sha256=KuFmqvbkQ4O61na6ifNlekuL6t2NU2NxrSV3npXUfMg,9569
8
+ cap_anndata-0.2.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
9
+ cap_anndata-0.2.1.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
10
+ cap_anndata-0.2.1.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- cap_anndata/__init__.py,sha256=4Vex9i79uTgNQZo_yiEuNc0KoLXPs5Awv87KWmDbxzM,143
2
- cap_anndata/backed_df.py,sha256=Ce74WHzXhebYRORx7yjVJD02XCcF5j1SxvmbTIpwzCA,1418
3
- cap_anndata/backed_uns.py,sha256=Tfxoz3RgcgENf4SvxFOox9w048K2QmBTh1VbAf4yqVI,854
4
- cap_anndata/cap_anndata.py,sha256=4mpsJjEgrmJGSVaY0cYp_mM6CTquI9NC-oYXbZxUjH4,7815
5
- cap_anndata-0.1.1.dist-info/LICENSE,sha256=JAV0w7TBl6wQe9iFcCKjAWgpurym0f-Q0B75zm2PrKw,1560
6
- cap_anndata-0.1.1.dist-info/METADATA,sha256=7xIKTN7cO4jbenuUTXEXJcJUCFJZwwZPN7VvE5Fp7EE,7123
7
- cap_anndata-0.1.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
8
- cap_anndata-0.1.1.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
9
- cap_anndata-0.1.1.dist-info/RECORD,,