cap-anndata 0.1.0__tar.gz → 0.2.0__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/PKG-INFO +82 -30
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/README.md +80 -28
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/cap_anndata/__init__.py +4 -0
- cap_anndata-0.2.0/cap_anndata/backed_df.py +69 -0
- cap_anndata-0.2.0/cap_anndata/cap_anndata.py +287 -0
- cap_anndata-0.2.0/cap_anndata/reader.py +44 -0
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/cap_anndata.egg-info/PKG-INFO +82 -30
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/cap_anndata.egg-info/SOURCES.txt +3 -1
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/setup.py +3 -3
- cap_anndata-0.2.0/test/test_backed_df.py +81 -0
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/test/test_backed_uns.py +36 -36
- cap_anndata-0.2.0/test/test_cap_anndata.py +431 -0
- cap_anndata-0.2.0/test/test_reader.py +22 -0
- cap_anndata-0.1.0/cap_anndata/backed_df.py +0 -40
- cap_anndata-0.1.0/cap_anndata/cap_anndata.py +0 -221
- cap_anndata-0.1.0/test/test_backed_df.py +0 -58
- cap_anndata-0.1.0/test/test_cap_anndata.py +0 -291
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/LICENSE +0 -0
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/cap_anndata/backed_uns.py +0 -0
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/cap_anndata.egg-info/dependency_links.txt +0 -0
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/cap_anndata.egg-info/requires.txt +0 -0
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/cap_anndata.egg-info/top_level.txt +0 -0
- {cap_anndata-0.1.0 → cap_anndata-0.2.0}/setup.cfg +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cap_anndata
|
3
|
-
Version: 0.
|
4
|
-
Summary: Partial read of AnnData files for low-memory operations with large datasets.
|
3
|
+
Version: 0.2.0
|
4
|
+
Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
|
5
5
|
Home-page: https://github.com/cellannotation/cap-anndata
|
6
6
|
Author: R. Mukhin, A. Isaev
|
7
7
|
Author-email: roman@ebookapplications.com
|
@@ -25,41 +25,65 @@ CAP-AnnData enriches the AnnData ecosystem by offering tailored functionalities
|
|
25
25
|
|
26
26
|
## Getting Started
|
27
27
|
|
28
|
+
### Installation
|
29
|
+
Install CAP-AnnData via pip:
|
30
|
+
|
31
|
+
```commandline
|
32
|
+
pip install -U cap-anndata
|
33
|
+
```
|
34
|
+
|
28
35
|
### Running Tests
|
29
|
-
Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests
|
36
|
+
Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests via `pytest` from the root of the repo.
|
37
|
+
|
38
|
+
```commandline
|
39
|
+
pip install pytest
|
40
|
+
pytest test
|
41
|
+
```
|
30
42
|
|
31
43
|
Make sure Python 3.9 or newer is used, along with all requirements specified in requirements.txt
|
32
44
|
|
33
45
|
## How-TO:
|
34
46
|
|
35
|
-
#### 1.
|
47
|
+
#### 1. Access AnnData File DataFrames
|
36
48
|
|
37
49
|
##### Basic Reading
|
38
50
|
By default, `CapAnnData` does not automatically read any data. To begin working with dataframes, you need to explicitly read the data from the AnnData file. You can read the entire dataframe or select specific columns. For partial reading, provide a list of column names.
|
39
51
|
|
40
52
|
```python
|
41
|
-
import
|
42
|
-
from cap_anndata import CapAnnData
|
53
|
+
from cap_anndata import read_h5ad
|
43
54
|
|
44
55
|
file_path = "your_data.h5ad"
|
45
|
-
with
|
46
|
-
|
47
|
-
|
56
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
57
|
+
# Get the list of all obs columns in AnnData file
|
58
|
+
cap_adata.obs_keys() # ['a', 'b', 'c']
|
48
59
|
# Read all columns of 'obs'
|
49
60
|
cap_adata.read_obs()
|
61
|
+
# Get the list of columns of DataFrame in memory
|
62
|
+
cap_adata.obs.columns # ['a', 'b', 'c']
|
50
63
|
|
64
|
+
# Get the list of all var columns in AnnData file
|
65
|
+
cap_adata.var_keys() # ['d', 'e', 'f']
|
51
66
|
# Read specific columns of 'var'
|
52
|
-
cap_adata.read_var(columns=['
|
53
|
-
|
54
|
-
# Read
|
55
|
-
cap_adata.read_var(
|
67
|
+
cap_adata.read_var(columns=['d'])
|
68
|
+
cap_adata.var.columns # ['d']
|
69
|
+
# Read additional column
|
70
|
+
cap_adata.read_var(columns=['e'])
|
71
|
+
cap_adata.var.columns # ['d', 'e']
|
72
|
+
|
73
|
+
# Read column and reset the in-memory DataFrame before that
|
74
|
+
cap_adata.read_var(columns=['f'], reset=True)
|
75
|
+
cap_adata.var.columns # ['f']
|
76
|
+
|
77
|
+
# Read no columns of raw.var (only the index)
|
78
|
+
cap_adata.raw.read_var(columns=[])
|
56
79
|
```
|
57
80
|
|
58
|
-
#####
|
81
|
+
##### Difference between `obs_keys()` and `obs.columns`
|
82
|
+
`obs_keys()` returns the list of columns in the on-disc AnnData file, while `obs.columns` returns the list of columns in the in-memory DataFrame. The two lists may differ if you read only specific columns. If you modify the in-memory DataFrame, the `obs_keys()` will reflect the changes. BTW it is recommended to check the `obs_keys()` before the `overwrite()` call to avoid the AnnData file damage.
|
59
83
|
|
60
|
-
If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting
|
84
|
+
If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting DataFrame. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read. Exactly the same behavior is for the `var_keys()` and `var.columns`.
|
61
85
|
|
62
|
-
#### 2. Modify the AnnData File
|
86
|
+
#### 2. Modify the AnnData File DataFrames In-Place
|
63
87
|
|
64
88
|
You can directly modify the dataframe by adding, renaming, or removing columns.
|
65
89
|
|
@@ -68,13 +92,14 @@ You can directly modify the dataframe by adding, renaming, or removing columns.
|
|
68
92
|
cap_adata.obs['new_col'] = [value1, value2, value3]
|
69
93
|
|
70
94
|
# Rename a column
|
71
|
-
cap_adata.rename_column('old_col_name', 'new_col_name')
|
95
|
+
cap_adata.obs.rename_column('old_col_name', 'new_col_name')
|
72
96
|
|
73
97
|
# Remove a column
|
74
|
-
cap_adata.remove_column('col_to_remove')
|
98
|
+
cap_adata.obs.remove_column('col_to_remove')
|
75
99
|
```
|
76
100
|
|
77
101
|
After modifications, you can overwrite the changes back to the AnnData file. If a value doesn't exist, it will be created.
|
102
|
+
Note: `read_h5ad` must be called with `edit=True` argument to open `.h5ad` file in `r+` mode.
|
78
103
|
|
79
104
|
```python
|
80
105
|
# overwrite all values which were read
|
@@ -84,7 +109,7 @@ cap_adata.overwrite()
|
|
84
109
|
cap_adata.overwrite(['obs', 'var'])
|
85
110
|
```
|
86
111
|
|
87
|
-
The full list of supported fields: `
|
112
|
+
The full list of supported fields: `obs`, `var`, `raw.var`, `obsm`, `uns`.
|
88
113
|
|
89
114
|
#### 3. How to Read Few Columns but Overwrite One in a Dataframe
|
90
115
|
|
@@ -100,14 +125,19 @@ cap_adata.obs.drop(columns='sample', inplace=True)
|
|
100
125
|
|
101
126
|
# Overwrite changes
|
102
127
|
cap_adata.overwrite(['obs'])
|
128
|
+
|
129
|
+
# NOTE that the line
|
130
|
+
# cap_adata.read_obs(columns=['sample'], reset=True)
|
131
|
+
# Will override in-memory changes with values from the AnnData file
|
103
132
|
```
|
104
133
|
|
105
134
|
#### 4. How to work with X and raw.X
|
106
135
|
|
107
|
-
The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
|
136
|
+
The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
|
137
|
+
The X object will be returned as the `h5py.Dataset` or `AnnData.experimental.sparse_dataset`.
|
108
138
|
|
109
139
|
```python
|
110
|
-
with
|
140
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
111
141
|
# self.X is None here
|
112
142
|
cap_adata = CapAnnData(file)
|
113
143
|
|
@@ -135,13 +165,13 @@ s_ = np.s_[mask, :5]
|
|
135
165
|
|
136
166
|
#### 5. How to handle obsm embeddings matrixes
|
137
167
|
|
138
|
-
By the default the CapAnnData will not read the embeddings matrix.
|
168
|
+
By the default the CapAnnData will not read the embeddings matrix.
|
169
|
+
The link to the h5py objects will be created upon the first call of the `.obsm` property.
|
170
|
+
Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead.
|
171
|
+
It is possible to get the information about the name and shape of the embeddings without taking the whole matrix in the memory.
|
139
172
|
|
140
173
|
```python
|
141
|
-
with
|
142
|
-
# initialization
|
143
|
-
cap_adata = CapAnnData(file)
|
144
|
-
|
174
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
145
175
|
# will return the list of strings
|
146
176
|
obsm_keys = cap_adata.obsm_keys()
|
147
177
|
|
@@ -158,10 +188,7 @@ with h5py.File(path) as file:
|
|
158
188
|
The `CapAnnData` class will lazely link the uns section upon the first call but ***WILL NOT*** read it into memory. Instead, the dictionary of the pairs `{'key': "__NotLinkedObject"}` will be creted. It allow to get the list of keys before the actual read. To read the uns section in the memory the `.read_uns(keys)` method must be called.
|
159
189
|
|
160
190
|
```python
|
161
|
-
with
|
162
|
-
# initialization
|
163
|
-
cap_adata = CapAnnData(file)
|
164
|
-
|
191
|
+
with read_h5ad(file_path=file_path, edit=True) as cap_adata:
|
165
192
|
# will return the keys() object
|
166
193
|
keys = cap_adata.uns.keys()
|
167
194
|
|
@@ -197,3 +224,28 @@ To save `uns` changes the method `CapAnnData.overwrite()` must be called.
|
|
197
224
|
cap_adata.overwrite() # all in-memory fields will be overwritten
|
198
225
|
cap_adata.overwrite(["uns"]) # overwrite the uns secion only
|
199
226
|
```
|
227
|
+
|
228
|
+
#### 7. Join and Merge DataFrames
|
229
|
+
|
230
|
+
Cap-AnnData provides enhanced methods for joining and merging dataframes, preserving column order and data integrity
|
231
|
+
|
232
|
+
```python
|
233
|
+
from cap_anndata import CapAnnDataDF
|
234
|
+
import pandas as pd
|
235
|
+
|
236
|
+
data1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
237
|
+
data2 = pd.DataFrame({'D': [7, 8, 9], 'E': [10, 11, 12]})
|
238
|
+
cap_anndata_df1 = CapAnnDataDF.from_df(data1, column_order=['A', 'B', 'C'])
|
239
|
+
|
240
|
+
cap_df = cap_anndata_df1.join(data2, how='left')
|
241
|
+
|
242
|
+
cap_df.columns # ['A', 'B', 'D', 'E']
|
243
|
+
cap_df.column_order # ['A', 'B', 'C', 'D', 'E']
|
244
|
+
|
245
|
+
data3 = pd.DataFrame({'A': [2, 3, 4], 'D': [10, 11, 12]})
|
246
|
+
cap_df = cap_anndata_df1.merge(data3, on='A')
|
247
|
+
|
248
|
+
cap_df.columns # ['A', 'B', 'D']
|
249
|
+
cap_df.column_order # ['A', 'B', 'C', 'D']
|
250
|
+
cap_df.shape # (2, 3)
|
251
|
+
```
|
@@ -5,41 +5,65 @@ CAP-AnnData enriches the AnnData ecosystem by offering tailored functionalities
|
|
5
5
|
|
6
6
|
## Getting Started
|
7
7
|
|
8
|
+
### Installation
|
9
|
+
Install CAP-AnnData via pip:
|
10
|
+
|
11
|
+
```commandline
|
12
|
+
pip install -U cap-anndata
|
13
|
+
```
|
14
|
+
|
8
15
|
### Running Tests
|
9
|
-
Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests
|
16
|
+
Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests via `pytest` from the root of the repo.
|
17
|
+
|
18
|
+
```commandline
|
19
|
+
pip install pytest
|
20
|
+
pytest test
|
21
|
+
```
|
10
22
|
|
11
23
|
Make sure Python 3.9 or newer is used, along with all requirements specified in requirements.txt
|
12
24
|
|
13
25
|
## How-TO:
|
14
26
|
|
15
|
-
#### 1.
|
27
|
+
#### 1. Access AnnData File DataFrames
|
16
28
|
|
17
29
|
##### Basic Reading
|
18
30
|
By default, `CapAnnData` does not automatically read any data. To begin working with dataframes, you need to explicitly read the data from the AnnData file. You can read the entire dataframe or select specific columns. For partial reading, provide a list of column names.
|
19
31
|
|
20
32
|
```python
|
21
|
-
import
|
22
|
-
from cap_anndata import CapAnnData
|
33
|
+
from cap_anndata import read_h5ad
|
23
34
|
|
24
35
|
file_path = "your_data.h5ad"
|
25
|
-
with
|
26
|
-
|
27
|
-
|
36
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
37
|
+
# Get the list of all obs columns in AnnData file
|
38
|
+
cap_adata.obs_keys() # ['a', 'b', 'c']
|
28
39
|
# Read all columns of 'obs'
|
29
40
|
cap_adata.read_obs()
|
41
|
+
# Get the list of columns of DataFrame in memory
|
42
|
+
cap_adata.obs.columns # ['a', 'b', 'c']
|
30
43
|
|
44
|
+
# Get the list of all var columns in AnnData file
|
45
|
+
cap_adata.var_keys() # ['d', 'e', 'f']
|
31
46
|
# Read specific columns of 'var'
|
32
|
-
cap_adata.read_var(columns=['
|
33
|
-
|
34
|
-
# Read
|
35
|
-
cap_adata.read_var(
|
47
|
+
cap_adata.read_var(columns=['d'])
|
48
|
+
cap_adata.var.columns # ['d']
|
49
|
+
# Read additional column
|
50
|
+
cap_adata.read_var(columns=['e'])
|
51
|
+
cap_adata.var.columns # ['d', 'e']
|
52
|
+
|
53
|
+
# Read column and reset the in-memory DataFrame before that
|
54
|
+
cap_adata.read_var(columns=['f'], reset=True)
|
55
|
+
cap_adata.var.columns # ['f']
|
56
|
+
|
57
|
+
# Read no columns of raw.var (only the index)
|
58
|
+
cap_adata.raw.read_var(columns=[])
|
36
59
|
```
|
37
60
|
|
38
|
-
#####
|
61
|
+
##### Difference between `obs_keys()` and `obs.columns`
|
62
|
+
`obs_keys()` returns the list of columns in the on-disc AnnData file, while `obs.columns` returns the list of columns in the in-memory DataFrame. The two lists may differ if you read only specific columns. If you modify the in-memory DataFrame, the `obs_keys()` will reflect the changes. BTW it is recommended to check the `obs_keys()` before the `overwrite()` call to avoid the AnnData file damage.
|
39
63
|
|
40
|
-
If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting
|
64
|
+
If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting DataFrame. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read. Exactly the same behavior is for the `var_keys()` and `var.columns`.
|
41
65
|
|
42
|
-
#### 2. Modify the AnnData File
|
66
|
+
#### 2. Modify the AnnData File DataFrames In-Place
|
43
67
|
|
44
68
|
You can directly modify the dataframe by adding, renaming, or removing columns.
|
45
69
|
|
@@ -48,13 +72,14 @@ You can directly modify the dataframe by adding, renaming, or removing columns.
|
|
48
72
|
cap_adata.obs['new_col'] = [value1, value2, value3]
|
49
73
|
|
50
74
|
# Rename a column
|
51
|
-
cap_adata.rename_column('old_col_name', 'new_col_name')
|
75
|
+
cap_adata.obs.rename_column('old_col_name', 'new_col_name')
|
52
76
|
|
53
77
|
# Remove a column
|
54
|
-
cap_adata.remove_column('col_to_remove')
|
78
|
+
cap_adata.obs.remove_column('col_to_remove')
|
55
79
|
```
|
56
80
|
|
57
81
|
After modifications, you can overwrite the changes back to the AnnData file. If a value doesn't exist, it will be created.
|
82
|
+
Note: `read_h5ad` must be called with `edit=True` argument to open `.h5ad` file in `r+` mode.
|
58
83
|
|
59
84
|
```python
|
60
85
|
# overwrite all values which were read
|
@@ -64,7 +89,7 @@ cap_adata.overwrite()
|
|
64
89
|
cap_adata.overwrite(['obs', 'var'])
|
65
90
|
```
|
66
91
|
|
67
|
-
The full list of supported fields: `
|
92
|
+
The full list of supported fields: `obs`, `var`, `raw.var`, `obsm`, `uns`.
|
68
93
|
|
69
94
|
#### 3. How to Read Few Columns but Overwrite One in a Dataframe
|
70
95
|
|
@@ -80,14 +105,19 @@ cap_adata.obs.drop(columns='sample', inplace=True)
|
|
80
105
|
|
81
106
|
# Overwrite changes
|
82
107
|
cap_adata.overwrite(['obs'])
|
108
|
+
|
109
|
+
# NOTE that the line
|
110
|
+
# cap_adata.read_obs(columns=['sample'], reset=True)
|
111
|
+
# Will override in-memory changes with values from the AnnData file
|
83
112
|
```
|
84
113
|
|
85
114
|
#### 4. How to work with X and raw.X
|
86
115
|
|
87
|
-
The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
|
116
|
+
The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
|
117
|
+
The X object will be returned as the `h5py.Dataset` or `AnnData.experimental.sparse_dataset`.
|
88
118
|
|
89
119
|
```python
|
90
|
-
with
|
120
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
91
121
|
# self.X is None here
|
92
122
|
cap_adata = CapAnnData(file)
|
93
123
|
|
@@ -115,13 +145,13 @@ s_ = np.s_[mask, :5]
|
|
115
145
|
|
116
146
|
#### 5. How to handle obsm embeddings matrixes
|
117
147
|
|
118
|
-
By the default the CapAnnData will not read the embeddings matrix.
|
148
|
+
By the default the CapAnnData will not read the embeddings matrix.
|
149
|
+
The link to the h5py objects will be created upon the first call of the `.obsm` property.
|
150
|
+
Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead.
|
151
|
+
It is possible to get the information about the name and shape of the embeddings without taking the whole matrix in the memory.
|
119
152
|
|
120
153
|
```python
|
121
|
-
with
|
122
|
-
# initialization
|
123
|
-
cap_adata = CapAnnData(file)
|
124
|
-
|
154
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
125
155
|
# will return the list of strings
|
126
156
|
obsm_keys = cap_adata.obsm_keys()
|
127
157
|
|
@@ -138,10 +168,7 @@ with h5py.File(path) as file:
|
|
138
168
|
The `CapAnnData` class will lazely link the uns section upon the first call but ***WILL NOT*** read it into memory. Instead, the dictionary of the pairs `{'key': "__NotLinkedObject"}` will be creted. It allow to get the list of keys before the actual read. To read the uns section in the memory the `.read_uns(keys)` method must be called.
|
139
169
|
|
140
170
|
```python
|
141
|
-
with
|
142
|
-
# initialization
|
143
|
-
cap_adata = CapAnnData(file)
|
144
|
-
|
171
|
+
with read_h5ad(file_path=file_path, edit=True) as cap_adata:
|
145
172
|
# will return the keys() object
|
146
173
|
keys = cap_adata.uns.keys()
|
147
174
|
|
@@ -177,3 +204,28 @@ To save `uns` changes the method `CapAnnData.overwrite()` must be called.
|
|
177
204
|
cap_adata.overwrite() # all in-memory fields will be overwritten
|
178
205
|
cap_adata.overwrite(["uns"]) # overwrite the uns secion only
|
179
206
|
```
|
207
|
+
|
208
|
+
#### 7. Join and Merge DataFrames
|
209
|
+
|
210
|
+
Cap-AnnData provides enhanced methods for joining and merging dataframes, preserving column order and data integrity
|
211
|
+
|
212
|
+
```python
|
213
|
+
from cap_anndata import CapAnnDataDF
|
214
|
+
import pandas as pd
|
215
|
+
|
216
|
+
data1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
217
|
+
data2 = pd.DataFrame({'D': [7, 8, 9], 'E': [10, 11, 12]})
|
218
|
+
cap_anndata_df1 = CapAnnDataDF.from_df(data1, column_order=['A', 'B', 'C'])
|
219
|
+
|
220
|
+
cap_df = cap_anndata_df1.join(data2, how='left')
|
221
|
+
|
222
|
+
cap_df.columns # ['A', 'B', 'D', 'E']
|
223
|
+
cap_df.column_order # ['A', 'B', 'C', 'D', 'E']
|
224
|
+
|
225
|
+
data3 = pd.DataFrame({'A': [2, 3, 4], 'D': [10, 11, 12]})
|
226
|
+
cap_df = cap_anndata_df1.merge(data3, on='A')
|
227
|
+
|
228
|
+
cap_df.columns # ['A', 'B', 'D']
|
229
|
+
cap_df.column_order # ['A', 'B', 'C', 'D']
|
230
|
+
cap_df.shape # (2, 3)
|
231
|
+
```
|
@@ -0,0 +1,69 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
from typing import List, Any
|
4
|
+
import logging
|
5
|
+
|
6
|
+
from pandas._typing import Self
|
7
|
+
from pandas.core.generic import bool_t
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class CapAnnDataDF(pd.DataFrame):
|
13
|
+
"""
|
14
|
+
The class to expand the pandas DataFrame behaviour to support partial
|
15
|
+
reading and writing of AnnData obs and var (raw.var) fields.
|
16
|
+
The main feature of the class is handling <column-order> attribute
|
17
|
+
which must be a copy of h5py.Group attribute
|
18
|
+
"""
|
19
|
+
|
20
|
+
_metadata = ["column_order"]
|
21
|
+
|
22
|
+
def rename_column(self, old_name: str, new_name: str) -> None:
|
23
|
+
i = np.where(self.column_order == old_name)[0]
|
24
|
+
self.column_order[i] = new_name
|
25
|
+
self.rename(columns={old_name: new_name}, inplace=True)
|
26
|
+
|
27
|
+
def remove_column(self, col_name: str) -> None:
|
28
|
+
i = np.where(self.column_order == col_name)[0]
|
29
|
+
self.column_order = np.delete(self.column_order, i)
|
30
|
+
self.drop(columns=[col_name], inplace=True)
|
31
|
+
|
32
|
+
def __setitem__(self, key, value) -> None:
|
33
|
+
if key not in self.column_order:
|
34
|
+
self.column_order = np.append(self.column_order, key)
|
35
|
+
return super().__setitem__(key, value)
|
36
|
+
|
37
|
+
@classmethod
|
38
|
+
def from_df(cls, df: pd.DataFrame, column_order: List[str] = None) -> Self:
|
39
|
+
if column_order is None:
|
40
|
+
column_order = df.columns.to_numpy()
|
41
|
+
|
42
|
+
new_inst = cls(df)
|
43
|
+
new_inst.column_order = column_order
|
44
|
+
return new_inst
|
45
|
+
|
46
|
+
def join(self, other: Any, **kwargs) -> Self:
|
47
|
+
result = super().join(other=other, **kwargs)
|
48
|
+
if isinstance(other, CapAnnDataDF):
|
49
|
+
new_columns = [
|
50
|
+
col for col in other.column_order if col not in self.column_order
|
51
|
+
]
|
52
|
+
else:
|
53
|
+
new_columns = [col for col in other.columns if col not in self.column_order]
|
54
|
+
column_order = np.append(self.column_order, new_columns)
|
55
|
+
return self.from_df(result, column_order=column_order)
|
56
|
+
|
57
|
+
def merge(self, right, **kwargs) -> Self:
|
58
|
+
result = super().merge(right=right, **kwargs)
|
59
|
+
if isinstance(right, CapAnnDataDF):
|
60
|
+
new_columns = [
|
61
|
+
col for col in right.column_order if col not in self.column_order
|
62
|
+
]
|
63
|
+
else:
|
64
|
+
new_columns = [col for col in right.columns if col not in self.column_order]
|
65
|
+
column_order = np.append(self.column_order, new_columns)
|
66
|
+
return self.from_df(result, column_order=column_order)
|
67
|
+
|
68
|
+
def copy(self, deep: bool_t | None = True) -> Self:
|
69
|
+
return self.from_df(super().copy(deep=deep), column_order=self.column_order)
|