pymaftools 0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymaftools-0.1/LICENSE +21 -0
- pymaftools-0.1/PKG-INFO +91 -0
- pymaftools-0.1/README.md +77 -0
- pymaftools-0.1/pymaftools/__init__.py +3 -0
- pymaftools-0.1/pymaftools/maf_plots.py +180 -0
- pymaftools-0.1/pymaftools/maf_utils.py +158 -0
- pymaftools-0.1/pymaftools.egg-info/PKG-INFO +91 -0
- pymaftools-0.1/pymaftools.egg-info/SOURCES.txt +11 -0
- pymaftools-0.1/pymaftools.egg-info/dependency_links.txt +1 -0
- pymaftools-0.1/pymaftools.egg-info/requires.txt +4 -0
- pymaftools-0.1/pymaftools.egg-info/top_level.txt +1 -0
- pymaftools-0.1/setup.cfg +4 -0
- pymaftools-0.1/setup.py +25 -0
pymaftools-0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 xu62u4u6
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pymaftools-0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pymaftools
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: pymaftools is a Python package for handling and analyzing Mutation Annotation Format (MAF) files. It provides utilities for data manipulation and visualization, including classes for MAF parsing and oncoplot generation.
|
|
5
|
+
Home-page: https://github.com/xu62u4u6/pymaftools
|
|
6
|
+
Author: xu62u4u6
|
|
7
|
+
Author-email: 199928ltyos@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# pymaftools
|
|
17
|
+
|
|
18
|
+
`pymaftools` is a Python package designed to handle and analyze MAF (Mutation Annotation Format) files. It provides utilities for working with mutation data, including the `MAF` and `PivotTable` classes for data manipulation, and functions for visualizing mutation data with oncoplots.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
|
|
22
|
+
- **MAF Class**: A utility to load, parse, and manipulate MAF files.
|
|
23
|
+
- **PivotTable Class**: A custom pivot table implementation for summarizing mutation frequencies and sorting genes and samples.
|
|
24
|
+
- **Oncoplot Visualization**: Generate oncoplot visualizations with mutation data and frequencies.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
### Using pip (from PyPI)
|
|
29
|
+
You can install the `pymaftools` package directly from PyPI using pip:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install pymaftools
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Using GitHub (for the latest version)
|
|
36
|
+
To install directly from GitHub (if you want the latest changes):
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install git+https://github.com/xu62u4u6/pymaftools.git
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
### Importing the Package
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from pymaftools.maf_utils import MAF, PivotTable
|
|
49
|
+
from pymaftools.maf_plots import create_oncoplot
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Example
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
# Load the MAF file
|
|
56
|
+
all_case_maf = MAF.read_maf("path_to_maf_file.maf")
|
|
57
|
+
|
|
58
|
+
# Filter to keep only nonsynonymous mutations
|
|
59
|
+
filtered_all_case_maf = all_case_maf.filter_maf(MAF.nonsynonymous_types)
|
|
60
|
+
|
|
61
|
+
# Convert to pivot table (genes x samples table, mutation classification as values)
|
|
62
|
+
pivot_table = filtered_all_case_maf.to_pivot_table()
|
|
63
|
+
|
|
64
|
+
# Calculate mutation frequencies
|
|
65
|
+
pivot_table = pivot_table.add_freq()
|
|
66
|
+
|
|
67
|
+
# Sort the pivot table (by gene frequency and sample mutation count)
|
|
68
|
+
sorted_pivot_table = (pivot_table
|
|
69
|
+
.sort_genes_by_freq()
|
|
70
|
+
.sort_samples_by_mutations()
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Generate an oncoplot to show the top 50 genes with the highest mutation frequencies
|
|
74
|
+
create_oncoplot(sorted_pivot_table.top(50),
|
|
75
|
+
figsize=(26, 15),
|
|
76
|
+
ax_main_range=(0, 28),
|
|
77
|
+
ax_freq_range=(28, 29),
|
|
78
|
+
ax_legend_range=(29, 31),
|
|
79
|
+
mutation_counts=True)
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Requirements
|
|
84
|
+
Python 3.x
|
|
85
|
+
pandas, numpy, matplotlib, seaborn
|
|
86
|
+
### License
|
|
87
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
88
|
+
|
|
89
|
+
### Author
|
|
90
|
+
xu62u4u6
|
|
91
|
+
|
pymaftools-0.1/README.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
|
|
2
|
+
# pymaftools
|
|
3
|
+
|
|
4
|
+
`pymaftools` is a Python package designed to handle and analyze MAF (Mutation Annotation Format) files. It provides utilities for working with mutation data, including the `MAF` and `PivotTable` classes for data manipulation, and functions for visualizing mutation data with oncoplots.
|
|
5
|
+
|
|
6
|
+
## Features
|
|
7
|
+
|
|
8
|
+
- **MAF Class**: A utility to load, parse, and manipulate MAF files.
|
|
9
|
+
- **PivotTable Class**: A custom pivot table implementation for summarizing mutation frequencies and sorting genes and samples.
|
|
10
|
+
- **Oncoplot Visualization**: Generate oncoplot visualizations with mutation data and frequencies.
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
### Using pip (from PyPI)
|
|
15
|
+
You can install the `pymaftools` package directly from PyPI using pip:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install pymaftools
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### Using GitHub (for the latest version)
|
|
22
|
+
To install directly from GitHub (if you want the latest changes):
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install git+https://github.com/xu62u4u6/pymaftools.git
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
### Importing the Package
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from pymaftools.maf_utils import MAF, PivotTable
|
|
35
|
+
from pymaftools.maf_plots import create_oncoplot
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Example
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
# Load the MAF file
|
|
42
|
+
all_case_maf = MAF.read_maf("path_to_maf_file.maf")
|
|
43
|
+
|
|
44
|
+
# Filter to keep only nonsynonymous mutations
|
|
45
|
+
filtered_all_case_maf = all_case_maf.filter_maf(MAF.nonsynonymous_types)
|
|
46
|
+
|
|
47
|
+
# Convert to pivot table (genes x samples table, mutation classification as values)
|
|
48
|
+
pivot_table = filtered_all_case_maf.to_pivot_table()
|
|
49
|
+
|
|
50
|
+
# Calculate mutation frequencies
|
|
51
|
+
pivot_table = pivot_table.add_freq()
|
|
52
|
+
|
|
53
|
+
# Sort the pivot table (by gene frequency and sample mutation count)
|
|
54
|
+
sorted_pivot_table = (pivot_table
|
|
55
|
+
.sort_genes_by_freq()
|
|
56
|
+
.sort_samples_by_mutations()
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Generate an oncoplot to show the top 50 genes with the highest mutation frequencies
|
|
60
|
+
create_oncoplot(sorted_pivot_table.top(50),
|
|
61
|
+
figsize=(26, 15),
|
|
62
|
+
ax_main_range=(0, 28),
|
|
63
|
+
ax_freq_range=(28, 29),
|
|
64
|
+
ax_legend_range=(29, 31),
|
|
65
|
+
mutation_counts=True)
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Requirements
|
|
70
|
+
Python 3.x
|
|
71
|
+
pandas, numpy, matplotlib, seaborn
|
|
72
|
+
### License
|
|
73
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
74
|
+
|
|
75
|
+
### Author
|
|
76
|
+
xu62u4u6
|
|
77
|
+
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
import seaborn as sns
|
|
6
|
+
import matplotlib.colors as mcolors
|
|
7
|
+
from matplotlib.patches import Rectangle
|
|
8
|
+
from typing import Union
|
|
9
|
+
|
|
10
|
+
target_col = [
|
|
11
|
+
"Hugo_Symbol",
|
|
12
|
+
"Start_Position",
|
|
13
|
+
"End_Position",
|
|
14
|
+
"Reference_Allele",
|
|
15
|
+
"Tumor_Seq_Allele1",
|
|
16
|
+
"Tumor_Seq_Allele2"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def create_oncoplot(pivot_table,
|
|
21
|
+
color_map=None,
|
|
22
|
+
mutation_counts : Union[bool, pd.Series]=True,
|
|
23
|
+
figsize=(18, 16),
|
|
24
|
+
wspace=0.5,
|
|
25
|
+
hspace=0.01,
|
|
26
|
+
freq_columns=["freq"],
|
|
27
|
+
ax_main_range=(0, 24),
|
|
28
|
+
ax_freq_range=(24, 28),
|
|
29
|
+
ax_legend_range=(29, 31),
|
|
30
|
+
square=False,
|
|
31
|
+
show_frame=False,
|
|
32
|
+
bar_annot_fontsize=7):
|
|
33
|
+
|
|
34
|
+
# freq_columns = freq_columns or [f"{sample_type}_freq" for sample_type in ["A", "T", "S"]] + ['all_freq']
|
|
35
|
+
heatmap_data = pivot_table#sorted_df.drop(columns=freq_columns)
|
|
36
|
+
freq_data = pivot_table.gene_metadata[freq_columns].values
|
|
37
|
+
|
|
38
|
+
# 預設的顏色映射
|
|
39
|
+
color_map = color_map or {
|
|
40
|
+
'False': '#FFFFFF', # 白色 (無突變)
|
|
41
|
+
'Missense_Mutation': 'gray', # 淺灰色
|
|
42
|
+
'Frame_Shift_Ins':'#FF4500', # 較深色紅
|
|
43
|
+
'Frame_Shift_Del': '#4682B4', # 較深色藍
|
|
44
|
+
'In_Frame_Ins': '#FF707A', # 淺色紅
|
|
45
|
+
'In_Frame_Del':'#ADD8E6', # 淺色藍
|
|
46
|
+
'Nonsense_Mutation': '#90EE90', # 低飽和度綠色
|
|
47
|
+
'Splice_Site': '#CB704D', # 低飽和度咖啡色
|
|
48
|
+
'Multi_Hit': '#000000', # 黑色 (多重突變)
|
|
49
|
+
"Silent": "#eeeeee",
|
|
50
|
+
"3'UTR": "#bbbbcc",
|
|
51
|
+
"5'UTR": "#bbbbcc",
|
|
52
|
+
"IGR": "#bbbbcc",
|
|
53
|
+
"Intron": "#bbbbcc",
|
|
54
|
+
"RNA": "#bbbbcc",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
fig = plt.figure(figsize=figsize)
|
|
58
|
+
gs = plt.GridSpec(2, 32, height_ratios=[1, 12], wspace=wspace, hspace=hspace)
|
|
59
|
+
|
|
60
|
+
if mutation_counts is not None:
|
|
61
|
+
if mutation_counts == True:
|
|
62
|
+
mutation_counts = pivot_table.sample_metadata.mutations_count.values
|
|
63
|
+
ax_bar = fig.add_subplot(gs[0, ax_main_range[0]:ax_main_range[1]]) # Bar chart
|
|
64
|
+
plot_bar(ax_bar, mutation_counts, fontsize=bar_annot_fontsize)
|
|
65
|
+
else:
|
|
66
|
+
ax_bar = None # 如果沒有 bar chart, 不繪製上方區域
|
|
67
|
+
|
|
68
|
+
ax_main = fig.add_subplot(gs[1, ax_main_range[0]:ax_main_range[1]]) # Main heatmap
|
|
69
|
+
ax_freq = fig.add_subplot(gs[1, ax_freq_range[0]:ax_freq_range[1]]) # Frequency heatmap
|
|
70
|
+
ax_legend = fig.add_subplot(gs[1, ax_legend_range[0]:ax_legend_range[1]]) # Legend
|
|
71
|
+
|
|
72
|
+
plot_heatmap(ax_main, heatmap_data, color_map, square=square, show_frame=show_frame)
|
|
73
|
+
plot_freq(ax_freq, freq_data, freq_columns, square=square)
|
|
74
|
+
plot_legend(ax_legend, color_map)
|
|
75
|
+
|
|
76
|
+
ax_main.set_xlabel("Mutations")
|
|
77
|
+
#if mutation_counts is None:
|
|
78
|
+
#gs.tight_layout(fig, rect=[0, 0, 1, 0.9]) # 調整繪圖佈局,避免空白區域過多
|
|
79
|
+
#else:
|
|
80
|
+
#plt.tight_layout()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def plot_bar(ax_bar, mutation_counts, fontsize=6):
|
|
84
|
+
|
|
85
|
+
x = np.arange(len(mutation_counts))
|
|
86
|
+
width = 0.95
|
|
87
|
+
|
|
88
|
+
# Create bars
|
|
89
|
+
tmbs = np.where(mutation_counts == 0, 0, mutation_counts/40)
|
|
90
|
+
ax_bar.bar(x, tmbs, width=width, color='gray', edgecolor='white')
|
|
91
|
+
|
|
92
|
+
# Set x-axis limits to exactly match the heatmap
|
|
93
|
+
# The -0.5 ensures the bars align perfectly with heatmap cells
|
|
94
|
+
ax_bar.set_xlim(-0.5, len(mutation_counts) - 0.5)
|
|
95
|
+
|
|
96
|
+
# 在柱子上添加數值標籤
|
|
97
|
+
for i, tmb in enumerate(tmbs):
|
|
98
|
+
ax_bar.text(i, tmb + 2, f"{tmb:.1f}", ha='center', fontsize=fontsize)
|
|
99
|
+
|
|
100
|
+
# 隐藏柱状图的边框和刻度
|
|
101
|
+
ax_bar.spines['top'].set_visible(False)
|
|
102
|
+
ax_bar.spines['right'].set_visible(False)
|
|
103
|
+
ax_bar.spines['left'].set_visible(True)
|
|
104
|
+
ax_bar.spines['bottom'].set_visible(False)
|
|
105
|
+
ax_bar.set_xticks([])
|
|
106
|
+
ax_bar.set_xlabel('TMB')
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def plot_heatmap(ax_main, heatmap_data, color_map, linecolor="white", square=True, show_frame=False):
|
|
110
|
+
|
|
111
|
+
# 創建數值映射
|
|
112
|
+
def color_encode(val):
|
|
113
|
+
return color_map.get(val, '#FFFFFF')
|
|
114
|
+
|
|
115
|
+
# 轉換數據
|
|
116
|
+
data_matrix = heatmap_data.map(color_encode)
|
|
117
|
+
|
|
118
|
+
# 創建熱圖
|
|
119
|
+
sns.heatmap(
|
|
120
|
+
heatmap_data.notna(),
|
|
121
|
+
cmap=['white', 'grey'], # 使用白色和灰色表示數據存在與否
|
|
122
|
+
cbar=False,
|
|
123
|
+
linewidths=1,
|
|
124
|
+
linecolor=linecolor,
|
|
125
|
+
ax=ax_main,
|
|
126
|
+
square=square
|
|
127
|
+
)
|
|
128
|
+
ax_main.set_yticklabels(ax_main.get_yticklabels(), rotation=0)
|
|
129
|
+
|
|
130
|
+
# 添加顏色
|
|
131
|
+
for i in range(data_matrix.shape[0]):
|
|
132
|
+
for j in range(data_matrix.shape[1]):
|
|
133
|
+
ax_main.add_patch(plt.Rectangle(
|
|
134
|
+
(j, i), 1, 1,
|
|
135
|
+
fill=True,
|
|
136
|
+
facecolor=data_matrix.iloc[i, j],
|
|
137
|
+
edgecolor=linecolor,
|
|
138
|
+
lw=1
|
|
139
|
+
))
|
|
140
|
+
|
|
141
|
+
# 添加每三個樣本的淺色框
|
|
142
|
+
if show_frame:
|
|
143
|
+
for i in range(0, heatmap_data.shape[1], 3): # 每三個樣本
|
|
144
|
+
rect = Rectangle((i, -0.5), 3, heatmap_data.shape[0] + 1,
|
|
145
|
+
linewidth=1, edgecolor='lightgray', facecolor='none')
|
|
146
|
+
ax_main.add_patch(rect)
|
|
147
|
+
|
|
148
|
+
def plot_freq(ax_freq, freq_data, freq_columns, square=True, show_frame=True):
|
|
149
|
+
# 繪製頻率熱圖
|
|
150
|
+
sns.heatmap(freq_data,
|
|
151
|
+
cmap='Blues',
|
|
152
|
+
linewidths=0.5,
|
|
153
|
+
ax=ax_freq,
|
|
154
|
+
cbar=False, # 不顯示頻率熱圖的colorbar
|
|
155
|
+
vmin=0,
|
|
156
|
+
vmax=freq_data.max(),
|
|
157
|
+
alpha=0.8,
|
|
158
|
+
square=square) # 根據頻率數據的最大值設置vmax
|
|
159
|
+
|
|
160
|
+
# 隱藏頻率熱圖的索引
|
|
161
|
+
ax_freq.set_xticks([]) # 隱藏 x 軸的標籤
|
|
162
|
+
ax_freq.set_yticks([]) # 隱藏 y 軸的標籤
|
|
163
|
+
|
|
164
|
+
# 設置頻率熱圖的標籤和數值,並隱藏索引
|
|
165
|
+
for i in range(freq_data.shape[0]): # 每行
|
|
166
|
+
for j in range(freq_data.shape[1]): # 每列
|
|
167
|
+
value = freq_data[i, j]
|
|
168
|
+
color = 'black' if value < 0.6 * freq_data.max() else 'white' # 高频率用白色,低频率用黑色
|
|
169
|
+
ax_freq.text(j + 0.5, i + 0.5, f"{value:.2f}",
|
|
170
|
+
va='center', ha='center', color=color)
|
|
171
|
+
|
|
172
|
+
ax_freq.set_title('Frequency', pad=20) # 頻率熱圖的標題
|
|
173
|
+
ax_freq.set_xticks(np.arange(len(freq_columns))+0.5) # 設置 x 軸刻度數量
|
|
174
|
+
ax_freq.set_xticklabels(freq_columns, rotation=90) # 設置 x 軸標籤並旋轉90度
|
|
175
|
+
|
|
176
|
+
def plot_legend(ax_legend, color_map):
|
|
177
|
+
# 修正圖例
|
|
178
|
+
legend_elements = [Rectangle((0, 0), 1, 1, color=color_map[key], label=key) for key in color_map.keys()]
|
|
179
|
+
ax_legend.legend(handles=legend_elements, title="Variant Types", loc='center', fontsize='small', frameon=False)
|
|
180
|
+
ax_legend.axis('off') # 隱藏圖例軸的坐標系
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
class PivotTable(pd.DataFrame):
|
|
4
|
+
# columns: gene or mutation, row: sample or case
|
|
5
|
+
_metadata = ["gene_metadata", "sample_metadata"]
|
|
6
|
+
def __init__(self, data, mutations_count: pd.Series=None, *args, **kwargs):
|
|
7
|
+
super().__init__(data, *args, **kwargs)
|
|
8
|
+
self.gene_metadata = pd.DataFrame(index=self.index)
|
|
9
|
+
self.sample_metadata = pd.DataFrame(index=self.columns)
|
|
10
|
+
|
|
11
|
+
@property
|
|
12
|
+
def _constructor(self):
|
|
13
|
+
return PivotTable
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def calculate_frequency(df: pd.DataFrame) -> pd.Series:
|
|
17
|
+
return (df != False).sum(axis=1) / df.shape[1]
|
|
18
|
+
|
|
19
|
+
def add_freq(self, groups: dict={}) -> "PivotTable":
|
|
20
|
+
"""
|
|
21
|
+
example:
|
|
22
|
+
groups: {"S": pd.dataframe,
|
|
23
|
+
"A": pd.dataframe....}
|
|
24
|
+
groupname: subset of pivot table
|
|
25
|
+
"""
|
|
26
|
+
pivot_table = self.copy()
|
|
27
|
+
freq_data = pd.DataFrame()
|
|
28
|
+
for group in groups.keys():
|
|
29
|
+
freq_data[f"{group}_freq"] = PivotTable.calculate_frequency(groups[group])
|
|
30
|
+
freq_data["freq"] = PivotTable.calculate_frequency(pivot_table)
|
|
31
|
+
pivot_table.gene_metadata[freq_data.columns] = freq_data
|
|
32
|
+
return pivot_table
|
|
33
|
+
|
|
34
|
+
def sort_genes_by_freq(self, by="freq", ascending=False):
|
|
35
|
+
pivot_table = self.copy()
|
|
36
|
+
sorted_index = pivot_table.gene_metadata.sort_values(by=by, ascending=ascending).index
|
|
37
|
+
|
|
38
|
+
# sort pivot table
|
|
39
|
+
pivot_table = pivot_table.loc[sorted_index]
|
|
40
|
+
|
|
41
|
+
# also sort gene_metadata
|
|
42
|
+
pivot_table.gene_metadata = pivot_table.gene_metadata.loc[sorted_index]
|
|
43
|
+
return pivot_table
|
|
44
|
+
|
|
45
|
+
def sort_samples_by_mutations(self, top: int = 10):
|
|
46
|
+
def binary_sort_key(column: pd.Series) -> int:
|
|
47
|
+
# binary column to int
|
|
48
|
+
binary_str = "".join(column.astype(int).astype(str))
|
|
49
|
+
return int(binary_str, 2)
|
|
50
|
+
|
|
51
|
+
# tmp_pivot_table = pivot_table.drop(columns=freq_columns)
|
|
52
|
+
pivot_table = self.copy()
|
|
53
|
+
binary_pivot_table = pivot_table != False
|
|
54
|
+
mutations_weight = binary_pivot_table.head(top).apply(binary_sort_key, axis=0)
|
|
55
|
+
pivot_table.sample_metadata["mutations_weight"] = mutations_weight
|
|
56
|
+
sorted_samples = (mutations_weight
|
|
57
|
+
.sort_values(ascending=False)
|
|
58
|
+
.index)
|
|
59
|
+
|
|
60
|
+
# sort by order
|
|
61
|
+
pivot_table = pivot_table.loc[:, sorted_samples]
|
|
62
|
+
pivot_table.sample_metadata = pivot_table.sample_metadata.loc[sorted_samples, :]
|
|
63
|
+
return pivot_table
|
|
64
|
+
|
|
65
|
+
def top(self, n_top = 50):
|
|
66
|
+
pivot_table = self.copy()
|
|
67
|
+
pivot_table = pivot_table.head(n_top)
|
|
68
|
+
pivot_table.gene_metadata = pivot_table.gene_metadata.head(n_top)
|
|
69
|
+
return pivot_table
|
|
70
|
+
|
|
71
|
+
class MAF(pd.DataFrame):
|
|
72
|
+
index_col = [
|
|
73
|
+
"Hugo_Symbol",
|
|
74
|
+
"Start_Position",
|
|
75
|
+
"End_Position",
|
|
76
|
+
"Reference_Allele",
|
|
77
|
+
"Tumor_Seq_Allele1",
|
|
78
|
+
"Tumor_Seq_Allele2"
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
# GDC MAF file fields:
|
|
82
|
+
# https://docs.gdc.cancer.gov/Encyclopedia/pages/Mutation_Annotation_Format_TCGAv2/
|
|
83
|
+
|
|
84
|
+
vaild_variant_classfication = [
|
|
85
|
+
"Frame_Shift_Del",
|
|
86
|
+
"Frame_Shift_Ins",
|
|
87
|
+
"In_Frame_Del",
|
|
88
|
+
"In_Frame_Ins",
|
|
89
|
+
"Missense_Mutation",
|
|
90
|
+
"Nonsense_Mutation",
|
|
91
|
+
"Silent",
|
|
92
|
+
"Splice_Site",
|
|
93
|
+
"Translation_Start_Site",
|
|
94
|
+
"Nonstop_Mutation",
|
|
95
|
+
"3'UTR",
|
|
96
|
+
"3'Flank",
|
|
97
|
+
"5'UTR",
|
|
98
|
+
"5'Flank",
|
|
99
|
+
"IGR",
|
|
100
|
+
"Intron",
|
|
101
|
+
"RNA",
|
|
102
|
+
"Targeted_Region"
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
nonsynonymous_types = [
|
|
106
|
+
"Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins",
|
|
107
|
+
"Missense_Mutation", "Nonsense_Mutation", "Splice_Site",
|
|
108
|
+
"Translation_Start_Site", "Nonstop_Mutation"
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
@classmethod
|
|
112
|
+
def read_maf(cls, maf_path, case_ID, preffix="", suffix=""):
|
|
113
|
+
maf = cls(pd.read_csv(maf_path, skiprows=1, sep="\t"))
|
|
114
|
+
maf["case_ID"] = f"{preffix}{case_ID}{suffix}"
|
|
115
|
+
maf.index = maf.loc[:, cls.target_col].apply(lambda row: "|".join(row.astype(str)), axis=1) # concat column
|
|
116
|
+
maf = maf.filter_maf(cls.vaild_variant_classfication)
|
|
117
|
+
return cls(maf)
|
|
118
|
+
|
|
119
|
+
def __init__(self, *args, **kwargs):
|
|
120
|
+
super().__init__(*args, **kwargs)
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def _constructor(self):
|
|
124
|
+
# make sure returned object is MAF type
|
|
125
|
+
return MAF
|
|
126
|
+
|
|
127
|
+
def filter_maf(self, mutation_types):
|
|
128
|
+
return self[self.Variant_Classification.isin(mutation_types)]
|
|
129
|
+
|
|
130
|
+
# def calculate_frequency(self) -> pd.Series:
|
|
131
|
+
# return (self != False).sum(axis=1) / self.shape[1]
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def merge_mutations(column):
|
|
135
|
+
if (column == False).all() :
|
|
136
|
+
return False
|
|
137
|
+
# Get unique non-False mutation types
|
|
138
|
+
unique_mutations = column[column != False].unique()
|
|
139
|
+
if len(unique_mutations) > 1:
|
|
140
|
+
return "Multi_Hit"
|
|
141
|
+
elif len(unique_mutations) == 1:
|
|
142
|
+
return unique_mutations[0]
|
|
143
|
+
|
|
144
|
+
def to_pivot_table(self) -> PivotTable:
|
|
145
|
+
pivot_table = self.pivot_table(
|
|
146
|
+
values="Variant_Classification",
|
|
147
|
+
index="Hugo_Symbol",
|
|
148
|
+
columns="case_ID",
|
|
149
|
+
aggfunc=MAF.merge_mutations
|
|
150
|
+
).fillna(False)
|
|
151
|
+
pivot_table = PivotTable(pivot_table)
|
|
152
|
+
pivot_table.sample_metadata["mutations_count"] = self.mutations_count
|
|
153
|
+
pivot_table.sample_metadata["TMB"] = self.mutations_count / 40
|
|
154
|
+
return pivot_table
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def mutations_count(self) -> pd.Series:
|
|
158
|
+
return self.groupby(self.case_ID).size()
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pymaftools
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: pymaftools is a Python package for handling and analyzing Mutation Annotation Format (MAF) files. It provides utilities for data manipulation and visualization, including classes for MAF parsing and oncoplot generation.
|
|
5
|
+
Home-page: https://github.com/xu62u4u6/pymaftools
|
|
6
|
+
Author: xu62u4u6
|
|
7
|
+
Author-email: 199928ltyos@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# pymaftools
|
|
17
|
+
|
|
18
|
+
`pymaftools` is a Python package designed to handle and analyze MAF (Mutation Annotation Format) files. It provides utilities for working with mutation data, including the `MAF` and `PivotTable` classes for data manipulation, and functions for visualizing mutation data with oncoplots.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
|
|
22
|
+
- **MAF Class**: A utility to load, parse, and manipulate MAF files.
|
|
23
|
+
- **PivotTable Class**: A custom pivot table implementation for summarizing mutation frequencies and sorting genes and samples.
|
|
24
|
+
- **Oncoplot Visualization**: Generate oncoplot visualizations with mutation data and frequencies.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
### Using pip (from PyPI)
|
|
29
|
+
You can install the `pymaftools` package directly from PyPI using pip:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install pymaftools
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Using GitHub (for the latest version)
|
|
36
|
+
To install directly from GitHub (if you want the latest changes):
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install git+https://github.com/xu62u4u6/pymaftools.git
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
### Importing the Package
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from pymaftools.maf_utils import MAF, PivotTable
|
|
49
|
+
from pymaftools.maf_plots import create_oncoplot
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Example
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
# Load the MAF file
|
|
56
|
+
all_case_maf = MAF.read_maf("path_to_maf_file.maf")
|
|
57
|
+
|
|
58
|
+
# Filter to keep only nonsynonymous mutations
|
|
59
|
+
filtered_all_case_maf = all_case_maf.filter_maf(MAF.nonsynonymous_types)
|
|
60
|
+
|
|
61
|
+
# Convert to pivot table (genes x samples table, mutation classification as values)
|
|
62
|
+
pivot_table = filtered_all_case_maf.to_pivot_table()
|
|
63
|
+
|
|
64
|
+
# Calculate mutation frequencies
|
|
65
|
+
pivot_table = pivot_table.add_freq()
|
|
66
|
+
|
|
67
|
+
# Sort the pivot table (by gene frequency and sample mutation count)
|
|
68
|
+
sorted_pivot_table = (pivot_table
|
|
69
|
+
.sort_genes_by_freq()
|
|
70
|
+
.sort_samples_by_mutations()
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Generate an oncoplot to show the top 50 genes with the highest mutation frequencies
|
|
74
|
+
create_oncoplot(sorted_pivot_table.top(50),
|
|
75
|
+
figsize=(26, 15),
|
|
76
|
+
ax_main_range=(0, 28),
|
|
77
|
+
ax_freq_range=(28, 29),
|
|
78
|
+
ax_legend_range=(29, 31),
|
|
79
|
+
mutation_counts=True)
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Requirements
|
|
84
|
+
Python 3.x
|
|
85
|
+
pandas, numpy, matplotlib, seaborn
|
|
86
|
+
### License
|
|
87
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
88
|
+
|
|
89
|
+
### Author
|
|
90
|
+
xu62u4u6
|
|
91
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
pymaftools/__init__.py
|
|
5
|
+
pymaftools/maf_plots.py
|
|
6
|
+
pymaftools/maf_utils.py
|
|
7
|
+
pymaftools.egg-info/PKG-INFO
|
|
8
|
+
pymaftools.egg-info/SOURCES.txt
|
|
9
|
+
pymaftools.egg-info/dependency_links.txt
|
|
10
|
+
pymaftools.egg-info/requires.txt
|
|
11
|
+
pymaftools.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pymaftools
|
pymaftools-0.1/setup.cfg
ADDED
pymaftools-0.1/setup.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name='pymaftools',
|
|
5
|
+
version='0.1',
|
|
6
|
+
author = "xu62u4u6",
|
|
7
|
+
author_email="199928ltyos@gmail.com",
|
|
8
|
+
description='pymaftools is a Python package for handling and analyzing Mutation Annotation Format (MAF) files. It provides utilities for data manipulation and visualization, including classes for MAF parsing and oncoplot generation.',
|
|
9
|
+
long_description=open('README.md').read(),
|
|
10
|
+
long_description_content_type='text/markdown',
|
|
11
|
+
packages=find_packages(),
|
|
12
|
+
install_requires=[
|
|
13
|
+
'pandas',
|
|
14
|
+
'numpy',
|
|
15
|
+
'matplotlib',
|
|
16
|
+
'seaborn',
|
|
17
|
+
],
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
],
|
|
23
|
+
license='MIT',
|
|
24
|
+
url='https://github.com/xu62u4u6/pymaftools',
|
|
25
|
+
)
|