pychnosz 1.1.12__cp310-cp310-macosx_15_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pychnosz/.dylibs/libgcc_s.1.1.dylib +0 -0
- pychnosz/.dylibs/libgfortran.5.dylib +0 -0
- pychnosz/.dylibs/libquadmath.0.dylib +0 -0
- pychnosz/__init__.py +129 -0
- pychnosz/_version.py +34 -0
- pychnosz/biomolecules/__init__.py +29 -0
- pychnosz/biomolecules/ionize_aa.py +197 -0
- pychnosz/biomolecules/proteins.py +595 -0
- pychnosz/core/__init__.py +46 -0
- pychnosz/core/affinity.py +1256 -0
- pychnosz/core/animation.py +593 -0
- pychnosz/core/balance.py +334 -0
- pychnosz/core/basis.py +716 -0
- pychnosz/core/diagram.py +3336 -0
- pychnosz/core/equilibrate.py +813 -0
- pychnosz/core/equilibrium.py +554 -0
- pychnosz/core/info.py +821 -0
- pychnosz/core/retrieve.py +364 -0
- pychnosz/core/speciation.py +580 -0
- pychnosz/core/species.py +599 -0
- pychnosz/core/subcrt.py +1696 -0
- pychnosz/core/thermo.py +593 -0
- pychnosz/core/unicurve.py +1226 -0
- pychnosz/data/__init__.py +11 -0
- pychnosz/data/add_obigt.py +327 -0
- pychnosz/data/extdata/Berman/BDat17_2017.csv +2 -0
- pychnosz/data/extdata/Berman/Ber88_1988.csv +68 -0
- pychnosz/data/extdata/Berman/Ber90_1990.csv +5 -0
- pychnosz/data/extdata/Berman/DS10_2010.csv +6 -0
- pychnosz/data/extdata/Berman/FDM+14_2014.csv +2 -0
- pychnosz/data/extdata/Berman/Got04_2004.csv +5 -0
- pychnosz/data/extdata/Berman/JUN92_1992.csv +3 -0
- pychnosz/data/extdata/Berman/SHD91_1991.csv +12 -0
- pychnosz/data/extdata/Berman/VGT92_1992.csv +2 -0
- pychnosz/data/extdata/Berman/VPT01_2001.csv +3 -0
- pychnosz/data/extdata/Berman/VPV05_2005.csv +2 -0
- pychnosz/data/extdata/Berman/ZS92_1992.csv +11 -0
- pychnosz/data/extdata/Berman/sympy.R +99 -0
- pychnosz/data/extdata/Berman/testing/BA96.bib +12 -0
- pychnosz/data/extdata/Berman/testing/BA96_Berman.csv +21 -0
- pychnosz/data/extdata/Berman/testing/BA96_OBIGT.csv +21 -0
- pychnosz/data/extdata/Berman/testing/BA96_refs.csv +6 -0
- pychnosz/data/extdata/OBIGT/AD.csv +25 -0
- pychnosz/data/extdata/OBIGT/Berman_cr.csv +93 -0
- pychnosz/data/extdata/OBIGT/DEW.csv +211 -0
- pychnosz/data/extdata/OBIGT/H2O_aq.csv +4 -0
- pychnosz/data/extdata/OBIGT/SLOP98.csv +411 -0
- pychnosz/data/extdata/OBIGT/SUPCRT92.csv +178 -0
- pychnosz/data/extdata/OBIGT/inorganic_aq.csv +729 -0
- pychnosz/data/extdata/OBIGT/inorganic_cr.csv +273 -0
- pychnosz/data/extdata/OBIGT/inorganic_gas.csv +20 -0
- pychnosz/data/extdata/OBIGT/organic_aq.csv +1104 -0
- pychnosz/data/extdata/OBIGT/organic_cr.csv +481 -0
- pychnosz/data/extdata/OBIGT/organic_gas.csv +268 -0
- pychnosz/data/extdata/OBIGT/organic_liq.csv +533 -0
- pychnosz/data/extdata/OBIGT/testing/GEMSFIT.csv +43 -0
- pychnosz/data/extdata/OBIGT/testing/IGEM.csv +17 -0
- pychnosz/data/extdata/OBIGT/testing/Sandia.csv +8 -0
- pychnosz/data/extdata/OBIGT/testing/SiO2.csv +4 -0
- pychnosz/data/extdata/misc/AD03_Fig1a.csv +69 -0
- pychnosz/data/extdata/misc/AD03_Fig1b.csv +43 -0
- pychnosz/data/extdata/misc/AD03_Fig1c.csv +89 -0
- pychnosz/data/extdata/misc/AD03_Fig1d.csv +30 -0
- pychnosz/data/extdata/misc/BZA10.csv +5 -0
- pychnosz/data/extdata/misc/HW97_Cp.csv +90 -0
- pychnosz/data/extdata/misc/HWM96_V.csv +229 -0
- pychnosz/data/extdata/misc/LA19_test.csv +7 -0
- pychnosz/data/extdata/misc/Mer75_Table4.csv +42 -0
- pychnosz/data/extdata/misc/OBIGT_check.csv +423 -0
- pychnosz/data/extdata/misc/PM90.csv +7 -0
- pychnosz/data/extdata/misc/RH95.csv +23 -0
- pychnosz/data/extdata/misc/RH98_Table15.csv +17 -0
- pychnosz/data/extdata/misc/SC10_Rainbow.csv +19 -0
- pychnosz/data/extdata/misc/SK95.csv +55 -0
- pychnosz/data/extdata/misc/SOJSH.csv +61 -0
- pychnosz/data/extdata/misc/SS98_Fig5a.csv +81 -0
- pychnosz/data/extdata/misc/SS98_Fig5b.csv +84 -0
- pychnosz/data/extdata/misc/TKSS14_Fig2.csv +25 -0
- pychnosz/data/extdata/misc/bluered.txt +1000 -0
- pychnosz/data/extdata/protein/Cas/Cas_aa.csv +177 -0
- pychnosz/data/extdata/protein/Cas/Cas_uniprot.csv +186 -0
- pychnosz/data/extdata/protein/Cas/download.R +34 -0
- pychnosz/data/extdata/protein/Cas/mkaa.R +34 -0
- pychnosz/data/extdata/protein/POLG.csv +12 -0
- pychnosz/data/extdata/protein/TBD+05.csv +393 -0
- pychnosz/data/extdata/protein/TBD+05_aa.csv +393 -0
- pychnosz/data/extdata/protein/rubisco.csv +28 -0
- pychnosz/data/extdata/protein/rubisco.fasta +239 -0
- pychnosz/data/extdata/protein/rubisco_aa.csv +28 -0
- pychnosz/data/extdata/src/H2O92D.f.orig +3457 -0
- pychnosz/data/extdata/src/README.txt +5 -0
- pychnosz/data/extdata/taxonomy/names.dmp +215 -0
- pychnosz/data/extdata/taxonomy/nodes.dmp +63 -0
- pychnosz/data/extdata/thermo/Bdot_acirc.csv +60 -0
- pychnosz/data/extdata/thermo/buffer.csv +40 -0
- pychnosz/data/extdata/thermo/element.csv +135 -0
- pychnosz/data/extdata/thermo/groups.csv +6 -0
- pychnosz/data/extdata/thermo/opt.csv +2 -0
- pychnosz/data/extdata/thermo/protein.csv +506 -0
- pychnosz/data/extdata/thermo/refs.csv +343 -0
- pychnosz/data/extdata/thermo/stoich.csv.xz +0 -0
- pychnosz/data/loader.py +431 -0
- pychnosz/data/mod_obigt.py +322 -0
- pychnosz/data/obigt.py +471 -0
- pychnosz/data/worm.py +228 -0
- pychnosz/fortran/.gitignore +6 -0
- pychnosz/fortran/__init__.py +16 -0
- pychnosz/fortran/h2o92.dylib +0 -0
- pychnosz/fortran/h2o92_interface.py +527 -0
- pychnosz/geochemistry/__init__.py +21 -0
- pychnosz/geochemistry/minerals.py +514 -0
- pychnosz/geochemistry/redox.py +500 -0
- pychnosz/models/__init__.py +47 -0
- pychnosz/models/archer_wang.py +165 -0
- pychnosz/models/berman.py +309 -0
- pychnosz/models/cgl.py +381 -0
- pychnosz/models/dew.py +997 -0
- pychnosz/models/hkf.py +523 -0
- pychnosz/models/hkf_helpers.py +231 -0
- pychnosz/models/iapws95.py +1113 -0
- pychnosz/models/supcrt92_fortran.py +238 -0
- pychnosz/models/water.py +480 -0
- pychnosz/utils/__init__.py +27 -0
- pychnosz/utils/expression.py +1074 -0
- pychnosz/utils/formula.py +830 -0
- pychnosz/utils/formula_ox.py +227 -0
- pychnosz/utils/reset.py +33 -0
- pychnosz/utils/units.py +259 -0
- pychnosz-1.1.12.dist-info/METADATA +197 -0
- pychnosz-1.1.12.dist-info/RECORD +133 -0
- pychnosz-1.1.12.dist-info/WHEEL +5 -0
- pychnosz-1.1.12.dist-info/licenses/LICENSE.txt +19 -0
- pychnosz-1.1.12.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Protein functions for CHNOSZ.
|
|
3
|
+
|
|
4
|
+
This module implements protein-related functions from CHNOSZ including
|
|
5
|
+
add_protein, protein_length, protein_formula, protein_OBIGT, and protein_basis.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from typing import Union, Optional, List
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
from ..core.thermo import thermo
|
|
14
|
+
from ..utils.formula import i2A, as_chemical_formula, species_basis
|
|
15
|
+
from ..biomolecules.ionize_aa import ionize_aa
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pinfo(protein: Union[str, int, pd.DataFrame, List],
|
|
19
|
+
organism: Optional[str] = None,
|
|
20
|
+
residue: bool = False,
|
|
21
|
+
regexp: bool = False) -> Union[pd.DataFrame, np.ndarray, int]:
|
|
22
|
+
"""
|
|
23
|
+
Get protein information from thermo().protein.
|
|
24
|
+
|
|
25
|
+
This function retrieves protein data from the thermodynamic database.
|
|
26
|
+
The behavior depends on the input type:
|
|
27
|
+
- DataFrame: returns the DataFrame (possibly per residue)
|
|
28
|
+
- int or list of ints: returns rows from thermo().protein
|
|
29
|
+
- str: searches for protein by name, returns row number(s)
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
protein : str, int, DataFrame, or list
|
|
34
|
+
Protein identifier(s) or data
|
|
35
|
+
organism : str, optional
|
|
36
|
+
Organism identifier (used with protein name)
|
|
37
|
+
residue : bool, default False
|
|
38
|
+
Return per-residue amino acid composition
|
|
39
|
+
regexp : bool, default False
|
|
40
|
+
Use regular expression matching for protein search
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
DataFrame, array, or int
|
|
45
|
+
Protein information or row numbers
|
|
46
|
+
|
|
47
|
+
Examples
|
|
48
|
+
--------
|
|
49
|
+
>>> # Get protein by name
|
|
50
|
+
>>> iprotein = pinfo("LYSC_CHICK")
|
|
51
|
+
>>> # Get protein data by row number
|
|
52
|
+
>>> protein_data = pinfo(iprotein)
|
|
53
|
+
"""
|
|
54
|
+
t_p = thermo().protein
|
|
55
|
+
|
|
56
|
+
if t_p is None:
|
|
57
|
+
raise RuntimeError("Protein database not loaded. Run reset() first.")
|
|
58
|
+
|
|
59
|
+
# If input is a DataFrame, return it (possibly per residue)
|
|
60
|
+
if isinstance(protein, pd.DataFrame):
|
|
61
|
+
out = protein.copy()
|
|
62
|
+
if residue:
|
|
63
|
+
# Normalize by total amino acid count (columns 5:25)
|
|
64
|
+
row_sums = out.iloc[:, 5:25].sum(axis=1)
|
|
65
|
+
out.iloc[:, 4:24] = out.iloc[:, 4:24].div(row_sums, axis=0)
|
|
66
|
+
return out
|
|
67
|
+
|
|
68
|
+
# If input is numeric, get rows from thermo().protein
|
|
69
|
+
if isinstance(protein, (int, np.integer)):
|
|
70
|
+
protein = [protein]
|
|
71
|
+
|
|
72
|
+
if isinstance(protein, (list, np.ndarray)) and all(isinstance(x, (int, np.integer)) for x in protein):
|
|
73
|
+
# Get amino acid counts
|
|
74
|
+
iproteins = list(range(len(t_p)))
|
|
75
|
+
# Replace invalid indices with NaN
|
|
76
|
+
protein_clean = [p if p in iproteins else np.nan for p in protein]
|
|
77
|
+
# Filter out NaN values for indexing
|
|
78
|
+
valid_indices = [p for p in protein_clean if not np.isnan(p)]
|
|
79
|
+
|
|
80
|
+
if not valid_indices:
|
|
81
|
+
return pd.DataFrame()
|
|
82
|
+
|
|
83
|
+
out = t_p.iloc[valid_indices].copy()
|
|
84
|
+
|
|
85
|
+
# Compute per-residue counts if requested
|
|
86
|
+
if residue:
|
|
87
|
+
row_sums = out.iloc[:, 5:25].sum(axis=1)
|
|
88
|
+
out.iloc[:, 4:24] = out.iloc[:, 4:24].div(row_sums, axis=0)
|
|
89
|
+
|
|
90
|
+
return out
|
|
91
|
+
|
|
92
|
+
# If input is string or list of strings, search for protein
|
|
93
|
+
if isinstance(protein, str):
|
|
94
|
+
protein = [protein]
|
|
95
|
+
|
|
96
|
+
if isinstance(protein, list) and all(isinstance(x, str) for x in protein):
|
|
97
|
+
# Search for protein or protein_organism in thermo().protein
|
|
98
|
+
t_p_names = t_p['protein'] + '_' + t_p['organism']
|
|
99
|
+
|
|
100
|
+
if regexp:
|
|
101
|
+
# Use regular expression matching
|
|
102
|
+
matches = []
|
|
103
|
+
for prot in protein:
|
|
104
|
+
iprotein = t_p['protein'].str.contains(prot, regex=True, na=False)
|
|
105
|
+
if organism is not None:
|
|
106
|
+
iorganism = t_p['organism'].str.contains(organism, regex=True, na=False)
|
|
107
|
+
iprotein = iprotein & iorganism
|
|
108
|
+
indices = np.where(iprotein)[0]
|
|
109
|
+
if len(indices) > 0:
|
|
110
|
+
matches.extend(indices.tolist())
|
|
111
|
+
else:
|
|
112
|
+
matches.append(np.nan)
|
|
113
|
+
|
|
114
|
+
if len(matches) == 1:
|
|
115
|
+
if np.isnan(matches[0]):
|
|
116
|
+
return np.nan
|
|
117
|
+
return int(matches[0])
|
|
118
|
+
return np.array(matches)
|
|
119
|
+
else:
|
|
120
|
+
# Exact matching
|
|
121
|
+
if organism is None:
|
|
122
|
+
my_names = protein
|
|
123
|
+
else:
|
|
124
|
+
my_names = [f"{p}_{organism}" for p in protein]
|
|
125
|
+
|
|
126
|
+
# Find matches
|
|
127
|
+
matches = []
|
|
128
|
+
for name in my_names:
|
|
129
|
+
idx = np.where(t_p_names == name)[0]
|
|
130
|
+
if len(idx) > 0:
|
|
131
|
+
matches.append(idx[0])
|
|
132
|
+
else:
|
|
133
|
+
matches.append(np.nan)
|
|
134
|
+
|
|
135
|
+
if len(matches) == 1:
|
|
136
|
+
if np.isnan(matches[0]):
|
|
137
|
+
return np.nan
|
|
138
|
+
return int(matches[0])
|
|
139
|
+
return np.array(matches)
|
|
140
|
+
|
|
141
|
+
raise TypeError(f"Unsupported protein type: {type(protein)}")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def add_protein(aa: pd.DataFrame, as_residue: bool = False) -> np.ndarray:
|
|
145
|
+
"""
|
|
146
|
+
Add protein amino acid compositions to thermo().protein.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
aa : DataFrame
|
|
151
|
+
DataFrame with protein amino acid compositions.
|
|
152
|
+
Must have same columns as thermo().protein
|
|
153
|
+
as_residue : bool, default False
|
|
154
|
+
Normalize amino acid counts by protein length
|
|
155
|
+
|
|
156
|
+
Returns
|
|
157
|
+
-------
|
|
158
|
+
array
|
|
159
|
+
Row numbers of added/updated proteins in thermo().protein
|
|
160
|
+
|
|
161
|
+
Examples
|
|
162
|
+
--------
|
|
163
|
+
>>> import pandas as pd
|
|
164
|
+
>>> from pychnosz import *
|
|
165
|
+
>>> aa = pd.read_csv("POLG.csv")
|
|
166
|
+
>>> iprotein = add_protein(aa)
|
|
167
|
+
"""
|
|
168
|
+
t = thermo()
|
|
169
|
+
|
|
170
|
+
if t.protein is None:
|
|
171
|
+
raise RuntimeError("Protein database not loaded. Run reset() first.")
|
|
172
|
+
|
|
173
|
+
# Check that columns match
|
|
174
|
+
if list(aa.columns) != list(t.protein.columns):
|
|
175
|
+
raise ValueError("'aa' does not have the same columns as thermo().protein")
|
|
176
|
+
|
|
177
|
+
# Check that new protein IDs are unique
|
|
178
|
+
po = aa['protein'] + '_' + aa['organism']
|
|
179
|
+
idup = po.duplicated()
|
|
180
|
+
if idup.any():
|
|
181
|
+
dup_proteins = po[idup].unique()
|
|
182
|
+
raise ValueError(f"some protein IDs are duplicated: {' '.join(dup_proteins)}")
|
|
183
|
+
|
|
184
|
+
# Normalize by protein length if as_residue = True
|
|
185
|
+
if as_residue:
|
|
186
|
+
pl = protein_length(aa)
|
|
187
|
+
aa.iloc[:, 4:24] = aa.iloc[:, 4:24].div(pl, axis=0)
|
|
188
|
+
|
|
189
|
+
# Find any protein IDs that are already present
|
|
190
|
+
ip = pinfo(po.tolist())
|
|
191
|
+
if isinstance(ip, (int, np.integer)):
|
|
192
|
+
ip = np.array([ip])
|
|
193
|
+
elif not isinstance(ip, np.ndarray):
|
|
194
|
+
ip = np.array([ip])
|
|
195
|
+
|
|
196
|
+
ip_present = ~np.isnan(ip)
|
|
197
|
+
|
|
198
|
+
# Now we're ready to go
|
|
199
|
+
tp_new = t.protein.copy()
|
|
200
|
+
|
|
201
|
+
# Add new proteins
|
|
202
|
+
if not all(ip_present):
|
|
203
|
+
new_proteins = aa[~ip_present].copy()
|
|
204
|
+
tp_new = pd.concat([tp_new, new_proteins], ignore_index=True)
|
|
205
|
+
|
|
206
|
+
# Update existing proteins
|
|
207
|
+
if any(ip_present):
|
|
208
|
+
valid_ip = ip[ip_present].astype(int)
|
|
209
|
+
tp_new.iloc[valid_ip] = aa[ip_present].values
|
|
210
|
+
|
|
211
|
+
# Update the protein database
|
|
212
|
+
tp_new.reset_index(drop=True, inplace=True)
|
|
213
|
+
t.protein = tp_new
|
|
214
|
+
|
|
215
|
+
# Return the new row numbers
|
|
216
|
+
ip_new = pinfo(po.tolist())
|
|
217
|
+
if isinstance(ip_new, (int, np.integer)):
|
|
218
|
+
ip_new = np.array([ip_new])
|
|
219
|
+
|
|
220
|
+
# Print messages
|
|
221
|
+
n_added = sum(~ip_present)
|
|
222
|
+
n_replaced = sum(ip_present)
|
|
223
|
+
|
|
224
|
+
if n_added > 0:
|
|
225
|
+
print(f"add_protein: added {n_added} new protein(s) to thermo().protein")
|
|
226
|
+
if n_replaced > 0:
|
|
227
|
+
print(f"add_protein: replaced {n_replaced} existing protein(s) in thermo().protein")
|
|
228
|
+
|
|
229
|
+
return ip_new
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def protein_length(protein: Union[int, List[int], pd.DataFrame],
|
|
233
|
+
organism: Optional[str] = None) -> Union[int, np.ndarray]:
|
|
234
|
+
"""
|
|
235
|
+
Calculate the length(s) of proteins.
|
|
236
|
+
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
protein : int, list of int, or DataFrame
|
|
240
|
+
Protein identifier(s) or amino acid composition data
|
|
241
|
+
organism : str, optional
|
|
242
|
+
Organism identifier (used with protein number)
|
|
243
|
+
|
|
244
|
+
Returns
|
|
245
|
+
-------
|
|
246
|
+
int or array
|
|
247
|
+
Protein length(s) in amino acid residues
|
|
248
|
+
|
|
249
|
+
Examples
|
|
250
|
+
--------
|
|
251
|
+
>>> iprotein = pinfo("LYSC_CHICK")
|
|
252
|
+
>>> length = protein_length(iprotein)
|
|
253
|
+
"""
|
|
254
|
+
# Get amino acid composition
|
|
255
|
+
aa = pinfo(pinfo(protein, organism))
|
|
256
|
+
|
|
257
|
+
if isinstance(aa, pd.DataFrame):
|
|
258
|
+
# Use sum on the columns containing amino acid counts (columns 5:25)
|
|
259
|
+
pl = aa.iloc[:, 5:25].sum(axis=1).values
|
|
260
|
+
return pl
|
|
261
|
+
else:
|
|
262
|
+
return 0
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def group_formulas() -> pd.DataFrame:
|
|
266
|
+
"""
|
|
267
|
+
Return chemical formulas of amino acid residues.
|
|
268
|
+
|
|
269
|
+
This function returns a DataFrame with the chemical formulas of
|
|
270
|
+
H2O, the 20 amino acid sidechain groups, and the unfolded protein
|
|
271
|
+
backbone group [UPBB].
|
|
272
|
+
|
|
273
|
+
Returns
|
|
274
|
+
-------
|
|
275
|
+
DataFrame
|
|
276
|
+
Chemical formulas with elements C, H, N, O, S as columns
|
|
277
|
+
and residues as rows
|
|
278
|
+
"""
|
|
279
|
+
# Chemical formulas as a numpy array
|
|
280
|
+
# Rows: water, [Ala], [Cys], [Asp], [Glu], [Phe], [Gly], [His], [Ile], [Lys], [Leu],
|
|
281
|
+
# [Met], [Asn], [Pro], [Gln], [Arg], [Ser], [Thr], [Val], [Trp], [Tyr], [UPBB]
|
|
282
|
+
# Columns: C, H, N, O, S
|
|
283
|
+
A = np.array([
|
|
284
|
+
[0, 2, 0, 1, 0], # H2O
|
|
285
|
+
[1, 3, 0, 0, 0], # [Ala]
|
|
286
|
+
[1, 3, 0, 0, 1], # [Cys]
|
|
287
|
+
[2, 3, 0, 2, 0], # [Asp]
|
|
288
|
+
[3, 5, 0, 2, 0], # [Glu]
|
|
289
|
+
[7, 7, 0, 0, 0], # [Phe]
|
|
290
|
+
[0, 1, 0, 0, 0], # [Gly]
|
|
291
|
+
[4, 5, 2, 0, 0], # [His]
|
|
292
|
+
[4, 9, 0, 0, 0], # [Ile]
|
|
293
|
+
[4, 10, 1, 0, 0], # [Lys]
|
|
294
|
+
[4, 9, 0, 0, 0], # [Leu]
|
|
295
|
+
[3, 7, 0, 0, 1], # [Met]
|
|
296
|
+
[2, 4, 1, 1, 0], # [Asn]
|
|
297
|
+
[3, 5, 0, 0, 0], # [Pro]
|
|
298
|
+
[3, 6, 1, 1, 0], # [Gln]
|
|
299
|
+
[4, 10, 3, 0, 0], # [Arg]
|
|
300
|
+
[1, 3, 0, 1, 0], # [Ser]
|
|
301
|
+
[2, 5, 0, 1, 0], # [Thr]
|
|
302
|
+
[3, 7, 0, 0, 0], # [Val]
|
|
303
|
+
[9, 8, 1, 0, 0], # [Trp]
|
|
304
|
+
[7, 7, 0, 1, 0], # [Tyr]
|
|
305
|
+
[2, 2, 1, 1, 0] # [UPBB]
|
|
306
|
+
])
|
|
307
|
+
|
|
308
|
+
rownames = ['H2O', '[Ala]', '[Cys]', '[Asp]', '[Glu]', '[Phe]', '[Gly]',
|
|
309
|
+
'[His]', '[Ile]', '[Lys]', '[Leu]', '[Met]', '[Asn]', '[Pro]',
|
|
310
|
+
'[Gln]', '[Arg]', '[Ser]', '[Thr]', '[Val]', '[Trp]', '[Tyr]',
|
|
311
|
+
'[UPBB]']
|
|
312
|
+
|
|
313
|
+
# Add [UPBB] to the sidechain groups to get residues
|
|
314
|
+
out = A.copy()
|
|
315
|
+
# Add [UPBB] (last row) to each sidechain group (rows 1-20)
|
|
316
|
+
out[1:21, :] = out[1:21, :] + A[21, :]
|
|
317
|
+
|
|
318
|
+
# Create DataFrame
|
|
319
|
+
df = pd.DataFrame(out[0:21, :],
|
|
320
|
+
index=rownames[0:21],
|
|
321
|
+
columns=['C', 'H', 'N', 'O', 'S'])
|
|
322
|
+
|
|
323
|
+
return df
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def protein_formula(protein: Union[int, List[int], pd.DataFrame],
|
|
327
|
+
organism: Optional[str] = None,
|
|
328
|
+
residue: bool = False) -> pd.DataFrame:
|
|
329
|
+
"""
|
|
330
|
+
Calculate chemical formulas of proteins.
|
|
331
|
+
|
|
332
|
+
Parameters
|
|
333
|
+
----------
|
|
334
|
+
protein : int, list of int, or DataFrame
|
|
335
|
+
Protein identifier(s) or amino acid composition data
|
|
336
|
+
organism : str, optional
|
|
337
|
+
Organism identifier (used with protein number)
|
|
338
|
+
residue : bool, default False
|
|
339
|
+
Return per-residue formula
|
|
340
|
+
|
|
341
|
+
Returns
|
|
342
|
+
-------
|
|
343
|
+
DataFrame
|
|
344
|
+
Chemical formulas with elements C, H, N, O, S as columns
|
|
345
|
+
|
|
346
|
+
Examples
|
|
347
|
+
--------
|
|
348
|
+
>>> iprotein = pinfo("LYSC_CHICK")
|
|
349
|
+
>>> formula = protein_formula(iprotein)
|
|
350
|
+
"""
|
|
351
|
+
# Get amino acid composition
|
|
352
|
+
aa = pinfo(pinfo(protein, organism))
|
|
353
|
+
|
|
354
|
+
if not isinstance(aa, pd.DataFrame):
|
|
355
|
+
raise TypeError("Could not retrieve protein data")
|
|
356
|
+
|
|
357
|
+
# Get group formulas
|
|
358
|
+
rf = group_formulas()
|
|
359
|
+
|
|
360
|
+
# Matrix multiplication: amino acid counts * residue formulas
|
|
361
|
+
# Columns 5:25 contain amino acid counts (excluding chains column at 4)
|
|
362
|
+
# We need to add H2O (chains column) separately
|
|
363
|
+
aa_counts = aa.iloc[:, 5:25].values.astype(float)
|
|
364
|
+
chains = aa.iloc[:, 4].values.astype(float)
|
|
365
|
+
rf_values = rf.iloc[1:, :].values.astype(float) # Skip H2O row, use amino acid residues
|
|
366
|
+
rf_H2O = rf.iloc[0, :].values.astype(float) # H2O row
|
|
367
|
+
|
|
368
|
+
# Calculate protein formula: amino acids + H2O for chains
|
|
369
|
+
out = np.dot(aa_counts, rf_values) + np.outer(chains, rf_H2O)
|
|
370
|
+
|
|
371
|
+
# Normalize by residue if requested
|
|
372
|
+
if residue:
|
|
373
|
+
row_sums = aa.iloc[:, 5:25].sum(axis=1).values
|
|
374
|
+
out = out / row_sums[:, np.newaxis]
|
|
375
|
+
|
|
376
|
+
# Create DataFrame with protein names as index
|
|
377
|
+
protein_names = aa['protein'] + '_' + aa['organism']
|
|
378
|
+
# Make names unique if there are duplicates
|
|
379
|
+
if protein_names.duplicated().any():
|
|
380
|
+
counts = {}
|
|
381
|
+
unique_names = []
|
|
382
|
+
for name in protein_names:
|
|
383
|
+
if name in counts:
|
|
384
|
+
counts[name] += 1
|
|
385
|
+
unique_names.append(f"{name}.{counts[name]}")
|
|
386
|
+
else:
|
|
387
|
+
counts[name] = 0
|
|
388
|
+
unique_names.append(name)
|
|
389
|
+
protein_names = unique_names
|
|
390
|
+
|
|
391
|
+
result = pd.DataFrame(out,
|
|
392
|
+
index=protein_names,
|
|
393
|
+
columns=['C', 'H', 'N', 'O', 'S'])
|
|
394
|
+
|
|
395
|
+
return result
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def protein_OBIGT(protein: Union[int, List[int], pd.DataFrame],
|
|
399
|
+
organism: Optional[str] = None,
|
|
400
|
+
state: Optional[str] = None) -> pd.DataFrame:
|
|
401
|
+
"""
|
|
402
|
+
Calculate protein properties using group additivity.
|
|
403
|
+
|
|
404
|
+
This function calculates thermodynamic properties of proteins
|
|
405
|
+
from amino acid composition using the group additivity approach.
|
|
406
|
+
|
|
407
|
+
Parameters
|
|
408
|
+
----------
|
|
409
|
+
protein : int, list of int, or DataFrame
|
|
410
|
+
Protein identifier(s) or amino acid composition data
|
|
411
|
+
organism : str, optional
|
|
412
|
+
Organism identifier
|
|
413
|
+
state : str, optional
|
|
414
|
+
Physical state ('aq' or 'cr'). If None, uses thermo().opt['state']
|
|
415
|
+
|
|
416
|
+
Returns
|
|
417
|
+
-------
|
|
418
|
+
DataFrame
|
|
419
|
+
Thermodynamic properties in OBIGT format
|
|
420
|
+
|
|
421
|
+
Examples
|
|
422
|
+
--------
|
|
423
|
+
>>> iprotein = pinfo("LYSC_CHICK")
|
|
424
|
+
>>> props = protein_OBIGT(iprotein)
|
|
425
|
+
"""
|
|
426
|
+
# Get amino acid composition
|
|
427
|
+
aa = pinfo(pinfo(protein, organism))
|
|
428
|
+
|
|
429
|
+
if not isinstance(aa, pd.DataFrame):
|
|
430
|
+
raise TypeError("Could not retrieve protein data")
|
|
431
|
+
|
|
432
|
+
# Get state
|
|
433
|
+
if state is None:
|
|
434
|
+
state = thermo().opt.get('state', 'aq')
|
|
435
|
+
|
|
436
|
+
# The names of the protein backbone groups depend on the state
|
|
437
|
+
# [UPBB] for aq or [PBB] for cr
|
|
438
|
+
if state == 'aq':
|
|
439
|
+
bbgroup = 'UPBB'
|
|
440
|
+
else:
|
|
441
|
+
bbgroup = 'PBB'
|
|
442
|
+
|
|
443
|
+
# Names of the AABB, sidechain and protein backbone groups
|
|
444
|
+
aa_cols = aa.columns[5:25].tolist() # Get amino acid column names
|
|
445
|
+
groups = ['AABB'] + aa_cols + [bbgroup]
|
|
446
|
+
|
|
447
|
+
# Put brackets around the group names
|
|
448
|
+
groups = [f"[{g}]" for g in groups]
|
|
449
|
+
|
|
450
|
+
# The row numbers of the groups in thermo().OBIGT
|
|
451
|
+
from ..core.info import info
|
|
452
|
+
|
|
453
|
+
groups_state = [f"{g}" for g in groups]
|
|
454
|
+
obigt = thermo().obigt
|
|
455
|
+
|
|
456
|
+
# Find groups in OBIGT
|
|
457
|
+
igroup = []
|
|
458
|
+
for group_name in groups_state:
|
|
459
|
+
# Search for the group with the specified state
|
|
460
|
+
matches = obigt[(obigt['name'] == group_name) & (obigt['state'] == state)]
|
|
461
|
+
if len(matches) > 0:
|
|
462
|
+
igroup.append(matches.index[0])
|
|
463
|
+
else:
|
|
464
|
+
# Try without brackets if not found
|
|
465
|
+
group_alt = group_name.strip('[]')
|
|
466
|
+
matches = obigt[(obigt['name'] == group_alt) & (obigt['state'] == state)]
|
|
467
|
+
if len(matches) > 0:
|
|
468
|
+
igroup.append(matches.index[0])
|
|
469
|
+
else:
|
|
470
|
+
raise ValueError(f"Group {group_name} not found in OBIGT for state {state}")
|
|
471
|
+
|
|
472
|
+
# The properties are in columns 9:21 of thermo().OBIGT (G, H, S, Cp, V, etc.)
|
|
473
|
+
# Column indices: G=9, H=10, S=11, Cp=12, V=13, a1.a=14, a2.b=15, a3.c=16, a4.d=17, c1.e=18, c2.f=19, omega.lambda=20, z.T=21
|
|
474
|
+
groupprops = obigt.loc[igroup, obigt.columns[9:22]]
|
|
475
|
+
|
|
476
|
+
# The elements in each of the groups
|
|
477
|
+
groupelements = i2A(igroup)
|
|
478
|
+
|
|
479
|
+
results = []
|
|
480
|
+
|
|
481
|
+
# Process each protein
|
|
482
|
+
for idx in range(len(aa)):
|
|
483
|
+
aa_row = aa.iloc[idx]
|
|
484
|
+
|
|
485
|
+
# Numbers of groups: chains [=AABB], sidechains, protein backbone
|
|
486
|
+
nchains = float(aa_row.iloc[4]) # chains column
|
|
487
|
+
length = float(aa_row.iloc[5:25].sum()) # sum of amino acids
|
|
488
|
+
npbb = length - nchains
|
|
489
|
+
|
|
490
|
+
# Create ngroups array
|
|
491
|
+
ngroups = np.array([nchains] + aa_row.iloc[5:25].tolist() + [npbb], dtype=float)
|
|
492
|
+
|
|
493
|
+
# Calculate thermodynamic properties by group additivity
|
|
494
|
+
eos = (groupprops.values * ngroups[:, np.newaxis]).sum(axis=0)
|
|
495
|
+
|
|
496
|
+
# Calculate formula
|
|
497
|
+
f_in = (groupelements.values * ngroups[:, np.newaxis]).sum(axis=0).round(3)
|
|
498
|
+
|
|
499
|
+
# Remove elements that don't appear
|
|
500
|
+
element_names = groupelements.columns
|
|
501
|
+
f_dict = {elem: f_in[i] for i, elem in enumerate(element_names) if f_in[i] != 0}
|
|
502
|
+
|
|
503
|
+
# Turn it into a formula string
|
|
504
|
+
f = as_chemical_formula(f_dict)
|
|
505
|
+
|
|
506
|
+
# Species name
|
|
507
|
+
name = f"{aa_row['protein']}_{aa_row['organism']}"
|
|
508
|
+
|
|
509
|
+
# Print message
|
|
510
|
+
print(f"protein_OBIGT: found {name} ({f}, {round(length, 3)} residues)")
|
|
511
|
+
|
|
512
|
+
ref = aa_row['ref']
|
|
513
|
+
|
|
514
|
+
# Include 'model' column
|
|
515
|
+
model = 'HKF' if state == 'aq' else 'CGL'
|
|
516
|
+
|
|
517
|
+
# Create header
|
|
518
|
+
header = {
|
|
519
|
+
'name': name,
|
|
520
|
+
'abbrv': None,
|
|
521
|
+
'formula': f,
|
|
522
|
+
'state': state,
|
|
523
|
+
'ref1': ref,
|
|
524
|
+
'ref2': None,
|
|
525
|
+
'date': None,
|
|
526
|
+
'model': model,
|
|
527
|
+
'E_units': 'cal'
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
# Combine header and eos
|
|
531
|
+
eosout = {**header, **dict(zip(groupprops.columns, eos))}
|
|
532
|
+
results.append(eosout)
|
|
533
|
+
|
|
534
|
+
# Convert to DataFrame
|
|
535
|
+
out = pd.DataFrame(results)
|
|
536
|
+
out.reset_index(drop=True, inplace=True)
|
|
537
|
+
|
|
538
|
+
return out
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def protein_basis(protein: Union[int, List[int], pd.DataFrame],
|
|
542
|
+
T: float = 25.0,
|
|
543
|
+
normalize: bool = False) -> pd.DataFrame:
|
|
544
|
+
"""
|
|
545
|
+
Calculate coefficients of basis species in protein formation reactions.
|
|
546
|
+
|
|
547
|
+
Parameters
|
|
548
|
+
----------
|
|
549
|
+
protein : int, list of int, or DataFrame
|
|
550
|
+
Protein identifier(s) or amino acid composition data
|
|
551
|
+
T : float, default 25.0
|
|
552
|
+
Temperature in degrees Celsius
|
|
553
|
+
normalize : bool, default False
|
|
554
|
+
Normalize by protein length
|
|
555
|
+
|
|
556
|
+
Returns
|
|
557
|
+
-------
|
|
558
|
+
DataFrame
|
|
559
|
+
Coefficients of basis species
|
|
560
|
+
|
|
561
|
+
Examples
|
|
562
|
+
--------
|
|
563
|
+
>>> from pychnosz import *
|
|
564
|
+
>>> basis("CHNOSe")
|
|
565
|
+
>>> iprotein = pinfo("LYSC_CHICK")
|
|
566
|
+
>>> coeffs = protein_basis(iprotein)
|
|
567
|
+
"""
|
|
568
|
+
# Get amino acid composition
|
|
569
|
+
aa = pinfo(pinfo(protein))
|
|
570
|
+
|
|
571
|
+
if not isinstance(aa, pd.DataFrame):
|
|
572
|
+
raise TypeError("Could not retrieve protein data")
|
|
573
|
+
|
|
574
|
+
# Get protein formulas
|
|
575
|
+
pf = protein_formula(aa)
|
|
576
|
+
|
|
577
|
+
# Calculate coefficients of basis species in formation reactions
|
|
578
|
+
sb = species_basis(pf)
|
|
579
|
+
|
|
580
|
+
# Calculate ionization states if H+ is a basis species
|
|
581
|
+
t = thermo()
|
|
582
|
+
if t.basis is not None:
|
|
583
|
+
basis_species = t.basis.index.tolist()
|
|
584
|
+
if 'H+' in basis_species:
|
|
585
|
+
iHplus = basis_species.index('H+')
|
|
586
|
+
pH = -t.basis.loc['H+', 'logact']
|
|
587
|
+
Z = ionize_aa(aa, T=T, pH=pH).iloc[0, :]
|
|
588
|
+
sb.iloc[:, iHplus] = sb.iloc[:, iHplus] + Z.values
|
|
589
|
+
|
|
590
|
+
# Normalize by length if requested
|
|
591
|
+
if normalize:
|
|
592
|
+
plen = protein_length(aa)
|
|
593
|
+
sb = sb.div(plen, axis=0)
|
|
594
|
+
|
|
595
|
+
return sb
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Core thermodynamic calculation functions for CHNOSZ."""
|
|
2
|
+
|
|
3
|
+
from .thermo import ThermoSystem, thermo
|
|
4
|
+
from .info import info, find_species, get_species_data, list_species
|
|
5
|
+
from .basis import basis, get_basis, is_basis_defined, preset_basis, BasisError
|
|
6
|
+
from .species import species, get_species, is_species_defined, n_species, SpeciesError
|
|
7
|
+
from .retrieve import retrieve
|
|
8
|
+
|
|
9
|
+
# Optional imports for modules that may not exist yet
|
|
10
|
+
try:
|
|
11
|
+
from .subcrt import subcrt
|
|
12
|
+
except ImportError:
|
|
13
|
+
subcrt = None
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from .affinity import affinity
|
|
17
|
+
except ImportError:
|
|
18
|
+
affinity = None
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from .equilibrate import equilibrate
|
|
22
|
+
except ImportError:
|
|
23
|
+
equilibrate = None
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from .diagram import diagram
|
|
27
|
+
except ImportError:
|
|
28
|
+
diagram = None
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
'ThermoSystem', 'thermo',
|
|
32
|
+
'info', 'find_species', 'get_species_data', 'list_species',
|
|
33
|
+
'basis', 'get_basis', 'is_basis_defined', 'preset_basis', 'BasisError',
|
|
34
|
+
'species', 'get_species', 'is_species_defined', 'n_species', 'SpeciesError',
|
|
35
|
+
'retrieve'
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Add optional functions if they exist
|
|
39
|
+
if subcrt is not None:
|
|
40
|
+
__all__.append('subcrt')
|
|
41
|
+
if affinity is not None:
|
|
42
|
+
__all__.append('affinity')
|
|
43
|
+
if equilibrate is not None:
|
|
44
|
+
__all__.append('equilibrate')
|
|
45
|
+
if diagram is not None:
|
|
46
|
+
__all__.append('diagram')
|