pychnosz 1.1.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. pychnosz/__init__.py +129 -0
  2. pychnosz/biomolecules/__init__.py +29 -0
  3. pychnosz/biomolecules/ionize_aa.py +197 -0
  4. pychnosz/biomolecules/proteins.py +595 -0
  5. pychnosz/core/__init__.py +46 -0
  6. pychnosz/core/affinity.py +1256 -0
  7. pychnosz/core/animation.py +593 -0
  8. pychnosz/core/balance.py +334 -0
  9. pychnosz/core/basis.py +716 -0
  10. pychnosz/core/diagram.py +3336 -0
  11. pychnosz/core/equilibrate.py +813 -0
  12. pychnosz/core/equilibrium.py +554 -0
  13. pychnosz/core/info.py +821 -0
  14. pychnosz/core/retrieve.py +364 -0
  15. pychnosz/core/speciation.py +580 -0
  16. pychnosz/core/species.py +599 -0
  17. pychnosz/core/subcrt.py +1700 -0
  18. pychnosz/core/thermo.py +593 -0
  19. pychnosz/core/unicurve.py +1226 -0
  20. pychnosz/data/__init__.py +11 -0
  21. pychnosz/data/add_obigt.py +327 -0
  22. pychnosz/data/extdata/Berman/BDat17_2017.csv +2 -0
  23. pychnosz/data/extdata/Berman/Ber88_1988.csv +68 -0
  24. pychnosz/data/extdata/Berman/Ber90_1990.csv +5 -0
  25. pychnosz/data/extdata/Berman/DS10_2010.csv +6 -0
  26. pychnosz/data/extdata/Berman/FDM+14_2014.csv +2 -0
  27. pychnosz/data/extdata/Berman/Got04_2004.csv +5 -0
  28. pychnosz/data/extdata/Berman/JUN92_1992.csv +3 -0
  29. pychnosz/data/extdata/Berman/SHD91_1991.csv +12 -0
  30. pychnosz/data/extdata/Berman/VGT92_1992.csv +2 -0
  31. pychnosz/data/extdata/Berman/VPT01_2001.csv +3 -0
  32. pychnosz/data/extdata/Berman/VPV05_2005.csv +2 -0
  33. pychnosz/data/extdata/Berman/ZS92_1992.csv +11 -0
  34. pychnosz/data/extdata/Berman/sympy.R +99 -0
  35. pychnosz/data/extdata/Berman/testing/BA96.bib +12 -0
  36. pychnosz/data/extdata/Berman/testing/BA96_Berman.csv +21 -0
  37. pychnosz/data/extdata/Berman/testing/BA96_OBIGT.csv +21 -0
  38. pychnosz/data/extdata/Berman/testing/BA96_refs.csv +6 -0
  39. pychnosz/data/extdata/OBIGT/AD.csv +25 -0
  40. pychnosz/data/extdata/OBIGT/Berman_cr.csv +93 -0
  41. pychnosz/data/extdata/OBIGT/DEW.csv +211 -0
  42. pychnosz/data/extdata/OBIGT/H2O_aq.csv +4 -0
  43. pychnosz/data/extdata/OBIGT/SLOP98.csv +411 -0
  44. pychnosz/data/extdata/OBIGT/SUPCRT92.csv +178 -0
  45. pychnosz/data/extdata/OBIGT/inorganic_aq.csv +729 -0
  46. pychnosz/data/extdata/OBIGT/inorganic_cr.csv +273 -0
  47. pychnosz/data/extdata/OBIGT/inorganic_gas.csv +20 -0
  48. pychnosz/data/extdata/OBIGT/organic_aq.csv +1104 -0
  49. pychnosz/data/extdata/OBIGT/organic_cr.csv +481 -0
  50. pychnosz/data/extdata/OBIGT/organic_gas.csv +268 -0
  51. pychnosz/data/extdata/OBIGT/organic_liq.csv +533 -0
  52. pychnosz/data/extdata/OBIGT/testing/GEMSFIT.csv +43 -0
  53. pychnosz/data/extdata/OBIGT/testing/IGEM.csv +17 -0
  54. pychnosz/data/extdata/OBIGT/testing/Sandia.csv +8 -0
  55. pychnosz/data/extdata/OBIGT/testing/SiO2.csv +4 -0
  56. pychnosz/data/extdata/misc/AD03_Fig1a.csv +69 -0
  57. pychnosz/data/extdata/misc/AD03_Fig1b.csv +43 -0
  58. pychnosz/data/extdata/misc/AD03_Fig1c.csv +89 -0
  59. pychnosz/data/extdata/misc/AD03_Fig1d.csv +30 -0
  60. pychnosz/data/extdata/misc/BZA10.csv +5 -0
  61. pychnosz/data/extdata/misc/HW97_Cp.csv +90 -0
  62. pychnosz/data/extdata/misc/HWM96_V.csv +229 -0
  63. pychnosz/data/extdata/misc/LA19_test.csv +7 -0
  64. pychnosz/data/extdata/misc/Mer75_Table4.csv +42 -0
  65. pychnosz/data/extdata/misc/OBIGT_check.csv +423 -0
  66. pychnosz/data/extdata/misc/PM90.csv +7 -0
  67. pychnosz/data/extdata/misc/RH95.csv +23 -0
  68. pychnosz/data/extdata/misc/RH98_Table15.csv +17 -0
  69. pychnosz/data/extdata/misc/SC10_Rainbow.csv +19 -0
  70. pychnosz/data/extdata/misc/SK95.csv +55 -0
  71. pychnosz/data/extdata/misc/SOJSH.csv +61 -0
  72. pychnosz/data/extdata/misc/SS98_Fig5a.csv +81 -0
  73. pychnosz/data/extdata/misc/SS98_Fig5b.csv +84 -0
  74. pychnosz/data/extdata/misc/TKSS14_Fig2.csv +25 -0
  75. pychnosz/data/extdata/misc/bluered.txt +1000 -0
  76. pychnosz/data/extdata/protein/Cas/Cas_aa.csv +177 -0
  77. pychnosz/data/extdata/protein/Cas/Cas_uniprot.csv +186 -0
  78. pychnosz/data/extdata/protein/Cas/download.R +34 -0
  79. pychnosz/data/extdata/protein/Cas/mkaa.R +34 -0
  80. pychnosz/data/extdata/protein/POLG.csv +12 -0
  81. pychnosz/data/extdata/protein/TBD+05.csv +393 -0
  82. pychnosz/data/extdata/protein/TBD+05_aa.csv +393 -0
  83. pychnosz/data/extdata/protein/rubisco.csv +28 -0
  84. pychnosz/data/extdata/protein/rubisco.fasta +239 -0
  85. pychnosz/data/extdata/protein/rubisco_aa.csv +28 -0
  86. pychnosz/data/extdata/src/H2O92D.f.orig +3457 -0
  87. pychnosz/data/extdata/src/README.txt +5 -0
  88. pychnosz/data/extdata/taxonomy/names.dmp +215 -0
  89. pychnosz/data/extdata/taxonomy/nodes.dmp +63 -0
  90. pychnosz/data/extdata/thermo/Bdot_acirc.csv +60 -0
  91. pychnosz/data/extdata/thermo/buffer.csv +40 -0
  92. pychnosz/data/extdata/thermo/element.csv +135 -0
  93. pychnosz/data/extdata/thermo/groups.csv +6 -0
  94. pychnosz/data/extdata/thermo/opt.csv +2 -0
  95. pychnosz/data/extdata/thermo/protein.csv +506 -0
  96. pychnosz/data/extdata/thermo/refs.csv +343 -0
  97. pychnosz/data/extdata/thermo/stoich.csv.xz +0 -0
  98. pychnosz/data/loader.py +431 -0
  99. pychnosz/data/mod_obigt.py +322 -0
  100. pychnosz/data/obigt.py +471 -0
  101. pychnosz/data/worm.py +228 -0
  102. pychnosz/fortran/__init__.py +16 -0
  103. pychnosz/fortran/h2o92.dll +0 -0
  104. pychnosz/fortran/h2o92_interface.py +527 -0
  105. pychnosz/geochemistry/__init__.py +21 -0
  106. pychnosz/geochemistry/minerals.py +514 -0
  107. pychnosz/geochemistry/redox.py +500 -0
  108. pychnosz/models/__init__.py +47 -0
  109. pychnosz/models/archer_wang.py +165 -0
  110. pychnosz/models/berman.py +309 -0
  111. pychnosz/models/cgl.py +381 -0
  112. pychnosz/models/dew.py +997 -0
  113. pychnosz/models/hkf.py +523 -0
  114. pychnosz/models/hkf_helpers.py +222 -0
  115. pychnosz/models/iapws95.py +1113 -0
  116. pychnosz/models/supcrt92_fortran.py +238 -0
  117. pychnosz/models/water.py +480 -0
  118. pychnosz/utils/__init__.py +27 -0
  119. pychnosz/utils/expression.py +1074 -0
  120. pychnosz/utils/formula.py +830 -0
  121. pychnosz/utils/formula_ox.py +227 -0
  122. pychnosz/utils/reset.py +33 -0
  123. pychnosz/utils/units.py +259 -0
  124. pychnosz-1.1.1.dist-info/METADATA +197 -0
  125. pychnosz-1.1.1.dist-info/RECORD +128 -0
  126. pychnosz-1.1.1.dist-info/WHEEL +5 -0
  127. pychnosz-1.1.1.dist-info/licenses/LICENSE.txt +19 -0
  128. pychnosz-1.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,595 @@
1
+ """
2
+ Protein functions for CHNOSZ.
3
+
4
+ This module implements protein-related functions from CHNOSZ including
5
+ add_protein, protein_length, protein_formula, protein_OBIGT, and protein_basis.
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from typing import Union, Optional, List
11
+ import warnings
12
+
13
+ from ..core.thermo import thermo
14
+ from ..utils.formula import i2A, as_chemical_formula, species_basis
15
+ from ..biomolecules.ionize_aa import ionize_aa
16
+
17
+
18
+ def pinfo(protein: Union[str, int, pd.DataFrame, List],
19
+ organism: Optional[str] = None,
20
+ residue: bool = False,
21
+ regexp: bool = False) -> Union[pd.DataFrame, np.ndarray, int]:
22
+ """
23
+ Get protein information from thermo().protein.
24
+
25
+ This function retrieves protein data from the thermodynamic database.
26
+ The behavior depends on the input type:
27
+ - DataFrame: returns the DataFrame (possibly per residue)
28
+ - int or list of ints: returns rows from thermo().protein
29
+ - str: searches for protein by name, returns row number(s)
30
+
31
+ Parameters
32
+ ----------
33
+ protein : str, int, DataFrame, or list
34
+ Protein identifier(s) or data
35
+ organism : str, optional
36
+ Organism identifier (used with protein name)
37
+ residue : bool, default False
38
+ Return per-residue amino acid composition
39
+ regexp : bool, default False
40
+ Use regular expression matching for protein search
41
+
42
+ Returns
43
+ -------
44
+ DataFrame, array, or int
45
+ Protein information or row numbers
46
+
47
+ Examples
48
+ --------
49
+ >>> # Get protein by name
50
+ >>> iprotein = pinfo("LYSC_CHICK")
51
+ >>> # Get protein data by row number
52
+ >>> protein_data = pinfo(iprotein)
53
+ """
54
+ t_p = thermo().protein
55
+
56
+ if t_p is None:
57
+ raise RuntimeError("Protein database not loaded. Run reset() first.")
58
+
59
+ # If input is a DataFrame, return it (possibly per residue)
60
+ if isinstance(protein, pd.DataFrame):
61
+ out = protein.copy()
62
+ if residue:
63
+ # Normalize by total amino acid count (columns 5:25)
64
+ row_sums = out.iloc[:, 5:25].sum(axis=1)
65
+ out.iloc[:, 4:24] = out.iloc[:, 4:24].div(row_sums, axis=0)
66
+ return out
67
+
68
+ # If input is numeric, get rows from thermo().protein
69
+ if isinstance(protein, (int, np.integer)):
70
+ protein = [protein]
71
+
72
+ if isinstance(protein, (list, np.ndarray)) and all(isinstance(x, (int, np.integer)) for x in protein):
73
+ # Get amino acid counts
74
+ iproteins = list(range(len(t_p)))
75
+ # Replace invalid indices with NaN
76
+ protein_clean = [p if p in iproteins else np.nan for p in protein]
77
+ # Filter out NaN values for indexing
78
+ valid_indices = [p for p in protein_clean if not np.isnan(p)]
79
+
80
+ if not valid_indices:
81
+ return pd.DataFrame()
82
+
83
+ out = t_p.iloc[valid_indices].copy()
84
+
85
+ # Compute per-residue counts if requested
86
+ if residue:
87
+ row_sums = out.iloc[:, 5:25].sum(axis=1)
88
+ out.iloc[:, 4:24] = out.iloc[:, 4:24].div(row_sums, axis=0)
89
+
90
+ return out
91
+
92
+ # If input is string or list of strings, search for protein
93
+ if isinstance(protein, str):
94
+ protein = [protein]
95
+
96
+ if isinstance(protein, list) and all(isinstance(x, str) for x in protein):
97
+ # Search for protein or protein_organism in thermo().protein
98
+ t_p_names = t_p['protein'] + '_' + t_p['organism']
99
+
100
+ if regexp:
101
+ # Use regular expression matching
102
+ matches = []
103
+ for prot in protein:
104
+ iprotein = t_p['protein'].str.contains(prot, regex=True, na=False)
105
+ if organism is not None:
106
+ iorganism = t_p['organism'].str.contains(organism, regex=True, na=False)
107
+ iprotein = iprotein & iorganism
108
+ indices = np.where(iprotein)[0]
109
+ if len(indices) > 0:
110
+ matches.extend(indices.tolist())
111
+ else:
112
+ matches.append(np.nan)
113
+
114
+ if len(matches) == 1:
115
+ if np.isnan(matches[0]):
116
+ return np.nan
117
+ return int(matches[0])
118
+ return np.array(matches)
119
+ else:
120
+ # Exact matching
121
+ if organism is None:
122
+ my_names = protein
123
+ else:
124
+ my_names = [f"{p}_{organism}" for p in protein]
125
+
126
+ # Find matches
127
+ matches = []
128
+ for name in my_names:
129
+ idx = np.where(t_p_names == name)[0]
130
+ if len(idx) > 0:
131
+ matches.append(idx[0])
132
+ else:
133
+ matches.append(np.nan)
134
+
135
+ if len(matches) == 1:
136
+ if np.isnan(matches[0]):
137
+ return np.nan
138
+ return int(matches[0])
139
+ return np.array(matches)
140
+
141
+ raise TypeError(f"Unsupported protein type: {type(protein)}")
142
+
143
+
144
+ def add_protein(aa: pd.DataFrame, as_residue: bool = False) -> np.ndarray:
145
+ """
146
+ Add protein amino acid compositions to thermo().protein.
147
+
148
+ Parameters
149
+ ----------
150
+ aa : DataFrame
151
+ DataFrame with protein amino acid compositions.
152
+ Must have same columns as thermo().protein
153
+ as_residue : bool, default False
154
+ Normalize amino acid counts by protein length
155
+
156
+ Returns
157
+ -------
158
+ array
159
+ Row numbers of added/updated proteins in thermo().protein
160
+
161
+ Examples
162
+ --------
163
+ >>> import pandas as pd
164
+ >>> from pychnosz import *
165
+ >>> aa = pd.read_csv("POLG.csv")
166
+ >>> iprotein = add_protein(aa)
167
+ """
168
+ t = thermo()
169
+
170
+ if t.protein is None:
171
+ raise RuntimeError("Protein database not loaded. Run reset() first.")
172
+
173
+ # Check that columns match
174
+ if list(aa.columns) != list(t.protein.columns):
175
+ raise ValueError("'aa' does not have the same columns as thermo().protein")
176
+
177
+ # Check that new protein IDs are unique
178
+ po = aa['protein'] + '_' + aa['organism']
179
+ idup = po.duplicated()
180
+ if idup.any():
181
+ dup_proteins = po[idup].unique()
182
+ raise ValueError(f"some protein IDs are duplicated: {' '.join(dup_proteins)}")
183
+
184
+ # Normalize by protein length if as_residue = True
185
+ if as_residue:
186
+ pl = protein_length(aa)
187
+ aa.iloc[:, 4:24] = aa.iloc[:, 4:24].div(pl, axis=0)
188
+
189
+ # Find any protein IDs that are already present
190
+ ip = pinfo(po.tolist())
191
+ if isinstance(ip, (int, np.integer)):
192
+ ip = np.array([ip])
193
+ elif not isinstance(ip, np.ndarray):
194
+ ip = np.array([ip])
195
+
196
+ ip_present = ~np.isnan(ip)
197
+
198
+ # Now we're ready to go
199
+ tp_new = t.protein.copy()
200
+
201
+ # Add new proteins
202
+ if not all(ip_present):
203
+ new_proteins = aa[~ip_present].copy()
204
+ tp_new = pd.concat([tp_new, new_proteins], ignore_index=True)
205
+
206
+ # Update existing proteins
207
+ if any(ip_present):
208
+ valid_ip = ip[ip_present].astype(int)
209
+ tp_new.iloc[valid_ip] = aa[ip_present].values
210
+
211
+ # Update the protein database
212
+ tp_new.reset_index(drop=True, inplace=True)
213
+ t.protein = tp_new
214
+
215
+ # Return the new row numbers
216
+ ip_new = pinfo(po.tolist())
217
+ if isinstance(ip_new, (int, np.integer)):
218
+ ip_new = np.array([ip_new])
219
+
220
+ # Print messages
221
+ n_added = sum(~ip_present)
222
+ n_replaced = sum(ip_present)
223
+
224
+ if n_added > 0:
225
+ print(f"add_protein: added {n_added} new protein(s) to thermo().protein")
226
+ if n_replaced > 0:
227
+ print(f"add_protein: replaced {n_replaced} existing protein(s) in thermo().protein")
228
+
229
+ return ip_new
230
+
231
+
232
+ def protein_length(protein: Union[int, List[int], pd.DataFrame],
233
+ organism: Optional[str] = None) -> Union[int, np.ndarray]:
234
+ """
235
+ Calculate the length(s) of proteins.
236
+
237
+ Parameters
238
+ ----------
239
+ protein : int, list of int, or DataFrame
240
+ Protein identifier(s) or amino acid composition data
241
+ organism : str, optional
242
+ Organism identifier (used with protein number)
243
+
244
+ Returns
245
+ -------
246
+ int or array
247
+ Protein length(s) in amino acid residues
248
+
249
+ Examples
250
+ --------
251
+ >>> iprotein = pinfo("LYSC_CHICK")
252
+ >>> length = protein_length(iprotein)
253
+ """
254
+ # Get amino acid composition
255
+ aa = pinfo(pinfo(protein, organism))
256
+
257
+ if isinstance(aa, pd.DataFrame):
258
+ # Use sum on the columns containing amino acid counts (columns 5:25)
259
+ pl = aa.iloc[:, 5:25].sum(axis=1).values
260
+ return pl
261
+ else:
262
+ return 0
263
+
264
+
265
+ def group_formulas() -> pd.DataFrame:
266
+ """
267
+ Return chemical formulas of amino acid residues.
268
+
269
+ This function returns a DataFrame with the chemical formulas of
270
+ H2O, the 20 amino acid sidechain groups, and the unfolded protein
271
+ backbone group [UPBB].
272
+
273
+ Returns
274
+ -------
275
+ DataFrame
276
+ Chemical formulas with elements C, H, N, O, S as columns
277
+ and residues as rows
278
+ """
279
+ # Chemical formulas as a numpy array
280
+ # Rows: water, [Ala], [Cys], [Asp], [Glu], [Phe], [Gly], [His], [Ile], [Lys], [Leu],
281
+ # [Met], [Asn], [Pro], [Gln], [Arg], [Ser], [Thr], [Val], [Trp], [Tyr], [UPBB]
282
+ # Columns: C, H, N, O, S
283
+ A = np.array([
284
+ [0, 2, 0, 1, 0], # H2O
285
+ [1, 3, 0, 0, 0], # [Ala]
286
+ [1, 3, 0, 0, 1], # [Cys]
287
+ [2, 3, 0, 2, 0], # [Asp]
288
+ [3, 5, 0, 2, 0], # [Glu]
289
+ [7, 7, 0, 0, 0], # [Phe]
290
+ [0, 1, 0, 0, 0], # [Gly]
291
+ [4, 5, 2, 0, 0], # [His]
292
+ [4, 9, 0, 0, 0], # [Ile]
293
+ [4, 10, 1, 0, 0], # [Lys]
294
+ [4, 9, 0, 0, 0], # [Leu]
295
+ [3, 7, 0, 0, 1], # [Met]
296
+ [2, 4, 1, 1, 0], # [Asn]
297
+ [3, 5, 0, 0, 0], # [Pro]
298
+ [3, 6, 1, 1, 0], # [Gln]
299
+ [4, 10, 3, 0, 0], # [Arg]
300
+ [1, 3, 0, 1, 0], # [Ser]
301
+ [2, 5, 0, 1, 0], # [Thr]
302
+ [3, 7, 0, 0, 0], # [Val]
303
+ [9, 8, 1, 0, 0], # [Trp]
304
+ [7, 7, 0, 1, 0], # [Tyr]
305
+ [2, 2, 1, 1, 0] # [UPBB]
306
+ ])
307
+
308
+ rownames = ['H2O', '[Ala]', '[Cys]', '[Asp]', '[Glu]', '[Phe]', '[Gly]',
309
+ '[His]', '[Ile]', '[Lys]', '[Leu]', '[Met]', '[Asn]', '[Pro]',
310
+ '[Gln]', '[Arg]', '[Ser]', '[Thr]', '[Val]', '[Trp]', '[Tyr]',
311
+ '[UPBB]']
312
+
313
+ # Add [UPBB] to the sidechain groups to get residues
314
+ out = A.copy()
315
+ # Add [UPBB] (last row) to each sidechain group (rows 1-20)
316
+ out[1:21, :] = out[1:21, :] + A[21, :]
317
+
318
+ # Create DataFrame
319
+ df = pd.DataFrame(out[0:21, :],
320
+ index=rownames[0:21],
321
+ columns=['C', 'H', 'N', 'O', 'S'])
322
+
323
+ return df
324
+
325
+
326
+ def protein_formula(protein: Union[int, List[int], pd.DataFrame],
327
+ organism: Optional[str] = None,
328
+ residue: bool = False) -> pd.DataFrame:
329
+ """
330
+ Calculate chemical formulas of proteins.
331
+
332
+ Parameters
333
+ ----------
334
+ protein : int, list of int, or DataFrame
335
+ Protein identifier(s) or amino acid composition data
336
+ organism : str, optional
337
+ Organism identifier (used with protein number)
338
+ residue : bool, default False
339
+ Return per-residue formula
340
+
341
+ Returns
342
+ -------
343
+ DataFrame
344
+ Chemical formulas with elements C, H, N, O, S as columns
345
+
346
+ Examples
347
+ --------
348
+ >>> iprotein = pinfo("LYSC_CHICK")
349
+ >>> formula = protein_formula(iprotein)
350
+ """
351
+ # Get amino acid composition
352
+ aa = pinfo(pinfo(protein, organism))
353
+
354
+ if not isinstance(aa, pd.DataFrame):
355
+ raise TypeError("Could not retrieve protein data")
356
+
357
+ # Get group formulas
358
+ rf = group_formulas()
359
+
360
+ # Matrix multiplication: amino acid counts * residue formulas
361
+ # Columns 5:25 contain amino acid counts (excluding chains column at 4)
362
+ # We need to add H2O (chains column) separately
363
+ aa_counts = aa.iloc[:, 5:25].values.astype(float)
364
+ chains = aa.iloc[:, 4].values.astype(float)
365
+ rf_values = rf.iloc[1:, :].values.astype(float) # Skip H2O row, use amino acid residues
366
+ rf_H2O = rf.iloc[0, :].values.astype(float) # H2O row
367
+
368
+ # Calculate protein formula: amino acids + H2O for chains
369
+ out = np.dot(aa_counts, rf_values) + np.outer(chains, rf_H2O)
370
+
371
+ # Normalize by residue if requested
372
+ if residue:
373
+ row_sums = aa.iloc[:, 5:25].sum(axis=1).values
374
+ out = out / row_sums[:, np.newaxis]
375
+
376
+ # Create DataFrame with protein names as index
377
+ protein_names = aa['protein'] + '_' + aa['organism']
378
+ # Make names unique if there are duplicates
379
+ if protein_names.duplicated().any():
380
+ counts = {}
381
+ unique_names = []
382
+ for name in protein_names:
383
+ if name in counts:
384
+ counts[name] += 1
385
+ unique_names.append(f"{name}.{counts[name]}")
386
+ else:
387
+ counts[name] = 0
388
+ unique_names.append(name)
389
+ protein_names = unique_names
390
+
391
+ result = pd.DataFrame(out,
392
+ index=protein_names,
393
+ columns=['C', 'H', 'N', 'O', 'S'])
394
+
395
+ return result
396
+
397
+
398
+ def protein_OBIGT(protein: Union[int, List[int], pd.DataFrame],
399
+ organism: Optional[str] = None,
400
+ state: Optional[str] = None) -> pd.DataFrame:
401
+ """
402
+ Calculate protein properties using group additivity.
403
+
404
+ This function calculates thermodynamic properties of proteins
405
+ from amino acid composition using the group additivity approach.
406
+
407
+ Parameters
408
+ ----------
409
+ protein : int, list of int, or DataFrame
410
+ Protein identifier(s) or amino acid composition data
411
+ organism : str, optional
412
+ Organism identifier
413
+ state : str, optional
414
+ Physical state ('aq' or 'cr'). If None, uses thermo().opt['state']
415
+
416
+ Returns
417
+ -------
418
+ DataFrame
419
+ Thermodynamic properties in OBIGT format
420
+
421
+ Examples
422
+ --------
423
+ >>> iprotein = pinfo("LYSC_CHICK")
424
+ >>> props = protein_OBIGT(iprotein)
425
+ """
426
+ # Get amino acid composition
427
+ aa = pinfo(pinfo(protein, organism))
428
+
429
+ if not isinstance(aa, pd.DataFrame):
430
+ raise TypeError("Could not retrieve protein data")
431
+
432
+ # Get state
433
+ if state is None:
434
+ state = thermo().opt.get('state', 'aq')
435
+
436
+ # The names of the protein backbone groups depend on the state
437
+ # [UPBB] for aq or [PBB] for cr
438
+ if state == 'aq':
439
+ bbgroup = 'UPBB'
440
+ else:
441
+ bbgroup = 'PBB'
442
+
443
+ # Names of the AABB, sidechain and protein backbone groups
444
+ aa_cols = aa.columns[5:25].tolist() # Get amino acid column names
445
+ groups = ['AABB'] + aa_cols + [bbgroup]
446
+
447
+ # Put brackets around the group names
448
+ groups = [f"[{g}]" for g in groups]
449
+
450
+ # The row numbers of the groups in thermo().OBIGT
451
+ from ..core.info import info
452
+
453
+ groups_state = [f"{g}" for g in groups]
454
+ obigt = thermo().obigt
455
+
456
+ # Find groups in OBIGT
457
+ igroup = []
458
+ for group_name in groups_state:
459
+ # Search for the group with the specified state
460
+ matches = obigt[(obigt['name'] == group_name) & (obigt['state'] == state)]
461
+ if len(matches) > 0:
462
+ igroup.append(matches.index[0])
463
+ else:
464
+ # Try without brackets if not found
465
+ group_alt = group_name.strip('[]')
466
+ matches = obigt[(obigt['name'] == group_alt) & (obigt['state'] == state)]
467
+ if len(matches) > 0:
468
+ igroup.append(matches.index[0])
469
+ else:
470
+ raise ValueError(f"Group {group_name} not found in OBIGT for state {state}")
471
+
472
+ # The properties are in columns 9:21 of thermo().OBIGT (G, H, S, Cp, V, etc.)
473
+ # Column indices: G=9, H=10, S=11, Cp=12, V=13, a1.a=14, a2.b=15, a3.c=16, a4.d=17, c1.e=18, c2.f=19, omega.lambda=20, z.T=21
474
+ groupprops = obigt.loc[igroup, obigt.columns[9:22]]
475
+
476
+ # The elements in each of the groups
477
+ groupelements = i2A(igroup)
478
+
479
+ results = []
480
+
481
+ # Process each protein
482
+ for idx in range(len(aa)):
483
+ aa_row = aa.iloc[idx]
484
+
485
+ # Numbers of groups: chains [=AABB], sidechains, protein backbone
486
+ nchains = float(aa_row.iloc[4]) # chains column
487
+ length = float(aa_row.iloc[5:25].sum()) # sum of amino acids
488
+ npbb = length - nchains
489
+
490
+ # Create ngroups array
491
+ ngroups = np.array([nchains] + aa_row.iloc[5:25].tolist() + [npbb], dtype=float)
492
+
493
+ # Calculate thermodynamic properties by group additivity
494
+ eos = (groupprops.values * ngroups[:, np.newaxis]).sum(axis=0)
495
+
496
+ # Calculate formula
497
+ f_in = (groupelements.values * ngroups[:, np.newaxis]).sum(axis=0).round(3)
498
+
499
+ # Remove elements that don't appear
500
+ element_names = groupelements.columns
501
+ f_dict = {elem: f_in[i] for i, elem in enumerate(element_names) if f_in[i] != 0}
502
+
503
+ # Turn it into a formula string
504
+ f = as_chemical_formula(f_dict)
505
+
506
+ # Species name
507
+ name = f"{aa_row['protein']}_{aa_row['organism']}"
508
+
509
+ # Print message
510
+ print(f"protein_OBIGT: found {name} ({f}, {round(length, 3)} residues)")
511
+
512
+ ref = aa_row['ref']
513
+
514
+ # Include 'model' column
515
+ model = 'HKF' if state == 'aq' else 'CGL'
516
+
517
+ # Create header
518
+ header = {
519
+ 'name': name,
520
+ 'abbrv': None,
521
+ 'formula': f,
522
+ 'state': state,
523
+ 'ref1': ref,
524
+ 'ref2': None,
525
+ 'date': None,
526
+ 'model': model,
527
+ 'E_units': 'cal'
528
+ }
529
+
530
+ # Combine header and eos
531
+ eosout = {**header, **dict(zip(groupprops.columns, eos))}
532
+ results.append(eosout)
533
+
534
+ # Convert to DataFrame
535
+ out = pd.DataFrame(results)
536
+ out.reset_index(drop=True, inplace=True)
537
+
538
+ return out
539
+
540
+
541
+ def protein_basis(protein: Union[int, List[int], pd.DataFrame],
542
+ T: float = 25.0,
543
+ normalize: bool = False) -> pd.DataFrame:
544
+ """
545
+ Calculate coefficients of basis species in protein formation reactions.
546
+
547
+ Parameters
548
+ ----------
549
+ protein : int, list of int, or DataFrame
550
+ Protein identifier(s) or amino acid composition data
551
+ T : float, default 25.0
552
+ Temperature in degrees Celsius
553
+ normalize : bool, default False
554
+ Normalize by protein length
555
+
556
+ Returns
557
+ -------
558
+ DataFrame
559
+ Coefficients of basis species
560
+
561
+ Examples
562
+ --------
563
+ >>> from pychnosz import *
564
+ >>> basis("CHNOSe")
565
+ >>> iprotein = pinfo("LYSC_CHICK")
566
+ >>> coeffs = protein_basis(iprotein)
567
+ """
568
+ # Get amino acid composition
569
+ aa = pinfo(pinfo(protein))
570
+
571
+ if not isinstance(aa, pd.DataFrame):
572
+ raise TypeError("Could not retrieve protein data")
573
+
574
+ # Get protein formulas
575
+ pf = protein_formula(aa)
576
+
577
+ # Calculate coefficients of basis species in formation reactions
578
+ sb = species_basis(pf)
579
+
580
+ # Calculate ionization states if H+ is a basis species
581
+ t = thermo()
582
+ if t.basis is not None:
583
+ basis_species = t.basis.index.tolist()
584
+ if 'H+' in basis_species:
585
+ iHplus = basis_species.index('H+')
586
+ pH = -t.basis.loc['H+', 'logact']
587
+ Z = ionize_aa(aa, T=T, pH=pH).iloc[0, :]
588
+ sb.iloc[:, iHplus] = sb.iloc[:, iHplus] + Z.values
589
+
590
+ # Normalize by length if requested
591
+ if normalize:
592
+ plen = protein_length(aa)
593
+ sb = sb.div(plen, axis=0)
594
+
595
+ return sb
@@ -0,0 +1,46 @@
1
+ """Core thermodynamic calculation functions for CHNOSZ."""
2
+
3
+ from .thermo import ThermoSystem, thermo
4
+ from .info import info, find_species, get_species_data, list_species
5
+ from .basis import basis, get_basis, is_basis_defined, preset_basis, BasisError
6
+ from .species import species, get_species, is_species_defined, n_species, SpeciesError
7
+ from .retrieve import retrieve
8
+
9
+ # Optional imports for modules that may not exist yet
10
+ try:
11
+ from .subcrt import subcrt
12
+ except ImportError:
13
+ subcrt = None
14
+
15
+ try:
16
+ from .affinity import affinity
17
+ except ImportError:
18
+ affinity = None
19
+
20
+ try:
21
+ from .equilibrate import equilibrate
22
+ except ImportError:
23
+ equilibrate = None
24
+
25
+ try:
26
+ from .diagram import diagram
27
+ except ImportError:
28
+ diagram = None
29
+
30
+ __all__ = [
31
+ 'ThermoSystem', 'thermo',
32
+ 'info', 'find_species', 'get_species_data', 'list_species',
33
+ 'basis', 'get_basis', 'is_basis_defined', 'preset_basis', 'BasisError',
34
+ 'species', 'get_species', 'is_species_defined', 'n_species', 'SpeciesError',
35
+ 'retrieve'
36
+ ]
37
+
38
+ # Add optional functions if they exist
39
+ if subcrt is not None:
40
+ __all__.append('subcrt')
41
+ if affinity is not None:
42
+ __all__.append('affinity')
43
+ if equilibrate is not None:
44
+ __all__.append('equilibrate')
45
+ if diagram is not None:
46
+ __all__.append('diagram')