pychnosz 1.1.11__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pychnosz/__init__.py +129 -0
- pychnosz/biomolecules/__init__.py +29 -0
- pychnosz/biomolecules/ionize_aa.py +197 -0
- pychnosz/biomolecules/proteins.py +595 -0
- pychnosz/core/__init__.py +46 -0
- pychnosz/core/affinity.py +1256 -0
- pychnosz/core/animation.py +593 -0
- pychnosz/core/balance.py +334 -0
- pychnosz/core/basis.py +716 -0
- pychnosz/core/diagram.py +3336 -0
- pychnosz/core/equilibrate.py +813 -0
- pychnosz/core/equilibrium.py +554 -0
- pychnosz/core/info.py +821 -0
- pychnosz/core/retrieve.py +364 -0
- pychnosz/core/speciation.py +580 -0
- pychnosz/core/species.py +599 -0
- pychnosz/core/subcrt.py +1696 -0
- pychnosz/core/thermo.py +593 -0
- pychnosz/core/unicurve.py +1226 -0
- pychnosz/data/__init__.py +11 -0
- pychnosz/data/add_obigt.py +327 -0
- pychnosz/data/extdata/Berman/BDat17_2017.csv +2 -0
- pychnosz/data/extdata/Berman/Ber88_1988.csv +68 -0
- pychnosz/data/extdata/Berman/Ber90_1990.csv +5 -0
- pychnosz/data/extdata/Berman/DS10_2010.csv +6 -0
- pychnosz/data/extdata/Berman/FDM+14_2014.csv +2 -0
- pychnosz/data/extdata/Berman/Got04_2004.csv +5 -0
- pychnosz/data/extdata/Berman/JUN92_1992.csv +3 -0
- pychnosz/data/extdata/Berman/SHD91_1991.csv +12 -0
- pychnosz/data/extdata/Berman/VGT92_1992.csv +2 -0
- pychnosz/data/extdata/Berman/VPT01_2001.csv +3 -0
- pychnosz/data/extdata/Berman/VPV05_2005.csv +2 -0
- pychnosz/data/extdata/Berman/ZS92_1992.csv +11 -0
- pychnosz/data/extdata/Berman/sympy.R +99 -0
- pychnosz/data/extdata/Berman/testing/BA96.bib +12 -0
- pychnosz/data/extdata/Berman/testing/BA96_Berman.csv +21 -0
- pychnosz/data/extdata/Berman/testing/BA96_OBIGT.csv +21 -0
- pychnosz/data/extdata/Berman/testing/BA96_refs.csv +6 -0
- pychnosz/data/extdata/OBIGT/AD.csv +25 -0
- pychnosz/data/extdata/OBIGT/Berman_cr.csv +93 -0
- pychnosz/data/extdata/OBIGT/DEW.csv +211 -0
- pychnosz/data/extdata/OBIGT/H2O_aq.csv +4 -0
- pychnosz/data/extdata/OBIGT/SLOP98.csv +411 -0
- pychnosz/data/extdata/OBIGT/SUPCRT92.csv +178 -0
- pychnosz/data/extdata/OBIGT/inorganic_aq.csv +729 -0
- pychnosz/data/extdata/OBIGT/inorganic_cr.csv +273 -0
- pychnosz/data/extdata/OBIGT/inorganic_gas.csv +20 -0
- pychnosz/data/extdata/OBIGT/organic_aq.csv +1104 -0
- pychnosz/data/extdata/OBIGT/organic_cr.csv +481 -0
- pychnosz/data/extdata/OBIGT/organic_gas.csv +268 -0
- pychnosz/data/extdata/OBIGT/organic_liq.csv +533 -0
- pychnosz/data/extdata/OBIGT/testing/GEMSFIT.csv +43 -0
- pychnosz/data/extdata/OBIGT/testing/IGEM.csv +17 -0
- pychnosz/data/extdata/OBIGT/testing/Sandia.csv +8 -0
- pychnosz/data/extdata/OBIGT/testing/SiO2.csv +4 -0
- pychnosz/data/extdata/misc/AD03_Fig1a.csv +69 -0
- pychnosz/data/extdata/misc/AD03_Fig1b.csv +43 -0
- pychnosz/data/extdata/misc/AD03_Fig1c.csv +89 -0
- pychnosz/data/extdata/misc/AD03_Fig1d.csv +30 -0
- pychnosz/data/extdata/misc/BZA10.csv +5 -0
- pychnosz/data/extdata/misc/HW97_Cp.csv +90 -0
- pychnosz/data/extdata/misc/HWM96_V.csv +229 -0
- pychnosz/data/extdata/misc/LA19_test.csv +7 -0
- pychnosz/data/extdata/misc/Mer75_Table4.csv +42 -0
- pychnosz/data/extdata/misc/OBIGT_check.csv +423 -0
- pychnosz/data/extdata/misc/PM90.csv +7 -0
- pychnosz/data/extdata/misc/RH95.csv +23 -0
- pychnosz/data/extdata/misc/RH98_Table15.csv +17 -0
- pychnosz/data/extdata/misc/SC10_Rainbow.csv +19 -0
- pychnosz/data/extdata/misc/SK95.csv +55 -0
- pychnosz/data/extdata/misc/SOJSH.csv +61 -0
- pychnosz/data/extdata/misc/SS98_Fig5a.csv +81 -0
- pychnosz/data/extdata/misc/SS98_Fig5b.csv +84 -0
- pychnosz/data/extdata/misc/TKSS14_Fig2.csv +25 -0
- pychnosz/data/extdata/misc/bluered.txt +1000 -0
- pychnosz/data/extdata/protein/Cas/Cas_aa.csv +177 -0
- pychnosz/data/extdata/protein/Cas/Cas_uniprot.csv +186 -0
- pychnosz/data/extdata/protein/Cas/download.R +34 -0
- pychnosz/data/extdata/protein/Cas/mkaa.R +34 -0
- pychnosz/data/extdata/protein/POLG.csv +12 -0
- pychnosz/data/extdata/protein/TBD+05.csv +393 -0
- pychnosz/data/extdata/protein/TBD+05_aa.csv +393 -0
- pychnosz/data/extdata/protein/rubisco.csv +28 -0
- pychnosz/data/extdata/protein/rubisco.fasta +239 -0
- pychnosz/data/extdata/protein/rubisco_aa.csv +28 -0
- pychnosz/data/extdata/src/H2O92D.f.orig +3457 -0
- pychnosz/data/extdata/src/README.txt +5 -0
- pychnosz/data/extdata/taxonomy/names.dmp +215 -0
- pychnosz/data/extdata/taxonomy/nodes.dmp +63 -0
- pychnosz/data/extdata/thermo/Bdot_acirc.csv +60 -0
- pychnosz/data/extdata/thermo/buffer.csv +40 -0
- pychnosz/data/extdata/thermo/element.csv +135 -0
- pychnosz/data/extdata/thermo/groups.csv +6 -0
- pychnosz/data/extdata/thermo/opt.csv +2 -0
- pychnosz/data/extdata/thermo/protein.csv +506 -0
- pychnosz/data/extdata/thermo/refs.csv +343 -0
- pychnosz/data/extdata/thermo/stoich.csv.xz +0 -0
- pychnosz/data/loader.py +431 -0
- pychnosz/data/mod_obigt.py +322 -0
- pychnosz/data/obigt.py +471 -0
- pychnosz/data/worm.py +228 -0
- pychnosz/fortran/__init__.py +16 -0
- pychnosz/fortran/h2o92.dll +0 -0
- pychnosz/fortran/h2o92_interface.py +527 -0
- pychnosz/geochemistry/__init__.py +21 -0
- pychnosz/geochemistry/minerals.py +514 -0
- pychnosz/geochemistry/redox.py +500 -0
- pychnosz/models/__init__.py +47 -0
- pychnosz/models/archer_wang.py +165 -0
- pychnosz/models/berman.py +309 -0
- pychnosz/models/cgl.py +381 -0
- pychnosz/models/dew.py +997 -0
- pychnosz/models/hkf.py +523 -0
- pychnosz/models/hkf_helpers.py +231 -0
- pychnosz/models/iapws95.py +1113 -0
- pychnosz/models/supcrt92_fortran.py +238 -0
- pychnosz/models/water.py +480 -0
- pychnosz/utils/__init__.py +27 -0
- pychnosz/utils/expression.py +1074 -0
- pychnosz/utils/formula.py +830 -0
- pychnosz/utils/formula_ox.py +227 -0
- pychnosz/utils/reset.py +33 -0
- pychnosz/utils/units.py +259 -0
- pychnosz-1.1.11.dist-info/METADATA +197 -0
- pychnosz-1.1.11.dist-info/RECORD +128 -0
- pychnosz-1.1.11.dist-info/WHEEL +5 -0
- pychnosz-1.1.11.dist-info/licenses/LICENSE.txt +19 -0
- pychnosz-1.1.11.dist-info/top_level.txt +1 -0
pychnosz/data/obigt.py
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OBIGT database access module.
|
|
3
|
+
|
|
4
|
+
This module provides a high-level interface to the OBIGT (Oelkers, Benezeth,
|
|
5
|
+
and Isobaric Gas Thermodynamics) database, which contains thermodynamic
|
|
6
|
+
parameters for chemical species.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
from typing import Dict, List, Optional, Union, Tuple
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from .loader import DataLoader
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OBIGTDatabase:
|
|
17
|
+
"""
|
|
18
|
+
High-level interface to the OBIGT thermodynamic database.
|
|
19
|
+
|
|
20
|
+
This class provides methods to access, search, and manipulate the
|
|
21
|
+
thermodynamic data from the OBIGT database files.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, data_loader: Optional[DataLoader] = None):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the OBIGT database.
|
|
27
|
+
|
|
28
|
+
Parameters:
|
|
29
|
+
-----------
|
|
30
|
+
data_loader : DataLoader, optional
|
|
31
|
+
DataLoader instance to use. If None, creates a default loader.
|
|
32
|
+
"""
|
|
33
|
+
if data_loader is None:
|
|
34
|
+
from .loader import get_default_loader
|
|
35
|
+
self.loader = get_default_loader()
|
|
36
|
+
else:
|
|
37
|
+
self.loader = data_loader
|
|
38
|
+
|
|
39
|
+
# Cache for combined data
|
|
40
|
+
self._combined_data = None
|
|
41
|
+
self._species_index = None
|
|
42
|
+
|
|
43
|
+
# Define the expected columns for OBIGT data
|
|
44
|
+
self.obigt_columns = [
|
|
45
|
+
'name', 'abbrv', 'formula', 'state', 'ref1', 'ref2', 'date', 'model',
|
|
46
|
+
'E_units', 'G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d',
|
|
47
|
+
'c1.e', 'c2.f', 'omega.lambda', 'z.T'
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
# State classifications
|
|
51
|
+
self.aqueous_states = ['aq']
|
|
52
|
+
self.crystalline_states = ['cr']
|
|
53
|
+
self.gas_states = ['gas']
|
|
54
|
+
self.liquid_states = ['liq']
|
|
55
|
+
|
|
56
|
+
def load_all_data(self, force_reload: bool = False) -> pd.DataFrame:
|
|
57
|
+
"""
|
|
58
|
+
Load and combine all OBIGT data files.
|
|
59
|
+
|
|
60
|
+
Parameters:
|
|
61
|
+
-----------
|
|
62
|
+
force_reload : bool, default False
|
|
63
|
+
Force reloading of data even if cached
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
--------
|
|
67
|
+
pd.DataFrame
|
|
68
|
+
Combined OBIGT database
|
|
69
|
+
"""
|
|
70
|
+
if self._combined_data is not None and not force_reload:
|
|
71
|
+
return self._combined_data.copy()
|
|
72
|
+
|
|
73
|
+
# Load all OBIGT files
|
|
74
|
+
obigt_files = self.loader.load_all_obigt_files()
|
|
75
|
+
|
|
76
|
+
# Combine all files
|
|
77
|
+
combined_data = []
|
|
78
|
+
|
|
79
|
+
for filename, df in obigt_files.items():
|
|
80
|
+
# Add source file information
|
|
81
|
+
df_copy = df.copy()
|
|
82
|
+
df_copy['source_file'] = filename
|
|
83
|
+
combined_data.append(df_copy)
|
|
84
|
+
|
|
85
|
+
# Concatenate all data
|
|
86
|
+
self._combined_data = pd.concat(combined_data, ignore_index=True)
|
|
87
|
+
|
|
88
|
+
# IMPORTANT: R uses 1-based indexing, so we need to shift the DataFrame index
|
|
89
|
+
# to match R's row numbers. Row 0 in pandas should be row 1 in R.
|
|
90
|
+
self._combined_data.index = self._combined_data.index + 1
|
|
91
|
+
|
|
92
|
+
# Create species index for fast lookups
|
|
93
|
+
self._create_species_index()
|
|
94
|
+
|
|
95
|
+
return self._combined_data.copy()
|
|
96
|
+
|
|
97
|
+
def get_combined_data(self) -> pd.DataFrame:
|
|
98
|
+
"""
|
|
99
|
+
Get combined OBIGT thermodynamic data.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
pd.DataFrame
|
|
104
|
+
Combined OBIGT data with all species
|
|
105
|
+
"""
|
|
106
|
+
if self._combined_data is not None:
|
|
107
|
+
return self._combined_data.copy()
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
# Try to load data normally first
|
|
111
|
+
return self.load_all_data()
|
|
112
|
+
except Exception as e:
|
|
113
|
+
print(f"Warning: Could not load OBIGT data: {e}")
|
|
114
|
+
# Create minimal fallback data for essential species
|
|
115
|
+
return self._create_fallback_data()
|
|
116
|
+
|
|
117
|
+
def _create_fallback_data(self) -> pd.DataFrame:
|
|
118
|
+
"""Create minimal fallback data for essential species."""
|
|
119
|
+
|
|
120
|
+
# Essential species data (approximate values for basic functionality)
|
|
121
|
+
fallback_data = {
|
|
122
|
+
'name': ['water', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'],
|
|
123
|
+
'abbrv': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'],
|
|
124
|
+
'formula': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'],
|
|
125
|
+
'state': ['liq', 'aq', 'aq', 'aq', 'aq', 'aq'],
|
|
126
|
+
'G': [-56688.1, 0.0, -37595.0, -92307.0, -140314.0, -126172.0],
|
|
127
|
+
'H': [-68317.0, 0.0, -54977.0, -98900.0, -165180.0, -161963.0],
|
|
128
|
+
'S': [16.712, 0.0, -2.56, -39.75, 98.4, -50.0],
|
|
129
|
+
'Cp': [18.0, 0.0, -36.4, 37.11, 25.0, -53.1],
|
|
130
|
+
'V': [18.068, 0.0, -4.71, 34.0, 25.0, -6.0],
|
|
131
|
+
'z.T': [0, 1, -1, 0, -1, -2],
|
|
132
|
+
'ref1': ['', '', '', '', '', ''],
|
|
133
|
+
'ref2': ['', '', '', '', '', ''],
|
|
134
|
+
'date': ['', '', '', '', '', ''],
|
|
135
|
+
'model': ['', '', '', '', '', ''],
|
|
136
|
+
'E_units': ['', '', '', '', '', ''],
|
|
137
|
+
'a1.a': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
|
138
|
+
'a2.b': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
|
139
|
+
'a3.c': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
|
140
|
+
'a4.d': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
|
141
|
+
'c1.e': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
|
142
|
+
'c2.f': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
|
143
|
+
'omega.lambda': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
df = pd.DataFrame(fallback_data)
|
|
147
|
+
|
|
148
|
+
# Cache the fallback data
|
|
149
|
+
self._combined_data = df
|
|
150
|
+
self._create_species_index()
|
|
151
|
+
|
|
152
|
+
return df.copy()
|
|
153
|
+
|
|
154
|
+
def _create_species_index(self):
|
|
155
|
+
"""Create an index for fast species lookups."""
|
|
156
|
+
if self._combined_data is None:
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
# Create multi-level index for name, formula, and state
|
|
160
|
+
self._species_index = {}
|
|
161
|
+
|
|
162
|
+
for idx, row in self._combined_data.iterrows():
|
|
163
|
+
name = str(row.get('name', '')).strip()
|
|
164
|
+
formula = str(row.get('formula', '')).strip()
|
|
165
|
+
state = str(row.get('state', '')).strip()
|
|
166
|
+
|
|
167
|
+
# Index by name
|
|
168
|
+
if name and name not in self._species_index:
|
|
169
|
+
self._species_index[name] = []
|
|
170
|
+
if name:
|
|
171
|
+
self._species_index[name].append(idx)
|
|
172
|
+
|
|
173
|
+
# Index by formula
|
|
174
|
+
formula_key = f"formula:{formula}"
|
|
175
|
+
if formula and formula_key not in self._species_index:
|
|
176
|
+
self._species_index[formula_key] = []
|
|
177
|
+
if formula:
|
|
178
|
+
self._species_index[formula_key].append(idx)
|
|
179
|
+
|
|
180
|
+
# Index by name+state combination
|
|
181
|
+
name_state_key = f"{name}({state})"
|
|
182
|
+
if name and state and name_state_key not in self._species_index:
|
|
183
|
+
self._species_index[name_state_key] = []
|
|
184
|
+
if name and state:
|
|
185
|
+
self._species_index[name_state_key].append(idx)
|
|
186
|
+
|
|
187
|
+
def get_species(self, identifier: str, state: Optional[str] = None) -> pd.DataFrame:
|
|
188
|
+
"""
|
|
189
|
+
Get species data by name, formula, or identifier.
|
|
190
|
+
|
|
191
|
+
Parameters:
|
|
192
|
+
-----------
|
|
193
|
+
identifier : str
|
|
194
|
+
Species name, formula, or identifier
|
|
195
|
+
state : str, optional
|
|
196
|
+
Physical state ('aq', 'cr', 'gas', 'liq')
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
--------
|
|
200
|
+
pd.DataFrame
|
|
201
|
+
Matching species data
|
|
202
|
+
"""
|
|
203
|
+
if self._combined_data is None:
|
|
204
|
+
self.load_all_data()
|
|
205
|
+
|
|
206
|
+
results = []
|
|
207
|
+
|
|
208
|
+
# Try exact name match first
|
|
209
|
+
if identifier in self._species_index:
|
|
210
|
+
indices = self._species_index[identifier]
|
|
211
|
+
for idx in indices:
|
|
212
|
+
row = self._combined_data.iloc[idx]
|
|
213
|
+
if state is None or str(row.get('state', '')).strip() == state:
|
|
214
|
+
results.append(row)
|
|
215
|
+
|
|
216
|
+
# Try formula match
|
|
217
|
+
formula_key = f"formula:{identifier}"
|
|
218
|
+
if formula_key in self._species_index:
|
|
219
|
+
indices = self._species_index[formula_key]
|
|
220
|
+
for idx in indices:
|
|
221
|
+
row = self._combined_data.iloc[idx]
|
|
222
|
+
if state is None or str(row.get('state', '')).strip() == state:
|
|
223
|
+
results.append(row)
|
|
224
|
+
|
|
225
|
+
# Try name+state combination
|
|
226
|
+
if state:
|
|
227
|
+
name_state_key = f"{identifier}({state})"
|
|
228
|
+
if name_state_key in self._species_index:
|
|
229
|
+
indices = self._species_index[name_state_key]
|
|
230
|
+
for idx in indices:
|
|
231
|
+
results.append(self._combined_data.iloc[idx])
|
|
232
|
+
|
|
233
|
+
# If no exact matches, try partial matching
|
|
234
|
+
if not results:
|
|
235
|
+
mask = self._combined_data['name'].str.contains(identifier, case=False, na=False) | \
|
|
236
|
+
self._combined_data['formula'].str.contains(identifier, case=False, na=False)
|
|
237
|
+
|
|
238
|
+
if state:
|
|
239
|
+
mask &= (self._combined_data['state'] == state)
|
|
240
|
+
|
|
241
|
+
partial_matches = self._combined_data[mask]
|
|
242
|
+
results = [row for _, row in partial_matches.iterrows()]
|
|
243
|
+
|
|
244
|
+
if results:
|
|
245
|
+
return pd.DataFrame(results).reset_index(drop=True)
|
|
246
|
+
else:
|
|
247
|
+
return pd.DataFrame(columns=self._combined_data.columns)
|
|
248
|
+
|
|
249
|
+
def search_species(self, query: str, search_columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
250
|
+
"""
|
|
251
|
+
Search for species using a text query.
|
|
252
|
+
|
|
253
|
+
Parameters:
|
|
254
|
+
-----------
|
|
255
|
+
query : str
|
|
256
|
+
Search query
|
|
257
|
+
search_columns : List[str], optional
|
|
258
|
+
Columns to search in. Default: ['name', 'formula', 'abbrv']
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
--------
|
|
262
|
+
pd.DataFrame
|
|
263
|
+
Matching species data
|
|
264
|
+
"""
|
|
265
|
+
if self._combined_data is None:
|
|
266
|
+
self.load_all_data()
|
|
267
|
+
|
|
268
|
+
if search_columns is None:
|
|
269
|
+
search_columns = ['name', 'formula', 'abbrv']
|
|
270
|
+
|
|
271
|
+
# Create search mask
|
|
272
|
+
mask = pd.Series([False] * len(self._combined_data))
|
|
273
|
+
|
|
274
|
+
for col in search_columns:
|
|
275
|
+
if col in self._combined_data.columns:
|
|
276
|
+
mask |= self._combined_data[col].str.contains(query, case=False, na=False)
|
|
277
|
+
|
|
278
|
+
return self._combined_data[mask].reset_index(drop=True)
|
|
279
|
+
|
|
280
|
+
def get_species_by_state(self, state: str) -> pd.DataFrame:
|
|
281
|
+
"""
|
|
282
|
+
Get all species in a specific physical state.
|
|
283
|
+
|
|
284
|
+
Parameters:
|
|
285
|
+
-----------
|
|
286
|
+
state : str
|
|
287
|
+
Physical state ('aq', 'cr', 'gas', 'liq')
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
--------
|
|
291
|
+
pd.DataFrame
|
|
292
|
+
Species data for the specified state
|
|
293
|
+
"""
|
|
294
|
+
if self._combined_data is None:
|
|
295
|
+
self.load_all_data()
|
|
296
|
+
|
|
297
|
+
mask = self._combined_data['state'] == state
|
|
298
|
+
return self._combined_data[mask].reset_index(drop=True)
|
|
299
|
+
|
|
300
|
+
def get_aqueous_species(self) -> pd.DataFrame:
|
|
301
|
+
"""Get all aqueous species."""
|
|
302
|
+
return self.get_species_by_state('aq')
|
|
303
|
+
|
|
304
|
+
def get_crystalline_species(self) -> pd.DataFrame:
|
|
305
|
+
"""Get all crystalline species."""
|
|
306
|
+
return self.get_species_by_state('cr')
|
|
307
|
+
|
|
308
|
+
def get_gas_species(self) -> pd.DataFrame:
|
|
309
|
+
"""Get all gas species."""
|
|
310
|
+
return self.get_species_by_state('gas')
|
|
311
|
+
|
|
312
|
+
def get_liquid_species(self) -> pd.DataFrame:
|
|
313
|
+
"""Get all liquid species."""
|
|
314
|
+
return self.get_species_by_state('liq')
|
|
315
|
+
|
|
316
|
+
def get_species_by_elements(self, elements: List[str]) -> pd.DataFrame:
|
|
317
|
+
"""
|
|
318
|
+
Get species containing specific elements.
|
|
319
|
+
|
|
320
|
+
Parameters:
|
|
321
|
+
-----------
|
|
322
|
+
elements : List[str]
|
|
323
|
+
List of element symbols
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
--------
|
|
327
|
+
pd.DataFrame
|
|
328
|
+
Species containing the specified elements
|
|
329
|
+
"""
|
|
330
|
+
if self._combined_data is None:
|
|
331
|
+
self.load_all_data()
|
|
332
|
+
|
|
333
|
+
# Create search pattern for elements
|
|
334
|
+
pattern = '|'.join(elements)
|
|
335
|
+
mask = self._combined_data['formula'].str.contains(pattern, case=False, na=False)
|
|
336
|
+
|
|
337
|
+
return self._combined_data[mask].reset_index(drop=True)
|
|
338
|
+
|
|
339
|
+
def get_thermodynamic_properties(self, species_data: pd.DataFrame) -> pd.DataFrame:
|
|
340
|
+
"""
|
|
341
|
+
Extract thermodynamic properties from species data.
|
|
342
|
+
|
|
343
|
+
Parameters:
|
|
344
|
+
-----------
|
|
345
|
+
species_data : pd.DataFrame
|
|
346
|
+
Species data from get_species or similar methods
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
--------
|
|
350
|
+
pd.DataFrame
|
|
351
|
+
Thermodynamic properties (G, H, S, Cp, V, etc.)
|
|
352
|
+
"""
|
|
353
|
+
thermo_columns = ['G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d',
|
|
354
|
+
'c1.e', 'c2.f', 'omega.lambda', 'z.T']
|
|
355
|
+
|
|
356
|
+
available_columns = [col for col in thermo_columns if col in species_data.columns]
|
|
357
|
+
|
|
358
|
+
result = species_data[['name', 'formula', 'state'] + available_columns].copy()
|
|
359
|
+
|
|
360
|
+
# Convert numeric columns to proper numeric types
|
|
361
|
+
for col in available_columns:
|
|
362
|
+
result[col] = pd.to_numeric(result[col], errors='coerce')
|
|
363
|
+
|
|
364
|
+
return result
|
|
365
|
+
|
|
366
|
+
def get_database_stats(self) -> Dict[str, Union[int, Dict[str, int]]]:
|
|
367
|
+
"""
|
|
368
|
+
Get statistics about the database.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
--------
|
|
372
|
+
Dict
|
|
373
|
+
Database statistics including total species, states, etc.
|
|
374
|
+
"""
|
|
375
|
+
if self._combined_data is None:
|
|
376
|
+
self.load_all_data()
|
|
377
|
+
|
|
378
|
+
stats = {
|
|
379
|
+
'total_species': len(self._combined_data),
|
|
380
|
+
'states': self._combined_data['state'].value_counts().to_dict(),
|
|
381
|
+
'source_files': self._combined_data['source_file'].value_counts().to_dict(),
|
|
382
|
+
'unique_names': self._combined_data['name'].nunique(),
|
|
383
|
+
'unique_formulas': self._combined_data['formula'].nunique(),
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
return stats
|
|
387
|
+
|
|
388
|
+
def validate_data(self) -> Dict[str, List]:
|
|
389
|
+
"""
|
|
390
|
+
Validate the OBIGT database for common issues.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
--------
|
|
394
|
+
Dict
|
|
395
|
+
Validation results with issues found
|
|
396
|
+
"""
|
|
397
|
+
if self._combined_data is None:
|
|
398
|
+
self.load_all_data()
|
|
399
|
+
|
|
400
|
+
issues = {
|
|
401
|
+
'missing_names': [],
|
|
402
|
+
'missing_formulas': [],
|
|
403
|
+
'missing_states': [],
|
|
404
|
+
'invalid_numeric_values': [],
|
|
405
|
+
'duplicate_entries': []
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
# Check for missing critical fields
|
|
409
|
+
missing_names = self._combined_data['name'].isna() | (self._combined_data['name'] == '')
|
|
410
|
+
if missing_names.any():
|
|
411
|
+
issues['missing_names'] = self._combined_data[missing_names].index.tolist()
|
|
412
|
+
|
|
413
|
+
missing_formulas = self._combined_data['formula'].isna() | (self._combined_data['formula'] == '')
|
|
414
|
+
if missing_formulas.any():
|
|
415
|
+
issues['missing_formulas'] = self._combined_data[missing_formulas].index.tolist()
|
|
416
|
+
|
|
417
|
+
missing_states = self._combined_data['state'].isna() | (self._combined_data['state'] == '')
|
|
418
|
+
if missing_states.any():
|
|
419
|
+
issues['missing_states'] = self._combined_data[missing_states].index.tolist()
|
|
420
|
+
|
|
421
|
+
# Check for invalid numeric values in key thermodynamic properties
|
|
422
|
+
numeric_columns = ['G', 'H', 'S', 'Cp']
|
|
423
|
+
for col in numeric_columns:
|
|
424
|
+
if col in self._combined_data.columns:
|
|
425
|
+
numeric_data = pd.to_numeric(self._combined_data[col], errors='coerce')
|
|
426
|
+
invalid_mask = numeric_data.isna() & self._combined_data[col].notna()
|
|
427
|
+
if invalid_mask.any():
|
|
428
|
+
issues['invalid_numeric_values'].extend(
|
|
429
|
+
[(idx, col) for idx in self._combined_data[invalid_mask].index]
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Check for potential duplicates
|
|
433
|
+
duplicate_mask = self._combined_data.duplicated(subset=['name', 'formula', 'state'], keep=False)
|
|
434
|
+
if duplicate_mask.any():
|
|
435
|
+
issues['duplicate_entries'] = self._combined_data[duplicate_mask].index.tolist()
|
|
436
|
+
|
|
437
|
+
return issues
|
|
438
|
+
|
|
439
|
+
def export_to_csv(self, filename: str, species_filter: Optional[str] = None):
|
|
440
|
+
"""
|
|
441
|
+
Export database or filtered data to CSV.
|
|
442
|
+
|
|
443
|
+
Parameters:
|
|
444
|
+
-----------
|
|
445
|
+
filename : str
|
|
446
|
+
Output filename
|
|
447
|
+
species_filter : str, optional
|
|
448
|
+
Filter to apply (state name like 'aq', 'cr', etc.)
|
|
449
|
+
"""
|
|
450
|
+
if self._combined_data is None:
|
|
451
|
+
self.load_all_data()
|
|
452
|
+
|
|
453
|
+
data_to_export = self._combined_data
|
|
454
|
+
|
|
455
|
+
if species_filter:
|
|
456
|
+
if species_filter in ['aq', 'cr', 'gas', 'liq']:
|
|
457
|
+
data_to_export = self.get_species_by_state(species_filter)
|
|
458
|
+
|
|
459
|
+
data_to_export.to_csv(filename, index=False)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def get_default_obigt() -> OBIGTDatabase:
|
|
463
|
+
"""
|
|
464
|
+
Get a default OBIGT database instance.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
--------
|
|
468
|
+
OBIGTDatabase
|
|
469
|
+
Default OBIGT database instance
|
|
470
|
+
"""
|
|
471
|
+
return OBIGTDatabase()
|
pychnosz/data/worm.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
WORM database loader for CHNOSZ.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to load the Water-Organic-Rock-Microbe (WORM)
|
|
5
|
+
thermodynamic database from the WORM-db GitHub repository.
|
|
6
|
+
|
|
7
|
+
Reference: https://github.com/worm-portal/WORM-db
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from io import StringIO
|
|
12
|
+
from urllib.request import urlopen
|
|
13
|
+
from typing import Optional, Tuple
|
|
14
|
+
import warnings
|
|
15
|
+
|
|
16
|
+
from ..core.thermo import thermo
|
|
17
|
+
from .add_obigt import add_OBIGT
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def can_connect_to(url: str, timeout: int = 5) -> bool:
|
|
21
|
+
"""
|
|
22
|
+
Check if a URL is reachable.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
url : str
|
|
27
|
+
The URL to check
|
|
28
|
+
timeout : int, default 5
|
|
29
|
+
Connection timeout in seconds
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
bool
|
|
34
|
+
True if URL is reachable, False otherwise
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
from urllib.request import Request
|
|
38
|
+
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
|
39
|
+
with urlopen(req, timeout=timeout) as response:
|
|
40
|
+
return response.status == 200
|
|
41
|
+
except Exception:
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def download_worm_data(url: str) -> Optional[pd.DataFrame]:
|
|
46
|
+
"""
|
|
47
|
+
Download WORM database from URL.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
url : str
|
|
52
|
+
URL to the WORM CSV file
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
pd.DataFrame or None
|
|
57
|
+
DataFrame containing WORM data, or None if download fails
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
from urllib.request import Request
|
|
61
|
+
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
|
62
|
+
with urlopen(req, timeout=30) as webpage:
|
|
63
|
+
content = webpage.read().decode('utf-8')
|
|
64
|
+
return pd.read_csv(StringIO(content), sep=",")
|
|
65
|
+
except Exception as e:
|
|
66
|
+
warnings.warn(f"Failed to download WORM data from {url}: {e}")
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def load_WORM(keep_default: bool = False, messages: bool = True) -> bool:
|
|
71
|
+
"""
|
|
72
|
+
Load the WORM (Water-Organic-Rock-Microbe) thermodynamic database.
|
|
73
|
+
|
|
74
|
+
This function downloads and loads the WORM database from the WORM-db GitHub
|
|
75
|
+
repository. By default, it replaces the OBIGT database with WORM data,
|
|
76
|
+
keeping only water, H+, and e- from the original database.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
keep_default : bool, default False
|
|
81
|
+
If False, replace OBIGT with minimal species (water, H+, e-) before
|
|
82
|
+
loading WORM. If True, add WORM species to the existing OBIGT database.
|
|
83
|
+
messages : bool, default True
|
|
84
|
+
Whether to print informational messages
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
bool
|
|
89
|
+
True if WORM database was loaded successfully, False otherwise
|
|
90
|
+
|
|
91
|
+
Examples
|
|
92
|
+
--------
|
|
93
|
+
>>> import pychnosz
|
|
94
|
+
>>> pychnosz.reset()
|
|
95
|
+
>>> # Load WORM database (replaces default OBIGT)
|
|
96
|
+
>>> pychnosz.load_WORM()
|
|
97
|
+
>>>
|
|
98
|
+
>>> # Load WORM database while keeping default OBIGT species
|
|
99
|
+
>>> pychnosz.reset()
|
|
100
|
+
>>> pychnosz.load_WORM(keep_default=True)
|
|
101
|
+
|
|
102
|
+
Notes
|
|
103
|
+
-----
|
|
104
|
+
The WORM database is downloaded from:
|
|
105
|
+
- Species data: https://github.com/worm-portal/WORM-db/master/wrm_data_latest.csv
|
|
106
|
+
- References: https://github.com/worm-portal/WORM-db/master/references.csv
|
|
107
|
+
|
|
108
|
+
This feature is exclusive to the Python version of CHNOSZ.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
# WORM database URLs
|
|
112
|
+
url_data = "https://raw.githubusercontent.com/worm-portal/WORM-db/master/wrm_data_latest.csv"
|
|
113
|
+
url_refs = "https://raw.githubusercontent.com/worm-portal/WORM-db/master/references.csv"
|
|
114
|
+
|
|
115
|
+
# Name for source_file column
|
|
116
|
+
worm_source_name = "wrm_data_latest.csv"
|
|
117
|
+
|
|
118
|
+
# Check if we can connect to the WORM database
|
|
119
|
+
if not can_connect_to(url_data):
|
|
120
|
+
if messages:
|
|
121
|
+
print("load_WORM: could not reach WORM database repository")
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
# Download WORM species data
|
|
125
|
+
worm_data = download_worm_data(url_data)
|
|
126
|
+
if worm_data is None:
|
|
127
|
+
if messages:
|
|
128
|
+
print("load_WORM: failed to download WORM species data")
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
# Get the thermodynamic system
|
|
132
|
+
thermo_sys = thermo()
|
|
133
|
+
|
|
134
|
+
if not keep_default:
|
|
135
|
+
# Keep only essential species (water, H+, e-)
|
|
136
|
+
from ..core.info import info
|
|
137
|
+
try:
|
|
138
|
+
# Get indices for essential species
|
|
139
|
+
essential_species = []
|
|
140
|
+
for species in ["water", "H+", "e-"]:
|
|
141
|
+
idx = info(species)
|
|
142
|
+
if idx is not None:
|
|
143
|
+
if isinstance(idx, (list, tuple)):
|
|
144
|
+
essential_species.extend(idx)
|
|
145
|
+
else:
|
|
146
|
+
essential_species.append(idx)
|
|
147
|
+
|
|
148
|
+
if essential_species:
|
|
149
|
+
# Keep only essential species
|
|
150
|
+
minimal_obigt = thermo_sys.obigt.loc[essential_species].copy()
|
|
151
|
+
thermo_sys.obigt = minimal_obigt
|
|
152
|
+
except Exception as e:
|
|
153
|
+
if messages:
|
|
154
|
+
print(f"load_WORM: warning - error keeping essential species: {e}")
|
|
155
|
+
|
|
156
|
+
# Add WORM species data (suppress add_OBIGT messages)
|
|
157
|
+
try:
|
|
158
|
+
# Add source_file column to worm_data before adding
|
|
159
|
+
worm_data['source_file'] = worm_source_name
|
|
160
|
+
|
|
161
|
+
indices = add_OBIGT(worm_data, messages=False)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
if messages:
|
|
164
|
+
print(f"load_WORM: failed to add WORM species: {e}")
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
# Try to download and load WORM references
|
|
168
|
+
if can_connect_to(url_refs):
|
|
169
|
+
worm_refs = download_worm_data(url_refs)
|
|
170
|
+
if worm_refs is not None:
|
|
171
|
+
# Replace refs with WORM refs
|
|
172
|
+
thermo_sys.refs = worm_refs
|
|
173
|
+
|
|
174
|
+
# Update formula_ox if it exists in WORM data
|
|
175
|
+
# This is already handled by add_OBIGT, but we ensure it's set correctly
|
|
176
|
+
if 'formula_ox' in thermo_sys.obigt.columns:
|
|
177
|
+
formula_ox_df = pd.DataFrame({
|
|
178
|
+
'name': thermo_sys.obigt['name'],
|
|
179
|
+
'formula_ox': thermo_sys.obigt['formula_ox']
|
|
180
|
+
})
|
|
181
|
+
formula_ox_df.index = thermo_sys.obigt.index
|
|
182
|
+
thermo_sys.formula_ox = formula_ox_df
|
|
183
|
+
|
|
184
|
+
# Print single summary message
|
|
185
|
+
if messages:
|
|
186
|
+
final_obigt = thermo_sys.obigt
|
|
187
|
+
total_species = len(final_obigt)
|
|
188
|
+
aqueous_species = len(final_obigt[final_obigt['state'] == 'aq'])
|
|
189
|
+
print(f"The WORM thermodynamic database has been loaded: {aqueous_species} aqueous, {total_species} total species")
|
|
190
|
+
|
|
191
|
+
return True
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def reset_WORM(messages: bool = True) -> None:
|
|
195
|
+
"""
|
|
196
|
+
Initialize the thermodynamic system with the WORM database.
|
|
197
|
+
|
|
198
|
+
This is a convenience function that combines reset() and load_WORM().
|
|
199
|
+
It initializes the system and loads the WORM database in one step.
|
|
200
|
+
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
messages : bool, default True
|
|
204
|
+
Whether to print informational messages
|
|
205
|
+
|
|
206
|
+
Examples
|
|
207
|
+
--------
|
|
208
|
+
>>> import pychnosz
|
|
209
|
+
>>> # Initialize with WORM database
|
|
210
|
+
>>> pychnosz.reset_WORM()
|
|
211
|
+
|
|
212
|
+
Notes
|
|
213
|
+
-----
|
|
214
|
+
This is equivalent to:
|
|
215
|
+
pychnosz.reset()
|
|
216
|
+
pychnosz.load_WORM()
|
|
217
|
+
"""
|
|
218
|
+
from ..utils.reset import reset
|
|
219
|
+
|
|
220
|
+
# Reset the system first
|
|
221
|
+
reset(messages=messages)
|
|
222
|
+
|
|
223
|
+
# Load WORM database
|
|
224
|
+
success = load_WORM(keep_default=False, messages=messages)
|
|
225
|
+
|
|
226
|
+
if not success:
|
|
227
|
+
if messages:
|
|
228
|
+
print("reset_WORM: falling back to default OBIGT database")
|