pychnosz 1.1.1__cp311-cp311-macosx_10_13_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pychnosz/.dylibs/libgcc_s.1.1.dylib +0 -0
- pychnosz/.dylibs/libgfortran.5.dylib +0 -0
- pychnosz/.dylibs/libquadmath.0.dylib +0 -0
- pychnosz/__init__.py +129 -0
- pychnosz/biomolecules/__init__.py +29 -0
- pychnosz/biomolecules/ionize_aa.py +197 -0
- pychnosz/biomolecules/proteins.py +595 -0
- pychnosz/core/__init__.py +46 -0
- pychnosz/core/affinity.py +1256 -0
- pychnosz/core/animation.py +593 -0
- pychnosz/core/balance.py +334 -0
- pychnosz/core/basis.py +716 -0
- pychnosz/core/diagram.py +3336 -0
- pychnosz/core/equilibrate.py +813 -0
- pychnosz/core/equilibrium.py +554 -0
- pychnosz/core/info.py +821 -0
- pychnosz/core/retrieve.py +364 -0
- pychnosz/core/speciation.py +580 -0
- pychnosz/core/species.py +599 -0
- pychnosz/core/subcrt.py +1700 -0
- pychnosz/core/thermo.py +593 -0
- pychnosz/core/unicurve.py +1226 -0
- pychnosz/data/__init__.py +11 -0
- pychnosz/data/add_obigt.py +327 -0
- pychnosz/data/extdata/Berman/BDat17_2017.csv +2 -0
- pychnosz/data/extdata/Berman/Ber88_1988.csv +68 -0
- pychnosz/data/extdata/Berman/Ber90_1990.csv +5 -0
- pychnosz/data/extdata/Berman/DS10_2010.csv +6 -0
- pychnosz/data/extdata/Berman/FDM+14_2014.csv +2 -0
- pychnosz/data/extdata/Berman/Got04_2004.csv +5 -0
- pychnosz/data/extdata/Berman/JUN92_1992.csv +3 -0
- pychnosz/data/extdata/Berman/SHD91_1991.csv +12 -0
- pychnosz/data/extdata/Berman/VGT92_1992.csv +2 -0
- pychnosz/data/extdata/Berman/VPT01_2001.csv +3 -0
- pychnosz/data/extdata/Berman/VPV05_2005.csv +2 -0
- pychnosz/data/extdata/Berman/ZS92_1992.csv +11 -0
- pychnosz/data/extdata/Berman/sympy.R +99 -0
- pychnosz/data/extdata/Berman/testing/BA96.bib +12 -0
- pychnosz/data/extdata/Berman/testing/BA96_Berman.csv +21 -0
- pychnosz/data/extdata/Berman/testing/BA96_OBIGT.csv +21 -0
- pychnosz/data/extdata/Berman/testing/BA96_refs.csv +6 -0
- pychnosz/data/extdata/OBIGT/AD.csv +25 -0
- pychnosz/data/extdata/OBIGT/Berman_cr.csv +93 -0
- pychnosz/data/extdata/OBIGT/DEW.csv +211 -0
- pychnosz/data/extdata/OBIGT/H2O_aq.csv +4 -0
- pychnosz/data/extdata/OBIGT/SLOP98.csv +411 -0
- pychnosz/data/extdata/OBIGT/SUPCRT92.csv +178 -0
- pychnosz/data/extdata/OBIGT/inorganic_aq.csv +729 -0
- pychnosz/data/extdata/OBIGT/inorganic_cr.csv +273 -0
- pychnosz/data/extdata/OBIGT/inorganic_gas.csv +20 -0
- pychnosz/data/extdata/OBIGT/organic_aq.csv +1104 -0
- pychnosz/data/extdata/OBIGT/organic_cr.csv +481 -0
- pychnosz/data/extdata/OBIGT/organic_gas.csv +268 -0
- pychnosz/data/extdata/OBIGT/organic_liq.csv +533 -0
- pychnosz/data/extdata/OBIGT/testing/GEMSFIT.csv +43 -0
- pychnosz/data/extdata/OBIGT/testing/IGEM.csv +17 -0
- pychnosz/data/extdata/OBIGT/testing/Sandia.csv +8 -0
- pychnosz/data/extdata/OBIGT/testing/SiO2.csv +4 -0
- pychnosz/data/extdata/misc/AD03_Fig1a.csv +69 -0
- pychnosz/data/extdata/misc/AD03_Fig1b.csv +43 -0
- pychnosz/data/extdata/misc/AD03_Fig1c.csv +89 -0
- pychnosz/data/extdata/misc/AD03_Fig1d.csv +30 -0
- pychnosz/data/extdata/misc/BZA10.csv +5 -0
- pychnosz/data/extdata/misc/HW97_Cp.csv +90 -0
- pychnosz/data/extdata/misc/HWM96_V.csv +229 -0
- pychnosz/data/extdata/misc/LA19_test.csv +7 -0
- pychnosz/data/extdata/misc/Mer75_Table4.csv +42 -0
- pychnosz/data/extdata/misc/OBIGT_check.csv +423 -0
- pychnosz/data/extdata/misc/PM90.csv +7 -0
- pychnosz/data/extdata/misc/RH95.csv +23 -0
- pychnosz/data/extdata/misc/RH98_Table15.csv +17 -0
- pychnosz/data/extdata/misc/SC10_Rainbow.csv +19 -0
- pychnosz/data/extdata/misc/SK95.csv +55 -0
- pychnosz/data/extdata/misc/SOJSH.csv +61 -0
- pychnosz/data/extdata/misc/SS98_Fig5a.csv +81 -0
- pychnosz/data/extdata/misc/SS98_Fig5b.csv +84 -0
- pychnosz/data/extdata/misc/TKSS14_Fig2.csv +25 -0
- pychnosz/data/extdata/misc/bluered.txt +1000 -0
- pychnosz/data/extdata/protein/Cas/Cas_aa.csv +177 -0
- pychnosz/data/extdata/protein/Cas/Cas_uniprot.csv +186 -0
- pychnosz/data/extdata/protein/Cas/download.R +34 -0
- pychnosz/data/extdata/protein/Cas/mkaa.R +34 -0
- pychnosz/data/extdata/protein/POLG.csv +12 -0
- pychnosz/data/extdata/protein/TBD+05.csv +393 -0
- pychnosz/data/extdata/protein/TBD+05_aa.csv +393 -0
- pychnosz/data/extdata/protein/rubisco.csv +28 -0
- pychnosz/data/extdata/protein/rubisco.fasta +239 -0
- pychnosz/data/extdata/protein/rubisco_aa.csv +28 -0
- pychnosz/data/extdata/src/H2O92D.f.orig +3457 -0
- pychnosz/data/extdata/src/README.txt +5 -0
- pychnosz/data/extdata/taxonomy/names.dmp +215 -0
- pychnosz/data/extdata/taxonomy/nodes.dmp +63 -0
- pychnosz/data/extdata/thermo/Bdot_acirc.csv +60 -0
- pychnosz/data/extdata/thermo/buffer.csv +40 -0
- pychnosz/data/extdata/thermo/element.csv +135 -0
- pychnosz/data/extdata/thermo/groups.csv +6 -0
- pychnosz/data/extdata/thermo/opt.csv +2 -0
- pychnosz/data/extdata/thermo/protein.csv +506 -0
- pychnosz/data/extdata/thermo/refs.csv +343 -0
- pychnosz/data/extdata/thermo/stoich.csv.xz +0 -0
- pychnosz/data/loader.py +431 -0
- pychnosz/data/mod_obigt.py +322 -0
- pychnosz/data/obigt.py +471 -0
- pychnosz/data/worm.py +228 -0
- pychnosz/fortran/__init__.py +16 -0
- pychnosz/fortran/h2o92.dylib +0 -0
- pychnosz/fortran/h2o92_interface.py +527 -0
- pychnosz/geochemistry/__init__.py +21 -0
- pychnosz/geochemistry/minerals.py +514 -0
- pychnosz/geochemistry/redox.py +500 -0
- pychnosz/models/__init__.py +47 -0
- pychnosz/models/archer_wang.py +165 -0
- pychnosz/models/berman.py +309 -0
- pychnosz/models/cgl.py +381 -0
- pychnosz/models/dew.py +997 -0
- pychnosz/models/hkf.py +523 -0
- pychnosz/models/hkf_helpers.py +222 -0
- pychnosz/models/iapws95.py +1113 -0
- pychnosz/models/supcrt92_fortran.py +238 -0
- pychnosz/models/water.py +480 -0
- pychnosz/utils/__init__.py +27 -0
- pychnosz/utils/expression.py +1074 -0
- pychnosz/utils/formula.py +830 -0
- pychnosz/utils/formula_ox.py +227 -0
- pychnosz/utils/reset.py +33 -0
- pychnosz/utils/units.py +259 -0
- pychnosz-1.1.1.dist-info/METADATA +197 -0
- pychnosz-1.1.1.dist-info/RECORD +131 -0
- pychnosz-1.1.1.dist-info/WHEEL +5 -0
- pychnosz-1.1.1.dist-info/licenses/LICENSE.txt +19 -0
- pychnosz-1.1.1.dist-info/top_level.txt +1 -0
pychnosz/data/loader.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data loader module for CHNOSZ thermodynamic database files.
|
|
3
|
+
|
|
4
|
+
This module provides utilities to load and manage the thermodynamic database
|
|
5
|
+
files from the R CHNOSZ package, converting them to pandas-compatible formats.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import lzma
|
|
11
|
+
import warnings
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Dict, Optional, Union, List
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DataLoader:
|
|
17
|
+
"""
|
|
18
|
+
Main data loader class for CHNOSZ thermodynamic database files.
|
|
19
|
+
|
|
20
|
+
This class handles loading of various data files from the CHNOSZ R package,
|
|
21
|
+
including compressed files, and converts them to pandas DataFrames while
|
|
22
|
+
preserving data integrity.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, data_path: Optional[Union[str, Path]] = None):
|
|
26
|
+
"""
|
|
27
|
+
Initialize the DataLoader.
|
|
28
|
+
|
|
29
|
+
Parameters:
|
|
30
|
+
-----------
|
|
31
|
+
data_path : str or Path, optional
|
|
32
|
+
Path to the CHNOSZ data directory. If None, will attempt to find
|
|
33
|
+
the data/extdata directory relative to this file within the package.
|
|
34
|
+
"""
|
|
35
|
+
if data_path is None:
|
|
36
|
+
# Try to find the data directory relative to this file
|
|
37
|
+
# We're now in pychnosz/data/, so extdata is in the same directory
|
|
38
|
+
current_dir = Path(__file__).parent
|
|
39
|
+
self.data_path = current_dir / "extdata"
|
|
40
|
+
else:
|
|
41
|
+
self.data_path = Path(data_path)
|
|
42
|
+
|
|
43
|
+
if not self.data_path.exists():
|
|
44
|
+
raise FileNotFoundError(f"Data directory not found: {self.data_path}")
|
|
45
|
+
|
|
46
|
+
self.obigt_path = self.data_path / "OBIGT"
|
|
47
|
+
self.thermo_path = self.data_path / "thermo"
|
|
48
|
+
|
|
49
|
+
# Cache for loaded data
|
|
50
|
+
self._cache = {}
|
|
51
|
+
|
|
52
|
+
def _read_csv_safe(self, filepath: Path, **kwargs) -> pd.DataFrame:
|
|
53
|
+
"""
|
|
54
|
+
Safely read a CSV file with appropriate error handling.
|
|
55
|
+
|
|
56
|
+
Parameters:
|
|
57
|
+
-----------
|
|
58
|
+
filepath : Path
|
|
59
|
+
Path to the CSV file
|
|
60
|
+
**kwargs
|
|
61
|
+
Additional arguments to pass to pd.read_csv
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
--------
|
|
65
|
+
pd.DataFrame
|
|
66
|
+
Loaded DataFrame
|
|
67
|
+
"""
|
|
68
|
+
try:
|
|
69
|
+
# Handle potential encoding issues
|
|
70
|
+
encodings = ['utf-8', 'latin-1', 'cp1252']
|
|
71
|
+
|
|
72
|
+
for encoding in encodings:
|
|
73
|
+
try:
|
|
74
|
+
df = pd.read_csv(filepath, encoding=encoding, **kwargs)
|
|
75
|
+
return df
|
|
76
|
+
except UnicodeDecodeError:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# If all encodings fail, try with error handling
|
|
80
|
+
df = pd.read_csv(filepath, encoding='utf-8', errors='replace', **kwargs)
|
|
81
|
+
warnings.warn(f"Used error replacement for file {filepath}")
|
|
82
|
+
return df
|
|
83
|
+
|
|
84
|
+
except Exception as e:
|
|
85
|
+
raise IOError(f"Failed to read {filepath}: {str(e)}")
|
|
86
|
+
|
|
87
|
+
def _read_compressed_csv(self, filepath: Path, **kwargs) -> pd.DataFrame:
|
|
88
|
+
"""
|
|
89
|
+
Read a compressed CSV file (e.g., .xz format).
|
|
90
|
+
|
|
91
|
+
Parameters:
|
|
92
|
+
-----------
|
|
93
|
+
filepath : Path
|
|
94
|
+
Path to the compressed CSV file
|
|
95
|
+
**kwargs
|
|
96
|
+
Additional arguments to pass to pd.read_csv
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
--------
|
|
100
|
+
pd.DataFrame
|
|
101
|
+
Loaded DataFrame
|
|
102
|
+
"""
|
|
103
|
+
if filepath.suffix == '.xz':
|
|
104
|
+
with lzma.open(filepath, 'rt', encoding='utf-8') as f:
|
|
105
|
+
df = pd.read_csv(f, **kwargs)
|
|
106
|
+
return df
|
|
107
|
+
else:
|
|
108
|
+
raise ValueError(f"Unsupported compression format: {filepath.suffix}")
|
|
109
|
+
|
|
110
|
+
def load_obigt_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame:
|
|
111
|
+
"""
|
|
112
|
+
Load a specific OBIGT database file.
|
|
113
|
+
|
|
114
|
+
Parameters:
|
|
115
|
+
-----------
|
|
116
|
+
filename : str
|
|
117
|
+
Name of the OBIGT file to load (e.g., 'inorganic_aq.csv')
|
|
118
|
+
use_cache : bool, default True
|
|
119
|
+
Whether to use cached data if available
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
--------
|
|
123
|
+
pd.DataFrame
|
|
124
|
+
Loaded OBIGT data
|
|
125
|
+
"""
|
|
126
|
+
cache_key = f"obigt_{filename}"
|
|
127
|
+
|
|
128
|
+
if use_cache and cache_key in self._cache:
|
|
129
|
+
return self._cache[cache_key].copy()
|
|
130
|
+
|
|
131
|
+
filepath = self.obigt_path / filename
|
|
132
|
+
|
|
133
|
+
if not filepath.exists():
|
|
134
|
+
raise FileNotFoundError(f"OBIGT file not found: {filepath}")
|
|
135
|
+
|
|
136
|
+
# Load the data
|
|
137
|
+
df = self._read_csv_safe(filepath)
|
|
138
|
+
|
|
139
|
+
# Clean up column names (remove any whitespace)
|
|
140
|
+
df.columns = df.columns.str.strip()
|
|
141
|
+
|
|
142
|
+
# Cache the result
|
|
143
|
+
if use_cache:
|
|
144
|
+
self._cache[cache_key] = df.copy()
|
|
145
|
+
|
|
146
|
+
return df
|
|
147
|
+
|
|
148
|
+
def load_all_obigt_files(self, use_cache: bool = True) -> Dict[str, pd.DataFrame]:
|
|
149
|
+
"""
|
|
150
|
+
Load all OBIGT database files in the same order as R CHNOSZ.
|
|
151
|
+
|
|
152
|
+
This mirrors the exact loading order from R CHNOSZ/thermo.R OBIGT() function
|
|
153
|
+
to ensure identical species indices between R and Python versions.
|
|
154
|
+
|
|
155
|
+
Parameters:
|
|
156
|
+
-----------
|
|
157
|
+
use_cache : bool, default True
|
|
158
|
+
Whether to use cached data if available
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
--------
|
|
162
|
+
Dict[str, pd.DataFrame]
|
|
163
|
+
Dictionary with filenames as keys and DataFrames as values, ordered like R CHNOSZ
|
|
164
|
+
"""
|
|
165
|
+
obigt_files = {}
|
|
166
|
+
|
|
167
|
+
if not self.obigt_path.exists():
|
|
168
|
+
raise FileNotFoundError(f"OBIGT directory not found: {self.obigt_path}")
|
|
169
|
+
|
|
170
|
+
# Use exact same order as R CHNOSZ (from thermo.R lines 63-67)
|
|
171
|
+
# sources_aq <- paste0(c("H2O", "inorganic", "organic"), "_aq")
|
|
172
|
+
# sources_cr <- paste0(c("Berman", "inorganic", "organic"), "_cr")
|
|
173
|
+
# sources_liq <- paste0(c("organic"), "_liq")
|
|
174
|
+
# sources_gas <- paste0(c("inorganic", "organic"), "_gas")
|
|
175
|
+
# sources <- c(sources_aq, sources_cr, sources_gas, sources_liq)
|
|
176
|
+
r_chnosz_order = [
|
|
177
|
+
"H2O_aq.csv",
|
|
178
|
+
"inorganic_aq.csv",
|
|
179
|
+
"organic_aq.csv",
|
|
180
|
+
"Berman_cr.csv",
|
|
181
|
+
"inorganic_cr.csv",
|
|
182
|
+
"organic_cr.csv",
|
|
183
|
+
"inorganic_gas.csv",
|
|
184
|
+
"organic_gas.csv",
|
|
185
|
+
"organic_liq.csv"
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
# Load files in R CHNOSZ order
|
|
189
|
+
for filename in r_chnosz_order:
|
|
190
|
+
file_path = self.obigt_path / filename
|
|
191
|
+
if file_path.exists():
|
|
192
|
+
obigt_files[filename] = self.load_obigt_file(filename, use_cache=use_cache)
|
|
193
|
+
else:
|
|
194
|
+
warnings.warn(f"OBIGT file not found: {filename}")
|
|
195
|
+
|
|
196
|
+
return obigt_files
|
|
197
|
+
|
|
198
|
+
def load_thermo_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame:
|
|
199
|
+
"""
|
|
200
|
+
Load a specific thermo database file.
|
|
201
|
+
|
|
202
|
+
Parameters:
|
|
203
|
+
-----------
|
|
204
|
+
filename : str
|
|
205
|
+
Name of the thermo file to load (e.g., 'element.csv', 'stoich.csv.xz')
|
|
206
|
+
use_cache : bool, default True
|
|
207
|
+
Whether to use cached data if available
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
--------
|
|
211
|
+
pd.DataFrame
|
|
212
|
+
Loaded thermo data
|
|
213
|
+
"""
|
|
214
|
+
cache_key = f"thermo_{filename}"
|
|
215
|
+
|
|
216
|
+
if use_cache and cache_key in self._cache:
|
|
217
|
+
return self._cache[cache_key].copy()
|
|
218
|
+
|
|
219
|
+
filepath = self.thermo_path / filename
|
|
220
|
+
|
|
221
|
+
if not filepath.exists():
|
|
222
|
+
raise FileNotFoundError(f"Thermo file not found: {filepath}")
|
|
223
|
+
|
|
224
|
+
# Handle compressed files
|
|
225
|
+
if filepath.suffix == '.xz':
|
|
226
|
+
df = self._read_compressed_csv(filepath)
|
|
227
|
+
else:
|
|
228
|
+
df = self._read_csv_safe(filepath)
|
|
229
|
+
|
|
230
|
+
# Clean up column names
|
|
231
|
+
df.columns = df.columns.str.strip()
|
|
232
|
+
|
|
233
|
+
# Cache the result
|
|
234
|
+
if use_cache:
|
|
235
|
+
self._cache[cache_key] = df.copy()
|
|
236
|
+
|
|
237
|
+
return df
|
|
238
|
+
|
|
239
|
+
def load_elements(self, use_cache: bool = True) -> pd.DataFrame:
|
|
240
|
+
"""
|
|
241
|
+
Load the elements data file.
|
|
242
|
+
|
|
243
|
+
Parameters:
|
|
244
|
+
-----------
|
|
245
|
+
use_cache : bool, default True
|
|
246
|
+
Whether to use cached data if available
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
--------
|
|
250
|
+
pd.DataFrame
|
|
251
|
+
Elements data with columns: element, state, source, mass, s, n
|
|
252
|
+
"""
|
|
253
|
+
return self.load_thermo_file('element.csv', use_cache=use_cache)
|
|
254
|
+
|
|
255
|
+
def load_buffer(self, use_cache: bool = True) -> pd.DataFrame:
|
|
256
|
+
"""
|
|
257
|
+
Load the buffer data file.
|
|
258
|
+
|
|
259
|
+
Parameters:
|
|
260
|
+
-----------
|
|
261
|
+
use_cache : bool, default True
|
|
262
|
+
Whether to use cached data if available
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
--------
|
|
266
|
+
pd.DataFrame
|
|
267
|
+
Buffer data with columns: name, species, state, logact
|
|
268
|
+
"""
|
|
269
|
+
return self.load_thermo_file('buffer.csv', use_cache=use_cache)
|
|
270
|
+
|
|
271
|
+
def load_protein(self, use_cache: bool = True) -> pd.DataFrame:
|
|
272
|
+
"""
|
|
273
|
+
Load the protein data file.
|
|
274
|
+
|
|
275
|
+
Parameters:
|
|
276
|
+
-----------
|
|
277
|
+
use_cache : bool, default True
|
|
278
|
+
Whether to use cached data if available
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
--------
|
|
282
|
+
pd.DataFrame
|
|
283
|
+
Protein data with amino acid compositions
|
|
284
|
+
"""
|
|
285
|
+
return self.load_thermo_file('protein.csv', use_cache=use_cache)
|
|
286
|
+
|
|
287
|
+
def load_stoich(self, use_cache: bool = True) -> pd.DataFrame:
|
|
288
|
+
"""
|
|
289
|
+
Load the stoichiometry data file (compressed).
|
|
290
|
+
|
|
291
|
+
Parameters:
|
|
292
|
+
-----------
|
|
293
|
+
use_cache : bool, default True
|
|
294
|
+
Whether to use cached data if available
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
--------
|
|
298
|
+
pd.DataFrame
|
|
299
|
+
Stoichiometry matrix for all species
|
|
300
|
+
"""
|
|
301
|
+
return self.load_thermo_file('stoich.csv.xz', use_cache=use_cache)
|
|
302
|
+
|
|
303
|
+
def get_available_obigt_files(self) -> List[str]:
|
|
304
|
+
"""
|
|
305
|
+
Get list of available OBIGT files.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
--------
|
|
309
|
+
List[str]
|
|
310
|
+
List of available OBIGT filenames
|
|
311
|
+
"""
|
|
312
|
+
if not self.obigt_path.exists():
|
|
313
|
+
return []
|
|
314
|
+
|
|
315
|
+
return [f.name for f in self.obigt_path.glob("*.csv")]
|
|
316
|
+
|
|
317
|
+
def get_available_thermo_files(self) -> List[str]:
|
|
318
|
+
"""
|
|
319
|
+
Get list of available thermo files.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
--------
|
|
323
|
+
List[str]
|
|
324
|
+
List of available thermo filenames
|
|
325
|
+
"""
|
|
326
|
+
if not self.thermo_path.exists():
|
|
327
|
+
return []
|
|
328
|
+
|
|
329
|
+
# Get both .csv and .csv.xz files
|
|
330
|
+
csv_files = [f.name for f in self.thermo_path.glob("*.csv")]
|
|
331
|
+
xz_files = [f.name for f in self.thermo_path.glob("*.csv.xz")]
|
|
332
|
+
|
|
333
|
+
return sorted(csv_files + xz_files)
|
|
334
|
+
|
|
335
|
+
def clear_cache(self):
|
|
336
|
+
"""Clear all cached data."""
|
|
337
|
+
self._cache.clear()
|
|
338
|
+
|
|
339
|
+
def get_cache_info(self) -> Dict[str, int]:
|
|
340
|
+
"""
|
|
341
|
+
Get information about cached data.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
--------
|
|
345
|
+
Dict[str, int]
|
|
346
|
+
Dictionary with cache keys and DataFrame sizes
|
|
347
|
+
"""
|
|
348
|
+
return {key: len(df) for key, df in self._cache.items()}
|
|
349
|
+
|
|
350
|
+
def get_data_path(self) -> Path:
|
|
351
|
+
"""
|
|
352
|
+
Get the data directory path.
|
|
353
|
+
|
|
354
|
+
Returns
|
|
355
|
+
-------
|
|
356
|
+
Path
|
|
357
|
+
Path to the data directory
|
|
358
|
+
"""
|
|
359
|
+
return self.data_path
|
|
360
|
+
|
|
361
|
+
def load_buffers(self, use_cache: bool = True) -> pd.DataFrame:
|
|
362
|
+
"""
|
|
363
|
+
Load buffer data (alias for load_buffer for compatibility).
|
|
364
|
+
|
|
365
|
+
Parameters
|
|
366
|
+
----------
|
|
367
|
+
use_cache : bool, default True
|
|
368
|
+
Whether to use cached data if available
|
|
369
|
+
|
|
370
|
+
Returns
|
|
371
|
+
-------
|
|
372
|
+
pd.DataFrame
|
|
373
|
+
Buffer data
|
|
374
|
+
"""
|
|
375
|
+
try:
|
|
376
|
+
return self.load_buffer(use_cache=use_cache)
|
|
377
|
+
except Exception:
|
|
378
|
+
# Return empty DataFrame if buffer data not available
|
|
379
|
+
return pd.DataFrame(columns=['name', 'species', 'state', 'logact'])
|
|
380
|
+
|
|
381
|
+
def load_proteins(self, use_cache: bool = True) -> pd.DataFrame:
|
|
382
|
+
"""
|
|
383
|
+
Load protein data (alias for load_protein for compatibility).
|
|
384
|
+
|
|
385
|
+
Parameters
|
|
386
|
+
----------
|
|
387
|
+
use_cache : bool, default True
|
|
388
|
+
Whether to use cached data if available
|
|
389
|
+
|
|
390
|
+
Returns
|
|
391
|
+
-------
|
|
392
|
+
pd.DataFrame
|
|
393
|
+
Protein data
|
|
394
|
+
"""
|
|
395
|
+
try:
|
|
396
|
+
return self.load_protein(use_cache=use_cache)
|
|
397
|
+
except Exception:
|
|
398
|
+
# Return empty DataFrame if protein data not available
|
|
399
|
+
return pd.DataFrame(columns=['protein', 'organism', 'ref', 'abbrv', 'chains'])
|
|
400
|
+
|
|
401
|
+
def load_refs(self, use_cache: bool = True) -> pd.DataFrame:
|
|
402
|
+
"""
|
|
403
|
+
Load references data file.
|
|
404
|
+
|
|
405
|
+
Parameters
|
|
406
|
+
----------
|
|
407
|
+
use_cache : bool, default True
|
|
408
|
+
Whether to use cached data if available
|
|
409
|
+
|
|
410
|
+
Returns
|
|
411
|
+
-------
|
|
412
|
+
pd.DataFrame
|
|
413
|
+
References data
|
|
414
|
+
"""
|
|
415
|
+
try:
|
|
416
|
+
return self.load_thermo_file('refs.csv', use_cache=use_cache)
|
|
417
|
+
except Exception:
|
|
418
|
+
# Return empty DataFrame if refs data not available
|
|
419
|
+
return pd.DataFrame(columns=['key', 'author', 'year', 'citation'])
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def get_default_loader() -> DataLoader:
|
|
423
|
+
"""
|
|
424
|
+
Get a default DataLoader instance.
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
--------
|
|
428
|
+
DataLoader
|
|
429
|
+
Default DataLoader instance
|
|
430
|
+
"""
|
|
431
|
+
return DataLoader()
|