pychnosz 1.1.4__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. pychnosz/__init__.py +129 -0
  2. pychnosz/biomolecules/__init__.py +29 -0
  3. pychnosz/biomolecules/ionize_aa.py +197 -0
  4. pychnosz/biomolecules/proteins.py +595 -0
  5. pychnosz/core/__init__.py +46 -0
  6. pychnosz/core/affinity.py +1256 -0
  7. pychnosz/core/animation.py +593 -0
  8. pychnosz/core/balance.py +334 -0
  9. pychnosz/core/basis.py +716 -0
  10. pychnosz/core/diagram.py +3336 -0
  11. pychnosz/core/equilibrate.py +813 -0
  12. pychnosz/core/equilibrium.py +554 -0
  13. pychnosz/core/info.py +821 -0
  14. pychnosz/core/retrieve.py +364 -0
  15. pychnosz/core/speciation.py +580 -0
  16. pychnosz/core/species.py +599 -0
  17. pychnosz/core/subcrt.py +1700 -0
  18. pychnosz/core/thermo.py +593 -0
  19. pychnosz/core/unicurve.py +1226 -0
  20. pychnosz/data/__init__.py +11 -0
  21. pychnosz/data/add_obigt.py +327 -0
  22. pychnosz/data/extdata/Berman/BDat17_2017.csv +2 -0
  23. pychnosz/data/extdata/Berman/Ber88_1988.csv +68 -0
  24. pychnosz/data/extdata/Berman/Ber90_1990.csv +5 -0
  25. pychnosz/data/extdata/Berman/DS10_2010.csv +6 -0
  26. pychnosz/data/extdata/Berman/FDM+14_2014.csv +2 -0
  27. pychnosz/data/extdata/Berman/Got04_2004.csv +5 -0
  28. pychnosz/data/extdata/Berman/JUN92_1992.csv +3 -0
  29. pychnosz/data/extdata/Berman/SHD91_1991.csv +12 -0
  30. pychnosz/data/extdata/Berman/VGT92_1992.csv +2 -0
  31. pychnosz/data/extdata/Berman/VPT01_2001.csv +3 -0
  32. pychnosz/data/extdata/Berman/VPV05_2005.csv +2 -0
  33. pychnosz/data/extdata/Berman/ZS92_1992.csv +11 -0
  34. pychnosz/data/extdata/Berman/sympy.R +99 -0
  35. pychnosz/data/extdata/Berman/testing/BA96.bib +12 -0
  36. pychnosz/data/extdata/Berman/testing/BA96_Berman.csv +21 -0
  37. pychnosz/data/extdata/Berman/testing/BA96_OBIGT.csv +21 -0
  38. pychnosz/data/extdata/Berman/testing/BA96_refs.csv +6 -0
  39. pychnosz/data/extdata/OBIGT/AD.csv +25 -0
  40. pychnosz/data/extdata/OBIGT/Berman_cr.csv +93 -0
  41. pychnosz/data/extdata/OBIGT/DEW.csv +211 -0
  42. pychnosz/data/extdata/OBIGT/H2O_aq.csv +4 -0
  43. pychnosz/data/extdata/OBIGT/SLOP98.csv +411 -0
  44. pychnosz/data/extdata/OBIGT/SUPCRT92.csv +178 -0
  45. pychnosz/data/extdata/OBIGT/inorganic_aq.csv +729 -0
  46. pychnosz/data/extdata/OBIGT/inorganic_cr.csv +273 -0
  47. pychnosz/data/extdata/OBIGT/inorganic_gas.csv +20 -0
  48. pychnosz/data/extdata/OBIGT/organic_aq.csv +1104 -0
  49. pychnosz/data/extdata/OBIGT/organic_cr.csv +481 -0
  50. pychnosz/data/extdata/OBIGT/organic_gas.csv +268 -0
  51. pychnosz/data/extdata/OBIGT/organic_liq.csv +533 -0
  52. pychnosz/data/extdata/OBIGT/testing/GEMSFIT.csv +43 -0
  53. pychnosz/data/extdata/OBIGT/testing/IGEM.csv +17 -0
  54. pychnosz/data/extdata/OBIGT/testing/Sandia.csv +8 -0
  55. pychnosz/data/extdata/OBIGT/testing/SiO2.csv +4 -0
  56. pychnosz/data/extdata/misc/AD03_Fig1a.csv +69 -0
  57. pychnosz/data/extdata/misc/AD03_Fig1b.csv +43 -0
  58. pychnosz/data/extdata/misc/AD03_Fig1c.csv +89 -0
  59. pychnosz/data/extdata/misc/AD03_Fig1d.csv +30 -0
  60. pychnosz/data/extdata/misc/BZA10.csv +5 -0
  61. pychnosz/data/extdata/misc/HW97_Cp.csv +90 -0
  62. pychnosz/data/extdata/misc/HWM96_V.csv +229 -0
  63. pychnosz/data/extdata/misc/LA19_test.csv +7 -0
  64. pychnosz/data/extdata/misc/Mer75_Table4.csv +42 -0
  65. pychnosz/data/extdata/misc/OBIGT_check.csv +423 -0
  66. pychnosz/data/extdata/misc/PM90.csv +7 -0
  67. pychnosz/data/extdata/misc/RH95.csv +23 -0
  68. pychnosz/data/extdata/misc/RH98_Table15.csv +17 -0
  69. pychnosz/data/extdata/misc/SC10_Rainbow.csv +19 -0
  70. pychnosz/data/extdata/misc/SK95.csv +55 -0
  71. pychnosz/data/extdata/misc/SOJSH.csv +61 -0
  72. pychnosz/data/extdata/misc/SS98_Fig5a.csv +81 -0
  73. pychnosz/data/extdata/misc/SS98_Fig5b.csv +84 -0
  74. pychnosz/data/extdata/misc/TKSS14_Fig2.csv +25 -0
  75. pychnosz/data/extdata/misc/bluered.txt +1000 -0
  76. pychnosz/data/extdata/protein/Cas/Cas_aa.csv +177 -0
  77. pychnosz/data/extdata/protein/Cas/Cas_uniprot.csv +186 -0
  78. pychnosz/data/extdata/protein/Cas/download.R +34 -0
  79. pychnosz/data/extdata/protein/Cas/mkaa.R +34 -0
  80. pychnosz/data/extdata/protein/POLG.csv +12 -0
  81. pychnosz/data/extdata/protein/TBD+05.csv +393 -0
  82. pychnosz/data/extdata/protein/TBD+05_aa.csv +393 -0
  83. pychnosz/data/extdata/protein/rubisco.csv +28 -0
  84. pychnosz/data/extdata/protein/rubisco.fasta +239 -0
  85. pychnosz/data/extdata/protein/rubisco_aa.csv +28 -0
  86. pychnosz/data/extdata/src/H2O92D.f.orig +3457 -0
  87. pychnosz/data/extdata/src/README.txt +5 -0
  88. pychnosz/data/extdata/taxonomy/names.dmp +215 -0
  89. pychnosz/data/extdata/taxonomy/nodes.dmp +63 -0
  90. pychnosz/data/extdata/thermo/Bdot_acirc.csv +60 -0
  91. pychnosz/data/extdata/thermo/buffer.csv +40 -0
  92. pychnosz/data/extdata/thermo/element.csv +135 -0
  93. pychnosz/data/extdata/thermo/groups.csv +6 -0
  94. pychnosz/data/extdata/thermo/opt.csv +2 -0
  95. pychnosz/data/extdata/thermo/protein.csv +506 -0
  96. pychnosz/data/extdata/thermo/refs.csv +343 -0
  97. pychnosz/data/extdata/thermo/stoich.csv.xz +0 -0
  98. pychnosz/data/loader.py +431 -0
  99. pychnosz/data/mod_obigt.py +322 -0
  100. pychnosz/data/obigt.py +471 -0
  101. pychnosz/data/worm.py +228 -0
  102. pychnosz/fortran/__init__.py +16 -0
  103. pychnosz/fortran/h2o92.dll +0 -0
  104. pychnosz/fortran/h2o92_interface.py +527 -0
  105. pychnosz/geochemistry/__init__.py +21 -0
  106. pychnosz/geochemistry/minerals.py +514 -0
  107. pychnosz/geochemistry/redox.py +500 -0
  108. pychnosz/models/__init__.py +47 -0
  109. pychnosz/models/archer_wang.py +165 -0
  110. pychnosz/models/berman.py +309 -0
  111. pychnosz/models/cgl.py +381 -0
  112. pychnosz/models/dew.py +997 -0
  113. pychnosz/models/hkf.py +523 -0
  114. pychnosz/models/hkf_helpers.py +222 -0
  115. pychnosz/models/iapws95.py +1113 -0
  116. pychnosz/models/supcrt92_fortran.py +238 -0
  117. pychnosz/models/water.py +480 -0
  118. pychnosz/utils/__init__.py +27 -0
  119. pychnosz/utils/expression.py +1074 -0
  120. pychnosz/utils/formula.py +830 -0
  121. pychnosz/utils/formula_ox.py +227 -0
  122. pychnosz/utils/reset.py +33 -0
  123. pychnosz/utils/units.py +259 -0
  124. pychnosz-1.1.4.dist-info/METADATA +197 -0
  125. pychnosz-1.1.4.dist-info/RECORD +128 -0
  126. pychnosz-1.1.4.dist-info/WHEEL +5 -0
  127. pychnosz-1.1.4.dist-info/licenses/LICENSE.txt +19 -0
  128. pychnosz-1.1.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,431 @@
1
+ """
2
+ Data loader module for CHNOSZ thermodynamic database files.
3
+
4
+ This module provides utilities to load and manage the thermodynamic database
5
+ files from the R CHNOSZ package, converting them to pandas-compatible formats.
6
+ """
7
+
8
+ import os
9
+ import pandas as pd
10
+ import lzma
11
+ import warnings
12
+ from pathlib import Path
13
+ from typing import Dict, Optional, Union, List
14
+
15
+
16
+ class DataLoader:
17
+ """
18
+ Main data loader class for CHNOSZ thermodynamic database files.
19
+
20
+ This class handles loading of various data files from the CHNOSZ R package,
21
+ including compressed files, and converts them to pandas DataFrames while
22
+ preserving data integrity.
23
+ """
24
+
25
+ def __init__(self, data_path: Optional[Union[str, Path]] = None):
26
+ """
27
+ Initialize the DataLoader.
28
+
29
+ Parameters:
30
+ -----------
31
+ data_path : str or Path, optional
32
+ Path to the CHNOSZ data directory. If None, will attempt to find
33
+ the data/extdata directory relative to this file within the package.
34
+ """
35
+ if data_path is None:
36
+ # Try to find the data directory relative to this file
37
+ # We're now in pychnosz/data/, so extdata is in the same directory
38
+ current_dir = Path(__file__).parent
39
+ self.data_path = current_dir / "extdata"
40
+ else:
41
+ self.data_path = Path(data_path)
42
+
43
+ if not self.data_path.exists():
44
+ raise FileNotFoundError(f"Data directory not found: {self.data_path}")
45
+
46
+ self.obigt_path = self.data_path / "OBIGT"
47
+ self.thermo_path = self.data_path / "thermo"
48
+
49
+ # Cache for loaded data
50
+ self._cache = {}
51
+
52
+ def _read_csv_safe(self, filepath: Path, **kwargs) -> pd.DataFrame:
53
+ """
54
+ Safely read a CSV file with appropriate error handling.
55
+
56
+ Parameters:
57
+ -----------
58
+ filepath : Path
59
+ Path to the CSV file
60
+ **kwargs
61
+ Additional arguments to pass to pd.read_csv
62
+
63
+ Returns:
64
+ --------
65
+ pd.DataFrame
66
+ Loaded DataFrame
67
+ """
68
+ try:
69
+ # Handle potential encoding issues
70
+ encodings = ['utf-8', 'latin-1', 'cp1252']
71
+
72
+ for encoding in encodings:
73
+ try:
74
+ df = pd.read_csv(filepath, encoding=encoding, **kwargs)
75
+ return df
76
+ except UnicodeDecodeError:
77
+ continue
78
+
79
+ # If all encodings fail, try with error handling
80
+ df = pd.read_csv(filepath, encoding='utf-8', errors='replace', **kwargs)
81
+ warnings.warn(f"Used error replacement for file {filepath}")
82
+ return df
83
+
84
+ except Exception as e:
85
+ raise IOError(f"Failed to read {filepath}: {str(e)}")
86
+
87
+ def _read_compressed_csv(self, filepath: Path, **kwargs) -> pd.DataFrame:
88
+ """
89
+ Read a compressed CSV file (e.g., .xz format).
90
+
91
+ Parameters:
92
+ -----------
93
+ filepath : Path
94
+ Path to the compressed CSV file
95
+ **kwargs
96
+ Additional arguments to pass to pd.read_csv
97
+
98
+ Returns:
99
+ --------
100
+ pd.DataFrame
101
+ Loaded DataFrame
102
+ """
103
+ if filepath.suffix == '.xz':
104
+ with lzma.open(filepath, 'rt', encoding='utf-8') as f:
105
+ df = pd.read_csv(f, **kwargs)
106
+ return df
107
+ else:
108
+ raise ValueError(f"Unsupported compression format: {filepath.suffix}")
109
+
110
+ def load_obigt_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame:
111
+ """
112
+ Load a specific OBIGT database file.
113
+
114
+ Parameters:
115
+ -----------
116
+ filename : str
117
+ Name of the OBIGT file to load (e.g., 'inorganic_aq.csv')
118
+ use_cache : bool, default True
119
+ Whether to use cached data if available
120
+
121
+ Returns:
122
+ --------
123
+ pd.DataFrame
124
+ Loaded OBIGT data
125
+ """
126
+ cache_key = f"obigt_{filename}"
127
+
128
+ if use_cache and cache_key in self._cache:
129
+ return self._cache[cache_key].copy()
130
+
131
+ filepath = self.obigt_path / filename
132
+
133
+ if not filepath.exists():
134
+ raise FileNotFoundError(f"OBIGT file not found: {filepath}")
135
+
136
+ # Load the data
137
+ df = self._read_csv_safe(filepath)
138
+
139
+ # Clean up column names (remove any whitespace)
140
+ df.columns = df.columns.str.strip()
141
+
142
+ # Cache the result
143
+ if use_cache:
144
+ self._cache[cache_key] = df.copy()
145
+
146
+ return df
147
+
148
+ def load_all_obigt_files(self, use_cache: bool = True) -> Dict[str, pd.DataFrame]:
149
+ """
150
+ Load all OBIGT database files in the same order as R CHNOSZ.
151
+
152
+ This mirrors the exact loading order from R CHNOSZ/thermo.R OBIGT() function
153
+ to ensure identical species indices between R and Python versions.
154
+
155
+ Parameters:
156
+ -----------
157
+ use_cache : bool, default True
158
+ Whether to use cached data if available
159
+
160
+ Returns:
161
+ --------
162
+ Dict[str, pd.DataFrame]
163
+ Dictionary with filenames as keys and DataFrames as values, ordered like R CHNOSZ
164
+ """
165
+ obigt_files = {}
166
+
167
+ if not self.obigt_path.exists():
168
+ raise FileNotFoundError(f"OBIGT directory not found: {self.obigt_path}")
169
+
170
+ # Use exact same order as R CHNOSZ (from thermo.R lines 63-67)
171
+ # sources_aq <- paste0(c("H2O", "inorganic", "organic"), "_aq")
172
+ # sources_cr <- paste0(c("Berman", "inorganic", "organic"), "_cr")
173
+ # sources_liq <- paste0(c("organic"), "_liq")
174
+ # sources_gas <- paste0(c("inorganic", "organic"), "_gas")
175
+ # sources <- c(sources_aq, sources_cr, sources_gas, sources_liq)
176
+ r_chnosz_order = [
177
+ "H2O_aq.csv",
178
+ "inorganic_aq.csv",
179
+ "organic_aq.csv",
180
+ "Berman_cr.csv",
181
+ "inorganic_cr.csv",
182
+ "organic_cr.csv",
183
+ "inorganic_gas.csv",
184
+ "organic_gas.csv",
185
+ "organic_liq.csv"
186
+ ]
187
+
188
+ # Load files in R CHNOSZ order
189
+ for filename in r_chnosz_order:
190
+ file_path = self.obigt_path / filename
191
+ if file_path.exists():
192
+ obigt_files[filename] = self.load_obigt_file(filename, use_cache=use_cache)
193
+ else:
194
+ warnings.warn(f"OBIGT file not found: {filename}")
195
+
196
+ return obigt_files
197
+
198
+ def load_thermo_file(self, filename: str, use_cache: bool = True) -> pd.DataFrame:
199
+ """
200
+ Load a specific thermo database file.
201
+
202
+ Parameters:
203
+ -----------
204
+ filename : str
205
+ Name of the thermo file to load (e.g., 'element.csv', 'stoich.csv.xz')
206
+ use_cache : bool, default True
207
+ Whether to use cached data if available
208
+
209
+ Returns:
210
+ --------
211
+ pd.DataFrame
212
+ Loaded thermo data
213
+ """
214
+ cache_key = f"thermo_{filename}"
215
+
216
+ if use_cache and cache_key in self._cache:
217
+ return self._cache[cache_key].copy()
218
+
219
+ filepath = self.thermo_path / filename
220
+
221
+ if not filepath.exists():
222
+ raise FileNotFoundError(f"Thermo file not found: {filepath}")
223
+
224
+ # Handle compressed files
225
+ if filepath.suffix == '.xz':
226
+ df = self._read_compressed_csv(filepath)
227
+ else:
228
+ df = self._read_csv_safe(filepath)
229
+
230
+ # Clean up column names
231
+ df.columns = df.columns.str.strip()
232
+
233
+ # Cache the result
234
+ if use_cache:
235
+ self._cache[cache_key] = df.copy()
236
+
237
+ return df
238
+
239
+ def load_elements(self, use_cache: bool = True) -> pd.DataFrame:
240
+ """
241
+ Load the elements data file.
242
+
243
+ Parameters:
244
+ -----------
245
+ use_cache : bool, default True
246
+ Whether to use cached data if available
247
+
248
+ Returns:
249
+ --------
250
+ pd.DataFrame
251
+ Elements data with columns: element, state, source, mass, s, n
252
+ """
253
+ return self.load_thermo_file('element.csv', use_cache=use_cache)
254
+
255
+ def load_buffer(self, use_cache: bool = True) -> pd.DataFrame:
256
+ """
257
+ Load the buffer data file.
258
+
259
+ Parameters:
260
+ -----------
261
+ use_cache : bool, default True
262
+ Whether to use cached data if available
263
+
264
+ Returns:
265
+ --------
266
+ pd.DataFrame
267
+ Buffer data with columns: name, species, state, logact
268
+ """
269
+ return self.load_thermo_file('buffer.csv', use_cache=use_cache)
270
+
271
+ def load_protein(self, use_cache: bool = True) -> pd.DataFrame:
272
+ """
273
+ Load the protein data file.
274
+
275
+ Parameters:
276
+ -----------
277
+ use_cache : bool, default True
278
+ Whether to use cached data if available
279
+
280
+ Returns:
281
+ --------
282
+ pd.DataFrame
283
+ Protein data with amino acid compositions
284
+ """
285
+ return self.load_thermo_file('protein.csv', use_cache=use_cache)
286
+
287
+ def load_stoich(self, use_cache: bool = True) -> pd.DataFrame:
288
+ """
289
+ Load the stoichiometry data file (compressed).
290
+
291
+ Parameters:
292
+ -----------
293
+ use_cache : bool, default True
294
+ Whether to use cached data if available
295
+
296
+ Returns:
297
+ --------
298
+ pd.DataFrame
299
+ Stoichiometry matrix for all species
300
+ """
301
+ return self.load_thermo_file('stoich.csv.xz', use_cache=use_cache)
302
+
303
+ def get_available_obigt_files(self) -> List[str]:
304
+ """
305
+ Get list of available OBIGT files.
306
+
307
+ Returns:
308
+ --------
309
+ List[str]
310
+ List of available OBIGT filenames
311
+ """
312
+ if not self.obigt_path.exists():
313
+ return []
314
+
315
+ return [f.name for f in self.obigt_path.glob("*.csv")]
316
+
317
+ def get_available_thermo_files(self) -> List[str]:
318
+ """
319
+ Get list of available thermo files.
320
+
321
+ Returns:
322
+ --------
323
+ List[str]
324
+ List of available thermo filenames
325
+ """
326
+ if not self.thermo_path.exists():
327
+ return []
328
+
329
+ # Get both .csv and .csv.xz files
330
+ csv_files = [f.name for f in self.thermo_path.glob("*.csv")]
331
+ xz_files = [f.name for f in self.thermo_path.glob("*.csv.xz")]
332
+
333
+ return sorted(csv_files + xz_files)
334
+
335
+ def clear_cache(self):
336
+ """Clear all cached data."""
337
+ self._cache.clear()
338
+
339
+ def get_cache_info(self) -> Dict[str, int]:
340
+ """
341
+ Get information about cached data.
342
+
343
+ Returns:
344
+ --------
345
+ Dict[str, int]
346
+ Dictionary with cache keys and DataFrame sizes
347
+ """
348
+ return {key: len(df) for key, df in self._cache.items()}
349
+
350
+ def get_data_path(self) -> Path:
351
+ """
352
+ Get the data directory path.
353
+
354
+ Returns
355
+ -------
356
+ Path
357
+ Path to the data directory
358
+ """
359
+ return self.data_path
360
+
361
+ def load_buffers(self, use_cache: bool = True) -> pd.DataFrame:
362
+ """
363
+ Load buffer data (alias for load_buffer for compatibility).
364
+
365
+ Parameters
366
+ ----------
367
+ use_cache : bool, default True
368
+ Whether to use cached data if available
369
+
370
+ Returns
371
+ -------
372
+ pd.DataFrame
373
+ Buffer data
374
+ """
375
+ try:
376
+ return self.load_buffer(use_cache=use_cache)
377
+ except Exception:
378
+ # Return empty DataFrame if buffer data not available
379
+ return pd.DataFrame(columns=['name', 'species', 'state', 'logact'])
380
+
381
+ def load_proteins(self, use_cache: bool = True) -> pd.DataFrame:
382
+ """
383
+ Load protein data (alias for load_protein for compatibility).
384
+
385
+ Parameters
386
+ ----------
387
+ use_cache : bool, default True
388
+ Whether to use cached data if available
389
+
390
+ Returns
391
+ -------
392
+ pd.DataFrame
393
+ Protein data
394
+ """
395
+ try:
396
+ return self.load_protein(use_cache=use_cache)
397
+ except Exception:
398
+ # Return empty DataFrame if protein data not available
399
+ return pd.DataFrame(columns=['protein', 'organism', 'ref', 'abbrv', 'chains'])
400
+
401
+ def load_refs(self, use_cache: bool = True) -> pd.DataFrame:
402
+ """
403
+ Load references data file.
404
+
405
+ Parameters
406
+ ----------
407
+ use_cache : bool, default True
408
+ Whether to use cached data if available
409
+
410
+ Returns
411
+ -------
412
+ pd.DataFrame
413
+ References data
414
+ """
415
+ try:
416
+ return self.load_thermo_file('refs.csv', use_cache=use_cache)
417
+ except Exception:
418
+ # Return empty DataFrame if refs data not available
419
+ return pd.DataFrame(columns=['key', 'author', 'year', 'citation'])
420
+
421
+
422
+ def get_default_loader() -> DataLoader:
423
+ """
424
+ Get a default DataLoader instance.
425
+
426
+ Returns:
427
+ --------
428
+ DataLoader
429
+ Default DataLoader instance
430
+ """
431
+ return DataLoader()