pychnosz 1.1.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. pychnosz/__init__.py +129 -0
  2. pychnosz/biomolecules/__init__.py +29 -0
  3. pychnosz/biomolecules/ionize_aa.py +197 -0
  4. pychnosz/biomolecules/proteins.py +595 -0
  5. pychnosz/core/__init__.py +46 -0
  6. pychnosz/core/affinity.py +1256 -0
  7. pychnosz/core/animation.py +593 -0
  8. pychnosz/core/balance.py +334 -0
  9. pychnosz/core/basis.py +716 -0
  10. pychnosz/core/diagram.py +3336 -0
  11. pychnosz/core/equilibrate.py +813 -0
  12. pychnosz/core/equilibrium.py +554 -0
  13. pychnosz/core/info.py +821 -0
  14. pychnosz/core/retrieve.py +364 -0
  15. pychnosz/core/speciation.py +580 -0
  16. pychnosz/core/species.py +599 -0
  17. pychnosz/core/subcrt.py +1700 -0
  18. pychnosz/core/thermo.py +593 -0
  19. pychnosz/core/unicurve.py +1226 -0
  20. pychnosz/data/__init__.py +11 -0
  21. pychnosz/data/add_obigt.py +327 -0
  22. pychnosz/data/extdata/Berman/BDat17_2017.csv +2 -0
  23. pychnosz/data/extdata/Berman/Ber88_1988.csv +68 -0
  24. pychnosz/data/extdata/Berman/Ber90_1990.csv +5 -0
  25. pychnosz/data/extdata/Berman/DS10_2010.csv +6 -0
  26. pychnosz/data/extdata/Berman/FDM+14_2014.csv +2 -0
  27. pychnosz/data/extdata/Berman/Got04_2004.csv +5 -0
  28. pychnosz/data/extdata/Berman/JUN92_1992.csv +3 -0
  29. pychnosz/data/extdata/Berman/SHD91_1991.csv +12 -0
  30. pychnosz/data/extdata/Berman/VGT92_1992.csv +2 -0
  31. pychnosz/data/extdata/Berman/VPT01_2001.csv +3 -0
  32. pychnosz/data/extdata/Berman/VPV05_2005.csv +2 -0
  33. pychnosz/data/extdata/Berman/ZS92_1992.csv +11 -0
  34. pychnosz/data/extdata/Berman/sympy.R +99 -0
  35. pychnosz/data/extdata/Berman/testing/BA96.bib +12 -0
  36. pychnosz/data/extdata/Berman/testing/BA96_Berman.csv +21 -0
  37. pychnosz/data/extdata/Berman/testing/BA96_OBIGT.csv +21 -0
  38. pychnosz/data/extdata/Berman/testing/BA96_refs.csv +6 -0
  39. pychnosz/data/extdata/OBIGT/AD.csv +25 -0
  40. pychnosz/data/extdata/OBIGT/Berman_cr.csv +93 -0
  41. pychnosz/data/extdata/OBIGT/DEW.csv +211 -0
  42. pychnosz/data/extdata/OBIGT/H2O_aq.csv +4 -0
  43. pychnosz/data/extdata/OBIGT/SLOP98.csv +411 -0
  44. pychnosz/data/extdata/OBIGT/SUPCRT92.csv +178 -0
  45. pychnosz/data/extdata/OBIGT/inorganic_aq.csv +729 -0
  46. pychnosz/data/extdata/OBIGT/inorganic_cr.csv +273 -0
  47. pychnosz/data/extdata/OBIGT/inorganic_gas.csv +20 -0
  48. pychnosz/data/extdata/OBIGT/organic_aq.csv +1104 -0
  49. pychnosz/data/extdata/OBIGT/organic_cr.csv +481 -0
  50. pychnosz/data/extdata/OBIGT/organic_gas.csv +268 -0
  51. pychnosz/data/extdata/OBIGT/organic_liq.csv +533 -0
  52. pychnosz/data/extdata/OBIGT/testing/GEMSFIT.csv +43 -0
  53. pychnosz/data/extdata/OBIGT/testing/IGEM.csv +17 -0
  54. pychnosz/data/extdata/OBIGT/testing/Sandia.csv +8 -0
  55. pychnosz/data/extdata/OBIGT/testing/SiO2.csv +4 -0
  56. pychnosz/data/extdata/misc/AD03_Fig1a.csv +69 -0
  57. pychnosz/data/extdata/misc/AD03_Fig1b.csv +43 -0
  58. pychnosz/data/extdata/misc/AD03_Fig1c.csv +89 -0
  59. pychnosz/data/extdata/misc/AD03_Fig1d.csv +30 -0
  60. pychnosz/data/extdata/misc/BZA10.csv +5 -0
  61. pychnosz/data/extdata/misc/HW97_Cp.csv +90 -0
  62. pychnosz/data/extdata/misc/HWM96_V.csv +229 -0
  63. pychnosz/data/extdata/misc/LA19_test.csv +7 -0
  64. pychnosz/data/extdata/misc/Mer75_Table4.csv +42 -0
  65. pychnosz/data/extdata/misc/OBIGT_check.csv +423 -0
  66. pychnosz/data/extdata/misc/PM90.csv +7 -0
  67. pychnosz/data/extdata/misc/RH95.csv +23 -0
  68. pychnosz/data/extdata/misc/RH98_Table15.csv +17 -0
  69. pychnosz/data/extdata/misc/SC10_Rainbow.csv +19 -0
  70. pychnosz/data/extdata/misc/SK95.csv +55 -0
  71. pychnosz/data/extdata/misc/SOJSH.csv +61 -0
  72. pychnosz/data/extdata/misc/SS98_Fig5a.csv +81 -0
  73. pychnosz/data/extdata/misc/SS98_Fig5b.csv +84 -0
  74. pychnosz/data/extdata/misc/TKSS14_Fig2.csv +25 -0
  75. pychnosz/data/extdata/misc/bluered.txt +1000 -0
  76. pychnosz/data/extdata/protein/Cas/Cas_aa.csv +177 -0
  77. pychnosz/data/extdata/protein/Cas/Cas_uniprot.csv +186 -0
  78. pychnosz/data/extdata/protein/Cas/download.R +34 -0
  79. pychnosz/data/extdata/protein/Cas/mkaa.R +34 -0
  80. pychnosz/data/extdata/protein/POLG.csv +12 -0
  81. pychnosz/data/extdata/protein/TBD+05.csv +393 -0
  82. pychnosz/data/extdata/protein/TBD+05_aa.csv +393 -0
  83. pychnosz/data/extdata/protein/rubisco.csv +28 -0
  84. pychnosz/data/extdata/protein/rubisco.fasta +239 -0
  85. pychnosz/data/extdata/protein/rubisco_aa.csv +28 -0
  86. pychnosz/data/extdata/src/H2O92D.f.orig +3457 -0
  87. pychnosz/data/extdata/src/README.txt +5 -0
  88. pychnosz/data/extdata/taxonomy/names.dmp +215 -0
  89. pychnosz/data/extdata/taxonomy/nodes.dmp +63 -0
  90. pychnosz/data/extdata/thermo/Bdot_acirc.csv +60 -0
  91. pychnosz/data/extdata/thermo/buffer.csv +40 -0
  92. pychnosz/data/extdata/thermo/element.csv +135 -0
  93. pychnosz/data/extdata/thermo/groups.csv +6 -0
  94. pychnosz/data/extdata/thermo/opt.csv +2 -0
  95. pychnosz/data/extdata/thermo/protein.csv +506 -0
  96. pychnosz/data/extdata/thermo/refs.csv +343 -0
  97. pychnosz/data/extdata/thermo/stoich.csv.xz +0 -0
  98. pychnosz/data/loader.py +431 -0
  99. pychnosz/data/mod_obigt.py +322 -0
  100. pychnosz/data/obigt.py +471 -0
  101. pychnosz/data/worm.py +228 -0
  102. pychnosz/fortran/__init__.py +16 -0
  103. pychnosz/fortran/h2o92.dll +0 -0
  104. pychnosz/fortran/h2o92_interface.py +527 -0
  105. pychnosz/geochemistry/__init__.py +21 -0
  106. pychnosz/geochemistry/minerals.py +514 -0
  107. pychnosz/geochemistry/redox.py +500 -0
  108. pychnosz/models/__init__.py +47 -0
  109. pychnosz/models/archer_wang.py +165 -0
  110. pychnosz/models/berman.py +309 -0
  111. pychnosz/models/cgl.py +381 -0
  112. pychnosz/models/dew.py +997 -0
  113. pychnosz/models/hkf.py +523 -0
  114. pychnosz/models/hkf_helpers.py +222 -0
  115. pychnosz/models/iapws95.py +1113 -0
  116. pychnosz/models/supcrt92_fortran.py +238 -0
  117. pychnosz/models/water.py +480 -0
  118. pychnosz/utils/__init__.py +27 -0
  119. pychnosz/utils/expression.py +1074 -0
  120. pychnosz/utils/formula.py +830 -0
  121. pychnosz/utils/formula_ox.py +227 -0
  122. pychnosz/utils/reset.py +33 -0
  123. pychnosz/utils/units.py +259 -0
  124. pychnosz-1.1.1.dist-info/METADATA +197 -0
  125. pychnosz-1.1.1.dist-info/RECORD +128 -0
  126. pychnosz-1.1.1.dist-info/WHEEL +5 -0
  127. pychnosz-1.1.1.dist-info/licenses/LICENSE.txt +19 -0
  128. pychnosz-1.1.1.dist-info/top_level.txt +1 -0
pychnosz/data/obigt.py ADDED
@@ -0,0 +1,471 @@
1
+ """
2
+ OBIGT database access module.
3
+
4
+ This module provides a high-level interface to the OBIGT (Oelkers, Benezeth,
5
+ and Isobaric Gas Thermodynamics) database, which contains thermodynamic
6
+ parameters for chemical species.
7
+ """
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ from typing import Dict, List, Optional, Union, Tuple
12
+ from pathlib import Path
13
+ from .loader import DataLoader
14
+
15
+
16
+ class OBIGTDatabase:
17
+ """
18
+ High-level interface to the OBIGT thermodynamic database.
19
+
20
+ This class provides methods to access, search, and manipulate the
21
+ thermodynamic data from the OBIGT database files.
22
+ """
23
+
24
+ def __init__(self, data_loader: Optional[DataLoader] = None):
25
+ """
26
+ Initialize the OBIGT database.
27
+
28
+ Parameters:
29
+ -----------
30
+ data_loader : DataLoader, optional
31
+ DataLoader instance to use. If None, creates a default loader.
32
+ """
33
+ if data_loader is None:
34
+ from .loader import get_default_loader
35
+ self.loader = get_default_loader()
36
+ else:
37
+ self.loader = data_loader
38
+
39
+ # Cache for combined data
40
+ self._combined_data = None
41
+ self._species_index = None
42
+
43
+ # Define the expected columns for OBIGT data
44
+ self.obigt_columns = [
45
+ 'name', 'abbrv', 'formula', 'state', 'ref1', 'ref2', 'date', 'model',
46
+ 'E_units', 'G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d',
47
+ 'c1.e', 'c2.f', 'omega.lambda', 'z.T'
48
+ ]
49
+
50
+ # State classifications
51
+ self.aqueous_states = ['aq']
52
+ self.crystalline_states = ['cr']
53
+ self.gas_states = ['gas']
54
+ self.liquid_states = ['liq']
55
+
56
+ def load_all_data(self, force_reload: bool = False) -> pd.DataFrame:
57
+ """
58
+ Load and combine all OBIGT data files.
59
+
60
+ Parameters:
61
+ -----------
62
+ force_reload : bool, default False
63
+ Force reloading of data even if cached
64
+
65
+ Returns:
66
+ --------
67
+ pd.DataFrame
68
+ Combined OBIGT database
69
+ """
70
+ if self._combined_data is not None and not force_reload:
71
+ return self._combined_data.copy()
72
+
73
+ # Load all OBIGT files
74
+ obigt_files = self.loader.load_all_obigt_files()
75
+
76
+ # Combine all files
77
+ combined_data = []
78
+
79
+ for filename, df in obigt_files.items():
80
+ # Add source file information
81
+ df_copy = df.copy()
82
+ df_copy['source_file'] = filename
83
+ combined_data.append(df_copy)
84
+
85
+ # Concatenate all data
86
+ self._combined_data = pd.concat(combined_data, ignore_index=True)
87
+
88
+ # IMPORTANT: R uses 1-based indexing, so we need to shift the DataFrame index
89
+ # to match R's row numbers. Row 0 in pandas should be row 1 in R.
90
+ self._combined_data.index = self._combined_data.index + 1
91
+
92
+ # Create species index for fast lookups
93
+ self._create_species_index()
94
+
95
+ return self._combined_data.copy()
96
+
97
+ def get_combined_data(self) -> pd.DataFrame:
98
+ """
99
+ Get combined OBIGT thermodynamic data.
100
+
101
+ Returns
102
+ -------
103
+ pd.DataFrame
104
+ Combined OBIGT data with all species
105
+ """
106
+ if self._combined_data is not None:
107
+ return self._combined_data.copy()
108
+
109
+ try:
110
+ # Try to load data normally first
111
+ return self.load_all_data()
112
+ except Exception as e:
113
+ print(f"Warning: Could not load OBIGT data: {e}")
114
+ # Create minimal fallback data for essential species
115
+ return self._create_fallback_data()
116
+
117
+ def _create_fallback_data(self) -> pd.DataFrame:
118
+ """Create minimal fallback data for essential species."""
119
+
120
+ # Essential species data (approximate values for basic functionality)
121
+ fallback_data = {
122
+ 'name': ['water', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'],
123
+ 'abbrv': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'],
124
+ 'formula': ['H2O', 'H+', 'OH-', 'CO2', 'HCO3-', 'CO3-2'],
125
+ 'state': ['liq', 'aq', 'aq', 'aq', 'aq', 'aq'],
126
+ 'G': [-56688.1, 0.0, -37595.0, -92307.0, -140314.0, -126172.0],
127
+ 'H': [-68317.0, 0.0, -54977.0, -98900.0, -165180.0, -161963.0],
128
+ 'S': [16.712, 0.0, -2.56, -39.75, 98.4, -50.0],
129
+ 'Cp': [18.0, 0.0, -36.4, 37.11, 25.0, -53.1],
130
+ 'V': [18.068, 0.0, -4.71, 34.0, 25.0, -6.0],
131
+ 'z.T': [0, 1, -1, 0, -1, -2],
132
+ 'ref1': ['', '', '', '', '', ''],
133
+ 'ref2': ['', '', '', '', '', ''],
134
+ 'date': ['', '', '', '', '', ''],
135
+ 'model': ['', '', '', '', '', ''],
136
+ 'E_units': ['', '', '', '', '', ''],
137
+ 'a1.a': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
138
+ 'a2.b': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
139
+ 'a3.c': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
140
+ 'a4.d': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
141
+ 'c1.e': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
142
+ 'c2.f': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
143
+ 'omega.lambda': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
144
+ }
145
+
146
+ df = pd.DataFrame(fallback_data)
147
+
148
+ # Cache the fallback data
149
+ self._combined_data = df
150
+ self._create_species_index()
151
+
152
+ return df.copy()
153
+
154
+ def _create_species_index(self):
155
+ """Create an index for fast species lookups."""
156
+ if self._combined_data is None:
157
+ return
158
+
159
+ # Create multi-level index for name, formula, and state
160
+ self._species_index = {}
161
+
162
+ for idx, row in self._combined_data.iterrows():
163
+ name = str(row.get('name', '')).strip()
164
+ formula = str(row.get('formula', '')).strip()
165
+ state = str(row.get('state', '')).strip()
166
+
167
+ # Index by name
168
+ if name and name not in self._species_index:
169
+ self._species_index[name] = []
170
+ if name:
171
+ self._species_index[name].append(idx)
172
+
173
+ # Index by formula
174
+ formula_key = f"formula:{formula}"
175
+ if formula and formula_key not in self._species_index:
176
+ self._species_index[formula_key] = []
177
+ if formula:
178
+ self._species_index[formula_key].append(idx)
179
+
180
+ # Index by name+state combination
181
+ name_state_key = f"{name}({state})"
182
+ if name and state and name_state_key not in self._species_index:
183
+ self._species_index[name_state_key] = []
184
+ if name and state:
185
+ self._species_index[name_state_key].append(idx)
186
+
187
+ def get_species(self, identifier: str, state: Optional[str] = None) -> pd.DataFrame:
188
+ """
189
+ Get species data by name, formula, or identifier.
190
+
191
+ Parameters:
192
+ -----------
193
+ identifier : str
194
+ Species name, formula, or identifier
195
+ state : str, optional
196
+ Physical state ('aq', 'cr', 'gas', 'liq')
197
+
198
+ Returns:
199
+ --------
200
+ pd.DataFrame
201
+ Matching species data
202
+ """
203
+ if self._combined_data is None:
204
+ self.load_all_data()
205
+
206
+ results = []
207
+
208
+ # Try exact name match first
209
+ if identifier in self._species_index:
210
+ indices = self._species_index[identifier]
211
+ for idx in indices:
212
+ row = self._combined_data.iloc[idx]
213
+ if state is None or str(row.get('state', '')).strip() == state:
214
+ results.append(row)
215
+
216
+ # Try formula match
217
+ formula_key = f"formula:{identifier}"
218
+ if formula_key in self._species_index:
219
+ indices = self._species_index[formula_key]
220
+ for idx in indices:
221
+ row = self._combined_data.iloc[idx]
222
+ if state is None or str(row.get('state', '')).strip() == state:
223
+ results.append(row)
224
+
225
+ # Try name+state combination
226
+ if state:
227
+ name_state_key = f"{identifier}({state})"
228
+ if name_state_key in self._species_index:
229
+ indices = self._species_index[name_state_key]
230
+ for idx in indices:
231
+ results.append(self._combined_data.iloc[idx])
232
+
233
+ # If no exact matches, try partial matching
234
+ if not results:
235
+ mask = self._combined_data['name'].str.contains(identifier, case=False, na=False) | \
236
+ self._combined_data['formula'].str.contains(identifier, case=False, na=False)
237
+
238
+ if state:
239
+ mask &= (self._combined_data['state'] == state)
240
+
241
+ partial_matches = self._combined_data[mask]
242
+ results = [row for _, row in partial_matches.iterrows()]
243
+
244
+ if results:
245
+ return pd.DataFrame(results).reset_index(drop=True)
246
+ else:
247
+ return pd.DataFrame(columns=self._combined_data.columns)
248
+
249
+ def search_species(self, query: str, search_columns: Optional[List[str]] = None) -> pd.DataFrame:
250
+ """
251
+ Search for species using a text query.
252
+
253
+ Parameters:
254
+ -----------
255
+ query : str
256
+ Search query
257
+ search_columns : List[str], optional
258
+ Columns to search in. Default: ['name', 'formula', 'abbrv']
259
+
260
+ Returns:
261
+ --------
262
+ pd.DataFrame
263
+ Matching species data
264
+ """
265
+ if self._combined_data is None:
266
+ self.load_all_data()
267
+
268
+ if search_columns is None:
269
+ search_columns = ['name', 'formula', 'abbrv']
270
+
271
+ # Create search mask
272
+ mask = pd.Series([False] * len(self._combined_data))
273
+
274
+ for col in search_columns:
275
+ if col in self._combined_data.columns:
276
+ mask |= self._combined_data[col].str.contains(query, case=False, na=False)
277
+
278
+ return self._combined_data[mask].reset_index(drop=True)
279
+
280
+ def get_species_by_state(self, state: str) -> pd.DataFrame:
281
+ """
282
+ Get all species in a specific physical state.
283
+
284
+ Parameters:
285
+ -----------
286
+ state : str
287
+ Physical state ('aq', 'cr', 'gas', 'liq')
288
+
289
+ Returns:
290
+ --------
291
+ pd.DataFrame
292
+ Species data for the specified state
293
+ """
294
+ if self._combined_data is None:
295
+ self.load_all_data()
296
+
297
+ mask = self._combined_data['state'] == state
298
+ return self._combined_data[mask].reset_index(drop=True)
299
+
300
+ def get_aqueous_species(self) -> pd.DataFrame:
301
+ """Get all aqueous species."""
302
+ return self.get_species_by_state('aq')
303
+
304
+ def get_crystalline_species(self) -> pd.DataFrame:
305
+ """Get all crystalline species."""
306
+ return self.get_species_by_state('cr')
307
+
308
+ def get_gas_species(self) -> pd.DataFrame:
309
+ """Get all gas species."""
310
+ return self.get_species_by_state('gas')
311
+
312
+ def get_liquid_species(self) -> pd.DataFrame:
313
+ """Get all liquid species."""
314
+ return self.get_species_by_state('liq')
315
+
316
+ def get_species_by_elements(self, elements: List[str]) -> pd.DataFrame:
317
+ """
318
+ Get species containing specific elements.
319
+
320
+ Parameters:
321
+ -----------
322
+ elements : List[str]
323
+ List of element symbols
324
+
325
+ Returns:
326
+ --------
327
+ pd.DataFrame
328
+ Species containing the specified elements
329
+ """
330
+ if self._combined_data is None:
331
+ self.load_all_data()
332
+
333
+ # Create search pattern for elements
334
+ pattern = '|'.join(elements)
335
+ mask = self._combined_data['formula'].str.contains(pattern, case=False, na=False)
336
+
337
+ return self._combined_data[mask].reset_index(drop=True)
338
+
339
+ def get_thermodynamic_properties(self, species_data: pd.DataFrame) -> pd.DataFrame:
340
+ """
341
+ Extract thermodynamic properties from species data.
342
+
343
+ Parameters:
344
+ -----------
345
+ species_data : pd.DataFrame
346
+ Species data from get_species or similar methods
347
+
348
+ Returns:
349
+ --------
350
+ pd.DataFrame
351
+ Thermodynamic properties (G, H, S, Cp, V, etc.)
352
+ """
353
+ thermo_columns = ['G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d',
354
+ 'c1.e', 'c2.f', 'omega.lambda', 'z.T']
355
+
356
+ available_columns = [col for col in thermo_columns if col in species_data.columns]
357
+
358
+ result = species_data[['name', 'formula', 'state'] + available_columns].copy()
359
+
360
+ # Convert numeric columns to proper numeric types
361
+ for col in available_columns:
362
+ result[col] = pd.to_numeric(result[col], errors='coerce')
363
+
364
+ return result
365
+
366
+ def get_database_stats(self) -> Dict[str, Union[int, Dict[str, int]]]:
367
+ """
368
+ Get statistics about the database.
369
+
370
+ Returns:
371
+ --------
372
+ Dict
373
+ Database statistics including total species, states, etc.
374
+ """
375
+ if self._combined_data is None:
376
+ self.load_all_data()
377
+
378
+ stats = {
379
+ 'total_species': len(self._combined_data),
380
+ 'states': self._combined_data['state'].value_counts().to_dict(),
381
+ 'source_files': self._combined_data['source_file'].value_counts().to_dict(),
382
+ 'unique_names': self._combined_data['name'].nunique(),
383
+ 'unique_formulas': self._combined_data['formula'].nunique(),
384
+ }
385
+
386
+ return stats
387
+
388
+ def validate_data(self) -> Dict[str, List]:
389
+ """
390
+ Validate the OBIGT database for common issues.
391
+
392
+ Returns:
393
+ --------
394
+ Dict
395
+ Validation results with issues found
396
+ """
397
+ if self._combined_data is None:
398
+ self.load_all_data()
399
+
400
+ issues = {
401
+ 'missing_names': [],
402
+ 'missing_formulas': [],
403
+ 'missing_states': [],
404
+ 'invalid_numeric_values': [],
405
+ 'duplicate_entries': []
406
+ }
407
+
408
+ # Check for missing critical fields
409
+ missing_names = self._combined_data['name'].isna() | (self._combined_data['name'] == '')
410
+ if missing_names.any():
411
+ issues['missing_names'] = self._combined_data[missing_names].index.tolist()
412
+
413
+ missing_formulas = self._combined_data['formula'].isna() | (self._combined_data['formula'] == '')
414
+ if missing_formulas.any():
415
+ issues['missing_formulas'] = self._combined_data[missing_formulas].index.tolist()
416
+
417
+ missing_states = self._combined_data['state'].isna() | (self._combined_data['state'] == '')
418
+ if missing_states.any():
419
+ issues['missing_states'] = self._combined_data[missing_states].index.tolist()
420
+
421
+ # Check for invalid numeric values in key thermodynamic properties
422
+ numeric_columns = ['G', 'H', 'S', 'Cp']
423
+ for col in numeric_columns:
424
+ if col in self._combined_data.columns:
425
+ numeric_data = pd.to_numeric(self._combined_data[col], errors='coerce')
426
+ invalid_mask = numeric_data.isna() & self._combined_data[col].notna()
427
+ if invalid_mask.any():
428
+ issues['invalid_numeric_values'].extend(
429
+ [(idx, col) for idx in self._combined_data[invalid_mask].index]
430
+ )
431
+
432
+ # Check for potential duplicates
433
+ duplicate_mask = self._combined_data.duplicated(subset=['name', 'formula', 'state'], keep=False)
434
+ if duplicate_mask.any():
435
+ issues['duplicate_entries'] = self._combined_data[duplicate_mask].index.tolist()
436
+
437
+ return issues
438
+
439
+ def export_to_csv(self, filename: str, species_filter: Optional[str] = None):
440
+ """
441
+ Export database or filtered data to CSV.
442
+
443
+ Parameters:
444
+ -----------
445
+ filename : str
446
+ Output filename
447
+ species_filter : str, optional
448
+ Filter to apply (state name like 'aq', 'cr', etc.)
449
+ """
450
+ if self._combined_data is None:
451
+ self.load_all_data()
452
+
453
+ data_to_export = self._combined_data
454
+
455
+ if species_filter:
456
+ if species_filter in ['aq', 'cr', 'gas', 'liq']:
457
+ data_to_export = self.get_species_by_state(species_filter)
458
+
459
+ data_to_export.to_csv(filename, index=False)
460
+
461
+
462
+ def get_default_obigt() -> OBIGTDatabase:
463
+ """
464
+ Get a default OBIGT database instance.
465
+
466
+ Returns:
467
+ --------
468
+ OBIGTDatabase
469
+ Default OBIGT database instance
470
+ """
471
+ return OBIGTDatabase()
pychnosz/data/worm.py ADDED
@@ -0,0 +1,228 @@
1
+ """
2
+ WORM database loader for CHNOSZ.
3
+
4
+ This module provides functionality to load the Water-Organic-Rock-Microbe (WORM)
5
+ thermodynamic database from the WORM-db GitHub repository.
6
+
7
+ Reference: https://github.com/worm-portal/WORM-db
8
+ """
9
+
10
+ import pandas as pd
11
+ from io import StringIO
12
+ from urllib.request import urlopen
13
+ from typing import Optional, Tuple
14
+ import warnings
15
+
16
+ from ..core.thermo import thermo
17
+ from .add_obigt import add_OBIGT
18
+
19
+
20
+ def can_connect_to(url: str, timeout: int = 5) -> bool:
21
+ """
22
+ Check if a URL is reachable.
23
+
24
+ Parameters
25
+ ----------
26
+ url : str
27
+ The URL to check
28
+ timeout : int, default 5
29
+ Connection timeout in seconds
30
+
31
+ Returns
32
+ -------
33
+ bool
34
+ True if URL is reachable, False otherwise
35
+ """
36
+ try:
37
+ from urllib.request import Request
38
+ req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
39
+ with urlopen(req, timeout=timeout) as response:
40
+ return response.status == 200
41
+ except Exception:
42
+ return False
43
+
44
+
45
+ def download_worm_data(url: str) -> Optional[pd.DataFrame]:
46
+ """
47
+ Download WORM database from URL.
48
+
49
+ Parameters
50
+ ----------
51
+ url : str
52
+ URL to the WORM CSV file
53
+
54
+ Returns
55
+ -------
56
+ pd.DataFrame or None
57
+ DataFrame containing WORM data, or None if download fails
58
+ """
59
+ try:
60
+ from urllib.request import Request
61
+ req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
62
+ with urlopen(req, timeout=30) as webpage:
63
+ content = webpage.read().decode('utf-8')
64
+ return pd.read_csv(StringIO(content), sep=",")
65
+ except Exception as e:
66
+ warnings.warn(f"Failed to download WORM data from {url}: {e}")
67
+ return None
68
+
69
+
70
+ def load_WORM(keep_default: bool = False, messages: bool = True) -> bool:
71
+ """
72
+ Load the WORM (Water-Organic-Rock-Microbe) thermodynamic database.
73
+
74
+ This function downloads and loads the WORM database from the WORM-db GitHub
75
+ repository. By default, it replaces the OBIGT database with WORM data,
76
+ keeping only water, H+, and e- from the original database.
77
+
78
+ Parameters
79
+ ----------
80
+ keep_default : bool, default False
81
+ If False, replace OBIGT with minimal species (water, H+, e-) before
82
+ loading WORM. If True, add WORM species to the existing OBIGT database.
83
+ messages : bool, default True
84
+ Whether to print informational messages
85
+
86
+ Returns
87
+ -------
88
+ bool
89
+ True if WORM database was loaded successfully, False otherwise
90
+
91
+ Examples
92
+ --------
93
+ >>> import pychnosz
94
+ >>> pychnosz.reset()
95
+ >>> # Load WORM database (replaces default OBIGT)
96
+ >>> pychnosz.load_WORM()
97
+ >>>
98
+ >>> # Load WORM database while keeping default OBIGT species
99
+ >>> pychnosz.reset()
100
+ >>> pychnosz.load_WORM(keep_default=True)
101
+
102
+ Notes
103
+ -----
104
+ The WORM database is downloaded from:
105
+ - Species data: https://github.com/worm-portal/WORM-db/master/wrm_data_latest.csv
106
+ - References: https://github.com/worm-portal/WORM-db/master/references.csv
107
+
108
+ This feature is exclusive to the Python version of CHNOSZ.
109
+ """
110
+
111
+ # WORM database URLs
112
+ url_data = "https://raw.githubusercontent.com/worm-portal/WORM-db/master/wrm_data_latest.csv"
113
+ url_refs = "https://raw.githubusercontent.com/worm-portal/WORM-db/master/references.csv"
114
+
115
+ # Name for source_file column
116
+ worm_source_name = "wrm_data_latest.csv"
117
+
118
+ # Check if we can connect to the WORM database
119
+ if not can_connect_to(url_data):
120
+ if messages:
121
+ print("load_WORM: could not reach WORM database repository")
122
+ return False
123
+
124
+ # Download WORM species data
125
+ worm_data = download_worm_data(url_data)
126
+ if worm_data is None:
127
+ if messages:
128
+ print("load_WORM: failed to download WORM species data")
129
+ return False
130
+
131
+ # Get the thermodynamic system
132
+ thermo_sys = thermo()
133
+
134
+ if not keep_default:
135
+ # Keep only essential species (water, H+, e-)
136
+ from ..core.info import info
137
+ try:
138
+ # Get indices for essential species
139
+ essential_species = []
140
+ for species in ["water", "H+", "e-"]:
141
+ idx = info(species)
142
+ if idx is not None:
143
+ if isinstance(idx, (list, tuple)):
144
+ essential_species.extend(idx)
145
+ else:
146
+ essential_species.append(idx)
147
+
148
+ if essential_species:
149
+ # Keep only essential species
150
+ minimal_obigt = thermo_sys.obigt.loc[essential_species].copy()
151
+ thermo_sys.obigt = minimal_obigt
152
+ except Exception as e:
153
+ if messages:
154
+ print(f"load_WORM: warning - error keeping essential species: {e}")
155
+
156
+ # Add WORM species data (suppress add_OBIGT messages)
157
+ try:
158
+ # Add source_file column to worm_data before adding
159
+ worm_data['source_file'] = worm_source_name
160
+
161
+ indices = add_OBIGT(worm_data, messages=False)
162
+ except Exception as e:
163
+ if messages:
164
+ print(f"load_WORM: failed to add WORM species: {e}")
165
+ return False
166
+
167
+ # Try to download and load WORM references
168
+ if can_connect_to(url_refs):
169
+ worm_refs = download_worm_data(url_refs)
170
+ if worm_refs is not None:
171
+ # Replace refs with WORM refs
172
+ thermo_sys.refs = worm_refs
173
+
174
+ # Update formula_ox if it exists in WORM data
175
+ # This is already handled by add_OBIGT, but we ensure it's set correctly
176
+ if 'formula_ox' in thermo_sys.obigt.columns:
177
+ formula_ox_df = pd.DataFrame({
178
+ 'name': thermo_sys.obigt['name'],
179
+ 'formula_ox': thermo_sys.obigt['formula_ox']
180
+ })
181
+ formula_ox_df.index = thermo_sys.obigt.index
182
+ thermo_sys.formula_ox = formula_ox_df
183
+
184
+ # Print single summary message
185
+ if messages:
186
+ final_obigt = thermo_sys.obigt
187
+ total_species = len(final_obigt)
188
+ aqueous_species = len(final_obigt[final_obigt['state'] == 'aq'])
189
+ print(f"The WORM thermodynamic database has been loaded: {aqueous_species} aqueous, {total_species} total species")
190
+
191
+ return True
192
+
193
+
194
+ def reset_WORM(messages: bool = True) -> None:
195
+ """
196
+ Initialize the thermodynamic system with the WORM database.
197
+
198
+ This is a convenience function that combines reset() and load_WORM().
199
+ It initializes the system and loads the WORM database in one step.
200
+
201
+ Parameters
202
+ ----------
203
+ messages : bool, default True
204
+ Whether to print informational messages
205
+
206
+ Examples
207
+ --------
208
+ >>> import pychnosz
209
+ >>> # Initialize with WORM database
210
+ >>> pychnosz.reset_WORM()
211
+
212
+ Notes
213
+ -----
214
+ This is equivalent to:
215
+ pychnosz.reset()
216
+ pychnosz.load_WORM()
217
+ """
218
+ from ..utils.reset import reset
219
+
220
+ # Reset the system first
221
+ reset(messages=messages)
222
+
223
+ # Load WORM database
224
+ success = load_WORM(keep_default=False, messages=messages)
225
+
226
+ if not success:
227
+ if messages:
228
+ print("reset_WORM: falling back to default OBIGT database")