pychnosz 1.1.11__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. pychnosz/__init__.py +129 -0
  2. pychnosz/biomolecules/__init__.py +29 -0
  3. pychnosz/biomolecules/ionize_aa.py +197 -0
  4. pychnosz/biomolecules/proteins.py +595 -0
  5. pychnosz/core/__init__.py +46 -0
  6. pychnosz/core/affinity.py +1256 -0
  7. pychnosz/core/animation.py +593 -0
  8. pychnosz/core/balance.py +334 -0
  9. pychnosz/core/basis.py +716 -0
  10. pychnosz/core/diagram.py +3336 -0
  11. pychnosz/core/equilibrate.py +813 -0
  12. pychnosz/core/equilibrium.py +554 -0
  13. pychnosz/core/info.py +821 -0
  14. pychnosz/core/retrieve.py +364 -0
  15. pychnosz/core/speciation.py +580 -0
  16. pychnosz/core/species.py +599 -0
  17. pychnosz/core/subcrt.py +1696 -0
  18. pychnosz/core/thermo.py +593 -0
  19. pychnosz/core/unicurve.py +1226 -0
  20. pychnosz/data/__init__.py +11 -0
  21. pychnosz/data/add_obigt.py +327 -0
  22. pychnosz/data/extdata/Berman/BDat17_2017.csv +2 -0
  23. pychnosz/data/extdata/Berman/Ber88_1988.csv +68 -0
  24. pychnosz/data/extdata/Berman/Ber90_1990.csv +5 -0
  25. pychnosz/data/extdata/Berman/DS10_2010.csv +6 -0
  26. pychnosz/data/extdata/Berman/FDM+14_2014.csv +2 -0
  27. pychnosz/data/extdata/Berman/Got04_2004.csv +5 -0
  28. pychnosz/data/extdata/Berman/JUN92_1992.csv +3 -0
  29. pychnosz/data/extdata/Berman/SHD91_1991.csv +12 -0
  30. pychnosz/data/extdata/Berman/VGT92_1992.csv +2 -0
  31. pychnosz/data/extdata/Berman/VPT01_2001.csv +3 -0
  32. pychnosz/data/extdata/Berman/VPV05_2005.csv +2 -0
  33. pychnosz/data/extdata/Berman/ZS92_1992.csv +11 -0
  34. pychnosz/data/extdata/Berman/sympy.R +99 -0
  35. pychnosz/data/extdata/Berman/testing/BA96.bib +12 -0
  36. pychnosz/data/extdata/Berman/testing/BA96_Berman.csv +21 -0
  37. pychnosz/data/extdata/Berman/testing/BA96_OBIGT.csv +21 -0
  38. pychnosz/data/extdata/Berman/testing/BA96_refs.csv +6 -0
  39. pychnosz/data/extdata/OBIGT/AD.csv +25 -0
  40. pychnosz/data/extdata/OBIGT/Berman_cr.csv +93 -0
  41. pychnosz/data/extdata/OBIGT/DEW.csv +211 -0
  42. pychnosz/data/extdata/OBIGT/H2O_aq.csv +4 -0
  43. pychnosz/data/extdata/OBIGT/SLOP98.csv +411 -0
  44. pychnosz/data/extdata/OBIGT/SUPCRT92.csv +178 -0
  45. pychnosz/data/extdata/OBIGT/inorganic_aq.csv +729 -0
  46. pychnosz/data/extdata/OBIGT/inorganic_cr.csv +273 -0
  47. pychnosz/data/extdata/OBIGT/inorganic_gas.csv +20 -0
  48. pychnosz/data/extdata/OBIGT/organic_aq.csv +1104 -0
  49. pychnosz/data/extdata/OBIGT/organic_cr.csv +481 -0
  50. pychnosz/data/extdata/OBIGT/organic_gas.csv +268 -0
  51. pychnosz/data/extdata/OBIGT/organic_liq.csv +533 -0
  52. pychnosz/data/extdata/OBIGT/testing/GEMSFIT.csv +43 -0
  53. pychnosz/data/extdata/OBIGT/testing/IGEM.csv +17 -0
  54. pychnosz/data/extdata/OBIGT/testing/Sandia.csv +8 -0
  55. pychnosz/data/extdata/OBIGT/testing/SiO2.csv +4 -0
  56. pychnosz/data/extdata/misc/AD03_Fig1a.csv +69 -0
  57. pychnosz/data/extdata/misc/AD03_Fig1b.csv +43 -0
  58. pychnosz/data/extdata/misc/AD03_Fig1c.csv +89 -0
  59. pychnosz/data/extdata/misc/AD03_Fig1d.csv +30 -0
  60. pychnosz/data/extdata/misc/BZA10.csv +5 -0
  61. pychnosz/data/extdata/misc/HW97_Cp.csv +90 -0
  62. pychnosz/data/extdata/misc/HWM96_V.csv +229 -0
  63. pychnosz/data/extdata/misc/LA19_test.csv +7 -0
  64. pychnosz/data/extdata/misc/Mer75_Table4.csv +42 -0
  65. pychnosz/data/extdata/misc/OBIGT_check.csv +423 -0
  66. pychnosz/data/extdata/misc/PM90.csv +7 -0
  67. pychnosz/data/extdata/misc/RH95.csv +23 -0
  68. pychnosz/data/extdata/misc/RH98_Table15.csv +17 -0
  69. pychnosz/data/extdata/misc/SC10_Rainbow.csv +19 -0
  70. pychnosz/data/extdata/misc/SK95.csv +55 -0
  71. pychnosz/data/extdata/misc/SOJSH.csv +61 -0
  72. pychnosz/data/extdata/misc/SS98_Fig5a.csv +81 -0
  73. pychnosz/data/extdata/misc/SS98_Fig5b.csv +84 -0
  74. pychnosz/data/extdata/misc/TKSS14_Fig2.csv +25 -0
  75. pychnosz/data/extdata/misc/bluered.txt +1000 -0
  76. pychnosz/data/extdata/protein/Cas/Cas_aa.csv +177 -0
  77. pychnosz/data/extdata/protein/Cas/Cas_uniprot.csv +186 -0
  78. pychnosz/data/extdata/protein/Cas/download.R +34 -0
  79. pychnosz/data/extdata/protein/Cas/mkaa.R +34 -0
  80. pychnosz/data/extdata/protein/POLG.csv +12 -0
  81. pychnosz/data/extdata/protein/TBD+05.csv +393 -0
  82. pychnosz/data/extdata/protein/TBD+05_aa.csv +393 -0
  83. pychnosz/data/extdata/protein/rubisco.csv +28 -0
  84. pychnosz/data/extdata/protein/rubisco.fasta +239 -0
  85. pychnosz/data/extdata/protein/rubisco_aa.csv +28 -0
  86. pychnosz/data/extdata/src/H2O92D.f.orig +3457 -0
  87. pychnosz/data/extdata/src/README.txt +5 -0
  88. pychnosz/data/extdata/taxonomy/names.dmp +215 -0
  89. pychnosz/data/extdata/taxonomy/nodes.dmp +63 -0
  90. pychnosz/data/extdata/thermo/Bdot_acirc.csv +60 -0
  91. pychnosz/data/extdata/thermo/buffer.csv +40 -0
  92. pychnosz/data/extdata/thermo/element.csv +135 -0
  93. pychnosz/data/extdata/thermo/groups.csv +6 -0
  94. pychnosz/data/extdata/thermo/opt.csv +2 -0
  95. pychnosz/data/extdata/thermo/protein.csv +506 -0
  96. pychnosz/data/extdata/thermo/refs.csv +343 -0
  97. pychnosz/data/extdata/thermo/stoich.csv.xz +0 -0
  98. pychnosz/data/loader.py +431 -0
  99. pychnosz/data/mod_obigt.py +322 -0
  100. pychnosz/data/obigt.py +471 -0
  101. pychnosz/data/worm.py +228 -0
  102. pychnosz/fortran/__init__.py +16 -0
  103. pychnosz/fortran/h2o92.dll +0 -0
  104. pychnosz/fortran/h2o92_interface.py +527 -0
  105. pychnosz/geochemistry/__init__.py +21 -0
  106. pychnosz/geochemistry/minerals.py +514 -0
  107. pychnosz/geochemistry/redox.py +500 -0
  108. pychnosz/models/__init__.py +47 -0
  109. pychnosz/models/archer_wang.py +165 -0
  110. pychnosz/models/berman.py +309 -0
  111. pychnosz/models/cgl.py +381 -0
  112. pychnosz/models/dew.py +997 -0
  113. pychnosz/models/hkf.py +523 -0
  114. pychnosz/models/hkf_helpers.py +231 -0
  115. pychnosz/models/iapws95.py +1113 -0
  116. pychnosz/models/supcrt92_fortran.py +238 -0
  117. pychnosz/models/water.py +480 -0
  118. pychnosz/utils/__init__.py +27 -0
  119. pychnosz/utils/expression.py +1074 -0
  120. pychnosz/utils/formula.py +830 -0
  121. pychnosz/utils/formula_ox.py +227 -0
  122. pychnosz/utils/reset.py +33 -0
  123. pychnosz/utils/units.py +259 -0
  124. pychnosz-1.1.11.dist-info/METADATA +197 -0
  125. pychnosz-1.1.11.dist-info/RECORD +128 -0
  126. pychnosz-1.1.11.dist-info/WHEEL +5 -0
  127. pychnosz-1.1.11.dist-info/licenses/LICENSE.txt +19 -0
  128. pychnosz-1.1.11.dist-info/top_level.txt +1 -0
pychnosz/core/info.py ADDED
@@ -0,0 +1,821 @@
1
+ """
2
+ Species database lookup and information retrieval module.
3
+
4
+ This module provides Python equivalents of the R functions in info.R:
5
+ - info(): Search for species by name, formula, or index
6
+ - species information retrieval and validation
7
+ - database summarization and query functions
8
+
9
+ Author: CHNOSZ Python port
10
+ """
11
+
12
+ import pandas as pd
13
+ import numpy as np
14
+ from typing import Union, List, Optional, Dict, Any
15
+ import warnings
16
+ import re
17
+
18
+ from .thermo import thermo
19
+ from ..utils.formula import makeup, as_chemical_formula
20
+
21
+
22
+ def info(species: Optional[Union[str, int, List[Union[str, int]], pd.Series]] = None,
23
+ state: Optional[Union[str, List[str]]] = None,
24
+ check_it: bool = True,
25
+ messages: bool = True) -> Union[pd.DataFrame, int, List[int], None]:
26
+ """
27
+ Search for species in the thermodynamic database.
28
+
29
+ Parameters
30
+ ----------
31
+ species : str, int, list of str/int, pd.Series, or None
32
+ Species name, formula, abbreviation, or OBIGT index.
33
+ Can also be a pandas Series (e.g., from retrieve()).
34
+ If None, returns summary information about the database.
35
+ state : str, list of str, or None
36
+ Physical state(s) to match ('aq', 'cr', 'gas', 'liq')
37
+ check_it : bool, default True
38
+ Whether to perform consistency checks on thermodynamic data
39
+ messages : bool, default True
40
+ Whether to print informational messages
41
+
42
+ Returns
43
+ -------
44
+ pd.DataFrame, int, list of int, or None
45
+ - If species is None: prints database summary, returns None
46
+ - If species is numeric: returns DataFrame with species data
47
+ - If species is string: returns species index(es) or NA if not found
48
+
49
+ Examples
50
+ --------
51
+ >>> # Get database summary
52
+ >>> info()
53
+
54
+ >>> # Find species index
55
+ >>> info("H2O")
56
+
57
+ >>> # Get species data by index
58
+ >>> info(1)
59
+
60
+ >>> # Search with specific state
61
+ >>> info("CO2", "aq")
62
+
63
+ >>> # Use output from retrieve()
64
+ >>> zn_species = retrieve("Zn", ["O", "H"], state="aq")
65
+ >>> info(zn_species)
66
+ """
67
+ thermo_obj = thermo()
68
+
69
+ # Initialize database if needed
70
+ if not thermo_obj.is_initialized():
71
+ thermo_obj.reset()
72
+
73
+ # Return database summary if no species specified
74
+ if species is None:
75
+ return _print_database_summary(thermo_obj, messages)
76
+
77
+ # Handle pandas Series (e.g., from retrieve())
78
+ if isinstance(species, pd.Series):
79
+ # Extract the integer indices from the Series values
80
+ indices = species.values.tolist()
81
+ return _info_numeric(indices, thermo_obj, check_it, messages)
82
+
83
+ # Handle numeric species indices
84
+ if isinstance(species, (int, list)) and all(isinstance(s, int) for s in (species if isinstance(species, list) else [species])):
85
+ return _info_numeric(species, thermo_obj, check_it, messages)
86
+
87
+ # Handle string species names/formulas
88
+ if isinstance(species, (str, list)):
89
+ return _info_character(species, state, thermo_obj, messages)
90
+
91
+ raise ValueError(f"Invalid species type: {type(species)}")
92
+
93
+
94
+ def _print_database_summary(thermo_obj, messages: bool = True) -> None:
95
+ """Print summary information about the thermodynamic database."""
96
+ obigt = thermo_obj.obigt
97
+ if obigt is None:
98
+ if messages:
99
+ print("Database not initialized")
100
+ return
101
+
102
+ if not messages:
103
+ return
104
+
105
+ # Count species by state
106
+ aq_count = len(obigt[obigt['state'] == 'aq'])
107
+ total_count = len(obigt)
108
+
109
+ print(f"info: thermo().obigt has {aq_count} aqueous, {total_count} total species")
110
+
111
+ # Count other data
112
+ refs_count = len(thermo_obj.refs) if thermo_obj.refs is not None else 0
113
+ elements_count = len(thermo_obj.element) if thermo_obj.element is not None else 0
114
+
115
+ buffer_count = 0
116
+ if thermo_obj.buffer is not None:
117
+ buffer_count = len(thermo_obj.buffer['name'].unique()) if 'name' in thermo_obj.buffer.columns else 0
118
+
119
+ print(f"number of literature sources: {refs_count}, elements: {elements_count}, buffers: {buffer_count}")
120
+
121
+ protein_count = 0
122
+ organism_count = 0
123
+ if thermo_obj.protein is not None:
124
+ protein_count = len(thermo_obj.protein)
125
+ if 'organism' in thermo_obj.protein.columns:
126
+ organism_count = len(thermo_obj.protein['organism'].unique())
127
+
128
+ print(f"number of proteins in thermo().protein is {protein_count} from {organism_count} organisms")
129
+
130
+
131
+ def _info_numeric(species: Union[int, List[int]], thermo_obj, check_it: bool, messages: bool = True) -> pd.DataFrame:
132
+ """
133
+ Retrieve species information by numeric index.
134
+
135
+ Parameters
136
+ ----------
137
+ species : int or list of int
138
+ Species index(es) in thermo().obigt
139
+ thermo_obj : ThermoSystem
140
+ The thermodynamic system object
141
+ check_it : bool
142
+ Whether to perform data consistency checks
143
+ messages : bool, default True
144
+ Whether to print informational messages
145
+
146
+ Returns
147
+ -------
148
+ pd.DataFrame
149
+ Species thermodynamic data
150
+ """
151
+ obigt = thermo_obj.obigt
152
+ if obigt is None:
153
+ raise RuntimeError("Thermodynamic database not initialized")
154
+
155
+ # Ensure species is a list
156
+ if isinstance(species, int):
157
+ species = [species]
158
+
159
+ # Validate indices
160
+ max_index = len(obigt)
161
+ for idx in species:
162
+ if idx < 1 or idx > max_index:
163
+ raise IndexError(f"Species index {idx} not found in thermo().obigt (1-{max_index})")
164
+
165
+ # Get species data (convert from 1-based to 0-based indexing)
166
+ results = []
167
+ for idx in species:
168
+ species_data = _get_species_data(idx - 1, obigt, check_it, messages)
169
+ results.append(species_data)
170
+
171
+ # Combine results
172
+ result_df = pd.concat(results, ignore_index=True)
173
+ return result_df
174
+
175
+
176
+ def _info_character(species: Union[str, List[str]],
177
+ state: Optional[Union[str, List[str]]],
178
+ thermo_obj,
179
+ messages: bool = True) -> Union[int, List[int]]:
180
+ """
181
+ Search for species by name, formula, or abbreviation.
182
+
183
+ Parameters
184
+ ----------
185
+ species : str or list of str
186
+ Species name(s), formula(s), or abbreviation(s) to search for
187
+ state : str, list of str, or None
188
+ Physical state(s) to match
189
+ thermo_obj : ThermoSystem
190
+ The thermodynamic system object
191
+
192
+ Returns
193
+ -------
194
+ int or list of int
195
+ Species index(es) or NA if not found
196
+ """
197
+ obigt = thermo_obj.obigt
198
+ if obigt is None:
199
+ raise RuntimeError("Thermodynamic database not initialized")
200
+
201
+ # Ensure species is a list
202
+ if isinstance(species, str):
203
+ species = [species]
204
+ single_result = True
205
+ else:
206
+ single_result = False
207
+
208
+ # Handle state argument
209
+ if state is not None:
210
+ if isinstance(state, str):
211
+ state = [state] * len(species)
212
+ elif len(state) != len(species):
213
+ # Expand state to match species length
214
+ state = state * ((len(species) // len(state)) + 1)
215
+ state = state[:len(species)]
216
+
217
+ results = []
218
+ for i, sp in enumerate(species):
219
+ sp_state = state[i] if state is not None else None
220
+ result = _find_species_index(sp, sp_state, obigt, messages)
221
+
222
+ # Show approximate matches if exact match not found and not a protein
223
+ if pd.isna(result) and '_' not in sp:
224
+ _info_approx(sp, sp_state, obigt, messages)
225
+
226
+ results.append(result)
227
+
228
+ if single_result:
229
+ return results[0]
230
+ else:
231
+ return results
232
+
233
+
234
+ def _find_species_index(species: str, state: Optional[str], obigt: pd.DataFrame, messages: bool = True) -> Union[int, float]:
235
+ """
236
+ Find exact match for species in the database.
237
+
238
+ Parameters
239
+ ----------
240
+ species : str
241
+ Species name, formula, or abbreviation
242
+ state : str or None
243
+ Physical state to match
244
+ obigt : pd.DataFrame
245
+ The OBIGT database
246
+
247
+ Returns
248
+ -------
249
+ int or np.nan
250
+ Species index (1-based) or NaN if not found
251
+ """
252
+ # Find matches for species name, abbreviation, or formula
253
+ matches = (
254
+ (obigt['name'] == species) |
255
+ (obigt['abbrv'] == species) |
256
+ (obigt['formula'] == species)
257
+ )
258
+
259
+ # Handle NaN values in abbrv column
260
+ matches = matches.fillna(False)
261
+
262
+ if not matches.any():
263
+ # Check if it's a protein (would be handled elsewhere)
264
+ return np.nan
265
+
266
+ # Get matching indices
267
+ matching_indices = obigt.index[matches].tolist()
268
+
269
+ # Filter by state if specified
270
+ if state is not None:
271
+ # Special handling for H2O: 'aq' retrieves 'liq'
272
+ if species in ['H2O', 'water'] and state == 'aq':
273
+ state = 'liq'
274
+
275
+ state_matches = obigt.loc[matching_indices, 'state'] == state
276
+ matching_indices = [idx for idx, match in zip(matching_indices, state_matches) if match]
277
+
278
+ if not matching_indices:
279
+ # Requested state not available
280
+ available_states = obigt.loc[matches, 'state'].unique()
281
+ state_text = "', '".join(available_states)
282
+ verb = "is" if len(available_states) == 1 else "are"
283
+ if messages:
284
+ print(f"info_character: requested state '{state}' for {species} "
285
+ f"but only '{state_text}' {verb} available")
286
+
287
+ # Special warning for methane
288
+ if species == 'methane' and state == 'aq':
289
+ warnings.warn("'methane' is not an aqueous species; use 'CH4' instead\n"
290
+ "To revert to the old behavior, run mod_OBIGT(info('CH4'), name='methane')")
291
+
292
+ return np.nan
293
+
294
+ if len(matching_indices) == 1:
295
+ # Index is already 1-based (shifted in obigt.py during data loading)
296
+ return matching_indices[0]
297
+ elif len(matching_indices) > 1:
298
+ # Multiple matches - prefer exact name match
299
+ exact_name_matches = obigt.loc[matching_indices, 'name'] == species
300
+ exact_indices = [idx for idx, match in zip(matching_indices, exact_name_matches) if match]
301
+
302
+ if len(exact_indices) == 1:
303
+ result_index = exact_indices[0]
304
+ else:
305
+ # Return first match
306
+ result_index = matching_indices[0]
307
+
308
+ # Inform user about multiple states
309
+ if messages:
310
+ _report_multiple_matches(species, result_index, matching_indices, obigt, messages=messages)
311
+
312
+ # Index is already 1-based (shifted in obigt.py during data loading)
313
+ return result_index
314
+
315
+ return np.nan
316
+
317
+
318
+ def _report_multiple_matches(species: str, selected_index: int, all_indices: List[int], obigt: pd.DataFrame, messages: bool = True):
319
+ """Report information about multiple matches for a species."""
320
+ selected_state = obigt.loc[selected_index, 'state']
321
+ other_indices = [idx for idx in all_indices if idx != selected_index]
322
+ other_states = obigt.loc[other_indices, 'state'].tolist()
323
+
324
+ # Handle polymorphic transitions
325
+ trans_states = ['cr2', 'cr3', 'cr4', 'cr5', 'cr6', 'cr7', 'cr8', 'cr9']
326
+ is_trans = [state in trans_states for state in other_states]
327
+
328
+ trans_text = ""
329
+ if selected_state == 'cr':
330
+ n_trans = sum(is_trans)
331
+ if n_trans == 1:
332
+ trans_text = f" with {n_trans} polymorphic transition"
333
+ elif n_trans > 1:
334
+ trans_text = f" with {n_trans} polymorphic transitions"
335
+
336
+ # For non-aqueous species, show substance names
337
+ selected_name = obigt.loc[selected_index, 'name']
338
+ name_text = ""
339
+ if selected_state != 'aq' and species != selected_name:
340
+ name_text = f" [{selected_name}]"
341
+
342
+ # Show other available states
343
+ other_states = [state for state, trans in zip(other_states, is_trans) if not trans]
344
+ if selected_state != 'aq':
345
+ # Replace state with name for isomers in same state
346
+ for i, (idx, state) in enumerate(zip(other_indices, obigt.loc[other_indices, 'state'])):
347
+ if state == selected_state:
348
+ other_states[i] = obigt.loc[idx, 'name']
349
+
350
+ other_text = ""
351
+ unique_others = list(set(other_states))
352
+ if len(unique_others) == 1:
353
+ other_text = f"; also available in {unique_others[0]}"
354
+ elif len(unique_others) > 1:
355
+ other_text = f"; also available in {', '.join(unique_others)}"
356
+
357
+ if (trans_text or other_text) and messages:
358
+ start_text = f"info_character: found {species}({selected_state}){name_text}"
359
+ print(f"{start_text}{trans_text}{other_text}")
360
+
361
+
362
+ def _info_approx(species: str, state: Optional[str], obigt: pd.DataFrame, messages: bool = True) -> List[int]:
363
+ """
364
+ Find approximate matches for species name.
365
+
366
+ Parameters
367
+ ----------
368
+ species : str
369
+ Species name to search for
370
+ state : str or None
371
+ Physical state to filter by
372
+ obigt : pd.DataFrame
373
+ The OBIGT database
374
+
375
+ Returns
376
+ -------
377
+ list of int
378
+ Approximate match indices
379
+ """
380
+ # Simple approximate matching - find species containing the search term
381
+ if state is not None:
382
+ search_data = obigt[obigt['state'] == state]
383
+ else:
384
+ search_data = obigt
385
+
386
+ approx_matches = []
387
+
388
+ # Look for partial matches in name, abbrv, and formula
389
+ for col in ['name', 'abbrv', 'formula']:
390
+ if col in search_data.columns:
391
+ mask = search_data[col].str.contains(species, case=False, na=False, regex=False)
392
+ matches = search_data.index[mask].tolist()
393
+ approx_matches.extend(matches)
394
+
395
+ approx_matches = list(set(approx_matches)) # Remove duplicates
396
+
397
+ if not messages:
398
+ return approx_matches
399
+
400
+ if approx_matches:
401
+ if len(approx_matches) == 1:
402
+ idx = approx_matches[0]
403
+ species_info = _format_species_info(idx, obigt)
404
+ print(f"info_approx: '{species}' is similar to {species_info}")
405
+ else:
406
+ max_show = 100
407
+ n_show = min(len(approx_matches), max_show)
408
+ ext_text = f" (showing first {max_show})" if len(approx_matches) > max_show else ""
409
+ print(f"info_approx: '{species}' is ambiguous; has approximate matches to "
410
+ f"{len(approx_matches)} species{ext_text}:")
411
+
412
+ # Get unique names (to avoid showing duplicates for polymorphs)
413
+ unique_names = []
414
+ for idx in approx_matches[:n_show]:
415
+ name = obigt.loc[idx, 'name']
416
+ if name not in unique_names:
417
+ unique_names.append(name)
418
+ print(f" {name}")
419
+ else:
420
+ print(f"info_approx: '{species}' has no approximate matches")
421
+
422
+ return approx_matches
423
+
424
+
425
+ def _format_species_info(index: int, obigt: pd.DataFrame, with_source: bool = True) -> str:
426
+ """
427
+ Format species information for display.
428
+
429
+ Parameters
430
+ ----------
431
+ index : int
432
+ Species index in obigt DataFrame
433
+ obigt : pd.DataFrame
434
+ The OBIGT database
435
+ with_source : bool
436
+ Whether to include source information
437
+
438
+ Returns
439
+ -------
440
+ str
441
+ Formatted species information string
442
+ """
443
+ row = obigt.loc[index]
444
+ name = row['name']
445
+ formula = row['formula']
446
+ state = row['state']
447
+
448
+ info_text = f"{name} [{formula}({state})]"
449
+
450
+ if with_source:
451
+ source_parts = []
452
+ if 'ref1' in row and pd.notna(row['ref1']):
453
+ source_parts.append(str(row['ref1']))
454
+ if 'ref2' in row and pd.notna(row['ref2']):
455
+ source_parts.append(str(row['ref2']))
456
+ if 'date' in row and pd.notna(row['date']):
457
+ source_parts.append(str(row['date']))
458
+
459
+ if source_parts:
460
+ info_text += f" ({', '.join(source_parts)})"
461
+
462
+ return info_text
463
+
464
+
465
+ def _get_species_data(index: int, obigt: pd.DataFrame, check_it: bool, messages: bool = True) -> pd.DataFrame:
466
+ """
467
+ Get and validate species thermodynamic data.
468
+
469
+ Parameters
470
+ ----------
471
+ index : int
472
+ Species index (0-based) in obigt DataFrame
473
+ obigt : pd.DataFrame
474
+ The OBIGT database
475
+ check_it : bool
476
+ Whether to perform consistency checks
477
+ messages : bool, default True
478
+ Whether to print informational messages
479
+
480
+ Returns
481
+ -------
482
+ pd.DataFrame
483
+ Single-row DataFrame with species data (22 columns matching R)
484
+ """
485
+ # Get species data
486
+ species_data = obigt.iloc[index:index+1].copy()
487
+
488
+ # Remove scaling factors on EOS parameters (equivalent to OBIGT2eos)
489
+ species_data = _remove_scaling_factors(species_data)
490
+
491
+ # Check for missing model
492
+ if pd.isna(species_data.iloc[0]['model']):
493
+ species_name = species_data.iloc[0]['name']
494
+ species_state = species_data.iloc[0]['state']
495
+ raise ValueError(f"Species has NA model: {species_name}({species_state})")
496
+
497
+ # Get the model for column selection (preserve case)
498
+ model = str(species_data.iloc[0]['model'])
499
+
500
+ # Berman minerals are fully implemented via the Berman() function
501
+ # The cgl() function automatically calls Berman() when model="Berman"
502
+ # No special handling needed here in info()
503
+
504
+ # Fill in missing G, H, or S values
505
+ if check_it:
506
+ species_data = _check_and_fill_ghs(species_data, messages)
507
+ species_data = _check_eos_parameters(species_data, messages)
508
+
509
+ # Return only the 22 columns that R returns (matching R's info() behavior)
510
+ # R uses different EOS column names depending on the model:
511
+ # - HKF/DEW: a1, a2, a3, a4, c1, c2, omega, Z
512
+ # - CGL and others: a, b, c, d, e, f, lambda, T
513
+
514
+ # Base columns (first 14)
515
+ base_columns = ['name', 'abbrv', 'formula', 'state', 'ref1', 'ref2', 'date',
516
+ 'model', 'E_units', 'G', 'H', 'S', 'Cp', 'V']
517
+
518
+ # EOS columns depend on model
519
+ if model in ['HKF', 'DEW']:
520
+ # HKF/DEW use: a1, a2, a3, a4, c1, c2, omega, Z
521
+ eos_columns = ['a1', 'a2', 'a3', 'a4', 'c1', 'c2', 'omega', 'Z']
522
+ else:
523
+ # CGL and others use: a, b, c, d, e, f, lambda, T
524
+ eos_columns = ['a', 'b', 'c', 'd', 'e', 'f', 'lambda', 'T']
525
+
526
+ r_columns = base_columns + eos_columns
527
+
528
+ # Select only columns that exist (for compatibility)
529
+ available_cols = [col for col in r_columns if col in species_data.columns]
530
+ species_data = species_data[available_cols].copy()
531
+
532
+ return species_data
533
+
534
+
535
+ def _remove_scaling_factors(species_data: pd.DataFrame) -> pd.DataFrame:
536
+ """
537
+ Remove scaling factors from equation-of-state parameters.
538
+
539
+ This mirrors the R CHNOSZ OBIGT2eos() function behavior:
540
+ - Applies scaling factors to HKF and DEW species parameters
541
+ - Changes column names from CSV format (a1.a) to EOS format (a1)
542
+ """
543
+ data = species_data.copy()
544
+
545
+ model = str(data.iloc[0]['model'])
546
+
547
+ # HKF and DEW models have scaling factors (mirroring R line 425)
548
+ if model in ['HKF', 'DEW']:
549
+ # Original CSV column names and their scaled equivalents
550
+ csv_cols = ['a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e', 'c2.f', 'omega.lambda', 'z.T']
551
+ eos_cols = ['a1', 'a2', 'a3', 'a4', 'c1', 'c2', 'omega', 'Z']
552
+
553
+ # Scaling factors from R: 10^c(-1, 2, 0, 4, 0, 4, 5, 0)
554
+ scaling_factors = [0.1, 100, 1, 10000, 1, 10000, 100000, 1]
555
+
556
+ # Apply scaling and rename columns - always create all 8 columns
557
+ for i, (csv_col, eos_col) in enumerate(zip(csv_cols, eos_cols)):
558
+ if csv_col in data.columns:
559
+ # Apply scaling factor
560
+ scaled_value = data[csv_col] * scaling_factors[i]
561
+ # Add new column with EOS name
562
+ data[eos_col] = scaled_value
563
+ else:
564
+ # Column doesn't exist, set to NaN
565
+ data[eos_col] = np.nan
566
+
567
+ # Also change column names for non-HKF species following R behavior
568
+ # This is done in OBIGT2eos lines 429-431
569
+
570
+ elif model == 'AD':
571
+ # For AD species, rename columns and set some to NA (R line 427)
572
+ csv_cols = ['a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e', 'c2.f', 'omega.lambda', 'z.T']
573
+ ad_cols = ['a', 'b', 'xi', 'XX1', 'XX2', 'XX3', 'XX4', 'Z']
574
+
575
+ for csv_col, ad_col in zip(csv_cols, ad_cols):
576
+ if csv_col in data.columns:
577
+ data[ad_col] = data[csv_col]
578
+ # Set unused columns to NA (columns 18-21 in R indexing)
579
+ if ad_col in ['XX1', 'XX2', 'XX3', 'XX4']:
580
+ data[ad_col] = np.nan
581
+
582
+ else:
583
+ # For CGL and other models, use generic names (R line 431)
584
+ csv_cols = ['a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e', 'c2.f', 'omega.lambda', 'z.T']
585
+ cgl_cols = ['a', 'b', 'c', 'd', 'e', 'f', 'lambda', 'T']
586
+
587
+ for csv_col, cgl_col in zip(csv_cols, cgl_cols):
588
+ if csv_col in data.columns:
589
+ data[cgl_col] = data[csv_col]
590
+
591
+ return data
592
+
593
+
594
+ def _check_and_fill_ghs(species_data: pd.DataFrame, messages: bool = True) -> pd.DataFrame:
595
+ """Check and fill missing G, H, S values."""
596
+ data = species_data.copy()
597
+
598
+ # Check if exactly one of G, H, S is missing
599
+ ghs_cols = ['G', 'H', 'S']
600
+ row = data.iloc[0]
601
+
602
+ missing = [pd.isna(row[col]) for col in ghs_cols if col in row]
603
+ n_missing = sum(missing)
604
+
605
+ if n_missing == 1:
606
+ # Calculate missing value from the other two
607
+ formula = row['formula']
608
+ G = row.get('G', np.nan)
609
+ H = row.get('H', np.nan)
610
+ S = row.get('S', np.nan)
611
+ E_units = row.get('E_units', 'J')
612
+
613
+ try:
614
+ # This would use the GHS function from formula utilities
615
+ from ..utils.formula import calculate_ghs
616
+ calculated = calculate_ghs(formula, G=G, H=H, S=S, E_units=E_units)
617
+
618
+ # Fill in the missing value
619
+ missing_col = ghs_cols[missing.index(True)]
620
+ data.loc[0, missing_col] = calculated[missing_col]
621
+
622
+ if messages:
623
+ print(f"info_numeric: {missing_col} of {row['name']}({row['state']}) is NA; "
624
+ f"set to {calculated[missing_col]:.2f} {E_units} mol-1")
625
+
626
+ except Exception:
627
+ # If calculation fails, leave as NaN
628
+ pass
629
+
630
+ return data
631
+
632
+
633
+ def _check_eos_parameters(species_data: pd.DataFrame, messages: bool = True) -> pd.DataFrame:
634
+ """
635
+ Check equation-of-state parameters for consistency.
636
+
637
+ This function implements the EOS parameter checking from R's check.EOS function,
638
+ calculating Cp and V from EOS parameters when they are NA in the database.
639
+
640
+ Parameters
641
+ ----------
642
+ species_data : pd.DataFrame
643
+ Single-row DataFrame with species data
644
+ messages : bool, default True
645
+ Whether to print informational messages
646
+
647
+ Returns
648
+ -------
649
+ pd.DataFrame
650
+ Species data with filled-in Cp and V values (if they were NA)
651
+ """
652
+ data = species_data.copy()
653
+ model = str(data.iloc[0]['model'])
654
+ state = data.iloc[0]['state']
655
+
656
+ # Check for HKF and DEW aqueous species
657
+ if model in ['HKF', 'DEW']:
658
+ # Temperature for calculations (Tr = 298.15 K)
659
+ Tr = 298.15
660
+ Theta = 228 # K
661
+
662
+ # Get species properties
663
+ name = data.iloc[0]['name']
664
+ E_units = data.iloc[0]['E_units']
665
+
666
+ # Check and calculate Cp if it's NA
667
+ if pd.isna(data.iloc[0].get('Cp')):
668
+ # Extract EOS parameters
669
+ c1 = data.iloc[0].get('c1', np.nan)
670
+ c2 = data.iloc[0].get('c2', np.nan)
671
+ omega = data.iloc[0].get('omega', np.nan)
672
+
673
+ # Check if we have all required parameters
674
+ if not (pd.isna(c1) or pd.isna(c2) or pd.isna(omega)):
675
+ # Choose value of X consistent with SUPCRT92 or DEW
676
+ if model == 'HKF':
677
+ X = -3.055586E-7
678
+ elif model == 'DEW':
679
+ X = -3.09E-7
680
+
681
+ # Calculate Cp from EOS parameters
682
+ # Cp = c1 + c2/(Tr-Theta)^2 + omega*Tr*X
683
+ calcCp = c1 + c2 / ((Tr - Theta) ** 2) + omega * Tr * X
684
+
685
+ # Fill in the NA value
686
+ data.at[data.index[0], 'Cp'] = calcCp
687
+ if messages:
688
+ print(f"info.numeric: Cp° of {name}({state}) is NA; set by EOS parameters to {calcCp:.2f} {E_units} K-1 mol-1")
689
+
690
+ # Check and calculate V if it's NA (only for aqueous species)
691
+ if pd.isna(data.iloc[0].get('V')):
692
+ # Extract EOS parameters
693
+ a1 = data.iloc[0].get('a1', np.nan)
694
+ a2 = data.iloc[0].get('a2', np.nan)
695
+ a3 = data.iloc[0].get('a3', np.nan)
696
+ a4 = data.iloc[0].get('a4', np.nan)
697
+ omega = data.iloc[0].get('omega', np.nan)
698
+
699
+ # Check if we have all required parameters
700
+ if not (pd.isna(a1) or pd.isna(a2) or pd.isna(a3) or pd.isna(a4) or pd.isna(omega)):
701
+ # Choose value of Q consistent with SUPCRT92 or DEW
702
+ if model == 'HKF':
703
+ Q = 0.00002775729
704
+ elif model == 'DEW':
705
+ Q = 0.0000005903 * 41.84
706
+
707
+ # Calculate V from EOS parameters
708
+ # V = 41.84*a1 + 41.84*a2/2601 + (41.84*a3 + 41.84*a4/2601)/(Tr-Theta) - Q*omega
709
+ calcV = (41.84 * a1 + 41.84 * a2 / 2601 +
710
+ (41.84 * a3 + 41.84 * a4 / 2601) / (Tr - Theta) -
711
+ Q * omega)
712
+
713
+ # Convert from J to cal if needed
714
+ if E_units == 'J':
715
+ # Import convert function here to avoid circular import
716
+ from ..utils.units import convert
717
+ calcV = convert(calcV, 'cal', messages=messages)
718
+
719
+ # Fill in the NA value
720
+ data.at[data.index[0], 'V'] = calcV
721
+ if messages:
722
+ print(f"info.numeric: V° of {name}({state}) is NA; set by EOS parameters to {calcV:.2f} cm3 mol-1")
723
+
724
+ return data
725
+
726
+
727
+ # Convenience functions for common operations
728
+ def find_species(name: str, state: Optional[str] = None, messages: bool = True) -> int:
729
+ """
730
+ Find a single species index by name.
731
+
732
+ Parameters
733
+ ----------
734
+ name : str
735
+ Species name, formula, or abbreviation
736
+ state : str, optional
737
+ Physical state
738
+ messages : bool, default True
739
+ If True, print informational messages
740
+
741
+ Returns
742
+ -------
743
+ int
744
+ Species index (1-based)
745
+
746
+ Raises
747
+ ------
748
+ ValueError
749
+ If species not found or multiple matches
750
+ """
751
+ result = info(name, state, messages=messages)
752
+
753
+ if pd.isna(result):
754
+ raise ValueError(f"Species '{name}' not found")
755
+
756
+ if isinstance(result, list):
757
+ if len(result) > 1:
758
+ raise ValueError(f"Multiple matches found for '{name}'")
759
+ result = result[0]
760
+
761
+ return int(result)
762
+
763
+
764
+ def get_species_data(species: Union[str, int], state: Optional[str] = None, messages: bool = True) -> pd.DataFrame:
765
+ """
766
+ Get complete thermodynamic data for a species.
767
+
768
+ Parameters
769
+ ----------
770
+ species : str or int
771
+ Species name/formula or index
772
+ state : str, optional
773
+ Physical state
774
+ messages : bool, default True
775
+ Display messages?
776
+
777
+ Returns
778
+ -------
779
+ pd.DataFrame
780
+ Species thermodynamic data
781
+ """
782
+ if isinstance(species, str):
783
+ species = find_species(species, state)
784
+
785
+ return info(species, messages=messages)
786
+
787
+
788
+ def list_species(pattern: Optional[str] = None, state: Optional[str] = None) -> pd.DataFrame:
789
+ """
790
+ List species matching criteria.
791
+
792
+ Parameters
793
+ ----------
794
+ pattern : str, optional
795
+ Pattern to match in species names
796
+ state : str, optional
797
+ Physical state to filter by
798
+
799
+ Returns
800
+ -------
801
+ pd.DataFrame
802
+ Matching species information
803
+ """
804
+ thermo_obj = thermo()
805
+ if not thermo_obj.is_initialized():
806
+ thermo_obj.reset()
807
+
808
+ obigt = thermo_obj.obigt.copy()
809
+
810
+ # Filter by state
811
+ if state is not None:
812
+ obigt = obigt[obigt['state'] == state]
813
+
814
+ # Filter by pattern
815
+ if pattern is not None:
816
+ mask = obigt['name'].str.contains(pattern, case=False, na=False)
817
+ obigt = obigt[mask]
818
+
819
+ # Return relevant columns
820
+ columns = ['name', 'formula', 'state', 'ref1', 'model']
821
+ return obigt[columns].reset_index(drop=True)