reaxkit 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reaxkit/__init__.py +0 -0
- reaxkit/analysis/__init__.py +0 -0
- reaxkit/analysis/composed/RDF_analyzer.py +560 -0
- reaxkit/analysis/composed/__init__.py +0 -0
- reaxkit/analysis/composed/connectivity_analyzer.py +706 -0
- reaxkit/analysis/composed/coordination_analyzer.py +144 -0
- reaxkit/analysis/composed/electrostatics_analyzer.py +687 -0
- reaxkit/analysis/per_file/__init__.py +0 -0
- reaxkit/analysis/per_file/control_analyzer.py +165 -0
- reaxkit/analysis/per_file/eregime_analyzer.py +108 -0
- reaxkit/analysis/per_file/ffield_analyzer.py +305 -0
- reaxkit/analysis/per_file/fort13_analyzer.py +79 -0
- reaxkit/analysis/per_file/fort57_analyzer.py +106 -0
- reaxkit/analysis/per_file/fort73_analyzer.py +61 -0
- reaxkit/analysis/per_file/fort74_analyzer.py +65 -0
- reaxkit/analysis/per_file/fort76_analyzer.py +191 -0
- reaxkit/analysis/per_file/fort78_analyzer.py +154 -0
- reaxkit/analysis/per_file/fort79_analyzer.py +83 -0
- reaxkit/analysis/per_file/fort7_analyzer.py +393 -0
- reaxkit/analysis/per_file/fort99_analyzer.py +411 -0
- reaxkit/analysis/per_file/molfra_analyzer.py +359 -0
- reaxkit/analysis/per_file/params_analyzer.py +258 -0
- reaxkit/analysis/per_file/summary_analyzer.py +84 -0
- reaxkit/analysis/per_file/trainset_analyzer.py +84 -0
- reaxkit/analysis/per_file/vels_analyzer.py +95 -0
- reaxkit/analysis/per_file/xmolout_analyzer.py +528 -0
- reaxkit/cli.py +181 -0
- reaxkit/count_loc.py +276 -0
- reaxkit/data/alias.yaml +89 -0
- reaxkit/data/constants.yaml +27 -0
- reaxkit/data/reaxff_input_files_contents.yaml +186 -0
- reaxkit/data/reaxff_output_files_contents.yaml +301 -0
- reaxkit/data/units.yaml +38 -0
- reaxkit/help/__init__.py +0 -0
- reaxkit/help/help_index_loader.py +531 -0
- reaxkit/help/introspection_utils.py +131 -0
- reaxkit/io/__init__.py +0 -0
- reaxkit/io/base_handler.py +165 -0
- reaxkit/io/generators/__init__.py +0 -0
- reaxkit/io/generators/control_generator.py +123 -0
- reaxkit/io/generators/eregime_generator.py +341 -0
- reaxkit/io/generators/geo_generator.py +967 -0
- reaxkit/io/generators/trainset_generator.py +1758 -0
- reaxkit/io/generators/tregime_generator.py +113 -0
- reaxkit/io/generators/vregime_generator.py +164 -0
- reaxkit/io/generators/xmolout_generator.py +304 -0
- reaxkit/io/handlers/__init__.py +0 -0
- reaxkit/io/handlers/control_handler.py +209 -0
- reaxkit/io/handlers/eregime_handler.py +122 -0
- reaxkit/io/handlers/ffield_handler.py +812 -0
- reaxkit/io/handlers/fort13_handler.py +123 -0
- reaxkit/io/handlers/fort57_handler.py +143 -0
- reaxkit/io/handlers/fort73_handler.py +145 -0
- reaxkit/io/handlers/fort74_handler.py +155 -0
- reaxkit/io/handlers/fort76_handler.py +195 -0
- reaxkit/io/handlers/fort78_handler.py +142 -0
- reaxkit/io/handlers/fort79_handler.py +227 -0
- reaxkit/io/handlers/fort7_handler.py +264 -0
- reaxkit/io/handlers/fort99_handler.py +128 -0
- reaxkit/io/handlers/geo_handler.py +224 -0
- reaxkit/io/handlers/molfra_handler.py +184 -0
- reaxkit/io/handlers/params_handler.py +137 -0
- reaxkit/io/handlers/summary_handler.py +135 -0
- reaxkit/io/handlers/trainset_handler.py +658 -0
- reaxkit/io/handlers/vels_handler.py +293 -0
- reaxkit/io/handlers/xmolout_handler.py +174 -0
- reaxkit/utils/__init__.py +0 -0
- reaxkit/utils/alias.py +219 -0
- reaxkit/utils/cache.py +77 -0
- reaxkit/utils/constants.py +75 -0
- reaxkit/utils/equation_of_states.py +96 -0
- reaxkit/utils/exceptions.py +27 -0
- reaxkit/utils/frame_utils.py +175 -0
- reaxkit/utils/log.py +43 -0
- reaxkit/utils/media/__init__.py +0 -0
- reaxkit/utils/media/convert.py +90 -0
- reaxkit/utils/media/make_video.py +91 -0
- reaxkit/utils/media/plotter.py +812 -0
- reaxkit/utils/numerical/__init__.py +0 -0
- reaxkit/utils/numerical/extrema_finder.py +96 -0
- reaxkit/utils/numerical/moving_average.py +103 -0
- reaxkit/utils/numerical/numerical_calcs.py +75 -0
- reaxkit/utils/numerical/signal_ops.py +135 -0
- reaxkit/utils/path.py +55 -0
- reaxkit/utils/units.py +104 -0
- reaxkit/webui/__init__.py +0 -0
- reaxkit/webui/app.py +0 -0
- reaxkit/webui/components.py +0 -0
- reaxkit/webui/layouts.py +0 -0
- reaxkit/webui/utils.py +0 -0
- reaxkit/workflows/__init__.py +0 -0
- reaxkit/workflows/composed/__init__.py +0 -0
- reaxkit/workflows/composed/coordination_workflow.py +393 -0
- reaxkit/workflows/composed/electrostatics_workflow.py +587 -0
- reaxkit/workflows/composed/xmolout_fort7_workflow.py +343 -0
- reaxkit/workflows/meta/__init__.py +0 -0
- reaxkit/workflows/meta/help_workflow.py +136 -0
- reaxkit/workflows/meta/introspection_workflow.py +235 -0
- reaxkit/workflows/meta/make_video_workflow.py +61 -0
- reaxkit/workflows/meta/plotter_workflow.py +601 -0
- reaxkit/workflows/per_file/__init__.py +0 -0
- reaxkit/workflows/per_file/control_workflow.py +110 -0
- reaxkit/workflows/per_file/eregime_workflow.py +267 -0
- reaxkit/workflows/per_file/ffield_workflow.py +390 -0
- reaxkit/workflows/per_file/fort13_workflow.py +86 -0
- reaxkit/workflows/per_file/fort57_workflow.py +137 -0
- reaxkit/workflows/per_file/fort73_workflow.py +151 -0
- reaxkit/workflows/per_file/fort74_workflow.py +88 -0
- reaxkit/workflows/per_file/fort76_workflow.py +188 -0
- reaxkit/workflows/per_file/fort78_workflow.py +135 -0
- reaxkit/workflows/per_file/fort79_workflow.py +314 -0
- reaxkit/workflows/per_file/fort7_workflow.py +592 -0
- reaxkit/workflows/per_file/fort83_workflow.py +60 -0
- reaxkit/workflows/per_file/fort99_workflow.py +223 -0
- reaxkit/workflows/per_file/geo_workflow.py +554 -0
- reaxkit/workflows/per_file/molfra_workflow.py +577 -0
- reaxkit/workflows/per_file/params_workflow.py +135 -0
- reaxkit/workflows/per_file/summary_workflow.py +161 -0
- reaxkit/workflows/per_file/trainset_workflow.py +356 -0
- reaxkit/workflows/per_file/tregime_workflow.py +79 -0
- reaxkit/workflows/per_file/vels_workflow.py +309 -0
- reaxkit/workflows/per_file/vregime_workflow.py +75 -0
- reaxkit/workflows/per_file/xmolout_workflow.py +678 -0
- reaxkit-1.0.0.dist-info/METADATA +128 -0
- reaxkit-1.0.0.dist-info/RECORD +130 -0
- reaxkit-1.0.0.dist-info/WHEEL +5 -0
- reaxkit-1.0.0.dist-info/entry_points.txt +2 -0
- reaxkit-1.0.0.dist-info/licenses/AUTHORS.md +20 -0
- reaxkit-1.0.0.dist-info/licenses/LICENSE +21 -0
- reaxkit-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ReaxFF training set definition (TRAINSET) handler.
|
|
3
|
+
|
|
4
|
+
This module provides a handler for parsing ReaxFF TRAINSET-style files,
|
|
5
|
+
which define reference data, weights, and targets used during
|
|
6
|
+
force-field parameter optimization.
|
|
7
|
+
|
|
8
|
+
TRAINSET files are sectioned and heterogeneous by design, containing
|
|
9
|
+
distinct blocks for charges, heats of formation, geometries, cell
|
|
10
|
+
parameters, and energies.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import Dict, Any, List, Optional
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
from reaxkit.io.base_handler import BaseHandler
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Map raw section labels in the file to canonical section names
|
|
23
|
+
SECTION_MAP = {
|
|
24
|
+
"CHARGE": "CHARGE",
|
|
25
|
+
"HEATFO": "HEATFO",
|
|
26
|
+
"GEOMETRY": "GEOMETRY",
|
|
27
|
+
"CELL PARAMETERS": "CELL_PARAMETERS",
|
|
28
|
+
"CELL": "CELL_PARAMETERS", # in case it's written as CELL
|
|
29
|
+
"ENERGY": "ENERGY",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _split_inline_comment(line: str) -> tuple[str, str]:
|
|
34
|
+
"""Return (data_part, inline_comment) split on '!' (if present)."""
|
|
35
|
+
if "!" in line:
|
|
36
|
+
data, comment = line.split("!", 1)
|
|
37
|
+
return data.strip(), comment.strip()
|
|
38
|
+
return line.strip(), ""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _parse_charge(lines: List[str], section_name: str) -> pd.DataFrame:
|
|
42
|
+
"""
|
|
43
|
+
CHARGE block:
|
|
44
|
+
|
|
45
|
+
CHARGE
|
|
46
|
+
#Iden Weight Atom Lit
|
|
47
|
+
# group line 1
|
|
48
|
+
# group line 2
|
|
49
|
+
AlNH2q 0.10 1 0.83215 !charge for Al atom in AlNH2
|
|
50
|
+
...
|
|
51
|
+
ENDCHARGE
|
|
52
|
+
|
|
53
|
+
Columns: section, iden, weight, atom, lit,
|
|
54
|
+
inline_comment, group_comment
|
|
55
|
+
|
|
56
|
+
group_comment behavior:
|
|
57
|
+
- Consecutive '#' lines are concatenated with " /// ".
|
|
58
|
+
- All following data lines share that block.
|
|
59
|
+
- When a new '#' block appears after data, it overwrites.
|
|
60
|
+
"""
|
|
61
|
+
rows = []
|
|
62
|
+
group_comment = ""
|
|
63
|
+
last_was_comment = False # track previous processed line
|
|
64
|
+
|
|
65
|
+
for raw in lines:
|
|
66
|
+
line = raw.strip()
|
|
67
|
+
if not line:
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
# comment lines → update group_comment
|
|
71
|
+
if line.startswith("#"):
|
|
72
|
+
text = line.lstrip("#").strip()
|
|
73
|
+
|
|
74
|
+
# skip header-like lines (Weigh / Weight ...)
|
|
75
|
+
if "weigh" in text.lower():
|
|
76
|
+
# header shouldn't join with group comments
|
|
77
|
+
last_was_comment = False
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
# If previous line was comment → append
|
|
81
|
+
# If previous line was data/start → new block
|
|
82
|
+
if last_was_comment and group_comment:
|
|
83
|
+
group_comment += " /// " + text
|
|
84
|
+
else:
|
|
85
|
+
group_comment = text
|
|
86
|
+
|
|
87
|
+
last_was_comment = True
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# data line → the next comment block should replace, not append
|
|
91
|
+
last_was_comment = False
|
|
92
|
+
|
|
93
|
+
data, inline_comment = _split_inline_comment(line)
|
|
94
|
+
tokens = data.split()
|
|
95
|
+
if len(tokens) < 4:
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
iden = tokens[0]
|
|
99
|
+
weight = float(tokens[1])
|
|
100
|
+
atom = int(tokens[2])
|
|
101
|
+
lit = float(tokens[3])
|
|
102
|
+
|
|
103
|
+
rows.append(
|
|
104
|
+
{
|
|
105
|
+
"section": section_name,
|
|
106
|
+
"group_comment": group_comment,
|
|
107
|
+
"iden": iden,
|
|
108
|
+
"weight": weight,
|
|
109
|
+
"atom": atom,
|
|
110
|
+
"lit": lit,
|
|
111
|
+
"inline_comment": inline_comment,
|
|
112
|
+
}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return pd.DataFrame(rows)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _parse_heatfo(lines: List[str], section_name: str) -> pd.DataFrame:
|
|
119
|
+
"""
|
|
120
|
+
HEATFO block:
|
|
121
|
+
|
|
122
|
+
HEATFO
|
|
123
|
+
#Iden Weight Lit
|
|
124
|
+
# group line 1
|
|
125
|
+
# group line 2
|
|
126
|
+
benzene 1.0 -19.82 !heat of formation
|
|
127
|
+
...
|
|
128
|
+
ENDHEATFO
|
|
129
|
+
|
|
130
|
+
Columns: section, iden, weight, lit, inline_comment, group_comment
|
|
131
|
+
|
|
132
|
+
group_comment behavior:
|
|
133
|
+
- Consecutive '#' lines are concatenated with " /// ".
|
|
134
|
+
- All following data lines share that comment until a new '#' block
|
|
135
|
+
appears, which overwrites the previous one.
|
|
136
|
+
"""
|
|
137
|
+
rows = []
|
|
138
|
+
group_comment = ""
|
|
139
|
+
last_was_comment = False # track whether previous processed line was a comment
|
|
140
|
+
|
|
141
|
+
for raw in lines:
|
|
142
|
+
line = raw.strip()
|
|
143
|
+
if not line:
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
# comment lines → update group_comment (possibly multi-line)
|
|
147
|
+
if line.startswith("#"):
|
|
148
|
+
text = line.lstrip("#").strip()
|
|
149
|
+
|
|
150
|
+
# skip header-like lines (Weigh / Weight ...)
|
|
151
|
+
if "weigh" in text.lower():
|
|
152
|
+
# header shouldn't join with group comments
|
|
153
|
+
last_was_comment = False
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Same block → append; new block → overwrite
|
|
157
|
+
if last_was_comment and group_comment:
|
|
158
|
+
group_comment += " /// " + text
|
|
159
|
+
else:
|
|
160
|
+
group_comment = text
|
|
161
|
+
|
|
162
|
+
last_was_comment = True
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# data line → next comment block should overwrite, not append
|
|
166
|
+
last_was_comment = False
|
|
167
|
+
|
|
168
|
+
data, inline_comment = _split_inline_comment(line)
|
|
169
|
+
tokens = data.split()
|
|
170
|
+
if len(tokens) < 3:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
iden = tokens[0]
|
|
174
|
+
weight = float(tokens[1])
|
|
175
|
+
lit = float(tokens[2])
|
|
176
|
+
|
|
177
|
+
rows.append(
|
|
178
|
+
{
|
|
179
|
+
"section": section_name,
|
|
180
|
+
"group_comment": group_comment,
|
|
181
|
+
"iden": iden,
|
|
182
|
+
"weight": weight,
|
|
183
|
+
"lit": lit,
|
|
184
|
+
"inline_comment": inline_comment,
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return pd.DataFrame(rows)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _parse_geometry(lines: List[str], section_name: str) -> pd.DataFrame:
|
|
192
|
+
"""
|
|
193
|
+
GEOMETRY block:
|
|
194
|
+
|
|
195
|
+
GEOMETRY
|
|
196
|
+
#Iden Weight At1 At2 At3 At4 Lit
|
|
197
|
+
# group line 1
|
|
198
|
+
# group line 2
|
|
199
|
+
chexane 0.01 1 2 1.54 !bond
|
|
200
|
+
chexane 1.00 1 2 3 111.0 !valence angle
|
|
201
|
+
chexane 1.00 1 2 3 4 56.0 !torsion angle
|
|
202
|
+
chexane 1.00 0.01 !RMSG
|
|
203
|
+
|
|
204
|
+
Required data per row:
|
|
205
|
+
- iden, weight, lit
|
|
206
|
+
Optional:
|
|
207
|
+
- at1, at2, at3, at4 (if present)
|
|
208
|
+
|
|
209
|
+
group_comment behavior:
|
|
210
|
+
- Multiple '#' lines in a row are concatenated with " /// ".
|
|
211
|
+
- All following data lines share that group_comment until a new
|
|
212
|
+
'#' block appears, which overwrites the previous one.
|
|
213
|
+
"""
|
|
214
|
+
rows = []
|
|
215
|
+
group_comment = ""
|
|
216
|
+
last_was_comment = False # track whether previous processed line was a comment
|
|
217
|
+
|
|
218
|
+
for raw in lines:
|
|
219
|
+
line = raw.strip()
|
|
220
|
+
if not line:
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
# comment lines → update group_comment (possibly multi-line)
|
|
224
|
+
if line.startswith("#"):
|
|
225
|
+
text = line.lstrip("#").strip()
|
|
226
|
+
|
|
227
|
+
# skip header-like lines (Iden / Weight / Weigh ...)
|
|
228
|
+
if "weigh" in text.lower() or "iden" in text.lower():
|
|
229
|
+
last_was_comment = False
|
|
230
|
+
continue
|
|
231
|
+
|
|
232
|
+
# Same block → append; new block → overwrite
|
|
233
|
+
if last_was_comment and group_comment:
|
|
234
|
+
group_comment += " /// " + text
|
|
235
|
+
else:
|
|
236
|
+
group_comment = text
|
|
237
|
+
|
|
238
|
+
last_was_comment = True
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
# data line → next comment block should overwrite, not append
|
|
242
|
+
last_was_comment = False
|
|
243
|
+
|
|
244
|
+
data, inline_comment = _split_inline_comment(line)
|
|
245
|
+
tokens = data.split()
|
|
246
|
+
|
|
247
|
+
# Need at least: iden, weight, lit
|
|
248
|
+
if len(tokens) < 3:
|
|
249
|
+
continue
|
|
250
|
+
|
|
251
|
+
iden = tokens[0]
|
|
252
|
+
weight = float(tokens[1])
|
|
253
|
+
lit = float(tokens[-1])
|
|
254
|
+
|
|
255
|
+
# Middle tokens (between weight and lit) are optional atom indices
|
|
256
|
+
atom_tokens = tokens[2:-1]
|
|
257
|
+
|
|
258
|
+
row = {
|
|
259
|
+
"section": section_name,
|
|
260
|
+
"iden": iden,
|
|
261
|
+
"weight": weight,
|
|
262
|
+
"lit": lit,
|
|
263
|
+
"inline_comment": inline_comment,
|
|
264
|
+
"group_comment": group_comment,
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
# Fill at1–at4 only if present
|
|
268
|
+
for i, tok in enumerate(atom_tokens[:4], start=1):
|
|
269
|
+
try:
|
|
270
|
+
row[f"at{i}"] = int(tok)
|
|
271
|
+
except ValueError:
|
|
272
|
+
# If something weird appears where an int is expected, skip it
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
rows.append(row)
|
|
276
|
+
|
|
277
|
+
# Build DataFrame and order columns nicely
|
|
278
|
+
df = pd.DataFrame(rows)
|
|
279
|
+
if not df.empty:
|
|
280
|
+
base_cols = ["section", "iden", "weight"]
|
|
281
|
+
atom_cols = [c for c in ["at1", "at2", "at3", "at4"] if c in df.columns]
|
|
282
|
+
end_cols = [c for c in ["lit", "inline_comment", "group_comment"] if c in df.columns]
|
|
283
|
+
other_cols = [c for c in df.columns if c not in (base_cols + atom_cols + end_cols)]
|
|
284
|
+
df = df[base_cols + atom_cols + other_cols + end_cols]
|
|
285
|
+
|
|
286
|
+
return df
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _parse_cell_parameters(lines: List[str], section_name: str) -> pd.DataFrame:
|
|
290
|
+
"""
|
|
291
|
+
CELL PARAMETERS block:
|
|
292
|
+
|
|
293
|
+
CELL PARAMETERS
|
|
294
|
+
#Iden Weight Type Lit
|
|
295
|
+
mycell 1.0 1 0.0 !some description
|
|
296
|
+
...
|
|
297
|
+
ENDCELLPARAMETERS (or similar)
|
|
298
|
+
|
|
299
|
+
Columns: section, iden, weight, type, lit,
|
|
300
|
+
inline_comment, group_comment
|
|
301
|
+
"""
|
|
302
|
+
rows = []
|
|
303
|
+
group_comment = ""
|
|
304
|
+
last_was_comment = False # track whether previous processed line was a comment
|
|
305
|
+
|
|
306
|
+
for raw in lines:
|
|
307
|
+
line = raw.strip()
|
|
308
|
+
if not line:
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
# comment lines update group_comment (possibly multi-line)
|
|
312
|
+
if line.startswith("#"):
|
|
313
|
+
text = line.lstrip("#").strip()
|
|
314
|
+
|
|
315
|
+
# skip header-like lines (Weigh / Weight ...)
|
|
316
|
+
if "weigh" in text.lower():
|
|
317
|
+
# header shouldn't join with group comments
|
|
318
|
+
last_was_comment = False
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
# If previous line was also a comment, append (same block)
|
|
322
|
+
# If previous line was data or start of section, start a new block
|
|
323
|
+
if last_was_comment and group_comment:
|
|
324
|
+
group_comment += " /// " + text
|
|
325
|
+
else:
|
|
326
|
+
group_comment = text
|
|
327
|
+
|
|
328
|
+
last_was_comment = True
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
# data line → next comments should be treated as a new block
|
|
332
|
+
last_was_comment = False
|
|
333
|
+
|
|
334
|
+
data, inline_comment = _split_inline_comment(line)
|
|
335
|
+
tokens = data.split()
|
|
336
|
+
if len(tokens) < 4:
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
iden = tokens[0]
|
|
340
|
+
weight = float(tokens[1])
|
|
341
|
+
type_ = tokens[2] # keep as string
|
|
342
|
+
lit = float(tokens[3])
|
|
343
|
+
|
|
344
|
+
rows.append(
|
|
345
|
+
{
|
|
346
|
+
"section": section_name,
|
|
347
|
+
"group_comment": group_comment,
|
|
348
|
+
"iden": iden,
|
|
349
|
+
"weight": weight,
|
|
350
|
+
"type": type_,
|
|
351
|
+
"lit": lit,
|
|
352
|
+
"inline_comment": inline_comment,
|
|
353
|
+
}
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
return pd.DataFrame(rows)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _parse_energy(lines: List[str], section_name: str) -> pd.DataFrame:
|
|
360
|
+
rows: List[Dict[str, Any]] = []
|
|
361
|
+
group_comment = ""
|
|
362
|
+
last_was_comment = False # track if previous processed line was a comment
|
|
363
|
+
|
|
364
|
+
for raw in lines:
|
|
365
|
+
line = raw.strip()
|
|
366
|
+
if not line:
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
# comment lines (header or group)
|
|
370
|
+
if line.startswith("#"):
|
|
371
|
+
text = line.lstrip("#").strip()
|
|
372
|
+
|
|
373
|
+
# skip header-like lines (Weigh / Weight ...)
|
|
374
|
+
if "weigh" in text.lower():
|
|
375
|
+
# header shouldn't join with group comments
|
|
376
|
+
last_was_comment = False
|
|
377
|
+
continue
|
|
378
|
+
|
|
379
|
+
# If previous line was also a comment → same block, append
|
|
380
|
+
# If previous line was data or start → new block, overwrite
|
|
381
|
+
if last_was_comment and group_comment:
|
|
382
|
+
group_comment += " /// " + text
|
|
383
|
+
else:
|
|
384
|
+
group_comment = text
|
|
385
|
+
|
|
386
|
+
last_was_comment = True
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
# ---- data line ----
|
|
390
|
+
last_was_comment = False # next comment block should overwrite
|
|
391
|
+
|
|
392
|
+
data, inline_comment = _split_inline_comment(line)
|
|
393
|
+
tokens = data.split()
|
|
394
|
+
if len(tokens) < 3:
|
|
395
|
+
# need at least weight, something, lit
|
|
396
|
+
continue
|
|
397
|
+
|
|
398
|
+
# first token: weight
|
|
399
|
+
try:
|
|
400
|
+
weight = float(tokens[0])
|
|
401
|
+
except ValueError:
|
|
402
|
+
# not a valid energy line
|
|
403
|
+
continue
|
|
404
|
+
|
|
405
|
+
# last token: lit (target energy)
|
|
406
|
+
try:
|
|
407
|
+
lit = float(tokens[-1])
|
|
408
|
+
except ValueError:
|
|
409
|
+
continue
|
|
410
|
+
|
|
411
|
+
# middle_part: everything between weight and lit
|
|
412
|
+
middle_part = " ".join(tokens[1:-1]).strip()
|
|
413
|
+
if not middle_part:
|
|
414
|
+
continue
|
|
415
|
+
|
|
416
|
+
middle_tokens = middle_part.split()
|
|
417
|
+
|
|
418
|
+
# --- normalize middle tokens ---
|
|
419
|
+
norm: List[str] = []
|
|
420
|
+
for tok in middle_tokens:
|
|
421
|
+
if "/" in tok and tok != "/":
|
|
422
|
+
if tok.startswith("/"):
|
|
423
|
+
norm.append(tok)
|
|
424
|
+
else:
|
|
425
|
+
base, rest = tok.split("/", 1)
|
|
426
|
+
norm.append(base)
|
|
427
|
+
norm.append("/" + rest)
|
|
428
|
+
else:
|
|
429
|
+
norm.append(tok)
|
|
430
|
+
|
|
431
|
+
row: Dict[str, Any] = {
|
|
432
|
+
"section": section_name,
|
|
433
|
+
"group_comment": group_comment,
|
|
434
|
+
"weight": weight,
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
i = 0
|
|
438
|
+
group_idx = 1
|
|
439
|
+
while i + 2 < len(norm):
|
|
440
|
+
op = norm[i]
|
|
441
|
+
if op == "–": # normalize en dash just in case
|
|
442
|
+
op = "-"
|
|
443
|
+
|
|
444
|
+
iden = norm[i + 1]
|
|
445
|
+
n_tok = norm[i + 2]
|
|
446
|
+
|
|
447
|
+
n = 1.0
|
|
448
|
+
if "/" in n_tok:
|
|
449
|
+
_, n_str = n_tok.split("/", 1)
|
|
450
|
+
try:
|
|
451
|
+
n = float(n_str.strip())
|
|
452
|
+
except ValueError:
|
|
453
|
+
n = 1.0
|
|
454
|
+
|
|
455
|
+
row[f"op{group_idx}"] = op
|
|
456
|
+
row[f"id{group_idx}"] = iden
|
|
457
|
+
row[f"n{group_idx}"] = n
|
|
458
|
+
|
|
459
|
+
group_idx += 1
|
|
460
|
+
i += 3
|
|
461
|
+
|
|
462
|
+
row["lit"] = lit
|
|
463
|
+
row["inline_comment"] = inline_comment
|
|
464
|
+
|
|
465
|
+
rows.append(row)
|
|
466
|
+
|
|
467
|
+
df = pd.DataFrame(rows)
|
|
468
|
+
# --- Reorder columns: dynamic terms first, then lit, then inline_comment ---
|
|
469
|
+
cols = list(df.columns)
|
|
470
|
+
|
|
471
|
+
# Fixed columns to move to the end
|
|
472
|
+
end_cols = ["lit", "inline_comment"]
|
|
473
|
+
|
|
474
|
+
# Keep only those that exist (in case some are missing)
|
|
475
|
+
end_cols = [c for c in end_cols if c in cols]
|
|
476
|
+
|
|
477
|
+
# All other columns come first
|
|
478
|
+
start_cols = [c for c in cols if c not in end_cols]
|
|
479
|
+
|
|
480
|
+
# New column order
|
|
481
|
+
df = df[start_cols + end_cols]
|
|
482
|
+
|
|
483
|
+
return df
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
class TrainsetHandler(BaseHandler):
|
|
487
|
+
"""
|
|
488
|
+
Parser for ReaxFF training set definition files (TRAINSET).
|
|
489
|
+
|
|
490
|
+
This class parses TRAINSET files and exposes their contents as
|
|
491
|
+
section-specific tables, one per training target category.
|
|
492
|
+
|
|
493
|
+
Parsed Data
|
|
494
|
+
-----------
|
|
495
|
+
Summary table
|
|
496
|
+
The main ``dataframe()`` is intentionally empty.
|
|
497
|
+
TRAINSET files do not have a single global tabular representation.
|
|
498
|
+
|
|
499
|
+
Section tables
|
|
500
|
+
Returned via ``metadata()["tables"]`` or convenience accessors,
|
|
501
|
+
with one table per section:
|
|
502
|
+
|
|
503
|
+
- ``CHARGE``:
|
|
504
|
+
Charge fitting targets, with columns:
|
|
505
|
+
["section", "iden", "weight", "atom", "lit",
|
|
506
|
+
"inline_comment", "group_comment"]
|
|
507
|
+
|
|
508
|
+
- ``HEATFO``:
|
|
509
|
+
Heats of formation targets, with columns:
|
|
510
|
+
["section", "iden", "weight", "lit",
|
|
511
|
+
"inline_comment", "group_comment"]
|
|
512
|
+
|
|
513
|
+
- ``GEOMETRY``:
|
|
514
|
+
Geometry-related targets (bond, angle, torsion, RMSG), with columns:
|
|
515
|
+
["section", "iden", "weight", "at1", "at2", "at3", "at4",
|
|
516
|
+
"lit", "inline_comment", "group_comment"]
|
|
517
|
+
(atom index columns are optional depending on the entry type)
|
|
518
|
+
|
|
519
|
+
- ``CELL_PARAMETERS``:
|
|
520
|
+
Cell and lattice targets, with columns:
|
|
521
|
+
["section", "iden", "weight", "type", "lit",
|
|
522
|
+
"inline_comment", "group_comment"]
|
|
523
|
+
|
|
524
|
+
- ``ENERGY``:
|
|
525
|
+
Composite energy expressions, with dynamically generated columns:
|
|
526
|
+
["section", "weight",
|
|
527
|
+
"op1", "id1", "n1",
|
|
528
|
+
"op2", "id2", "n2", ...,
|
|
529
|
+
"lit", "inline_comment"]
|
|
530
|
+
|
|
531
|
+
Metadata
|
|
532
|
+
Returned by ``metadata()``, containing:
|
|
533
|
+
{
|
|
534
|
+
"sections": list[str], # present section names
|
|
535
|
+
"tables": dict[str, DataFrame] # section → parsed table
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
Notes
|
|
539
|
+
-----
|
|
540
|
+
- Consecutive ``#`` comment lines are grouped and stored as
|
|
541
|
+
``group_comment`` using ``" /// "`` as a separator.
|
|
542
|
+
- Inline comments following ``!`` are preserved verbatim.
|
|
543
|
+
- Sections appearing multiple times are concatenated automatically.
|
|
544
|
+
- This handler is not frame-based; ``n_frames()`` always returns 0.
|
|
545
|
+
"""
|
|
546
|
+
|
|
547
|
+
filetype = "trainset"
|
|
548
|
+
|
|
549
|
+
def _parse(self) -> tuple[pd.DataFrame, Dict[str, Any]]:
|
|
550
|
+
"""
|
|
551
|
+
TemplateHandler expects _parse(self) with NO arguments.
|
|
552
|
+
So we load file content here.
|
|
553
|
+
"""
|
|
554
|
+
# read the file
|
|
555
|
+
with open(self.path, "r") as f:
|
|
556
|
+
lines = f.read().splitlines()
|
|
557
|
+
|
|
558
|
+
tables: Dict[str, pd.DataFrame] = {}
|
|
559
|
+
current_raw_label: Optional[str] = None
|
|
560
|
+
current_canonical: Optional[str] = None
|
|
561
|
+
buffer: List[str] = []
|
|
562
|
+
|
|
563
|
+
def flush_section():
|
|
564
|
+
nonlocal buffer, current_canonical, tables
|
|
565
|
+
|
|
566
|
+
if not current_canonical or not buffer:
|
|
567
|
+
buffer = []
|
|
568
|
+
return
|
|
569
|
+
|
|
570
|
+
name = current_canonical
|
|
571
|
+
|
|
572
|
+
if name == "CHARGE":
|
|
573
|
+
df = _parse_charge(buffer, name)
|
|
574
|
+
elif name == "HEATFO":
|
|
575
|
+
df = _parse_heatfo(buffer, name)
|
|
576
|
+
elif name == "GEOMETRY":
|
|
577
|
+
df = _parse_geometry(buffer, name)
|
|
578
|
+
elif name == "CELL_PARAMETERS":
|
|
579
|
+
df = _parse_cell_parameters(buffer, name)
|
|
580
|
+
elif name == "ENERGY":
|
|
581
|
+
df = _parse_energy(buffer, name)
|
|
582
|
+
else:
|
|
583
|
+
df = pd.DataFrame()
|
|
584
|
+
|
|
585
|
+
# 🔧 KEY CHANGE: append rather than overwrite
|
|
586
|
+
if name in tables and not tables[name].empty:
|
|
587
|
+
tables[name] = pd.concat([tables[name], df], ignore_index=True)
|
|
588
|
+
else:
|
|
589
|
+
tables[name] = df
|
|
590
|
+
|
|
591
|
+
buffer = []
|
|
592
|
+
|
|
593
|
+
for raw in lines:
|
|
594
|
+
stripped = raw.strip()
|
|
595
|
+
if not stripped:
|
|
596
|
+
continue
|
|
597
|
+
|
|
598
|
+
upper = stripped.upper()
|
|
599
|
+
|
|
600
|
+
# SECTION START?
|
|
601
|
+
if upper in SECTION_MAP:
|
|
602
|
+
flush_section()
|
|
603
|
+
current_raw_label = stripped
|
|
604
|
+
current_canonical = SECTION_MAP[upper]
|
|
605
|
+
buffer = []
|
|
606
|
+
continue
|
|
607
|
+
|
|
608
|
+
# INSIDE A SECTION
|
|
609
|
+
if current_raw_label and current_canonical:
|
|
610
|
+
end_token = "END" + current_raw_label.replace(" ", "").upper()
|
|
611
|
+
|
|
612
|
+
if upper.startswith(end_token):
|
|
613
|
+
flush_section()
|
|
614
|
+
current_raw_label = None
|
|
615
|
+
current_canonical = None
|
|
616
|
+
buffer = []
|
|
617
|
+
continue
|
|
618
|
+
|
|
619
|
+
buffer.append(raw)
|
|
620
|
+
|
|
621
|
+
# Final flush
|
|
622
|
+
flush_section()
|
|
623
|
+
|
|
624
|
+
# RETURN EMPTY summary + metadata
|
|
625
|
+
return pd.DataFrame(), {
|
|
626
|
+
"sections": list(tables.keys()),
|
|
627
|
+
"tables": tables,
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
# ------------------------------------------------------------------
|
|
631
|
+
# Convenience accessors
|
|
632
|
+
# ------------------------------------------------------------------
|
|
633
|
+
def section(self, name: str) -> pd.DataFrame:
|
|
634
|
+
"""Return table for a given section (case-insensitive)."""
|
|
635
|
+
meta = self.metadata()
|
|
636
|
+
tables = meta.get("tables", {})
|
|
637
|
+
key = name.upper()
|
|
638
|
+
# normalize CELL vs CELL_PARAMETERS
|
|
639
|
+
if key in ("CELL", "CELL PARAMETERS"):
|
|
640
|
+
key = "CELL_PARAMETERS"
|
|
641
|
+
if key not in tables:
|
|
642
|
+
raise KeyError(f"Section '{name}' not found in trainset.")
|
|
643
|
+
return tables[key]
|
|
644
|
+
|
|
645
|
+
def charges(self) -> pd.DataFrame:
|
|
646
|
+
return self.section("CHARGE")
|
|
647
|
+
|
|
648
|
+
def heatfo(self) -> pd.DataFrame:
|
|
649
|
+
return self.section("HEATFO")
|
|
650
|
+
|
|
651
|
+
def geometry(self) -> pd.DataFrame:
|
|
652
|
+
return self.section("GEOMETRY")
|
|
653
|
+
|
|
654
|
+
def cell_parameters(self) -> pd.DataFrame:
|
|
655
|
+
return self.section("CELL_PARAMETERS")
|
|
656
|
+
|
|
657
|
+
def energy_terms(self) -> pd.DataFrame:
|
|
658
|
+
return self.section("ENERGY")
|