reaxkit 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. reaxkit/__init__.py +0 -0
  2. reaxkit/analysis/__init__.py +0 -0
  3. reaxkit/analysis/composed/RDF_analyzer.py +560 -0
  4. reaxkit/analysis/composed/__init__.py +0 -0
  5. reaxkit/analysis/composed/connectivity_analyzer.py +706 -0
  6. reaxkit/analysis/composed/coordination_analyzer.py +144 -0
  7. reaxkit/analysis/composed/electrostatics_analyzer.py +687 -0
  8. reaxkit/analysis/per_file/__init__.py +0 -0
  9. reaxkit/analysis/per_file/control_analyzer.py +165 -0
  10. reaxkit/analysis/per_file/eregime_analyzer.py +108 -0
  11. reaxkit/analysis/per_file/ffield_analyzer.py +305 -0
  12. reaxkit/analysis/per_file/fort13_analyzer.py +79 -0
  13. reaxkit/analysis/per_file/fort57_analyzer.py +106 -0
  14. reaxkit/analysis/per_file/fort73_analyzer.py +61 -0
  15. reaxkit/analysis/per_file/fort74_analyzer.py +65 -0
  16. reaxkit/analysis/per_file/fort76_analyzer.py +191 -0
  17. reaxkit/analysis/per_file/fort78_analyzer.py +154 -0
  18. reaxkit/analysis/per_file/fort79_analyzer.py +83 -0
  19. reaxkit/analysis/per_file/fort7_analyzer.py +393 -0
  20. reaxkit/analysis/per_file/fort99_analyzer.py +411 -0
  21. reaxkit/analysis/per_file/molfra_analyzer.py +359 -0
  22. reaxkit/analysis/per_file/params_analyzer.py +258 -0
  23. reaxkit/analysis/per_file/summary_analyzer.py +84 -0
  24. reaxkit/analysis/per_file/trainset_analyzer.py +84 -0
  25. reaxkit/analysis/per_file/vels_analyzer.py +95 -0
  26. reaxkit/analysis/per_file/xmolout_analyzer.py +528 -0
  27. reaxkit/cli.py +181 -0
  28. reaxkit/count_loc.py +276 -0
  29. reaxkit/data/alias.yaml +89 -0
  30. reaxkit/data/constants.yaml +27 -0
  31. reaxkit/data/reaxff_input_files_contents.yaml +186 -0
  32. reaxkit/data/reaxff_output_files_contents.yaml +301 -0
  33. reaxkit/data/units.yaml +38 -0
  34. reaxkit/help/__init__.py +0 -0
  35. reaxkit/help/help_index_loader.py +531 -0
  36. reaxkit/help/introspection_utils.py +131 -0
  37. reaxkit/io/__init__.py +0 -0
  38. reaxkit/io/base_handler.py +165 -0
  39. reaxkit/io/generators/__init__.py +0 -0
  40. reaxkit/io/generators/control_generator.py +123 -0
  41. reaxkit/io/generators/eregime_generator.py +341 -0
  42. reaxkit/io/generators/geo_generator.py +967 -0
  43. reaxkit/io/generators/trainset_generator.py +1758 -0
  44. reaxkit/io/generators/tregime_generator.py +113 -0
  45. reaxkit/io/generators/vregime_generator.py +164 -0
  46. reaxkit/io/generators/xmolout_generator.py +304 -0
  47. reaxkit/io/handlers/__init__.py +0 -0
  48. reaxkit/io/handlers/control_handler.py +209 -0
  49. reaxkit/io/handlers/eregime_handler.py +122 -0
  50. reaxkit/io/handlers/ffield_handler.py +812 -0
  51. reaxkit/io/handlers/fort13_handler.py +123 -0
  52. reaxkit/io/handlers/fort57_handler.py +143 -0
  53. reaxkit/io/handlers/fort73_handler.py +145 -0
  54. reaxkit/io/handlers/fort74_handler.py +155 -0
  55. reaxkit/io/handlers/fort76_handler.py +195 -0
  56. reaxkit/io/handlers/fort78_handler.py +142 -0
  57. reaxkit/io/handlers/fort79_handler.py +227 -0
  58. reaxkit/io/handlers/fort7_handler.py +264 -0
  59. reaxkit/io/handlers/fort99_handler.py +128 -0
  60. reaxkit/io/handlers/geo_handler.py +224 -0
  61. reaxkit/io/handlers/molfra_handler.py +184 -0
  62. reaxkit/io/handlers/params_handler.py +137 -0
  63. reaxkit/io/handlers/summary_handler.py +135 -0
  64. reaxkit/io/handlers/trainset_handler.py +658 -0
  65. reaxkit/io/handlers/vels_handler.py +293 -0
  66. reaxkit/io/handlers/xmolout_handler.py +174 -0
  67. reaxkit/utils/__init__.py +0 -0
  68. reaxkit/utils/alias.py +219 -0
  69. reaxkit/utils/cache.py +77 -0
  70. reaxkit/utils/constants.py +75 -0
  71. reaxkit/utils/equation_of_states.py +96 -0
  72. reaxkit/utils/exceptions.py +27 -0
  73. reaxkit/utils/frame_utils.py +175 -0
  74. reaxkit/utils/log.py +43 -0
  75. reaxkit/utils/media/__init__.py +0 -0
  76. reaxkit/utils/media/convert.py +90 -0
  77. reaxkit/utils/media/make_video.py +91 -0
  78. reaxkit/utils/media/plotter.py +812 -0
  79. reaxkit/utils/numerical/__init__.py +0 -0
  80. reaxkit/utils/numerical/extrema_finder.py +96 -0
  81. reaxkit/utils/numerical/moving_average.py +103 -0
  82. reaxkit/utils/numerical/numerical_calcs.py +75 -0
  83. reaxkit/utils/numerical/signal_ops.py +135 -0
  84. reaxkit/utils/path.py +55 -0
  85. reaxkit/utils/units.py +104 -0
  86. reaxkit/webui/__init__.py +0 -0
  87. reaxkit/webui/app.py +0 -0
  88. reaxkit/webui/components.py +0 -0
  89. reaxkit/webui/layouts.py +0 -0
  90. reaxkit/webui/utils.py +0 -0
  91. reaxkit/workflows/__init__.py +0 -0
  92. reaxkit/workflows/composed/__init__.py +0 -0
  93. reaxkit/workflows/composed/coordination_workflow.py +393 -0
  94. reaxkit/workflows/composed/electrostatics_workflow.py +587 -0
  95. reaxkit/workflows/composed/xmolout_fort7_workflow.py +343 -0
  96. reaxkit/workflows/meta/__init__.py +0 -0
  97. reaxkit/workflows/meta/help_workflow.py +136 -0
  98. reaxkit/workflows/meta/introspection_workflow.py +235 -0
  99. reaxkit/workflows/meta/make_video_workflow.py +61 -0
  100. reaxkit/workflows/meta/plotter_workflow.py +601 -0
  101. reaxkit/workflows/per_file/__init__.py +0 -0
  102. reaxkit/workflows/per_file/control_workflow.py +110 -0
  103. reaxkit/workflows/per_file/eregime_workflow.py +267 -0
  104. reaxkit/workflows/per_file/ffield_workflow.py +390 -0
  105. reaxkit/workflows/per_file/fort13_workflow.py +86 -0
  106. reaxkit/workflows/per_file/fort57_workflow.py +137 -0
  107. reaxkit/workflows/per_file/fort73_workflow.py +151 -0
  108. reaxkit/workflows/per_file/fort74_workflow.py +88 -0
  109. reaxkit/workflows/per_file/fort76_workflow.py +188 -0
  110. reaxkit/workflows/per_file/fort78_workflow.py +135 -0
  111. reaxkit/workflows/per_file/fort79_workflow.py +314 -0
  112. reaxkit/workflows/per_file/fort7_workflow.py +592 -0
  113. reaxkit/workflows/per_file/fort83_workflow.py +60 -0
  114. reaxkit/workflows/per_file/fort99_workflow.py +223 -0
  115. reaxkit/workflows/per_file/geo_workflow.py +554 -0
  116. reaxkit/workflows/per_file/molfra_workflow.py +577 -0
  117. reaxkit/workflows/per_file/params_workflow.py +135 -0
  118. reaxkit/workflows/per_file/summary_workflow.py +161 -0
  119. reaxkit/workflows/per_file/trainset_workflow.py +356 -0
  120. reaxkit/workflows/per_file/tregime_workflow.py +79 -0
  121. reaxkit/workflows/per_file/vels_workflow.py +309 -0
  122. reaxkit/workflows/per_file/vregime_workflow.py +75 -0
  123. reaxkit/workflows/per_file/xmolout_workflow.py +678 -0
  124. reaxkit-1.0.0.dist-info/METADATA +128 -0
  125. reaxkit-1.0.0.dist-info/RECORD +130 -0
  126. reaxkit-1.0.0.dist-info/WHEEL +5 -0
  127. reaxkit-1.0.0.dist-info/entry_points.txt +2 -0
  128. reaxkit-1.0.0.dist-info/licenses/AUTHORS.md +20 -0
  129. reaxkit-1.0.0.dist-info/licenses/LICENSE +21 -0
  130. reaxkit-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,658 @@
1
+ """
2
+ ReaxFF training set definition (TRAINSET) handler.
3
+
4
+ This module provides a handler for parsing ReaxFF TRAINSET-style files,
5
+ which define reference data, weights, and targets used during
6
+ force-field parameter optimization.
7
+
8
+ TRAINSET files are sectioned and heterogeneous by design, containing
9
+ distinct blocks for charges, heats of formation, geometries, cell
10
+ parameters, and energies.
11
+ """
12
+
13
+
14
+ from __future__ import annotations
15
+
16
+ from typing import Dict, Any, List, Optional
17
+ import pandas as pd
18
+
19
+ from reaxkit.io.base_handler import BaseHandler
20
+
21
+
22
+ # Map raw section labels in the file to canonical section names
23
+ SECTION_MAP = {
24
+ "CHARGE": "CHARGE",
25
+ "HEATFO": "HEATFO",
26
+ "GEOMETRY": "GEOMETRY",
27
+ "CELL PARAMETERS": "CELL_PARAMETERS",
28
+ "CELL": "CELL_PARAMETERS", # in case it's written as CELL
29
+ "ENERGY": "ENERGY",
30
+ }
31
+
32
+
33
+ def _split_inline_comment(line: str) -> tuple[str, str]:
34
+ """Return (data_part, inline_comment) split on '!' (if present)."""
35
+ if "!" in line:
36
+ data, comment = line.split("!", 1)
37
+ return data.strip(), comment.strip()
38
+ return line.strip(), ""
39
+
40
+
41
+ def _parse_charge(lines: List[str], section_name: str) -> pd.DataFrame:
42
+ """
43
+ CHARGE block:
44
+
45
+ CHARGE
46
+ #Iden Weight Atom Lit
47
+ # group line 1
48
+ # group line 2
49
+ AlNH2q 0.10 1 0.83215 !charge for Al atom in AlNH2
50
+ ...
51
+ ENDCHARGE
52
+
53
+ Columns: section, iden, weight, atom, lit,
54
+ inline_comment, group_comment
55
+
56
+ group_comment behavior:
57
+ - Consecutive '#' lines are concatenated with " /// ".
58
+ - All following data lines share that block.
59
+ - When a new '#' block appears after data, it overwrites.
60
+ """
61
+ rows = []
62
+ group_comment = ""
63
+ last_was_comment = False # track previous processed line
64
+
65
+ for raw in lines:
66
+ line = raw.strip()
67
+ if not line:
68
+ continue
69
+
70
+ # comment lines → update group_comment
71
+ if line.startswith("#"):
72
+ text = line.lstrip("#").strip()
73
+
74
+ # skip header-like lines (Weigh / Weight ...)
75
+ if "weigh" in text.lower():
76
+ # header shouldn't join with group comments
77
+ last_was_comment = False
78
+ continue
79
+
80
+ # If previous line was comment → append
81
+ # If previous line was data/start → new block
82
+ if last_was_comment and group_comment:
83
+ group_comment += " /// " + text
84
+ else:
85
+ group_comment = text
86
+
87
+ last_was_comment = True
88
+ continue
89
+
90
+ # data line → the next comment block should replace, not append
91
+ last_was_comment = False
92
+
93
+ data, inline_comment = _split_inline_comment(line)
94
+ tokens = data.split()
95
+ if len(tokens) < 4:
96
+ continue
97
+
98
+ iden = tokens[0]
99
+ weight = float(tokens[1])
100
+ atom = int(tokens[2])
101
+ lit = float(tokens[3])
102
+
103
+ rows.append(
104
+ {
105
+ "section": section_name,
106
+ "group_comment": group_comment,
107
+ "iden": iden,
108
+ "weight": weight,
109
+ "atom": atom,
110
+ "lit": lit,
111
+ "inline_comment": inline_comment,
112
+ }
113
+ )
114
+
115
+ return pd.DataFrame(rows)
116
+
117
+
118
+ def _parse_heatfo(lines: List[str], section_name: str) -> pd.DataFrame:
119
+ """
120
+ HEATFO block:
121
+
122
+ HEATFO
123
+ #Iden Weight Lit
124
+ # group line 1
125
+ # group line 2
126
+ benzene 1.0 -19.82 !heat of formation
127
+ ...
128
+ ENDHEATFO
129
+
130
+ Columns: section, iden, weight, lit, inline_comment, group_comment
131
+
132
+ group_comment behavior:
133
+ - Consecutive '#' lines are concatenated with " /// ".
134
+ - All following data lines share that comment until a new '#' block
135
+ appears, which overwrites the previous one.
136
+ """
137
+ rows = []
138
+ group_comment = ""
139
+ last_was_comment = False # track whether previous processed line was a comment
140
+
141
+ for raw in lines:
142
+ line = raw.strip()
143
+ if not line:
144
+ continue
145
+
146
+ # comment lines → update group_comment (possibly multi-line)
147
+ if line.startswith("#"):
148
+ text = line.lstrip("#").strip()
149
+
150
+ # skip header-like lines (Weigh / Weight ...)
151
+ if "weigh" in text.lower():
152
+ # header shouldn't join with group comments
153
+ last_was_comment = False
154
+ continue
155
+
156
+ # Same block → append; new block → overwrite
157
+ if last_was_comment and group_comment:
158
+ group_comment += " /// " + text
159
+ else:
160
+ group_comment = text
161
+
162
+ last_was_comment = True
163
+ continue
164
+
165
+ # data line → next comment block should overwrite, not append
166
+ last_was_comment = False
167
+
168
+ data, inline_comment = _split_inline_comment(line)
169
+ tokens = data.split()
170
+ if len(tokens) < 3:
171
+ continue
172
+
173
+ iden = tokens[0]
174
+ weight = float(tokens[1])
175
+ lit = float(tokens[2])
176
+
177
+ rows.append(
178
+ {
179
+ "section": section_name,
180
+ "group_comment": group_comment,
181
+ "iden": iden,
182
+ "weight": weight,
183
+ "lit": lit,
184
+ "inline_comment": inline_comment,
185
+ }
186
+ )
187
+
188
+ return pd.DataFrame(rows)
189
+
190
+
191
+ def _parse_geometry(lines: List[str], section_name: str) -> pd.DataFrame:
192
+ """
193
+ GEOMETRY block:
194
+
195
+ GEOMETRY
196
+ #Iden Weight At1 At2 At3 At4 Lit
197
+ # group line 1
198
+ # group line 2
199
+ chexane 0.01 1 2 1.54 !bond
200
+ chexane 1.00 1 2 3 111.0 !valence angle
201
+ chexane 1.00 1 2 3 4 56.0 !torsion angle
202
+ chexane 1.00 0.01 !RMSG
203
+
204
+ Required data per row:
205
+ - iden, weight, lit
206
+ Optional:
207
+ - at1, at2, at3, at4 (if present)
208
+
209
+ group_comment behavior:
210
+ - Multiple '#' lines in a row are concatenated with " /// ".
211
+ - All following data lines share that group_comment until a new
212
+ '#' block appears, which overwrites the previous one.
213
+ """
214
+ rows = []
215
+ group_comment = ""
216
+ last_was_comment = False # track whether previous processed line was a comment
217
+
218
+ for raw in lines:
219
+ line = raw.strip()
220
+ if not line:
221
+ continue
222
+
223
+ # comment lines → update group_comment (possibly multi-line)
224
+ if line.startswith("#"):
225
+ text = line.lstrip("#").strip()
226
+
227
+ # skip header-like lines (Iden / Weight / Weigh ...)
228
+ if "weigh" in text.lower() or "iden" in text.lower():
229
+ last_was_comment = False
230
+ continue
231
+
232
+ # Same block → append; new block → overwrite
233
+ if last_was_comment and group_comment:
234
+ group_comment += " /// " + text
235
+ else:
236
+ group_comment = text
237
+
238
+ last_was_comment = True
239
+ continue
240
+
241
+ # data line → next comment block should overwrite, not append
242
+ last_was_comment = False
243
+
244
+ data, inline_comment = _split_inline_comment(line)
245
+ tokens = data.split()
246
+
247
+ # Need at least: iden, weight, lit
248
+ if len(tokens) < 3:
249
+ continue
250
+
251
+ iden = tokens[0]
252
+ weight = float(tokens[1])
253
+ lit = float(tokens[-1])
254
+
255
+ # Middle tokens (between weight and lit) are optional atom indices
256
+ atom_tokens = tokens[2:-1]
257
+
258
+ row = {
259
+ "section": section_name,
260
+ "iden": iden,
261
+ "weight": weight,
262
+ "lit": lit,
263
+ "inline_comment": inline_comment,
264
+ "group_comment": group_comment,
265
+ }
266
+
267
+ # Fill at1–at4 only if present
268
+ for i, tok in enumerate(atom_tokens[:4], start=1):
269
+ try:
270
+ row[f"at{i}"] = int(tok)
271
+ except ValueError:
272
+ # If something weird appears where an int is expected, skip it
273
+ continue
274
+
275
+ rows.append(row)
276
+
277
+ # Build DataFrame and order columns nicely
278
+ df = pd.DataFrame(rows)
279
+ if not df.empty:
280
+ base_cols = ["section", "iden", "weight"]
281
+ atom_cols = [c for c in ["at1", "at2", "at3", "at4"] if c in df.columns]
282
+ end_cols = [c for c in ["lit", "inline_comment", "group_comment"] if c in df.columns]
283
+ other_cols = [c for c in df.columns if c not in (base_cols + atom_cols + end_cols)]
284
+ df = df[base_cols + atom_cols + other_cols + end_cols]
285
+
286
+ return df
287
+
288
+
289
+ def _parse_cell_parameters(lines: List[str], section_name: str) -> pd.DataFrame:
290
+ """
291
+ CELL PARAMETERS block:
292
+
293
+ CELL PARAMETERS
294
+ #Iden Weight Type Lit
295
+ mycell 1.0 1 0.0 !some description
296
+ ...
297
+ ENDCELLPARAMETERS (or similar)
298
+
299
+ Columns: section, iden, weight, type, lit,
300
+ inline_comment, group_comment
301
+ """
302
+ rows = []
303
+ group_comment = ""
304
+ last_was_comment = False # track whether previous processed line was a comment
305
+
306
+ for raw in lines:
307
+ line = raw.strip()
308
+ if not line:
309
+ continue
310
+
311
+ # comment lines update group_comment (possibly multi-line)
312
+ if line.startswith("#"):
313
+ text = line.lstrip("#").strip()
314
+
315
+ # skip header-like lines (Weigh / Weight ...)
316
+ if "weigh" in text.lower():
317
+ # header shouldn't join with group comments
318
+ last_was_comment = False
319
+ continue
320
+
321
+ # If previous line was also a comment, append (same block)
322
+ # If previous line was data or start of section, start a new block
323
+ if last_was_comment and group_comment:
324
+ group_comment += " /// " + text
325
+ else:
326
+ group_comment = text
327
+
328
+ last_was_comment = True
329
+ continue
330
+
331
+ # data line → next comments should be treated as a new block
332
+ last_was_comment = False
333
+
334
+ data, inline_comment = _split_inline_comment(line)
335
+ tokens = data.split()
336
+ if len(tokens) < 4:
337
+ continue
338
+
339
+ iden = tokens[0]
340
+ weight = float(tokens[1])
341
+ type_ = tokens[2] # keep as string
342
+ lit = float(tokens[3])
343
+
344
+ rows.append(
345
+ {
346
+ "section": section_name,
347
+ "group_comment": group_comment,
348
+ "iden": iden,
349
+ "weight": weight,
350
+ "type": type_,
351
+ "lit": lit,
352
+ "inline_comment": inline_comment,
353
+ }
354
+ )
355
+
356
+ return pd.DataFrame(rows)
357
+
358
+
359
+ def _parse_energy(lines: List[str], section_name: str) -> pd.DataFrame:
360
+ rows: List[Dict[str, Any]] = []
361
+ group_comment = ""
362
+ last_was_comment = False # track if previous processed line was a comment
363
+
364
+ for raw in lines:
365
+ line = raw.strip()
366
+ if not line:
367
+ continue
368
+
369
+ # comment lines (header or group)
370
+ if line.startswith("#"):
371
+ text = line.lstrip("#").strip()
372
+
373
+ # skip header-like lines (Weigh / Weight ...)
374
+ if "weigh" in text.lower():
375
+ # header shouldn't join with group comments
376
+ last_was_comment = False
377
+ continue
378
+
379
+ # If previous line was also a comment → same block, append
380
+ # If previous line was data or start → new block, overwrite
381
+ if last_was_comment and group_comment:
382
+ group_comment += " /// " + text
383
+ else:
384
+ group_comment = text
385
+
386
+ last_was_comment = True
387
+ continue
388
+
389
+ # ---- data line ----
390
+ last_was_comment = False # next comment block should overwrite
391
+
392
+ data, inline_comment = _split_inline_comment(line)
393
+ tokens = data.split()
394
+ if len(tokens) < 3:
395
+ # need at least weight, something, lit
396
+ continue
397
+
398
+ # first token: weight
399
+ try:
400
+ weight = float(tokens[0])
401
+ except ValueError:
402
+ # not a valid energy line
403
+ continue
404
+
405
+ # last token: lit (target energy)
406
+ try:
407
+ lit = float(tokens[-1])
408
+ except ValueError:
409
+ continue
410
+
411
+ # middle_part: everything between weight and lit
412
+ middle_part = " ".join(tokens[1:-1]).strip()
413
+ if not middle_part:
414
+ continue
415
+
416
+ middle_tokens = middle_part.split()
417
+
418
+ # --- normalize middle tokens ---
419
+ norm: List[str] = []
420
+ for tok in middle_tokens:
421
+ if "/" in tok and tok != "/":
422
+ if tok.startswith("/"):
423
+ norm.append(tok)
424
+ else:
425
+ base, rest = tok.split("/", 1)
426
+ norm.append(base)
427
+ norm.append("/" + rest)
428
+ else:
429
+ norm.append(tok)
430
+
431
+ row: Dict[str, Any] = {
432
+ "section": section_name,
433
+ "group_comment": group_comment,
434
+ "weight": weight,
435
+ }
436
+
437
+ i = 0
438
+ group_idx = 1
439
+ while i + 2 < len(norm):
440
+ op = norm[i]
441
+ if op == "–": # normalize en dash just in case
442
+ op = "-"
443
+
444
+ iden = norm[i + 1]
445
+ n_tok = norm[i + 2]
446
+
447
+ n = 1.0
448
+ if "/" in n_tok:
449
+ _, n_str = n_tok.split("/", 1)
450
+ try:
451
+ n = float(n_str.strip())
452
+ except ValueError:
453
+ n = 1.0
454
+
455
+ row[f"op{group_idx}"] = op
456
+ row[f"id{group_idx}"] = iden
457
+ row[f"n{group_idx}"] = n
458
+
459
+ group_idx += 1
460
+ i += 3
461
+
462
+ row["lit"] = lit
463
+ row["inline_comment"] = inline_comment
464
+
465
+ rows.append(row)
466
+
467
+ df = pd.DataFrame(rows)
468
+ # --- Reorder columns: dynamic terms first, then lit, then inline_comment ---
469
+ cols = list(df.columns)
470
+
471
+ # Fixed columns to move to the end
472
+ end_cols = ["lit", "inline_comment"]
473
+
474
+ # Keep only those that exist (in case some are missing)
475
+ end_cols = [c for c in end_cols if c in cols]
476
+
477
+ # All other columns come first
478
+ start_cols = [c for c in cols if c not in end_cols]
479
+
480
+ # New column order
481
+ df = df[start_cols + end_cols]
482
+
483
+ return df
484
+
485
+
486
+ class TrainsetHandler(BaseHandler):
487
+ """
488
+ Parser for ReaxFF training set definition files (TRAINSET).
489
+
490
+ This class parses TRAINSET files and exposes their contents as
491
+ section-specific tables, one per training target category.
492
+
493
+ Parsed Data
494
+ -----------
495
+ Summary table
496
+ The main ``dataframe()`` is intentionally empty.
497
+ TRAINSET files do not have a single global tabular representation.
498
+
499
+ Section tables
500
+ Returned via ``metadata()["tables"]`` or convenience accessors,
501
+ with one table per section:
502
+
503
+ - ``CHARGE``:
504
+ Charge fitting targets, with columns:
505
+ ["section", "iden", "weight", "atom", "lit",
506
+ "inline_comment", "group_comment"]
507
+
508
+ - ``HEATFO``:
509
+ Heats of formation targets, with columns:
510
+ ["section", "iden", "weight", "lit",
511
+ "inline_comment", "group_comment"]
512
+
513
+ - ``GEOMETRY``:
514
+ Geometry-related targets (bond, angle, torsion, RMSG), with columns:
515
+ ["section", "iden", "weight", "at1", "at2", "at3", "at4",
516
+ "lit", "inline_comment", "group_comment"]
517
+ (atom index columns are optional depending on the entry type)
518
+
519
+ - ``CELL_PARAMETERS``:
520
+ Cell and lattice targets, with columns:
521
+ ["section", "iden", "weight", "type", "lit",
522
+ "inline_comment", "group_comment"]
523
+
524
+ - ``ENERGY``:
525
+ Composite energy expressions, with dynamically generated columns:
526
+ ["section", "weight",
527
+ "op1", "id1", "n1",
528
+ "op2", "id2", "n2", ...,
529
+ "lit", "inline_comment"]
530
+
531
+ Metadata
532
+ Returned by ``metadata()``, containing:
533
+ {
534
+ "sections": list[str], # present section names
535
+ "tables": dict[str, DataFrame] # section → parsed table
536
+ }
537
+
538
+ Notes
539
+ -----
540
+ - Consecutive ``#`` comment lines are grouped and stored as
541
+ ``group_comment`` using ``" /// "`` as a separator.
542
+ - Inline comments following ``!`` are preserved verbatim.
543
+ - Sections appearing multiple times are concatenated automatically.
544
+ - This handler is not frame-based; ``n_frames()`` always returns 0.
545
+ """
546
+
547
+ filetype = "trainset"
548
+
549
+ def _parse(self) -> tuple[pd.DataFrame, Dict[str, Any]]:
550
+ """
551
+ TemplateHandler expects _parse(self) with NO arguments.
552
+ So we load file content here.
553
+ """
554
+ # read the file
555
+ with open(self.path, "r") as f:
556
+ lines = f.read().splitlines()
557
+
558
+ tables: Dict[str, pd.DataFrame] = {}
559
+ current_raw_label: Optional[str] = None
560
+ current_canonical: Optional[str] = None
561
+ buffer: List[str] = []
562
+
563
+ def flush_section():
564
+ nonlocal buffer, current_canonical, tables
565
+
566
+ if not current_canonical or not buffer:
567
+ buffer = []
568
+ return
569
+
570
+ name = current_canonical
571
+
572
+ if name == "CHARGE":
573
+ df = _parse_charge(buffer, name)
574
+ elif name == "HEATFO":
575
+ df = _parse_heatfo(buffer, name)
576
+ elif name == "GEOMETRY":
577
+ df = _parse_geometry(buffer, name)
578
+ elif name == "CELL_PARAMETERS":
579
+ df = _parse_cell_parameters(buffer, name)
580
+ elif name == "ENERGY":
581
+ df = _parse_energy(buffer, name)
582
+ else:
583
+ df = pd.DataFrame()
584
+
585
+ # 🔧 KEY CHANGE: append rather than overwrite
586
+ if name in tables and not tables[name].empty:
587
+ tables[name] = pd.concat([tables[name], df], ignore_index=True)
588
+ else:
589
+ tables[name] = df
590
+
591
+ buffer = []
592
+
593
+ for raw in lines:
594
+ stripped = raw.strip()
595
+ if not stripped:
596
+ continue
597
+
598
+ upper = stripped.upper()
599
+
600
+ # SECTION START?
601
+ if upper in SECTION_MAP:
602
+ flush_section()
603
+ current_raw_label = stripped
604
+ current_canonical = SECTION_MAP[upper]
605
+ buffer = []
606
+ continue
607
+
608
+ # INSIDE A SECTION
609
+ if current_raw_label and current_canonical:
610
+ end_token = "END" + current_raw_label.replace(" ", "").upper()
611
+
612
+ if upper.startswith(end_token):
613
+ flush_section()
614
+ current_raw_label = None
615
+ current_canonical = None
616
+ buffer = []
617
+ continue
618
+
619
+ buffer.append(raw)
620
+
621
+ # Final flush
622
+ flush_section()
623
+
624
+ # RETURN EMPTY summary + metadata
625
+ return pd.DataFrame(), {
626
+ "sections": list(tables.keys()),
627
+ "tables": tables,
628
+ }
629
+
630
+ # ------------------------------------------------------------------
631
+ # Convenience accessors
632
+ # ------------------------------------------------------------------
633
+ def section(self, name: str) -> pd.DataFrame:
634
+ """Return table for a given section (case-insensitive)."""
635
+ meta = self.metadata()
636
+ tables = meta.get("tables", {})
637
+ key = name.upper()
638
+ # normalize CELL vs CELL_PARAMETERS
639
+ if key in ("CELL", "CELL PARAMETERS"):
640
+ key = "CELL_PARAMETERS"
641
+ if key not in tables:
642
+ raise KeyError(f"Section '{name}' not found in trainset.")
643
+ return tables[key]
644
+
645
+ def charges(self) -> pd.DataFrame:
646
+ return self.section("CHARGE")
647
+
648
+ def heatfo(self) -> pd.DataFrame:
649
+ return self.section("HEATFO")
650
+
651
+ def geometry(self) -> pd.DataFrame:
652
+ return self.section("GEOMETRY")
653
+
654
+ def cell_parameters(self) -> pd.DataFrame:
655
+ return self.section("CELL_PARAMETERS")
656
+
657
+ def energy_terms(self) -> pd.DataFrame:
658
+ return self.section("ENERGY")