hea-bench 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hea_bench/__init__.py +37 -0
- hea_bench/benchmark/__init__.py +4 -0
- hea_bench/benchmark/composition.py +16 -0
- hea_bench/benchmark/consolidate.py +296 -0
- hea_bench/benchmark/coverage.py +169 -0
- hea_bench/benchmark/loaders/__init__.py +54 -0
- hea_bench/benchmark/loaders/borg2020.py +85 -0
- hea_bench/benchmark/loaders/pei2020.py +82 -0
- hea_bench/benchmark/loaders/peivaste.py +104 -0
- hea_bench/benchmark/taxonomy.py +86 -0
- hea_bench/classifiers/__init__.py +4 -0
- hea_bench/classifiers/diagnostic_stats.py +208 -0
- hea_bench/cli.py +36 -0
- hea_bench/composition.py +138 -0
- hea_bench/constants.py +3 -0
- hea_bench/descriptors/__init__.py +6 -0
- hea_bench/descriptors/data/LICENSE.matminer.txt +46 -0
- hea_bench/descriptors/data/README.md +109 -0
- hea_bench/descriptors/data/__init__.py +7 -0
- hea_bench/descriptors/data/elemental.py +98 -0
- hea_bench/descriptors/data/miedema_parameters.csv +74 -0
- hea_bench/descriptors/data/pair_enthalpies.py +99 -0
- hea_bench/descriptors/data/pair_enthalpies.tsv +2629 -0
- hea_bench/descriptors/entropy.py +55 -0
- hea_bench/descriptors/melting.py +47 -0
- hea_bench/descriptors/miedema.py +84 -0
- hea_bench/descriptors/omega.py +77 -0
- hea_bench/descriptors/size.py +80 -0
- hea_bench/descriptors/vec.py +56 -0
- hea_bench/evaluate.py +256 -0
- hea_bench/rules/__init__.py +15 -0
- hea_bench/rules/guo_vec.py +48 -0
- hea_bench/rules/yang_omega.py +38 -0
- hea_bench/rules/yeh_smix.py +37 -0
- hea_bench/rules/zhang_delta.py +32 -0
- hea_bench-0.1.0.dist-info/METADATA +307 -0
- hea_bench-0.1.0.dist-info/RECORD +40 -0
- hea_bench-0.1.0.dist-info/WHEEL +4 -0
- hea_bench-0.1.0.dist-info/entry_points.txt +2 -0
- hea_bench-0.1.0.dist-info/licenses/LICENSE +21 -0
hea_bench/__init__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""hea-bench: an open benchmark suite for high-entropy alloy phase prediction.
|
|
2
|
+
|
|
3
|
+
Top-level convenience imports for the most-used names; full surface
|
|
4
|
+
documented in ``README.md``.
|
|
5
|
+
|
|
6
|
+
Quick start::
|
|
7
|
+
|
|
8
|
+
from hea_bench import smix, delta, vec, mixing_enthalpy, omega
|
|
9
|
+
cantor = {"Co": 0.2, "Cr": 0.2, "Fe": 0.2, "Mn": 0.2, "Ni": 0.2}
|
|
10
|
+
print(smix(cantor), delta(cantor), vec(cantor),
|
|
11
|
+
mixing_enthalpy(cantor), omega(cantor))
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .composition import Composition, normalize, parse_formula
|
|
15
|
+
from .descriptors.entropy import smix
|
|
16
|
+
from .descriptors.melting import melting_temperature
|
|
17
|
+
from .descriptors.miedema import mixing_enthalpy
|
|
18
|
+
from .descriptors.omega import omega
|
|
19
|
+
from .descriptors.size import delta
|
|
20
|
+
from .descriptors.vec import vec
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"__version__",
|
|
26
|
+
# core type + parsing
|
|
27
|
+
"Composition",
|
|
28
|
+
"parse_formula",
|
|
29
|
+
"normalize",
|
|
30
|
+
# descriptors
|
|
31
|
+
"smix",
|
|
32
|
+
"delta",
|
|
33
|
+
"vec",
|
|
34
|
+
"melting_temperature",
|
|
35
|
+
"mixing_enthalpy",
|
|
36
|
+
"omega",
|
|
37
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Re-export of composition primitives from the package root.
|
|
2
|
+
|
|
3
|
+
`composition.py` was originally defined here; it moved to the package
|
|
4
|
+
root so descriptor modules can consume it without depending on the
|
|
5
|
+
benchmark subpackage. This shim preserves the historical import path
|
|
6
|
+
for any external code that grew to depend on it.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from ..composition import (
|
|
10
|
+
Composition,
|
|
11
|
+
from_element_columns,
|
|
12
|
+
normalize,
|
|
13
|
+
parse_formula,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = ["Composition", "from_element_columns", "normalize", "parse_formula"]
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Consolidate the three source datasets into one canonical benchmark.
|
|
2
|
+
|
|
3
|
+
Walks each source loader, merges records on a *composition-only* join
|
|
4
|
+
key (Borg's processing column is preserved as side-channel data but
|
|
5
|
+
does **not** participate in the join), and produces:
|
|
6
|
+
|
|
7
|
+
- ``data/consolidated/<version>/consolidated.csv`` — the benchmark
|
|
8
|
+
- ``data/consolidated/<version>/manifest.json`` — provenance, dedup
|
|
9
|
+
stats, SHA-256s
|
|
10
|
+
|
|
11
|
+
Label conflict handling: when two or more sources contribute records for
|
|
12
|
+
the same composition but disagree on the canonical phase class, the
|
|
13
|
+
output row has ``canonical_phase`` blank and ``has_conflict=1``. The
|
|
14
|
+
per-source labels are still recorded so users can resolve the conflict
|
|
15
|
+
however they want.
|
|
16
|
+
|
|
17
|
+
Run as a module to (re)build v0.1.0:
|
|
18
|
+
|
|
19
|
+
python -m hea_bench.benchmark.consolidate
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import csv
|
|
25
|
+
import datetime
|
|
26
|
+
import hashlib
|
|
27
|
+
import json
|
|
28
|
+
import pathlib
|
|
29
|
+
import sys
|
|
30
|
+
from collections import defaultdict
|
|
31
|
+
from collections.abc import Iterable
|
|
32
|
+
from dataclasses import dataclass, field
|
|
33
|
+
|
|
34
|
+
from .composition import Composition
|
|
35
|
+
from .loaders import AlloyRecord, borg2020, pei2020, peivaste
|
|
36
|
+
from .taxonomy import PhaseClass
|
|
37
|
+
|
|
38
|
+
DEFAULT_VERSION = "0.1.0"
|
|
39
|
+
|
|
40
|
+
# Repo-relative paths (repo root is 3 parents up from this file:
|
|
41
|
+
# hea-bench/src/hea_bench/benchmark/consolidate.py → hea-bench/).
|
|
42
|
+
_REPO_ROOT = pathlib.Path(__file__).resolve().parents[3]
|
|
43
|
+
_RAW_DIR = _REPO_ROOT / "data" / "raw"
|
|
44
|
+
_CONSOLIDATED_DIR = _REPO_ROOT / "data" / "consolidated"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def canonical_formula_key(composition: Composition, precision: int = 4) -> str:
|
|
48
|
+
"""Stable string key for a normalized composition.
|
|
49
|
+
|
|
50
|
+
Elements sorted alphabetically; fractions formatted at fixed
|
|
51
|
+
precision. Two compositions differing only by float noise produce
|
|
52
|
+
the same key.
|
|
53
|
+
|
|
54
|
+
>>> canonical_formula_key({'Fe': 0.5, 'Co': 0.5})
|
|
55
|
+
'Co0.5000Fe0.5000'
|
|
56
|
+
>>> canonical_formula_key({'Co': 0.500000001, 'Fe': 0.499999999})
|
|
57
|
+
'Co0.5000Fe0.5000'
|
|
58
|
+
"""
|
|
59
|
+
parts = []
|
|
60
|
+
for el in sorted(composition):
|
|
61
|
+
frac = round(composition[el], precision)
|
|
62
|
+
if frac > 0:
|
|
63
|
+
parts.append(f"{el}{frac:.{precision}f}")
|
|
64
|
+
return "".join(parts)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class ConsolidatedRow:
|
|
69
|
+
composition_key: str
|
|
70
|
+
composition: Composition
|
|
71
|
+
n_elements: int
|
|
72
|
+
sources: list[str]
|
|
73
|
+
canonical_phase: PhaseClass | None
|
|
74
|
+
has_conflict: bool
|
|
75
|
+
per_source_canonical: dict[str, PhaseClass]
|
|
76
|
+
per_source_raw_labels: dict[str, str]
|
|
77
|
+
borg_processing: str | None = None
|
|
78
|
+
borg_doi: str | None = None
|
|
79
|
+
source_row_ids: dict[str, str] = field(default_factory=dict)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def consolidate(records: Iterable[AlloyRecord], precision: int = 4) -> list[ConsolidatedRow]:
|
|
83
|
+
"""Merge records from any number of sources by composition-only key."""
|
|
84
|
+
by_key: dict[str, list[AlloyRecord]] = defaultdict(list)
|
|
85
|
+
for r in records:
|
|
86
|
+
by_key[canonical_formula_key(r.composition, precision)].append(r)
|
|
87
|
+
|
|
88
|
+
out: list[ConsolidatedRow] = []
|
|
89
|
+
for key, rs in by_key.items():
|
|
90
|
+
per_source_canonical: dict[str, PhaseClass] = {}
|
|
91
|
+
per_source_raw: dict[str, str] = {}
|
|
92
|
+
row_ids: dict[str, str] = {}
|
|
93
|
+
for r in rs:
|
|
94
|
+
# Within one source, a single (formula, processing) pair has
|
|
95
|
+
# already been deduped by the per-source loader. Multiple rs
|
|
96
|
+
# entries for the same source would only happen if two Borg
|
|
97
|
+
# processings of the same composition exist; keep the first.
|
|
98
|
+
per_source_canonical.setdefault(r.source, r.canonical_phase)
|
|
99
|
+
per_source_raw.setdefault(r.source, r.source_label)
|
|
100
|
+
row_ids.setdefault(r.source, r.source_row_id)
|
|
101
|
+
|
|
102
|
+
canonical_set = set(per_source_canonical.values())
|
|
103
|
+
has_conflict = len(canonical_set) > 1
|
|
104
|
+
canonical = next(iter(canonical_set)) if not has_conflict else None
|
|
105
|
+
|
|
106
|
+
borg_rec = next((r for r in rs if r.source == "borg2020"), None)
|
|
107
|
+
|
|
108
|
+
out.append(
|
|
109
|
+
ConsolidatedRow(
|
|
110
|
+
composition_key=key,
|
|
111
|
+
composition=rs[0].composition,
|
|
112
|
+
n_elements=sum(1 for v in rs[0].composition.values() if v > 0),
|
|
113
|
+
sources=sorted(per_source_canonical),
|
|
114
|
+
canonical_phase=canonical,
|
|
115
|
+
has_conflict=has_conflict,
|
|
116
|
+
per_source_canonical=per_source_canonical,
|
|
117
|
+
per_source_raw_labels=per_source_raw,
|
|
118
|
+
borg_processing=borg_rec.processing if borg_rec else None,
|
|
119
|
+
borg_doi=borg_rec.source_doi if borg_rec else None,
|
|
120
|
+
source_row_ids=row_ids,
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
return out
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ---- Output writers -------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
_CSV_COLUMNS = [
|
|
129
|
+
"composition_key",
|
|
130
|
+
"n_elements",
|
|
131
|
+
"sources",
|
|
132
|
+
"canonical_phase",
|
|
133
|
+
"has_conflict",
|
|
134
|
+
"borg_label",
|
|
135
|
+
"pei_label",
|
|
136
|
+
"peivaste_label",
|
|
137
|
+
"borg_raw_label",
|
|
138
|
+
"pei_raw_label",
|
|
139
|
+
"peivaste_raw_label",
|
|
140
|
+
"borg_processing",
|
|
141
|
+
"borg_doi",
|
|
142
|
+
"source_row_ids",
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _row_to_csv(row: ConsolidatedRow) -> list[str]:
|
|
147
|
+
def lbl(src: str) -> str:
|
|
148
|
+
v = row.per_source_canonical.get(src)
|
|
149
|
+
return v.value if v else ""
|
|
150
|
+
|
|
151
|
+
return [
|
|
152
|
+
row.composition_key,
|
|
153
|
+
str(row.n_elements),
|
|
154
|
+
";".join(row.sources),
|
|
155
|
+
row.canonical_phase.value if row.canonical_phase else "",
|
|
156
|
+
"1" if row.has_conflict else "0",
|
|
157
|
+
lbl("borg2020"),
|
|
158
|
+
lbl("pei2020"),
|
|
159
|
+
lbl("peivaste"),
|
|
160
|
+
row.per_source_raw_labels.get("borg2020", ""),
|
|
161
|
+
row.per_source_raw_labels.get("pei2020", ""),
|
|
162
|
+
row.per_source_raw_labels.get("peivaste", ""),
|
|
163
|
+
row.borg_processing or "",
|
|
164
|
+
row.borg_doi or "",
|
|
165
|
+
";".join(f"{s}:{rid}" for s, rid in sorted(row.source_row_ids.items())),
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def write_consolidated_csv(rows: Iterable[ConsolidatedRow], path: pathlib.Path) -> int:
|
|
170
|
+
"""Write rows in alphabetical composition_key order. Returns row count."""
|
|
171
|
+
rows = sorted(rows, key=lambda r: r.composition_key)
|
|
172
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
173
|
+
with path.open("w", newline="", encoding="utf-8") as f:
|
|
174
|
+
w = csv.writer(f)
|
|
175
|
+
w.writerow(_CSV_COLUMNS)
|
|
176
|
+
for r in rows:
|
|
177
|
+
w.writerow(_row_to_csv(r))
|
|
178
|
+
return len(rows)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _sha256(p: pathlib.Path) -> str:
|
|
182
|
+
return hashlib.sha256(p.read_bytes()).hexdigest() if p.exists() else ""
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def build_manifest(
|
|
186
|
+
rows: list[ConsolidatedRow],
|
|
187
|
+
source_counts: dict[str, int],
|
|
188
|
+
source_paths: dict[str, pathlib.Path],
|
|
189
|
+
version: str,
|
|
190
|
+
) -> dict:
|
|
191
|
+
canonical_dist = {p.value: 0 for p in PhaseClass}
|
|
192
|
+
canonical_dist["conflict"] = 0
|
|
193
|
+
for r in rows:
|
|
194
|
+
if r.has_conflict:
|
|
195
|
+
canonical_dist["conflict"] += 1
|
|
196
|
+
elif r.canonical_phase:
|
|
197
|
+
canonical_dist[r.canonical_phase.value] += 1
|
|
198
|
+
|
|
199
|
+
overlap = {"single_source": 0, "two_sources": 0, "three_sources": 0}
|
|
200
|
+
for r in rows:
|
|
201
|
+
n = len(r.sources)
|
|
202
|
+
if n == 1:
|
|
203
|
+
overlap["single_source"] += 1
|
|
204
|
+
elif n == 2:
|
|
205
|
+
overlap["two_sources"] += 1
|
|
206
|
+
elif n >= 3:
|
|
207
|
+
overlap["three_sources"] += 1
|
|
208
|
+
|
|
209
|
+
src_meta_table = {
|
|
210
|
+
"borg2020": {"license": "CC-BY-4.0", "doi": "10.1038/s41597-020-00768-9"},
|
|
211
|
+
"pei2020": {"license": "CC-BY-4.0", "doi": "10.1038/s41524-020-0308-7"},
|
|
212
|
+
"peivaste": {"license": "none-declared", "url": "https://github.com/Iman-Peivaste/ML_HEAs_Phase_Dataset"},
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
sources_section = []
|
|
216
|
+
for name in ("borg2020", "pei2020", "peivaste"):
|
|
217
|
+
meta = src_meta_table[name]
|
|
218
|
+
sources_section.append(
|
|
219
|
+
{
|
|
220
|
+
"name": name,
|
|
221
|
+
**meta,
|
|
222
|
+
"rows_yielded": source_counts.get(name, 0),
|
|
223
|
+
"sha256": _sha256(source_paths.get(name, pathlib.Path())),
|
|
224
|
+
}
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
"version": version,
|
|
229
|
+
"created": datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds"),
|
|
230
|
+
"schema": {
|
|
231
|
+
"composition_key": (
|
|
232
|
+
"Stable hash of normalized composition: alphabetically sorted "
|
|
233
|
+
"elements at 4-decimal mole-fraction precision."
|
|
234
|
+
),
|
|
235
|
+
"canonical_phase": (
|
|
236
|
+
"One of BCC, FCC, HCP, multi-phase. Empty when sources disagree."
|
|
237
|
+
),
|
|
238
|
+
"has_conflict": (
|
|
239
|
+
"1 if any pair of contributing sources gave different canonical labels."
|
|
240
|
+
),
|
|
241
|
+
"borg_processing": "Preserved side-channel from Borg only. Not part of join key.",
|
|
242
|
+
},
|
|
243
|
+
"sources": sources_section,
|
|
244
|
+
"totals": {
|
|
245
|
+
"unique_compositions": len(rows),
|
|
246
|
+
"by_source_overlap": overlap,
|
|
247
|
+
"canonical_distribution": canonical_dist,
|
|
248
|
+
},
|
|
249
|
+
"consolidation_rules": {
|
|
250
|
+
"join_key": "composition_only, 4-decimal mole-fraction precision",
|
|
251
|
+
"label_policy": "agreement → canonical; disagreement → has_conflict=1, canonical_phase blank",
|
|
252
|
+
"borg_processing": "Preserved as side-channel; not part of join key",
|
|
253
|
+
"per_source_dedup": "Each loader deduplicates internally before consolidation",
|
|
254
|
+
},
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def build(version: str = DEFAULT_VERSION, out_dir: pathlib.Path | None = None) -> dict:
|
|
259
|
+
"""Run all three loaders, consolidate, write the v<version> release.
|
|
260
|
+
|
|
261
|
+
Returns the manifest dict (also written to disk).
|
|
262
|
+
"""
|
|
263
|
+
out_dir = out_dir or (_CONSOLIDATED_DIR / f"v{version}")
|
|
264
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
265
|
+
|
|
266
|
+
source_paths = {
|
|
267
|
+
"borg2020": _RAW_DIR / "borg2020" / "MPEA_dataset.csv",
|
|
268
|
+
"pei2020": _RAW_DIR / "pei2020" / "pei2020_alloys_phases.csv",
|
|
269
|
+
"peivaste": _RAW_DIR / "peivaste" / "dataset11252_79.csv",
|
|
270
|
+
}
|
|
271
|
+
borg_rows = list(borg2020.load(source_paths["borg2020"]))
|
|
272
|
+
pei_rows = list(pei2020.load(source_paths["pei2020"]))
|
|
273
|
+
peivaste_rows = list(peivaste.load(source_paths["peivaste"]))
|
|
274
|
+
source_counts = {"borg2020": len(borg_rows), "pei2020": len(pei_rows), "peivaste": len(peivaste_rows)}
|
|
275
|
+
|
|
276
|
+
consolidated = consolidate([*borg_rows, *pei_rows, *peivaste_rows])
|
|
277
|
+
write_consolidated_csv(consolidated, out_dir / "consolidated.csv")
|
|
278
|
+
|
|
279
|
+
manifest = build_manifest(consolidated, source_counts, source_paths, version)
|
|
280
|
+
(out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
|
|
281
|
+
return manifest
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def main() -> int:
|
|
285
|
+
if hasattr(sys.stdout, "reconfigure"):
|
|
286
|
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
|
287
|
+
m = build()
|
|
288
|
+
print(f"=== v{m['version']} built {m['created']} ===")
|
|
289
|
+
print(f"unique compositions: {m['totals']['unique_compositions']}")
|
|
290
|
+
print(f"by_source_overlap: {m['totals']['by_source_overlap']}")
|
|
291
|
+
print(f"canonical: {m['totals']['canonical_distribution']}")
|
|
292
|
+
return 0
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
if __name__ == "__main__":
|
|
296
|
+
sys.exit(main())
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Coverage analysis of the consolidated benchmark against the
|
|
2
|
+
descriptor data tables.
|
|
3
|
+
|
|
4
|
+
For each composition in ``data/consolidated/v<version>/consolidated.csv``,
|
|
5
|
+
checks whether all of its elements are present in:
|
|
6
|
+
|
|
7
|
+
(a) :data:`hea_bench.descriptors.data.elemental.ELEMENTAL_DATA`
|
|
8
|
+
— needed for δ, VEC, T_m (the "basic" descriptor set)
|
|
9
|
+
(b) the Miedema pair table at
|
|
10
|
+
:mod:`hea_bench.descriptors.data.pair_enthalpies`
|
|
11
|
+
— needed for ΔH_mix and Ω
|
|
12
|
+
|
|
13
|
+
Outputs:
|
|
14
|
+
|
|
15
|
+
- per-composition counts (in_basic_only / in_miedema_only / in_both / no_coverage)
|
|
16
|
+
- ranked list of elements missing from each table, by alloy frequency
|
|
17
|
+
- the "next-best-add" candidates — elements present in the pair table
|
|
18
|
+
but missing from ELEMENTAL_DATA, sorted by how many alloys they'd
|
|
19
|
+
unlock if added
|
|
20
|
+
|
|
21
|
+
Run as a module to score the released v0.1.0:
|
|
22
|
+
|
|
23
|
+
python -m hea_bench.benchmark.coverage
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import csv
|
|
29
|
+
import json
|
|
30
|
+
import pathlib
|
|
31
|
+
import sys
|
|
32
|
+
from collections import Counter
|
|
33
|
+
from collections.abc import Iterable
|
|
34
|
+
|
|
35
|
+
from ..composition import parse_formula
|
|
36
|
+
from ..descriptors.data.elemental import covered_elements as _elemental_covered
|
|
37
|
+
from ..descriptors.data.pair_enthalpies import covered_elements as _pair_covered
|
|
38
|
+
|
|
39
|
+
_REPO_ROOT = pathlib.Path(__file__).resolve().parents[3]
|
|
40
|
+
_DEFAULT_CSV = _REPO_ROOT / "data" / "consolidated" / "v0.1.0" / "consolidated.csv"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def analyze(csv_path: pathlib.Path = _DEFAULT_CSV) -> dict:
|
|
44
|
+
"""Compute coverage of the consolidated benchmark.
|
|
45
|
+
|
|
46
|
+
Returns a dict with:
|
|
47
|
+
|
|
48
|
+
- ``total`` : total compositions in the CSV
|
|
49
|
+
- ``in_basic``, ``in_miedema``, ``in_both``, ``in_neither`` : counts
|
|
50
|
+
- ``missing_from_basic`` : ``Counter`` of element → alloy frequency
|
|
51
|
+
for elements that appear in compositions but are absent from
|
|
52
|
+
ELEMENTAL_DATA
|
|
53
|
+
- ``missing_from_miedema`` : same for the pair table
|
|
54
|
+
- ``next_best_adds`` : list of (element, n_alloys_unlocked,
|
|
55
|
+
already_in_pair_table) tuples, sorted descending — these are
|
|
56
|
+
the highest-leverage elements to add to ELEMENTAL_DATA next
|
|
57
|
+
"""
|
|
58
|
+
basic = _elemental_covered()
|
|
59
|
+
miedema = _pair_covered()
|
|
60
|
+
|
|
61
|
+
total = 0
|
|
62
|
+
in_basic = 0
|
|
63
|
+
in_miedema = 0
|
|
64
|
+
in_both = 0
|
|
65
|
+
in_neither = 0
|
|
66
|
+
missing_from_basic: Counter[str] = Counter()
|
|
67
|
+
missing_from_miedema: Counter[str] = Counter()
|
|
68
|
+
|
|
69
|
+
# For "next-best-add": map each currently-uncovered element to the
|
|
70
|
+
# number of compositions whose ONLY missing element(s) are inside
|
|
71
|
+
# the set that would be unlocked by adding it. Simpler proxy:
|
|
72
|
+
# count how many compositions contain each uncovered element.
|
|
73
|
+
per_element_alloys: Counter[str] = Counter()
|
|
74
|
+
|
|
75
|
+
with csv_path.open(newline="", encoding="utf-8") as f:
|
|
76
|
+
for row in csv.DictReader(f):
|
|
77
|
+
total += 1
|
|
78
|
+
try:
|
|
79
|
+
comp = parse_formula(row["composition_key"])
|
|
80
|
+
except (KeyError, ValueError):
|
|
81
|
+
continue
|
|
82
|
+
elems = set(comp)
|
|
83
|
+
miss_b = elems - basic
|
|
84
|
+
miss_m = elems - miedema
|
|
85
|
+
|
|
86
|
+
if not miss_b:
|
|
87
|
+
in_basic += 1
|
|
88
|
+
if not miss_m:
|
|
89
|
+
in_miedema += 1
|
|
90
|
+
if not miss_b and not miss_m:
|
|
91
|
+
in_both += 1
|
|
92
|
+
if miss_b and miss_m:
|
|
93
|
+
in_neither += 1
|
|
94
|
+
|
|
95
|
+
for e in miss_b:
|
|
96
|
+
missing_from_basic[e] += 1
|
|
97
|
+
for e in miss_m:
|
|
98
|
+
missing_from_miedema[e] += 1
|
|
99
|
+
for e in elems - basic:
|
|
100
|
+
per_element_alloys[e] += 1
|
|
101
|
+
|
|
102
|
+
# Next-best-add ranking: elements outside ELEMENTAL_DATA, by alloy
|
|
103
|
+
# count, with a flag indicating whether they're already in the
|
|
104
|
+
# pair table (low-effort wins) vs. need full sourcing.
|
|
105
|
+
next_best: list[tuple[str, int, bool]] = []
|
|
106
|
+
for el, cnt in per_element_alloys.most_common():
|
|
107
|
+
next_best.append((el, cnt, el in miedema))
|
|
108
|
+
|
|
109
|
+
return {
|
|
110
|
+
"csv_path": str(csv_path),
|
|
111
|
+
"total": total,
|
|
112
|
+
"in_basic": in_basic,
|
|
113
|
+
"in_miedema": in_miedema,
|
|
114
|
+
"in_both": in_both,
|
|
115
|
+
"in_neither": in_neither,
|
|
116
|
+
"missing_from_basic": dict(missing_from_basic),
|
|
117
|
+
"missing_from_miedema": dict(missing_from_miedema),
|
|
118
|
+
"next_best_adds": next_best,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def format_report(report: dict) -> str:
|
|
123
|
+
"""Render `analyze()` output as a human-readable text report."""
|
|
124
|
+
out: list[str] = []
|
|
125
|
+
total = report["total"]
|
|
126
|
+
|
|
127
|
+
def pct(n: int) -> str:
|
|
128
|
+
return f"{n / total * 100:.1f}%" if total else "0.0%"
|
|
129
|
+
|
|
130
|
+
out.append(f"=== Coverage report: {report['csv_path']} ===")
|
|
131
|
+
out.append("")
|
|
132
|
+
out.append(f"total compositions: {total:>5}")
|
|
133
|
+
out.append("")
|
|
134
|
+
out.append("Coverage by descriptor data table:")
|
|
135
|
+
out.append(f" ELEMENTAL_DATA (δ/VEC/T_m): {report['in_basic']:>5} / {total} ({pct(report['in_basic'])})")
|
|
136
|
+
out.append(f" Miedema pair table (ΔH/Ω): {report['in_miedema']:>5} / {total} ({pct(report['in_miedema'])})")
|
|
137
|
+
out.append(f" both (all 6 descriptors): {report['in_both']:>5} / {total} ({pct(report['in_both'])})")
|
|
138
|
+
out.append(f" neither (cannot score): {report['in_neither']:>5} / {total} ({pct(report['in_neither'])})")
|
|
139
|
+
out.append("")
|
|
140
|
+
out.append("Top 15 elements missing from ELEMENTAL_DATA (basic descriptors):")
|
|
141
|
+
items = sorted(report["missing_from_basic"].items(), key=lambda kv: -kv[1])[:15]
|
|
142
|
+
for el, cnt in items:
|
|
143
|
+
out.append(f" {el:>3} appears in {cnt:>5} alloys ({pct(cnt)})")
|
|
144
|
+
out.append("")
|
|
145
|
+
out.append("Next-best-add ranking (elements outside ELEMENTAL_DATA, by alloy count):")
|
|
146
|
+
out.append(" symbol alloys-unlocked already-in-pair-table")
|
|
147
|
+
for el, cnt, in_pair in report["next_best_adds"][:15]:
|
|
148
|
+
flag = "YES (low-effort win)" if in_pair else "no (needs full sourcing)"
|
|
149
|
+
out.append(f" {el:>3} {cnt:>5} {flag}")
|
|
150
|
+
return "\n".join(out)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def main(argv: Iterable[str] | None = None) -> int:
|
|
154
|
+
if hasattr(sys.stdout, "reconfigure"):
|
|
155
|
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
|
156
|
+
report = analyze()
|
|
157
|
+
print(format_report(report))
|
|
158
|
+
# Persist machine-readable form alongside the v0.1.0 release.
|
|
159
|
+
out_json = pathlib.Path(report["csv_path"]).parent / "coverage_report.json"
|
|
160
|
+
out_json.write_text(json.dumps(
|
|
161
|
+
{k: (dict(v) if isinstance(v, Counter) else v) for k, v in report.items()},
|
|
162
|
+
indent=2,
|
|
163
|
+
))
|
|
164
|
+
print(f"\nwrote {out_json}")
|
|
165
|
+
return 0
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
sys.exit(main())
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Per-source dataset loaders.
|
|
2
|
+
|
|
3
|
+
Each loader returns an iterable of `AlloyRecord` rows in a canonical
|
|
4
|
+
schema, hiding the upstream column conventions, formula syntax, and
|
|
5
|
+
phase-label vocabulary from the consolidator.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from ..composition import Composition
|
|
13
|
+
from ..taxonomy import PhaseClass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class AlloyRecord:
|
|
18
|
+
"""One row in the consolidated benchmark schema.
|
|
19
|
+
|
|
20
|
+
Attributes
|
|
21
|
+
----------
|
|
22
|
+
source
|
|
23
|
+
Identifier of the upstream source (e.g., ``"borg2020"``,
|
|
24
|
+
``"pei2020"``, ``"peivaste"``).
|
|
25
|
+
source_row_id
|
|
26
|
+
Stable identifier from the upstream row — index, Reference ID,
|
|
27
|
+
whatever the source uses to point back to its own records.
|
|
28
|
+
formula_raw
|
|
29
|
+
The composition string as it appeared in the upstream CSV,
|
|
30
|
+
verbatim. Useful for traceability.
|
|
31
|
+
composition
|
|
32
|
+
Normalized mole-fraction dict (sums to 1.0).
|
|
33
|
+
canonical_phase
|
|
34
|
+
One of the `PhaseClass` values: ``BCC``, ``FCC``, ``HCP``, or
|
|
35
|
+
``multi-phase``.
|
|
36
|
+
source_label
|
|
37
|
+
The upstream phase label string, preserved verbatim for users
|
|
38
|
+
who want finer granularity than the canonical 4-class.
|
|
39
|
+
processing
|
|
40
|
+
Processing route, if the upstream source records it (Borg only).
|
|
41
|
+
``None`` for sources that don't.
|
|
42
|
+
source_doi
|
|
43
|
+
DOI of the primary-literature paper this alloy was reported in,
|
|
44
|
+
if the upstream source records it (Borg only).
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
source: str
|
|
48
|
+
source_row_id: str
|
|
49
|
+
formula_raw: str
|
|
50
|
+
composition: Composition
|
|
51
|
+
canonical_phase: PhaseClass
|
|
52
|
+
source_label: str
|
|
53
|
+
processing: str | None = None
|
|
54
|
+
source_doi: str | None = None
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Loader for the Borg et al. 2020 MPEA dataset.
|
|
2
|
+
|
|
3
|
+
Source: ``data/raw/borg2020/MPEA_dataset.csv`` (CC-BY-4.0, mirrored
|
|
4
|
+
verbatim from figshare 10.6084/m9.figshare.12642953 v9).
|
|
5
|
+
|
|
6
|
+
Schema, dedup behaviour, and label-normalization decisions are
|
|
7
|
+
documented inline below; see ``data/raw/borg2020/README.md`` for the
|
|
8
|
+
full source-level context.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import csv
|
|
14
|
+
import pathlib
|
|
15
|
+
from collections.abc import Iterator
|
|
16
|
+
|
|
17
|
+
from ..composition import parse_formula
|
|
18
|
+
from ..taxonomy import BORG_SIMPLE_MAP
|
|
19
|
+
from . import AlloyRecord
|
|
20
|
+
|
|
21
|
+
SOURCE_NAME = "borg2020"
|
|
22
|
+
|
|
23
|
+
# Column names — Borg's upstream headers, verbatim (including raw LaTeX).
|
|
24
|
+
_COL_REF_ID = "IDENTIFIER: Reference ID"
|
|
25
|
+
_COL_FORMULA = "FORMULA"
|
|
26
|
+
_COL_MICROSTRUCTURE = "PROPERTY: Microstructure"
|
|
27
|
+
_COL_PROCESSING = "PROPERTY: Processing method"
|
|
28
|
+
_COL_SIMPLE_PHASE = "PROPERTY: BCC/FCC/other"
|
|
29
|
+
_COL_DOI = "REFERENCE: doi"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def load(csv_path: pathlib.Path) -> Iterator[AlloyRecord]:
|
|
33
|
+
"""Yield one `AlloyRecord` per unique ``(formula, processing)`` alloy.
|
|
34
|
+
|
|
35
|
+
Borg has 1,545 measurement rows for 740 unique ``(formula, processing)``
|
|
36
|
+
alloys — the duplicates come from multiple mechanical tests at
|
|
37
|
+
different temperatures/test types on the same alloy. Phase is a
|
|
38
|
+
property of the alloy and its processing, not the test, so we
|
|
39
|
+
deduplicate here.
|
|
40
|
+
|
|
41
|
+
When duplicate ``(formula, processing)`` rows disagree on the phase
|
|
42
|
+
label, the first row encountered wins. (Borg's data is internally
|
|
43
|
+
consistent on this point — duplicate rows from the same source paper
|
|
44
|
+
always carry the same phase label — but the loader is defensive.)
|
|
45
|
+
|
|
46
|
+
Rows missing a parseable formula or a simplified phase label are
|
|
47
|
+
skipped silently. Caller can compare yielded count to file row count
|
|
48
|
+
if a count of skipped rows is needed.
|
|
49
|
+
"""
|
|
50
|
+
seen: set[tuple[str, str]] = set()
|
|
51
|
+
|
|
52
|
+
with csv_path.open(newline="", encoding="utf-8-sig") as f:
|
|
53
|
+
for raw in csv.DictReader(f):
|
|
54
|
+
formula = (raw.get(_COL_FORMULA) or "").strip()
|
|
55
|
+
processing = (raw.get(_COL_PROCESSING) or "").strip()
|
|
56
|
+
source_label = (raw.get(_COL_SIMPLE_PHASE) or "").strip()
|
|
57
|
+
full_microstructure = (raw.get(_COL_MICROSTRUCTURE) or "").strip()
|
|
58
|
+
|
|
59
|
+
if not formula or not source_label:
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
key = (formula, processing)
|
|
63
|
+
if key in seen:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
composition = parse_formula(formula)
|
|
68
|
+
except ValueError:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
canonical = BORG_SIMPLE_MAP.get(source_label)
|
|
72
|
+
if canonical is None:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
seen.add(key)
|
|
76
|
+
yield AlloyRecord(
|
|
77
|
+
source=SOURCE_NAME,
|
|
78
|
+
source_row_id=(raw.get(_COL_REF_ID) or "").strip(),
|
|
79
|
+
formula_raw=formula,
|
|
80
|
+
composition=composition,
|
|
81
|
+
canonical_phase=canonical,
|
|
82
|
+
source_label=full_microstructure or source_label,
|
|
83
|
+
processing=processing or None,
|
|
84
|
+
source_doi=(raw.get(_COL_DOI) or "").strip() or None,
|
|
85
|
+
)
|