hea-bench 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. hea_bench/__init__.py +37 -0
  2. hea_bench/benchmark/__init__.py +4 -0
  3. hea_bench/benchmark/composition.py +16 -0
  4. hea_bench/benchmark/consolidate.py +296 -0
  5. hea_bench/benchmark/coverage.py +169 -0
  6. hea_bench/benchmark/loaders/__init__.py +54 -0
  7. hea_bench/benchmark/loaders/borg2020.py +85 -0
  8. hea_bench/benchmark/loaders/pei2020.py +82 -0
  9. hea_bench/benchmark/loaders/peivaste.py +104 -0
  10. hea_bench/benchmark/taxonomy.py +86 -0
  11. hea_bench/classifiers/__init__.py +4 -0
  12. hea_bench/classifiers/diagnostic_stats.py +208 -0
  13. hea_bench/cli.py +36 -0
  14. hea_bench/composition.py +138 -0
  15. hea_bench/constants.py +3 -0
  16. hea_bench/descriptors/__init__.py +6 -0
  17. hea_bench/descriptors/data/LICENSE.matminer.txt +46 -0
  18. hea_bench/descriptors/data/README.md +109 -0
  19. hea_bench/descriptors/data/__init__.py +7 -0
  20. hea_bench/descriptors/data/elemental.py +98 -0
  21. hea_bench/descriptors/data/miedema_parameters.csv +74 -0
  22. hea_bench/descriptors/data/pair_enthalpies.py +99 -0
  23. hea_bench/descriptors/data/pair_enthalpies.tsv +2629 -0
  24. hea_bench/descriptors/entropy.py +55 -0
  25. hea_bench/descriptors/melting.py +47 -0
  26. hea_bench/descriptors/miedema.py +84 -0
  27. hea_bench/descriptors/omega.py +77 -0
  28. hea_bench/descriptors/size.py +80 -0
  29. hea_bench/descriptors/vec.py +56 -0
  30. hea_bench/evaluate.py +256 -0
  31. hea_bench/rules/__init__.py +15 -0
  32. hea_bench/rules/guo_vec.py +48 -0
  33. hea_bench/rules/yang_omega.py +38 -0
  34. hea_bench/rules/yeh_smix.py +37 -0
  35. hea_bench/rules/zhang_delta.py +32 -0
  36. hea_bench-0.1.0.dist-info/METADATA +307 -0
  37. hea_bench-0.1.0.dist-info/RECORD +40 -0
  38. hea_bench-0.1.0.dist-info/WHEEL +4 -0
  39. hea_bench-0.1.0.dist-info/entry_points.txt +2 -0
  40. hea_bench-0.1.0.dist-info/licenses/LICENSE +21 -0
hea_bench/__init__.py ADDED
@@ -0,0 +1,37 @@
1
+ """hea-bench: an open benchmark suite for high-entropy alloy phase prediction.
2
+
3
+ Top-level convenience imports for the most-used names; full surface
4
+ documented in ``README.md``.
5
+
6
+ Quick start::
7
+
8
+ from hea_bench import smix, delta, vec, mixing_enthalpy, omega
9
+ cantor = {"Co": 0.2, "Cr": 0.2, "Fe": 0.2, "Mn": 0.2, "Ni": 0.2}
10
+ print(smix(cantor), delta(cantor), vec(cantor),
11
+ mixing_enthalpy(cantor), omega(cantor))
12
+ """
13
+
14
+ from .composition import Composition, normalize, parse_formula
15
+ from .descriptors.entropy import smix
16
+ from .descriptors.melting import melting_temperature
17
+ from .descriptors.miedema import mixing_enthalpy
18
+ from .descriptors.omega import omega
19
+ from .descriptors.size import delta
20
+ from .descriptors.vec import vec
21
+
22
+ __version__ = "0.1.0"
23
+
24
+ __all__ = [
25
+ "__version__",
26
+ # core type + parsing
27
+ "Composition",
28
+ "parse_formula",
29
+ "normalize",
30
+ # descriptors
31
+ "smix",
32
+ "delta",
33
+ "vec",
34
+ "melting_temperature",
35
+ "mixing_enthalpy",
36
+ "omega",
37
+ ]
@@ -0,0 +1,4 @@
1
+ """Dataset loaders, deduplication, and benchmark assembly.
2
+
3
+ Phase 0: empty namespace. Populated in Phase 1.
4
+ """
@@ -0,0 +1,16 @@
1
+ """Re-export of composition primitives from the package root.
2
+
3
+ `composition.py` was originally defined here; it moved to the package
4
+ root so descriptor modules can consume it without depending on the
5
+ benchmark subpackage. This shim preserves the historical import path
6
+ for any external code that grew to depend on it.
7
+ """
8
+
9
+ from ..composition import (
10
+ Composition,
11
+ from_element_columns,
12
+ normalize,
13
+ parse_formula,
14
+ )
15
+
16
+ __all__ = ["Composition", "from_element_columns", "normalize", "parse_formula"]
@@ -0,0 +1,296 @@
1
+ """Consolidate the three source datasets into one canonical benchmark.
2
+
3
+ Walks each source loader, merges records on a *composition-only* join
4
+ key (Borg's processing column is preserved as side-channel data but
5
+ does **not** participate in the join), and produces:
6
+
7
+ - ``data/consolidated/<version>/consolidated.csv`` — the benchmark
8
+ - ``data/consolidated/<version>/manifest.json`` — provenance, dedup
9
+ stats, SHA-256s
10
+
11
+ Label conflict handling: when two or more sources contribute records for
12
+ the same composition but disagree on the canonical phase class, the
13
+ output row has ``canonical_phase`` blank and ``has_conflict=1``. The
14
+ per-source labels are still recorded so users can resolve the conflict
15
+ however they want.
16
+
17
+ Run as a module to (re)build v0.1.0:
18
+
19
+ python -m hea_bench.benchmark.consolidate
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import csv
25
+ import datetime
26
+ import hashlib
27
+ import json
28
+ import pathlib
29
+ import sys
30
+ from collections import defaultdict
31
+ from collections.abc import Iterable
32
+ from dataclasses import dataclass, field
33
+
34
+ from .composition import Composition
35
+ from .loaders import AlloyRecord, borg2020, pei2020, peivaste
36
+ from .taxonomy import PhaseClass
37
+
38
+ DEFAULT_VERSION = "0.1.0"
39
+
40
+ # Repo-relative paths (repo root is 3 parents up from this file:
41
+ # hea-bench/src/hea_bench/benchmark/consolidate.py → hea-bench/).
42
+ _REPO_ROOT = pathlib.Path(__file__).resolve().parents[3]
43
+ _RAW_DIR = _REPO_ROOT / "data" / "raw"
44
+ _CONSOLIDATED_DIR = _REPO_ROOT / "data" / "consolidated"
45
+
46
+
47
+ def canonical_formula_key(composition: Composition, precision: int = 4) -> str:
48
+ """Stable string key for a normalized composition.
49
+
50
+ Elements sorted alphabetically; fractions formatted at fixed
51
+ precision. Two compositions differing only by float noise produce
52
+ the same key.
53
+
54
+ >>> canonical_formula_key({'Fe': 0.5, 'Co': 0.5})
55
+ 'Co0.5000Fe0.5000'
56
+ >>> canonical_formula_key({'Co': 0.500000001, 'Fe': 0.499999999})
57
+ 'Co0.5000Fe0.5000'
58
+ """
59
+ parts = []
60
+ for el in sorted(composition):
61
+ frac = round(composition[el], precision)
62
+ if frac > 0:
63
+ parts.append(f"{el}{frac:.{precision}f}")
64
+ return "".join(parts)
65
+
66
+
67
+ @dataclass
68
+ class ConsolidatedRow:
69
+ composition_key: str
70
+ composition: Composition
71
+ n_elements: int
72
+ sources: list[str]
73
+ canonical_phase: PhaseClass | None
74
+ has_conflict: bool
75
+ per_source_canonical: dict[str, PhaseClass]
76
+ per_source_raw_labels: dict[str, str]
77
+ borg_processing: str | None = None
78
+ borg_doi: str | None = None
79
+ source_row_ids: dict[str, str] = field(default_factory=dict)
80
+
81
+
82
+ def consolidate(records: Iterable[AlloyRecord], precision: int = 4) -> list[ConsolidatedRow]:
83
+ """Merge records from any number of sources by composition-only key."""
84
+ by_key: dict[str, list[AlloyRecord]] = defaultdict(list)
85
+ for r in records:
86
+ by_key[canonical_formula_key(r.composition, precision)].append(r)
87
+
88
+ out: list[ConsolidatedRow] = []
89
+ for key, rs in by_key.items():
90
+ per_source_canonical: dict[str, PhaseClass] = {}
91
+ per_source_raw: dict[str, str] = {}
92
+ row_ids: dict[str, str] = {}
93
+ for r in rs:
94
+ # Within one source, a single (formula, processing) pair has
95
+ # already been deduped by the per-source loader. Multiple rs
96
+ # entries for the same source would only happen if two Borg
97
+ # processings of the same composition exist; keep the first.
98
+ per_source_canonical.setdefault(r.source, r.canonical_phase)
99
+ per_source_raw.setdefault(r.source, r.source_label)
100
+ row_ids.setdefault(r.source, r.source_row_id)
101
+
102
+ canonical_set = set(per_source_canonical.values())
103
+ has_conflict = len(canonical_set) > 1
104
+ canonical = next(iter(canonical_set)) if not has_conflict else None
105
+
106
+ borg_rec = next((r for r in rs if r.source == "borg2020"), None)
107
+
108
+ out.append(
109
+ ConsolidatedRow(
110
+ composition_key=key,
111
+ composition=rs[0].composition,
112
+ n_elements=sum(1 for v in rs[0].composition.values() if v > 0),
113
+ sources=sorted(per_source_canonical),
114
+ canonical_phase=canonical,
115
+ has_conflict=has_conflict,
116
+ per_source_canonical=per_source_canonical,
117
+ per_source_raw_labels=per_source_raw,
118
+ borg_processing=borg_rec.processing if borg_rec else None,
119
+ borg_doi=borg_rec.source_doi if borg_rec else None,
120
+ source_row_ids=row_ids,
121
+ )
122
+ )
123
+ return out
124
+
125
+
126
+ # ---- Output writers -------------------------------------------------------
127
+
128
+ _CSV_COLUMNS = [
129
+ "composition_key",
130
+ "n_elements",
131
+ "sources",
132
+ "canonical_phase",
133
+ "has_conflict",
134
+ "borg_label",
135
+ "pei_label",
136
+ "peivaste_label",
137
+ "borg_raw_label",
138
+ "pei_raw_label",
139
+ "peivaste_raw_label",
140
+ "borg_processing",
141
+ "borg_doi",
142
+ "source_row_ids",
143
+ ]
144
+
145
+
146
+ def _row_to_csv(row: ConsolidatedRow) -> list[str]:
147
+ def lbl(src: str) -> str:
148
+ v = row.per_source_canonical.get(src)
149
+ return v.value if v else ""
150
+
151
+ return [
152
+ row.composition_key,
153
+ str(row.n_elements),
154
+ ";".join(row.sources),
155
+ row.canonical_phase.value if row.canonical_phase else "",
156
+ "1" if row.has_conflict else "0",
157
+ lbl("borg2020"),
158
+ lbl("pei2020"),
159
+ lbl("peivaste"),
160
+ row.per_source_raw_labels.get("borg2020", ""),
161
+ row.per_source_raw_labels.get("pei2020", ""),
162
+ row.per_source_raw_labels.get("peivaste", ""),
163
+ row.borg_processing or "",
164
+ row.borg_doi or "",
165
+ ";".join(f"{s}:{rid}" for s, rid in sorted(row.source_row_ids.items())),
166
+ ]
167
+
168
+
169
+ def write_consolidated_csv(rows: Iterable[ConsolidatedRow], path: pathlib.Path) -> int:
170
+ """Write rows in alphabetical composition_key order. Returns row count."""
171
+ rows = sorted(rows, key=lambda r: r.composition_key)
172
+ path.parent.mkdir(parents=True, exist_ok=True)
173
+ with path.open("w", newline="", encoding="utf-8") as f:
174
+ w = csv.writer(f)
175
+ w.writerow(_CSV_COLUMNS)
176
+ for r in rows:
177
+ w.writerow(_row_to_csv(r))
178
+ return len(rows)
179
+
180
+
181
+ def _sha256(p: pathlib.Path) -> str:
182
+ return hashlib.sha256(p.read_bytes()).hexdigest() if p.exists() else ""
183
+
184
+
185
+ def build_manifest(
186
+ rows: list[ConsolidatedRow],
187
+ source_counts: dict[str, int],
188
+ source_paths: dict[str, pathlib.Path],
189
+ version: str,
190
+ ) -> dict:
191
+ canonical_dist = {p.value: 0 for p in PhaseClass}
192
+ canonical_dist["conflict"] = 0
193
+ for r in rows:
194
+ if r.has_conflict:
195
+ canonical_dist["conflict"] += 1
196
+ elif r.canonical_phase:
197
+ canonical_dist[r.canonical_phase.value] += 1
198
+
199
+ overlap = {"single_source": 0, "two_sources": 0, "three_sources": 0}
200
+ for r in rows:
201
+ n = len(r.sources)
202
+ if n == 1:
203
+ overlap["single_source"] += 1
204
+ elif n == 2:
205
+ overlap["two_sources"] += 1
206
+ elif n >= 3:
207
+ overlap["three_sources"] += 1
208
+
209
+ src_meta_table = {
210
+ "borg2020": {"license": "CC-BY-4.0", "doi": "10.1038/s41597-020-00768-9"},
211
+ "pei2020": {"license": "CC-BY-4.0", "doi": "10.1038/s41524-020-0308-7"},
212
+ "peivaste": {"license": "none-declared", "url": "https://github.com/Iman-Peivaste/ML_HEAs_Phase_Dataset"},
213
+ }
214
+
215
+ sources_section = []
216
+ for name in ("borg2020", "pei2020", "peivaste"):
217
+ meta = src_meta_table[name]
218
+ sources_section.append(
219
+ {
220
+ "name": name,
221
+ **meta,
222
+ "rows_yielded": source_counts.get(name, 0),
223
+ "sha256": _sha256(source_paths.get(name, pathlib.Path())),
224
+ }
225
+ )
226
+
227
+ return {
228
+ "version": version,
229
+ "created": datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds"),
230
+ "schema": {
231
+ "composition_key": (
232
+ "Stable hash of normalized composition: alphabetically sorted "
233
+ "elements at 4-decimal mole-fraction precision."
234
+ ),
235
+ "canonical_phase": (
236
+ "One of BCC, FCC, HCP, multi-phase. Empty when sources disagree."
237
+ ),
238
+ "has_conflict": (
239
+ "1 if any pair of contributing sources gave different canonical labels."
240
+ ),
241
+ "borg_processing": "Preserved side-channel from Borg only. Not part of join key.",
242
+ },
243
+ "sources": sources_section,
244
+ "totals": {
245
+ "unique_compositions": len(rows),
246
+ "by_source_overlap": overlap,
247
+ "canonical_distribution": canonical_dist,
248
+ },
249
+ "consolidation_rules": {
250
+ "join_key": "composition_only, 4-decimal mole-fraction precision",
251
+ "label_policy": "agreement → canonical; disagreement → has_conflict=1, canonical_phase blank",
252
+ "borg_processing": "Preserved as side-channel; not part of join key",
253
+ "per_source_dedup": "Each loader deduplicates internally before consolidation",
254
+ },
255
+ }
256
+
257
+
258
+ def build(version: str = DEFAULT_VERSION, out_dir: pathlib.Path | None = None) -> dict:
259
+ """Run all three loaders, consolidate, write the v<version> release.
260
+
261
+ Returns the manifest dict (also written to disk).
262
+ """
263
+ out_dir = out_dir or (_CONSOLIDATED_DIR / f"v{version}")
264
+ out_dir.mkdir(parents=True, exist_ok=True)
265
+
266
+ source_paths = {
267
+ "borg2020": _RAW_DIR / "borg2020" / "MPEA_dataset.csv",
268
+ "pei2020": _RAW_DIR / "pei2020" / "pei2020_alloys_phases.csv",
269
+ "peivaste": _RAW_DIR / "peivaste" / "dataset11252_79.csv",
270
+ }
271
+ borg_rows = list(borg2020.load(source_paths["borg2020"]))
272
+ pei_rows = list(pei2020.load(source_paths["pei2020"]))
273
+ peivaste_rows = list(peivaste.load(source_paths["peivaste"]))
274
+ source_counts = {"borg2020": len(borg_rows), "pei2020": len(pei_rows), "peivaste": len(peivaste_rows)}
275
+
276
+ consolidated = consolidate([*borg_rows, *pei_rows, *peivaste_rows])
277
+ write_consolidated_csv(consolidated, out_dir / "consolidated.csv")
278
+
279
+ manifest = build_manifest(consolidated, source_counts, source_paths, version)
280
+ (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
281
+ return manifest
282
+
283
+
284
+ def main() -> int:
285
+ if hasattr(sys.stdout, "reconfigure"):
286
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
287
+ m = build()
288
+ print(f"=== v{m['version']} built {m['created']} ===")
289
+ print(f"unique compositions: {m['totals']['unique_compositions']}")
290
+ print(f"by_source_overlap: {m['totals']['by_source_overlap']}")
291
+ print(f"canonical: {m['totals']['canonical_distribution']}")
292
+ return 0
293
+
294
+
295
+ if __name__ == "__main__":
296
+ sys.exit(main())
@@ -0,0 +1,169 @@
1
+ """Coverage analysis of the consolidated benchmark against the
2
+ descriptor data tables.
3
+
4
+ For each composition in ``data/consolidated/v<version>/consolidated.csv``,
5
+ checks whether all of its elements are present in:
6
+
7
+ (a) :data:`hea_bench.descriptors.data.elemental.ELEMENTAL_DATA`
8
+ — needed for δ, VEC, T_m (the "basic" descriptor set)
9
+ (b) the Miedema pair table at
10
+ :mod:`hea_bench.descriptors.data.pair_enthalpies`
11
+ — needed for ΔH_mix and Ω
12
+
13
+ Outputs:
14
+
15
+ - per-composition counts (in_basic_only / in_miedema_only / in_both / no_coverage)
16
+ - ranked list of elements missing from each table, by alloy frequency
17
+ - the "next-best-add" candidates — elements present in the pair table
18
+ but missing from ELEMENTAL_DATA, sorted by how many alloys they'd
19
+ unlock if added
20
+
21
+ Run as a module to score the released v0.1.0:
22
+
23
+ python -m hea_bench.benchmark.coverage
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import csv
29
+ import json
30
+ import pathlib
31
+ import sys
32
+ from collections import Counter
33
+ from collections.abc import Iterable
34
+
35
+ from ..composition import parse_formula
36
+ from ..descriptors.data.elemental import covered_elements as _elemental_covered
37
+ from ..descriptors.data.pair_enthalpies import covered_elements as _pair_covered
38
+
39
+ _REPO_ROOT = pathlib.Path(__file__).resolve().parents[3]
40
+ _DEFAULT_CSV = _REPO_ROOT / "data" / "consolidated" / "v0.1.0" / "consolidated.csv"
41
+
42
+
43
+ def analyze(csv_path: pathlib.Path = _DEFAULT_CSV) -> dict:
44
+ """Compute coverage of the consolidated benchmark.
45
+
46
+ Returns a dict with:
47
+
48
+ - ``total`` : total compositions in the CSV
49
+ - ``in_basic``, ``in_miedema``, ``in_both``, ``in_neither`` : counts
50
+ - ``missing_from_basic`` : ``Counter`` of element → alloy frequency
51
+ for elements that appear in compositions but are absent from
52
+ ELEMENTAL_DATA
53
+ - ``missing_from_miedema`` : same for the pair table
54
+ - ``next_best_adds`` : list of (element, n_alloys_unlocked,
55
+ already_in_pair_table) tuples, sorted descending — these are
56
+ the highest-leverage elements to add to ELEMENTAL_DATA next
57
+ """
58
+ basic = _elemental_covered()
59
+ miedema = _pair_covered()
60
+
61
+ total = 0
62
+ in_basic = 0
63
+ in_miedema = 0
64
+ in_both = 0
65
+ in_neither = 0
66
+ missing_from_basic: Counter[str] = Counter()
67
+ missing_from_miedema: Counter[str] = Counter()
68
+
69
+ # For "next-best-add": map each currently-uncovered element to the
70
+ # number of compositions whose ONLY missing element(s) are inside
71
+ # the set that would be unlocked by adding it. Simpler proxy:
72
+ # count how many compositions contain each uncovered element.
73
+ per_element_alloys: Counter[str] = Counter()
74
+
75
+ with csv_path.open(newline="", encoding="utf-8") as f:
76
+ for row in csv.DictReader(f):
77
+ total += 1
78
+ try:
79
+ comp = parse_formula(row["composition_key"])
80
+ except (KeyError, ValueError):
81
+ continue
82
+ elems = set(comp)
83
+ miss_b = elems - basic
84
+ miss_m = elems - miedema
85
+
86
+ if not miss_b:
87
+ in_basic += 1
88
+ if not miss_m:
89
+ in_miedema += 1
90
+ if not miss_b and not miss_m:
91
+ in_both += 1
92
+ if miss_b and miss_m:
93
+ in_neither += 1
94
+
95
+ for e in miss_b:
96
+ missing_from_basic[e] += 1
97
+ for e in miss_m:
98
+ missing_from_miedema[e] += 1
99
+ for e in elems - basic:
100
+ per_element_alloys[e] += 1
101
+
102
+ # Next-best-add ranking: elements outside ELEMENTAL_DATA, by alloy
103
+ # count, with a flag indicating whether they're already in the
104
+ # pair table (low-effort wins) vs. need full sourcing.
105
+ next_best: list[tuple[str, int, bool]] = []
106
+ for el, cnt in per_element_alloys.most_common():
107
+ next_best.append((el, cnt, el in miedema))
108
+
109
+ return {
110
+ "csv_path": str(csv_path),
111
+ "total": total,
112
+ "in_basic": in_basic,
113
+ "in_miedema": in_miedema,
114
+ "in_both": in_both,
115
+ "in_neither": in_neither,
116
+ "missing_from_basic": dict(missing_from_basic),
117
+ "missing_from_miedema": dict(missing_from_miedema),
118
+ "next_best_adds": next_best,
119
+ }
120
+
121
+
122
+ def format_report(report: dict) -> str:
123
+ """Render `analyze()` output as a human-readable text report."""
124
+ out: list[str] = []
125
+ total = report["total"]
126
+
127
+ def pct(n: int) -> str:
128
+ return f"{n / total * 100:.1f}%" if total else "0.0%"
129
+
130
+ out.append(f"=== Coverage report: {report['csv_path']} ===")
131
+ out.append("")
132
+ out.append(f"total compositions: {total:>5}")
133
+ out.append("")
134
+ out.append("Coverage by descriptor data table:")
135
+ out.append(f" ELEMENTAL_DATA (δ/VEC/T_m): {report['in_basic']:>5} / {total} ({pct(report['in_basic'])})")
136
+ out.append(f" Miedema pair table (ΔH/Ω): {report['in_miedema']:>5} / {total} ({pct(report['in_miedema'])})")
137
+ out.append(f" both (all 6 descriptors): {report['in_both']:>5} / {total} ({pct(report['in_both'])})")
138
+ out.append(f" neither (cannot score): {report['in_neither']:>5} / {total} ({pct(report['in_neither'])})")
139
+ out.append("")
140
+ out.append("Top 15 elements missing from ELEMENTAL_DATA (basic descriptors):")
141
+ items = sorted(report["missing_from_basic"].items(), key=lambda kv: -kv[1])[:15]
142
+ for el, cnt in items:
143
+ out.append(f" {el:>3} appears in {cnt:>5} alloys ({pct(cnt)})")
144
+ out.append("")
145
+ out.append("Next-best-add ranking (elements outside ELEMENTAL_DATA, by alloy count):")
146
+ out.append(" symbol alloys-unlocked already-in-pair-table")
147
+ for el, cnt, in_pair in report["next_best_adds"][:15]:
148
+ flag = "YES (low-effort win)" if in_pair else "no (needs full sourcing)"
149
+ out.append(f" {el:>3} {cnt:>5} {flag}")
150
+ return "\n".join(out)
151
+
152
+
153
+ def main(argv: Iterable[str] | None = None) -> int:
154
+ if hasattr(sys.stdout, "reconfigure"):
155
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
156
+ report = analyze()
157
+ print(format_report(report))
158
+ # Persist machine-readable form alongside the v0.1.0 release.
159
+ out_json = pathlib.Path(report["csv_path"]).parent / "coverage_report.json"
160
+ out_json.write_text(json.dumps(
161
+ {k: (dict(v) if isinstance(v, Counter) else v) for k, v in report.items()},
162
+ indent=2,
163
+ ))
164
+ print(f"\nwrote {out_json}")
165
+ return 0
166
+
167
+
168
+ if __name__ == "__main__":
169
+ sys.exit(main())
@@ -0,0 +1,54 @@
1
+ """Per-source dataset loaders.
2
+
3
+ Each loader returns an iterable of `AlloyRecord` rows in a canonical
4
+ schema, hiding the upstream column conventions, formula syntax, and
5
+ phase-label vocabulary from the consolidator.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+
12
+ from ..composition import Composition
13
+ from ..taxonomy import PhaseClass
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class AlloyRecord:
18
+ """One row in the consolidated benchmark schema.
19
+
20
+ Attributes
21
+ ----------
22
+ source
23
+ Identifier of the upstream source (e.g., ``"borg2020"``,
24
+ ``"pei2020"``, ``"peivaste"``).
25
+ source_row_id
26
+ Stable identifier from the upstream row — index, Reference ID,
27
+ whatever the source uses to point back to its own records.
28
+ formula_raw
29
+ The composition string as it appeared in the upstream CSV,
30
+ verbatim. Useful for traceability.
31
+ composition
32
+ Normalized mole-fraction dict (sums to 1.0).
33
+ canonical_phase
34
+ One of the `PhaseClass` values: ``BCC``, ``FCC``, ``HCP``, or
35
+ ``multi-phase``.
36
+ source_label
37
+ The upstream phase label string, preserved verbatim for users
38
+ who want finer granularity than the canonical 4-class.
39
+ processing
40
+ Processing route, if the upstream source records it (Borg only).
41
+ ``None`` for sources that don't.
42
+ source_doi
43
+ DOI of the primary-literature paper this alloy was reported in,
44
+ if the upstream source records it (Borg only).
45
+ """
46
+
47
+ source: str
48
+ source_row_id: str
49
+ formula_raw: str
50
+ composition: Composition
51
+ canonical_phase: PhaseClass
52
+ source_label: str
53
+ processing: str | None = None
54
+ source_doi: str | None = None
@@ -0,0 +1,85 @@
1
+ """Loader for the Borg et al. 2020 MPEA dataset.
2
+
3
+ Source: ``data/raw/borg2020/MPEA_dataset.csv`` (CC-BY-4.0, mirrored
4
+ verbatim from figshare 10.6084/m9.figshare.12642953 v9).
5
+
6
+ Schema, dedup behaviour, and label-normalization decisions are
7
+ documented inline below; see ``data/raw/borg2020/README.md`` for the
8
+ full source-level context.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import csv
14
+ import pathlib
15
+ from collections.abc import Iterator
16
+
17
+ from ..composition import parse_formula
18
+ from ..taxonomy import BORG_SIMPLE_MAP
19
+ from . import AlloyRecord
20
+
21
+ SOURCE_NAME = "borg2020"
22
+
23
+ # Column names — Borg's upstream headers, verbatim (including raw LaTeX).
24
+ _COL_REF_ID = "IDENTIFIER: Reference ID"
25
+ _COL_FORMULA = "FORMULA"
26
+ _COL_MICROSTRUCTURE = "PROPERTY: Microstructure"
27
+ _COL_PROCESSING = "PROPERTY: Processing method"
28
+ _COL_SIMPLE_PHASE = "PROPERTY: BCC/FCC/other"
29
+ _COL_DOI = "REFERENCE: doi"
30
+
31
+
32
+ def load(csv_path: pathlib.Path) -> Iterator[AlloyRecord]:
33
+ """Yield one `AlloyRecord` per unique ``(formula, processing)`` alloy.
34
+
35
+ Borg has 1,545 measurement rows for 740 unique ``(formula, processing)``
36
+ alloys — the duplicates come from multiple mechanical tests at
37
+ different temperatures/test types on the same alloy. Phase is a
38
+ property of the alloy and its processing, not the test, so we
39
+ deduplicate here.
40
+
41
+ When duplicate ``(formula, processing)`` rows disagree on the phase
42
+ label, the first row encountered wins. (Borg's data is internally
43
+ consistent on this point — duplicate rows from the same source paper
44
+ always carry the same phase label — but the loader is defensive.)
45
+
46
+ Rows missing a parseable formula or a simplified phase label are
47
+ skipped silently. Caller can compare yielded count to file row count
48
+ if a count of skipped rows is needed.
49
+ """
50
+ seen: set[tuple[str, str]] = set()
51
+
52
+ with csv_path.open(newline="", encoding="utf-8-sig") as f:
53
+ for raw in csv.DictReader(f):
54
+ formula = (raw.get(_COL_FORMULA) or "").strip()
55
+ processing = (raw.get(_COL_PROCESSING) or "").strip()
56
+ source_label = (raw.get(_COL_SIMPLE_PHASE) or "").strip()
57
+ full_microstructure = (raw.get(_COL_MICROSTRUCTURE) or "").strip()
58
+
59
+ if not formula or not source_label:
60
+ continue
61
+
62
+ key = (formula, processing)
63
+ if key in seen:
64
+ continue
65
+
66
+ try:
67
+ composition = parse_formula(formula)
68
+ except ValueError:
69
+ continue
70
+
71
+ canonical = BORG_SIMPLE_MAP.get(source_label)
72
+ if canonical is None:
73
+ continue
74
+
75
+ seen.add(key)
76
+ yield AlloyRecord(
77
+ source=SOURCE_NAME,
78
+ source_row_id=(raw.get(_COL_REF_ID) or "").strip(),
79
+ formula_raw=formula,
80
+ composition=composition,
81
+ canonical_phase=canonical,
82
+ source_label=full_microstructure or source_label,
83
+ processing=processing or None,
84
+ source_doi=(raw.get(_COL_DOI) or "").strip() or None,
85
+ )