chessvision-py 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,89 @@
1
+ Metadata-Version: 2.4
2
+ Name: chessvision-py
3
+ Version: 0.1.0
4
+ Summary: Computational chess performance analysis — ML-powered insights from personal PGN archives
5
+ Author-email: Rakkshet Singhaal <rakkshetsinghaal99@gmail.com>
6
+ License: GPL-3.0-or-later
7
+ Project-URL: Homepage, https://github.com/chesslens-project/chessvision-py
8
+ Project-URL: Repository, https://github.com/chesslens-project/chessvision-py
9
+ Keywords: chess,machine-learning,performance-analysis,stockfish,pgn
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Games/Entertainment :: Board Games
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: chess>=1.10.0
20
+ Requires-Dist: pandas>=2.0.0
21
+ Requires-Dist: numpy>=1.24.0
22
+ Requires-Dist: pyarrow>=12.0.0
23
+ Requires-Dist: zstandard>=0.21.0
24
+ Requires-Dist: gensim>=4.3.0
25
+ Requires-Dist: requests>=2.31.0
26
+ Requires-Dist: tqdm>=4.65.0
27
+ Requires-Dist: scipy>=1.11.0
28
+ Requires-Dist: scikit-learn>=1.3.0
29
+ Requires-Dist: joblib>=1.3.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
32
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
33
+
34
+ # chessvision-py
35
+
36
+ A Python library for computational chess performance analysis. Takes your PGN game files and runs machine learning on them to tell you what kind of mistakes you make and how to improve.
37
+
38
+ ## What it does
39
+
40
+ - Parses PGN files from chess.com, lichess, or OTB tournaments
41
+ - Evaluates every position with Stockfish (cached, resumable)
42
+ - Clusters your errors into 5 distinct archetypes
43
+ - Embeds your playing style into a vector space trained on 29.3M elite games
44
+ - Forecasts ELO trajectory using a model trained on 67,115 elite players
45
+ - Generates a personalized weekly training plan
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ pip install chessvision-py
51
+ ```
52
+
53
+ Requires Stockfish:
54
+ ```bash
55
+ brew install stockfish # Mac
56
+ apt install stockfish # Linux
57
+ ```
58
+
59
+ ## Quick start
60
+
61
+ ```python
62
+ import chessvision as cv
63
+
64
+ cv.download_models()
65
+
66
+ report = cv.analyze("my_games.pgn", player_name="YourUsername")
67
+ ```
68
+
69
+ Or step by step:
70
+
71
+ ```python
72
+ games, moves = cv.parse_pgn("my_games.pgn")
73
+ moves = cv.evaluate_games(moves)
74
+ moves = cv.engineer_features(games, moves)
75
+ errors = cv.run_archetype_analysis(moves)
76
+ report = cv.analyze_player(moves, games, errors, "YourUsername")
77
+ ```
78
+
79
+ ## Pre-trained models
80
+
81
+ Models are hosted on Hugging Face at `rakkshet/chessvision-models` and download automatically on first use:
82
+
83
+ ```python
84
+ cv.download_models()
85
+ ```
86
+
87
+ ## License
88
+
89
+ GPL-3.0
@@ -0,0 +1,56 @@
1
+ # chessvision-py
2
+
3
+ A Python library for computational chess performance analysis. Takes your PGN game files and runs machine learning on them to tell you what kind of mistakes you make and how to improve.
4
+
5
+ ## What it does
6
+
7
+ - Parses PGN files from chess.com, lichess, or OTB tournaments
8
+ - Evaluates every position with Stockfish (cached, resumable)
9
+ - Clusters your errors into 5 distinct archetypes
10
+ - Embeds your playing style into a vector space trained on 29.3M elite games
11
+ - Forecasts ELO trajectory using a model trained on 67,115 elite players
12
+ - Generates a personalized weekly training plan
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pip install chessvision-py
18
+ ```
19
+
20
+ Requires Stockfish:
21
+ ```bash
22
+ brew install stockfish # Mac
23
+ apt install stockfish # Linux
24
+ ```
25
+
26
+ ## Quick start
27
+
28
+ ```python
29
+ import chessvision as cv
30
+
31
+ cv.download_models()
32
+
33
+ report = cv.analyze("my_games.pgn", player_name="YourUsername")
34
+ ```
35
+
36
+ Or step by step:
37
+
38
+ ```python
39
+ games, moves = cv.parse_pgn("my_games.pgn")
40
+ moves = cv.evaluate_games(moves)
41
+ moves = cv.engineer_features(games, moves)
42
+ errors = cv.run_archetype_analysis(moves)
43
+ report = cv.analyze_player(moves, games, errors, "YourUsername")
44
+ ```
45
+
46
+ ## Pre-trained models
47
+
48
+ Models are hosted on Hugging Face at `rakkshet/chessvision-models` and download automatically on first use:
49
+
50
+ ```python
51
+ cv.download_models()
52
+ ```
53
+
54
+ ## License
55
+
56
+ GPL-3.0
@@ -0,0 +1,33 @@
1
+ from .analyze import analyze
2
+ from .parser import parse_pgn
3
+ from .evaluator import evaluate_games
4
+ from .features import engineer_features, feature_summary
5
+ from .archetypes import run_archetype_analysis, player_archetype_profile
6
+ from .recommender import (
7
+ build_player_profile,
8
+ generate_recommendations,
9
+ analyze_player,
10
+ )
11
+ from .models import (
12
+ download_models,
13
+ register_local_models,
14
+ list_models,
15
+ is_downloaded,
16
+ )
17
+
18
+ try:
19
+ from .elo_forecast import (
20
+ build_game_features,
21
+ train_elo_model,
22
+ train_win_classifier,
23
+ train_population_lstm,
24
+ fine_tune_on_personal,
25
+ predict_elo_trajectory,
26
+ load_model,
27
+ )
28
+ except ImportError:
29
+ pass
30
+
31
+ __version__ = "0.1.0"
32
+ __author__ = "Rakkshet Singhaal"
33
+ __email__ = "rakkshet.singhaal@kellogg.northwestern.edu"
@@ -0,0 +1,124 @@
1
+ """
2
+ analyze.py
3
+
4
+ Main entry point for chessvision.
5
+ One function call produces the full analysis report.
6
+ """
7
+
8
+ import pandas as pd
9
+ from pathlib import Path
10
+ from typing import Optional, Union
11
+
12
+ from .parser import parse_pgn
13
+ from .evaluator import evaluate_games
14
+ from .features import engineer_features
15
+ from .archetypes import run_archetype_analysis
16
+ from .recommender import analyze_player
17
+ from .models import get_cached_path
18
+
19
+
20
+ def analyze(
21
+ pgn_path: Union[str, Path],
22
+ player_name: Optional[str] = None,
23
+ stockfish_path: str = "stockfish",
24
+ depth: int = 15,
25
+ output_path: Optional[Union[str, Path]] = None,
26
+ verbose: bool = True,
27
+ ) -> dict:
28
+ """
29
+ Full chessvision analysis pipeline — one function call.
30
+
31
+ Takes a PGN file (or folder of PGN files) and returns a
32
+ comprehensive training report with error archetypes, style
33
+ profile, and personalized recommendations.
34
+
35
+ Parameters
36
+ ----------
37
+ pgn_path : path to .pgn file or folder of .pgn files
38
+ player_name : name of the player to analyze
39
+ (auto-detected if only one player appears frequently)
40
+ stockfish_path: path to Stockfish binary (default: 'stockfish' on PATH)
41
+ depth : Stockfish search depth (default: 15)
42
+ output_path : if set, saves JSON report here
43
+ verbose : print progress and report
44
+
45
+ Returns
46
+ -------
47
+ dict with full analysis results
48
+
49
+ Example
50
+ -------
51
+ >>> import chessvision as cv
52
+ >>> report = cv.analyze("my_games.pgn", player_name="MyUsername")
53
+ """
54
+ pgn_path = Path(pgn_path)
55
+
56
+ if verbose:
57
+ print("=" * 60)
58
+ print(" ChessVision Analysis Pipeline")
59
+ print("=" * 60)
60
+ print()
61
+
62
+ # ── Step 1: Parse ────────────────────────────────────────────
63
+ if verbose:
64
+ print("[1/5] Parsing PGN files...")
65
+ games_df, moves_df = parse_pgn(pgn_path)
66
+
67
+ # Auto-detect player name
68
+ if player_name is None:
69
+ player_name = _detect_player(games_df)
70
+ if verbose:
71
+ print(f" Auto-detected player: {player_name}")
72
+
73
+ # ── Step 2: Evaluate ─────────────────────────────────────────
74
+ if verbose:
75
+ print("\n[2/5] Evaluating positions with Stockfish...")
76
+ print(" (This step is cached — safe to stop and restart)")
77
+ moves_df = evaluate_games(
78
+ moves_df,
79
+ stockfish_path = stockfish_path,
80
+ depth = depth,
81
+ cache = True,
82
+ )
83
+
84
+ # ── Step 3: Feature engineering ──────────────────────────────
85
+ if verbose:
86
+ print("\n[3/5] Engineering behavioral features...")
87
+ moves_df = engineer_features(games_df, moves_df)
88
+
89
+ # ── Step 4: Error archetypes ──────────────────────────────────
90
+ if verbose:
91
+ print("\n[4/5] Discovering error archetypes...")
92
+
93
+ # Use pre-trained model if available
94
+ get_cached_path("error_archetypes", "hdbscan_model.joblib")
95
+ error_df = run_archetype_analysis(
96
+ moves_df,
97
+ min_cluster_size = 200,
98
+ min_samples = 50,
99
+ )
100
+
101
+ # ── Step 5: Recommendations ──────────────────────────────────
102
+ if verbose:
103
+ print("\n[5/5] Generating personalized recommendations...")
104
+
105
+ chess2vec_path = get_cached_path(
106
+ "chess2vec", "chess2vec.wordvectors"
107
+ )
108
+
109
+ report = analyze_player(
110
+ moves_df = moves_df,
111
+ games_df = games_df,
112
+ error_df = error_df,
113
+ player_name = player_name,
114
+ chess2vec_path = chess2vec_path,
115
+ output_path = Path(output_path) if output_path else None,
116
+ )
117
+
118
+ return report
119
+
120
+
121
+ def _detect_player(games_df: pd.DataFrame) -> str:
122
+ """Auto-detect the focal player — the one who appears most often."""
123
+ all_players = pd.concat([games_df["white"], games_df["black"]])
124
+ return all_players.value_counts().index[0]
@@ -0,0 +1,393 @@
1
+ """
2
+ archetypes.py
3
+
4
+ Unsupervised error archetype discovery using HDBSCAN.
5
+
6
+ Takes the evaluated move dataframe and clusters error moves
7
+ into behaviorally distinct groups — replacing the naive
8
+ blunder/mistake/inaccuracy taxonomy with a data-driven one.
9
+
10
+ Expected archetypes:
11
+ - Tactical blindspot : sharp positions, sudden CPL spike
12
+ - Strategic drift : cumulative small losses, passive pieces
13
+ - Time pressure : low clock, CPL spikes regardless of position
14
+ - Endgame failure : winning positions lost in low material
15
+ - Preparation boundary : CPL spike at specific opening move number
16
+ - Positional confusion : complex pawn structures, gradual CPL creep
17
+ """
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import joblib
22
+ from pathlib import Path
23
+ from typing import Optional
24
+
25
+ try:
26
+ import hdbscan
27
+ except ImportError:
28
+ raise ImportError("Install hdbscan: pip3 install hdbscan")
29
+
30
+ try:
31
+ import umap
32
+ except ImportError:
33
+ raise ImportError("Install umap-learn: pip3 install umap-learn")
34
+
35
+
36
+ ARCHETYPE_LABELS = {
37
+ -1: "noise",
38
+ 0: "tactical_blindspot",
39
+ 1: "strategic_drift",
40
+ 2: "time_pressure_collapse",
41
+ 3: "endgame_failure",
42
+ 4: "preparation_boundary",
43
+ 5: "positional_confusion",
44
+ 6: "other",
45
+ }
46
+
47
+
48
+ def build_error_features(moves_df: pd.DataFrame) -> pd.DataFrame:
49
+ """
50
+ Build the feature matrix for error clustering.
51
+ Filters to error moves only (CPL >= 50).
52
+
53
+ Features per move:
54
+ cpl_norm : normalized centipawn loss [0,1]
55
+ eval_before_norm : normalized position evaluation
56
+ clock_pressure_index : time remaining fraction
57
+ complexity : position complexity score
58
+ mobility_ratio : own vs opponent mobility
59
+ phase_encoded : opening=0, middlegame=1, endgame=2
60
+ move_number_norm : normalized move number
61
+ is_capture : binary — was it a capture?
62
+ cumulative_cpl_norm : running total CPL in game (normalized)
63
+
64
+ Returns
65
+ -------
66
+ error_df : filtered + feature-enriched dataframe
67
+ """
68
+ df = moves_df.copy()
69
+
70
+ # Filter to errors only
71
+ error_df = df[df["cpl"] >= 50].copy()
72
+ print(f"Error moves (CPL >= 50): {len(error_df):,} of {len(df):,} total")
73
+
74
+ # Normalize CPL (clip at 1000 to handle mate scores)
75
+ error_df["cpl_norm"] = (
76
+ error_df["cpl"].clip(upper=1000) / 1000.0
77
+ )
78
+
79
+ # Normalize eval_before (clip at ±1000 centipawns)
80
+ error_df["eval_before_norm"] = (
81
+ error_df["eval_before"].clip(-1000, 1000) / 1000.0
82
+ )
83
+
84
+ # Phase encoding
85
+ phase_map = {"opening": 0.0, "middlegame": 0.5, "endgame": 1.0}
86
+ error_df["phase_encoded"] = error_df["phase"].map(phase_map).fillna(0.5)
87
+
88
+ # Normalize move number (cap at 80)
89
+ error_df["move_number_norm"] = (
90
+ error_df["move_number"].clip(upper=80) / 80.0
91
+ )
92
+
93
+ # Is capture — detect from SAN notation
94
+ error_df["is_capture"] = error_df["san"].str.contains(
95
+ "x", na=False
96
+ ).astype(float)
97
+
98
+ # Cumulative CPL per game (normalized)
99
+ error_df["cumulative_cpl"] = (
100
+ df.groupby("game_id")["cpl"]
101
+ .transform(lambda x: x.fillna(0).cumsum())
102
+ .loc[error_df.index]
103
+ )
104
+ error_df["cumulative_cpl_norm"] = (
105
+ error_df["cumulative_cpl"].clip(upper=3000) / 3000.0
106
+ )
107
+
108
+ # Fill missing values
109
+ error_df["clock_pressure_index"] = (
110
+ error_df["clock_pressure_index"].fillna(0.5)
111
+ )
112
+ error_df["position_complexity_score"] = (
113
+ error_df["position_complexity_score"].fillna(0.5)
114
+ )
115
+ error_df["mobility_ratio"] = (
116
+ error_df["mobility_ratio"].fillna(1.0).clip(0.1, 5.0)
117
+ )
118
+ error_df["mobility_ratio_norm"] = (
119
+ (error_df["mobility_ratio"] - 0.1) / (5.0 - 0.1)
120
+ )
121
+
122
+ return error_df
123
+
124
+
125
+ def get_feature_matrix(error_df: pd.DataFrame) -> np.ndarray:
126
+ """Extract the numeric feature columns as a numpy array."""
127
+ feature_cols = [
128
+ "cpl_norm",
129
+ "eval_before_norm",
130
+ "clock_pressure_index",
131
+ "position_complexity_score",
132
+ "mobility_ratio_norm",
133
+ "phase_encoded",
134
+ "move_number_norm",
135
+ "is_capture",
136
+ "cumulative_cpl_norm",
137
+ ]
138
+ missing = [c for c in feature_cols if c not in error_df.columns]
139
+ if missing:
140
+ raise ValueError(f"Missing feature columns: {missing}")
141
+
142
+ X = error_df[feature_cols].values.astype(np.float32)
143
+ print(f"Feature matrix shape: {X.shape}")
144
+ return X
145
+
146
+
147
+ def reduce_dimensions(
148
+ X: np.ndarray,
149
+ n_components: int = 10,
150
+ random_state: int = 42,
151
+ ) -> np.ndarray:
152
+ """
153
+ Reduce feature dimensions with UMAP before clustering.
154
+ HDBSCAN degrades in high dimensions — UMAP first improves results.
155
+ """
156
+ print(f"Reducing {X.shape[1]}D → {n_components}D with UMAP...")
157
+ reducer = umap.UMAP(
158
+ n_components = n_components,
159
+ n_neighbors = 30,
160
+ min_dist = 0.0,
161
+ metric = "euclidean",
162
+ random_state = random_state,
163
+ )
164
+ X_reduced = reducer.fit_transform(X)
165
+ print(f"Reduction complete. Shape: {X_reduced.shape}")
166
+ return X_reduced, reducer
167
+
168
+
169
+ def cluster_errors(
170
+ X_reduced: np.ndarray,
171
+ min_cluster_size: int = 200,
172
+ min_samples: int = 50,
173
+ ) -> np.ndarray:
174
+ """
175
+ Run HDBSCAN on the UMAP-reduced feature matrix.
176
+
177
+ min_cluster_size : minimum moves to form a cluster
178
+ min_samples : controls how conservative clustering is
179
+ (higher = fewer, denser clusters)
180
+ """
181
+ print(f"Running HDBSCAN "
182
+ f"(min_cluster_size={min_cluster_size}, "
183
+ f"min_samples={min_samples})...")
184
+
185
+ clusterer = hdbscan.HDBSCAN(
186
+ min_cluster_size = min_cluster_size,
187
+ min_samples = min_samples,
188
+ metric = "euclidean",
189
+ cluster_selection_method = "eom",
190
+ prediction_data = True,
191
+ )
192
+ labels = clusterer.fit_predict(X_reduced)
193
+
194
+ n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
195
+ n_noise = (labels == -1).sum()
196
+ print(f"Found {n_clusters} clusters, {n_noise:,} noise points "
197
+ f"({n_noise/len(labels)*100:.1f}%)")
198
+
199
+ return labels, clusterer
200
+
201
+
202
+ def label_archetypes(
203
+ error_df: pd.DataFrame,
204
+ labels: np.ndarray,
205
+ ) -> pd.DataFrame:
206
+ """
207
+ Assign archetype labels to clusters by inspecting
208
+ the mean feature profile of each cluster.
209
+
210
+ Labeling rules (in priority order):
211
+ time_pressure_collapse : mean clock_pressure_index < 0.2
212
+ endgame_failure : mean phase_encoded > 0.8
213
+ preparation_boundary : mean move_number_norm < 0.2
214
+ tactical_blindspot : high cpl_norm + high complexity
215
+ positional_confusion : low cpl_norm + high cumulative_cpl
216
+ strategic_drift : middlegame + low complexity + high cumul
217
+ other : everything else
218
+ """
219
+ df = error_df.copy()
220
+ df["cluster"] = labels
221
+ df["archetype"] = "noise"
222
+
223
+ cluster_ids = [c for c in sorted(df["cluster"].unique()) if c != -1]
224
+ archetype_map = {}
225
+
226
+ print("\nCluster profiles:")
227
+ print(f"{'Cluster':>8} {'N':>7} {'CPL':>6} {'Clock':>6} "
228
+ f"{'Phase':>6} {'Complexity':>10} {'MoveN':>6} Archetype")
229
+ print("-" * 75)
230
+
231
+ for cid in cluster_ids:
232
+ mask = df["cluster"] == cid
233
+ sub = df[mask]
234
+ n = len(sub)
235
+
236
+ mean_cpl = sub["cpl_norm"].mean()
237
+ mean_clock = sub["clock_pressure_index"].mean()
238
+ mean_phase = sub["phase_encoded"].mean()
239
+ mean_complexity = sub["position_complexity_score"].mean()
240
+ mean_move = sub["move_number_norm"].mean()
241
+ mean_cumul = sub["cumulative_cpl_norm"].mean()
242
+
243
+ # Assign archetype by feature signature
244
+ if mean_clock < 0.2:
245
+ archetype = "time_pressure_collapse"
246
+ elif mean_phase > 0.75:
247
+ archetype = "endgame_failure"
248
+ elif mean_move < 0.2:
249
+ archetype = "preparation_boundary"
250
+ elif mean_cpl > 0.5 and mean_complexity > 0.6:
251
+ archetype = "tactical_blindspot"
252
+ elif mean_cumul > 0.5 and mean_complexity < 0.4:
253
+ archetype = "strategic_drift"
254
+ elif mean_cumul > 0.4 and mean_complexity > 0.4:
255
+ archetype = "positional_confusion"
256
+ else:
257
+ archetype = "other"
258
+
259
+ archetype_map[cid] = archetype
260
+ print(f"{cid:>8} {n:>7,} {mean_cpl:>6.3f} {mean_clock:>6.3f} "
261
+ f"{mean_phase:>6.3f} {mean_complexity:>10.3f} "
262
+ f"{mean_move:>6.3f} {archetype}")
263
+
264
+ df["archetype"] = df["cluster"].map(archetype_map).fillna("noise")
265
+ return df, archetype_map
266
+
267
+
268
+ def run_archetype_analysis(
269
+ moves_df: pd.DataFrame,
270
+ min_cluster_size: int = 200,
271
+ min_samples: int = 50,
272
+ umap_components: int = 10,
273
+ output_dir: Optional[Path] = None,
274
+ ) -> pd.DataFrame:
275
+ """
276
+ Full pipeline: features → UMAP → HDBSCAN → archetypes.
277
+
278
+ Parameters
279
+ ----------
280
+ moves_df : evaluated move dataframe
281
+ min_cluster_size : HDBSCAN minimum cluster size
282
+ min_samples : HDBSCAN minimum samples
283
+ umap_components : UMAP output dimensions
284
+ output_dir : if set, saves model and results here
285
+
286
+ Returns
287
+ -------
288
+ error_df with cluster and archetype columns added
289
+ """
290
+ print("=" * 60)
291
+ print("Phase 3B — Error Archetype Analysis")
292
+ print("=" * 60)
293
+
294
+ # Check input
295
+ required = ["cpl", "phase", "move_number", "clock_pressure_index",
296
+ "position_complexity_score", "mobility_ratio"]
297
+ missing = [c for c in required if c not in moves_df.columns]
298
+ if missing:
299
+ raise ValueError(
300
+ f"Missing columns: {missing}. "
301
+ f"Run evaluate_games() and engineer_features() first."
302
+ )
303
+
304
+ evaluated = moves_df["cpl"].notna().sum()
305
+ if evaluated == 0:
306
+ raise ValueError("No evaluated moves found. Run evaluate_games() first.")
307
+
308
+ print(f"\nInput: {len(moves_df):,} moves, {evaluated:,} evaluated")
309
+
310
+ # Step 1: build features
311
+ print("\n[1/4] Building error features...")
312
+ error_df = build_error_features(moves_df)
313
+
314
+ # Step 2: feature matrix
315
+ print("\n[2/4] Extracting feature matrix...")
316
+ X = get_feature_matrix(error_df)
317
+
318
+ # Step 3: UMAP reduction
319
+ print("\n[3/4] Dimensionality reduction...")
320
+ X_reduced, reducer = reduce_dimensions(X, n_components=umap_components)
321
+
322
+ # Step 4: clustering
323
+ print("\n[4/4] Clustering...")
324
+ labels, clusterer = cluster_errors(
325
+ X_reduced, min_cluster_size, min_samples
326
+ )
327
+
328
+ # Label archetypes
329
+ error_df, archetype_map = label_archetypes(error_df, labels)
330
+
331
+ # Summary
332
+ print("\nArchetype distribution:")
333
+ arch_counts = error_df["archetype"].value_counts()
334
+ for arch, count in arch_counts.items():
335
+ pct = count / len(error_df) * 100
336
+ print(f" {arch:<28} : {count:>6,} ({pct:.1f}%)")
337
+
338
+ # Save if requested
339
+ if output_dir:
340
+ output_dir = Path(output_dir)
341
+ output_dir.mkdir(parents=True, exist_ok=True)
342
+
343
+ joblib.dump(clusterer, output_dir / "hdbscan_model.joblib")
344
+ joblib.dump(reducer, output_dir / "umap_reducer.joblib")
345
+
346
+ error_df.to_parquet(output_dir / "error_archetypes.parquet",
347
+ index=False)
348
+
349
+ import json
350
+ with open(output_dir / "archetype_map.json", "w") as f:
351
+ json.dump({str(k): v for k, v in archetype_map.items()}, f,
352
+ indent=2)
353
+
354
+ print(f"\nSaved to {output_dir}")
355
+
356
+ return error_df
357
+
358
+
359
+ def player_archetype_profile(
360
+ error_df: pd.DataFrame,
361
+ player_name: str,
362
+ games_df: Optional[pd.DataFrame] = None,
363
+ ) -> pd.DataFrame:
364
+ """
365
+ Compute archetype distribution for a specific player.
366
+ Returns a dataframe with archetype counts and percentages.
367
+ """
368
+ if games_df is not None:
369
+ player_games = games_df[
370
+ (games_df["white"] == player_name) |
371
+ (games_df["black"] == player_name)
372
+ ]["game_id"].tolist()
373
+ player_errors = error_df[error_df["game_id"].isin(player_games)]
374
+ else:
375
+ player_errors = error_df
376
+
377
+ if len(player_errors) == 0:
378
+ print(f"No error moves found for player: {player_name}")
379
+ return pd.DataFrame()
380
+
381
+ profile = (
382
+ player_errors["archetype"]
383
+ .value_counts()
384
+ .reset_index()
385
+ )
386
+ profile.columns = ["archetype", "count"]
387
+ profile["pct"] = (profile["count"] / len(player_errors) * 100).round(1)
388
+
389
+ print(f"\nError archetype profile for {player_name}:")
390
+ print(f"Total error moves: {len(player_errors):,}")
391
+ print(profile.to_string(index=False))
392
+
393
+ return profile