chessvision-py 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chessvision_py-0.1.0/PKG-INFO +89 -0
- chessvision_py-0.1.0/README.md +56 -0
- chessvision_py-0.1.0/chessvision/__init__.py +33 -0
- chessvision_py-0.1.0/chessvision/analyze.py +124 -0
- chessvision_py-0.1.0/chessvision/archetypes.py +393 -0
- chessvision_py-0.1.0/chessvision/elo_forecast.py +1243 -0
- chessvision_py-0.1.0/chessvision/evaluator.py +317 -0
- chessvision_py-0.1.0/chessvision/features.py +339 -0
- chessvision_py-0.1.0/chessvision/models.py +194 -0
- chessvision_py-0.1.0/chessvision/parser.py +208 -0
- chessvision_py-0.1.0/chessvision/recommender.py +617 -0
- chessvision_py-0.1.0/chessvision_py.egg-info/PKG-INFO +89 -0
- chessvision_py-0.1.0/chessvision_py.egg-info/SOURCES.txt +25 -0
- chessvision_py-0.1.0/chessvision_py.egg-info/dependency_links.txt +1 -0
- chessvision_py-0.1.0/chessvision_py.egg-info/requires.txt +15 -0
- chessvision_py-0.1.0/chessvision_py.egg-info/top_level.txt +1 -0
- chessvision_py-0.1.0/pyproject.toml +60 -0
- chessvision_py-0.1.0/setup.cfg +4 -0
- chessvision_py-0.1.0/tests/test_analyze.py +48 -0
- chessvision_py-0.1.0/tests/test_archetypes.py +131 -0
- chessvision_py-0.1.0/tests/test_elo_forecast.py +163 -0
- chessvision_py-0.1.0/tests/test_evaluator.py +110 -0
- chessvision_py-0.1.0/tests/test_features.py +142 -0
- chessvision_py-0.1.0/tests/test_parser.py +79 -0
- chessvision_py-0.1.0/tests/test_recommender.py +157 -0
- chessvision_py-0.1.0/tests/test_smoke.py +8 -0
- chessvision_py-0.1.0/tests/test_stream.py +181 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chessvision-py
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Computational chess performance analysis — ML-powered insights from personal PGN archives
|
|
5
|
+
Author-email: Rakkshet Singhaal <rakkshetsinghaal99@gmail.com>
|
|
6
|
+
License: GPL-3.0-or-later
|
|
7
|
+
Project-URL: Homepage, https://github.com/chesslens-project/chessvision-py
|
|
8
|
+
Project-URL: Repository, https://github.com/chesslens-project/chessvision-py
|
|
9
|
+
Keywords: chess,machine-learning,performance-analysis,stockfish,pgn
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Games/Entertainment :: Board Games
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: chess>=1.10.0
|
|
20
|
+
Requires-Dist: pandas>=2.0.0
|
|
21
|
+
Requires-Dist: numpy>=1.24.0
|
|
22
|
+
Requires-Dist: pyarrow>=12.0.0
|
|
23
|
+
Requires-Dist: zstandard>=0.21.0
|
|
24
|
+
Requires-Dist: gensim>=4.3.0
|
|
25
|
+
Requires-Dist: requests>=2.31.0
|
|
26
|
+
Requires-Dist: tqdm>=4.65.0
|
|
27
|
+
Requires-Dist: scipy>=1.11.0
|
|
28
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
29
|
+
Requires-Dist: joblib>=1.3.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
33
|
+
|
|
34
|
+
# chessvision-py
|
|
35
|
+
|
|
36
|
+
A Python library for computational chess performance analysis. Takes your PGN game files and runs machine learning on them to tell you what kind of mistakes you make and how to improve.
|
|
37
|
+
|
|
38
|
+
## What it does
|
|
39
|
+
|
|
40
|
+
- Parses PGN files from chess.com, lichess, or OTB tournaments
|
|
41
|
+
- Evaluates every position with Stockfish (cached, resumable)
|
|
42
|
+
- Clusters your errors into 5 distinct archetypes
|
|
43
|
+
- Embeds your playing style into a vector space trained on 29.3M elite games
|
|
44
|
+
- Forecasts ELO trajectory using a model trained on 67,115 elite players
|
|
45
|
+
- Generates a personalized weekly training plan
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install chessvision-py
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Requires Stockfish:
|
|
54
|
+
```bash
|
|
55
|
+
brew install stockfish # Mac
|
|
56
|
+
apt install stockfish # Linux
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Quick start
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import chessvision as cv
|
|
63
|
+
|
|
64
|
+
cv.download_models()
|
|
65
|
+
|
|
66
|
+
report = cv.analyze("my_games.pgn", player_name="YourUsername")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Or step by step:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
games, moves = cv.parse_pgn("my_games.pgn")
|
|
73
|
+
moves = cv.evaluate_games(moves)
|
|
74
|
+
moves = cv.engineer_features(games, moves)
|
|
75
|
+
errors = cv.run_archetype_analysis(moves)
|
|
76
|
+
report = cv.analyze_player(moves, games, errors, "YourUsername")
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Pre-trained models
|
|
80
|
+
|
|
81
|
+
Models are hosted on Hugging Face at `rakkshet/chessvision-models` and download automatically on first use:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
cv.download_models()
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
|
|
89
|
+
GPL-3.0
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# chessvision-py
|
|
2
|
+
|
|
3
|
+
A Python library for computational chess performance analysis. Takes your PGN game files and runs machine learning on them to tell you what kind of mistakes you make and how to improve.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
- Parses PGN files from chess.com, lichess, or OTB tournaments
|
|
8
|
+
- Evaluates every position with Stockfish (cached, resumable)
|
|
9
|
+
- Clusters your errors into 5 distinct archetypes
|
|
10
|
+
- Embeds your playing style into a vector space trained on 29.3M elite games
|
|
11
|
+
- Forecasts ELO trajectory using a model trained on 67,115 elite players
|
|
12
|
+
- Generates a personalized weekly training plan
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install chessvision-py
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Requires Stockfish:
|
|
21
|
+
```bash
|
|
22
|
+
brew install stockfish # Mac
|
|
23
|
+
apt install stockfish # Linux
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick start
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import chessvision as cv
|
|
30
|
+
|
|
31
|
+
cv.download_models()
|
|
32
|
+
|
|
33
|
+
report = cv.analyze("my_games.pgn", player_name="YourUsername")
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Or step by step:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
games, moves = cv.parse_pgn("my_games.pgn")
|
|
40
|
+
moves = cv.evaluate_games(moves)
|
|
41
|
+
moves = cv.engineer_features(games, moves)
|
|
42
|
+
errors = cv.run_archetype_analysis(moves)
|
|
43
|
+
report = cv.analyze_player(moves, games, errors, "YourUsername")
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Pre-trained models
|
|
47
|
+
|
|
48
|
+
Models are hosted on Hugging Face at `rakkshet/chessvision-models` and download automatically on first use:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
cv.download_models()
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## License
|
|
55
|
+
|
|
56
|
+
GPL-3.0
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from .analyze import analyze
|
|
2
|
+
from .parser import parse_pgn
|
|
3
|
+
from .evaluator import evaluate_games
|
|
4
|
+
from .features import engineer_features, feature_summary
|
|
5
|
+
from .archetypes import run_archetype_analysis, player_archetype_profile
|
|
6
|
+
from .recommender import (
|
|
7
|
+
build_player_profile,
|
|
8
|
+
generate_recommendations,
|
|
9
|
+
analyze_player,
|
|
10
|
+
)
|
|
11
|
+
from .models import (
|
|
12
|
+
download_models,
|
|
13
|
+
register_local_models,
|
|
14
|
+
list_models,
|
|
15
|
+
is_downloaded,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from .elo_forecast import (
|
|
20
|
+
build_game_features,
|
|
21
|
+
train_elo_model,
|
|
22
|
+
train_win_classifier,
|
|
23
|
+
train_population_lstm,
|
|
24
|
+
fine_tune_on_personal,
|
|
25
|
+
predict_elo_trajectory,
|
|
26
|
+
load_model,
|
|
27
|
+
)
|
|
28
|
+
except ImportError:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
__version__ = "0.1.0"
|
|
32
|
+
__author__ = "Rakkshet Singhaal"
|
|
33
|
+
__email__ = "rakkshet.singhaal@kellogg.northwestern.edu"
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
analyze.py
|
|
3
|
+
|
|
4
|
+
Main entry point for chessvision.
|
|
5
|
+
One function call produces the full analysis report.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional, Union
|
|
11
|
+
|
|
12
|
+
from .parser import parse_pgn
|
|
13
|
+
from .evaluator import evaluate_games
|
|
14
|
+
from .features import engineer_features
|
|
15
|
+
from .archetypes import run_archetype_analysis
|
|
16
|
+
from .recommender import analyze_player
|
|
17
|
+
from .models import get_cached_path
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def analyze(
|
|
21
|
+
pgn_path: Union[str, Path],
|
|
22
|
+
player_name: Optional[str] = None,
|
|
23
|
+
stockfish_path: str = "stockfish",
|
|
24
|
+
depth: int = 15,
|
|
25
|
+
output_path: Optional[Union[str, Path]] = None,
|
|
26
|
+
verbose: bool = True,
|
|
27
|
+
) -> dict:
|
|
28
|
+
"""
|
|
29
|
+
Full chessvision analysis pipeline — one function call.
|
|
30
|
+
|
|
31
|
+
Takes a PGN file (or folder of PGN files) and returns a
|
|
32
|
+
comprehensive training report with error archetypes, style
|
|
33
|
+
profile, and personalized recommendations.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
pgn_path : path to .pgn file or folder of .pgn files
|
|
38
|
+
player_name : name of the player to analyze
|
|
39
|
+
(auto-detected if only one player appears frequently)
|
|
40
|
+
stockfish_path: path to Stockfish binary (default: 'stockfish' on PATH)
|
|
41
|
+
depth : Stockfish search depth (default: 15)
|
|
42
|
+
output_path : if set, saves JSON report here
|
|
43
|
+
verbose : print progress and report
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
dict with full analysis results
|
|
48
|
+
|
|
49
|
+
Example
|
|
50
|
+
-------
|
|
51
|
+
>>> import chessvision as cv
|
|
52
|
+
>>> report = cv.analyze("my_games.pgn", player_name="MyUsername")
|
|
53
|
+
"""
|
|
54
|
+
pgn_path = Path(pgn_path)
|
|
55
|
+
|
|
56
|
+
if verbose:
|
|
57
|
+
print("=" * 60)
|
|
58
|
+
print(" ChessVision Analysis Pipeline")
|
|
59
|
+
print("=" * 60)
|
|
60
|
+
print()
|
|
61
|
+
|
|
62
|
+
# ── Step 1: Parse ────────────────────────────────────────────
|
|
63
|
+
if verbose:
|
|
64
|
+
print("[1/5] Parsing PGN files...")
|
|
65
|
+
games_df, moves_df = parse_pgn(pgn_path)
|
|
66
|
+
|
|
67
|
+
# Auto-detect player name
|
|
68
|
+
if player_name is None:
|
|
69
|
+
player_name = _detect_player(games_df)
|
|
70
|
+
if verbose:
|
|
71
|
+
print(f" Auto-detected player: {player_name}")
|
|
72
|
+
|
|
73
|
+
# ── Step 2: Evaluate ─────────────────────────────────────────
|
|
74
|
+
if verbose:
|
|
75
|
+
print("\n[2/5] Evaluating positions with Stockfish...")
|
|
76
|
+
print(" (This step is cached — safe to stop and restart)")
|
|
77
|
+
moves_df = evaluate_games(
|
|
78
|
+
moves_df,
|
|
79
|
+
stockfish_path = stockfish_path,
|
|
80
|
+
depth = depth,
|
|
81
|
+
cache = True,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# ── Step 3: Feature engineering ──────────────────────────────
|
|
85
|
+
if verbose:
|
|
86
|
+
print("\n[3/5] Engineering behavioral features...")
|
|
87
|
+
moves_df = engineer_features(games_df, moves_df)
|
|
88
|
+
|
|
89
|
+
# ── Step 4: Error archetypes ──────────────────────────────────
|
|
90
|
+
if verbose:
|
|
91
|
+
print("\n[4/5] Discovering error archetypes...")
|
|
92
|
+
|
|
93
|
+
# Use pre-trained model if available
|
|
94
|
+
get_cached_path("error_archetypes", "hdbscan_model.joblib")
|
|
95
|
+
error_df = run_archetype_analysis(
|
|
96
|
+
moves_df,
|
|
97
|
+
min_cluster_size = 200,
|
|
98
|
+
min_samples = 50,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# ── Step 5: Recommendations ──────────────────────────────────
|
|
102
|
+
if verbose:
|
|
103
|
+
print("\n[5/5] Generating personalized recommendations...")
|
|
104
|
+
|
|
105
|
+
chess2vec_path = get_cached_path(
|
|
106
|
+
"chess2vec", "chess2vec.wordvectors"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
report = analyze_player(
|
|
110
|
+
moves_df = moves_df,
|
|
111
|
+
games_df = games_df,
|
|
112
|
+
error_df = error_df,
|
|
113
|
+
player_name = player_name,
|
|
114
|
+
chess2vec_path = chess2vec_path,
|
|
115
|
+
output_path = Path(output_path) if output_path else None,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return report
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _detect_player(games_df: pd.DataFrame) -> str:
|
|
122
|
+
"""Auto-detect the focal player — the one who appears most often."""
|
|
123
|
+
all_players = pd.concat([games_df["white"], games_df["black"]])
|
|
124
|
+
return all_players.value_counts().index[0]
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
"""
|
|
2
|
+
archetypes.py
|
|
3
|
+
|
|
4
|
+
Unsupervised error archetype discovery using HDBSCAN.
|
|
5
|
+
|
|
6
|
+
Takes the evaluated move dataframe and clusters error moves
|
|
7
|
+
into behaviorally distinct groups — replacing the naive
|
|
8
|
+
blunder/mistake/inaccuracy taxonomy with a data-driven one.
|
|
9
|
+
|
|
10
|
+
Expected archetypes:
|
|
11
|
+
- Tactical blindspot : sharp positions, sudden CPL spike
|
|
12
|
+
- Strategic drift : cumulative small losses, passive pieces
|
|
13
|
+
- Time pressure : low clock, CPL spikes regardless of position
|
|
14
|
+
- Endgame failure : winning positions lost in low material
|
|
15
|
+
- Preparation boundary : CPL spike at specific opening move number
|
|
16
|
+
- Positional confusion : complex pawn structures, gradual CPL creep
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
import joblib
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Optional
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
import hdbscan
|
|
27
|
+
except ImportError:
|
|
28
|
+
raise ImportError("Install hdbscan: pip3 install hdbscan")
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
import umap
|
|
32
|
+
except ImportError:
|
|
33
|
+
raise ImportError("Install umap-learn: pip3 install umap-learn")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
ARCHETYPE_LABELS = {
|
|
37
|
+
-1: "noise",
|
|
38
|
+
0: "tactical_blindspot",
|
|
39
|
+
1: "strategic_drift",
|
|
40
|
+
2: "time_pressure_collapse",
|
|
41
|
+
3: "endgame_failure",
|
|
42
|
+
4: "preparation_boundary",
|
|
43
|
+
5: "positional_confusion",
|
|
44
|
+
6: "other",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def build_error_features(moves_df: pd.DataFrame) -> pd.DataFrame:
|
|
49
|
+
"""
|
|
50
|
+
Build the feature matrix for error clustering.
|
|
51
|
+
Filters to error moves only (CPL >= 50).
|
|
52
|
+
|
|
53
|
+
Features per move:
|
|
54
|
+
cpl_norm : normalized centipawn loss [0,1]
|
|
55
|
+
eval_before_norm : normalized position evaluation
|
|
56
|
+
clock_pressure_index : time remaining fraction
|
|
57
|
+
complexity : position complexity score
|
|
58
|
+
mobility_ratio : own vs opponent mobility
|
|
59
|
+
phase_encoded : opening=0, middlegame=1, endgame=2
|
|
60
|
+
move_number_norm : normalized move number
|
|
61
|
+
is_capture : binary — was it a capture?
|
|
62
|
+
cumulative_cpl_norm : running total CPL in game (normalized)
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
error_df : filtered + feature-enriched dataframe
|
|
67
|
+
"""
|
|
68
|
+
df = moves_df.copy()
|
|
69
|
+
|
|
70
|
+
# Filter to errors only
|
|
71
|
+
error_df = df[df["cpl"] >= 50].copy()
|
|
72
|
+
print(f"Error moves (CPL >= 50): {len(error_df):,} of {len(df):,} total")
|
|
73
|
+
|
|
74
|
+
# Normalize CPL (clip at 1000 to handle mate scores)
|
|
75
|
+
error_df["cpl_norm"] = (
|
|
76
|
+
error_df["cpl"].clip(upper=1000) / 1000.0
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Normalize eval_before (clip at ±1000 centipawns)
|
|
80
|
+
error_df["eval_before_norm"] = (
|
|
81
|
+
error_df["eval_before"].clip(-1000, 1000) / 1000.0
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Phase encoding
|
|
85
|
+
phase_map = {"opening": 0.0, "middlegame": 0.5, "endgame": 1.0}
|
|
86
|
+
error_df["phase_encoded"] = error_df["phase"].map(phase_map).fillna(0.5)
|
|
87
|
+
|
|
88
|
+
# Normalize move number (cap at 80)
|
|
89
|
+
error_df["move_number_norm"] = (
|
|
90
|
+
error_df["move_number"].clip(upper=80) / 80.0
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Is capture — detect from SAN notation
|
|
94
|
+
error_df["is_capture"] = error_df["san"].str.contains(
|
|
95
|
+
"x", na=False
|
|
96
|
+
).astype(float)
|
|
97
|
+
|
|
98
|
+
# Cumulative CPL per game (normalized)
|
|
99
|
+
error_df["cumulative_cpl"] = (
|
|
100
|
+
df.groupby("game_id")["cpl"]
|
|
101
|
+
.transform(lambda x: x.fillna(0).cumsum())
|
|
102
|
+
.loc[error_df.index]
|
|
103
|
+
)
|
|
104
|
+
error_df["cumulative_cpl_norm"] = (
|
|
105
|
+
error_df["cumulative_cpl"].clip(upper=3000) / 3000.0
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Fill missing values
|
|
109
|
+
error_df["clock_pressure_index"] = (
|
|
110
|
+
error_df["clock_pressure_index"].fillna(0.5)
|
|
111
|
+
)
|
|
112
|
+
error_df["position_complexity_score"] = (
|
|
113
|
+
error_df["position_complexity_score"].fillna(0.5)
|
|
114
|
+
)
|
|
115
|
+
error_df["mobility_ratio"] = (
|
|
116
|
+
error_df["mobility_ratio"].fillna(1.0).clip(0.1, 5.0)
|
|
117
|
+
)
|
|
118
|
+
error_df["mobility_ratio_norm"] = (
|
|
119
|
+
(error_df["mobility_ratio"] - 0.1) / (5.0 - 0.1)
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return error_df
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def get_feature_matrix(error_df: pd.DataFrame) -> np.ndarray:
|
|
126
|
+
"""Extract the numeric feature columns as a numpy array."""
|
|
127
|
+
feature_cols = [
|
|
128
|
+
"cpl_norm",
|
|
129
|
+
"eval_before_norm",
|
|
130
|
+
"clock_pressure_index",
|
|
131
|
+
"position_complexity_score",
|
|
132
|
+
"mobility_ratio_norm",
|
|
133
|
+
"phase_encoded",
|
|
134
|
+
"move_number_norm",
|
|
135
|
+
"is_capture",
|
|
136
|
+
"cumulative_cpl_norm",
|
|
137
|
+
]
|
|
138
|
+
missing = [c for c in feature_cols if c not in error_df.columns]
|
|
139
|
+
if missing:
|
|
140
|
+
raise ValueError(f"Missing feature columns: {missing}")
|
|
141
|
+
|
|
142
|
+
X = error_df[feature_cols].values.astype(np.float32)
|
|
143
|
+
print(f"Feature matrix shape: {X.shape}")
|
|
144
|
+
return X
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def reduce_dimensions(
|
|
148
|
+
X: np.ndarray,
|
|
149
|
+
n_components: int = 10,
|
|
150
|
+
random_state: int = 42,
|
|
151
|
+
) -> np.ndarray:
|
|
152
|
+
"""
|
|
153
|
+
Reduce feature dimensions with UMAP before clustering.
|
|
154
|
+
HDBSCAN degrades in high dimensions — UMAP first improves results.
|
|
155
|
+
"""
|
|
156
|
+
print(f"Reducing {X.shape[1]}D → {n_components}D with UMAP...")
|
|
157
|
+
reducer = umap.UMAP(
|
|
158
|
+
n_components = n_components,
|
|
159
|
+
n_neighbors = 30,
|
|
160
|
+
min_dist = 0.0,
|
|
161
|
+
metric = "euclidean",
|
|
162
|
+
random_state = random_state,
|
|
163
|
+
)
|
|
164
|
+
X_reduced = reducer.fit_transform(X)
|
|
165
|
+
print(f"Reduction complete. Shape: {X_reduced.shape}")
|
|
166
|
+
return X_reduced, reducer
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def cluster_errors(
|
|
170
|
+
X_reduced: np.ndarray,
|
|
171
|
+
min_cluster_size: int = 200,
|
|
172
|
+
min_samples: int = 50,
|
|
173
|
+
) -> np.ndarray:
|
|
174
|
+
"""
|
|
175
|
+
Run HDBSCAN on the UMAP-reduced feature matrix.
|
|
176
|
+
|
|
177
|
+
min_cluster_size : minimum moves to form a cluster
|
|
178
|
+
min_samples : controls how conservative clustering is
|
|
179
|
+
(higher = fewer, denser clusters)
|
|
180
|
+
"""
|
|
181
|
+
print(f"Running HDBSCAN "
|
|
182
|
+
f"(min_cluster_size={min_cluster_size}, "
|
|
183
|
+
f"min_samples={min_samples})...")
|
|
184
|
+
|
|
185
|
+
clusterer = hdbscan.HDBSCAN(
|
|
186
|
+
min_cluster_size = min_cluster_size,
|
|
187
|
+
min_samples = min_samples,
|
|
188
|
+
metric = "euclidean",
|
|
189
|
+
cluster_selection_method = "eom",
|
|
190
|
+
prediction_data = True,
|
|
191
|
+
)
|
|
192
|
+
labels = clusterer.fit_predict(X_reduced)
|
|
193
|
+
|
|
194
|
+
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
195
|
+
n_noise = (labels == -1).sum()
|
|
196
|
+
print(f"Found {n_clusters} clusters, {n_noise:,} noise points "
|
|
197
|
+
f"({n_noise/len(labels)*100:.1f}%)")
|
|
198
|
+
|
|
199
|
+
return labels, clusterer
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def label_archetypes(
|
|
203
|
+
error_df: pd.DataFrame,
|
|
204
|
+
labels: np.ndarray,
|
|
205
|
+
) -> pd.DataFrame:
|
|
206
|
+
"""
|
|
207
|
+
Assign archetype labels to clusters by inspecting
|
|
208
|
+
the mean feature profile of each cluster.
|
|
209
|
+
|
|
210
|
+
Labeling rules (in priority order):
|
|
211
|
+
time_pressure_collapse : mean clock_pressure_index < 0.2
|
|
212
|
+
endgame_failure : mean phase_encoded > 0.8
|
|
213
|
+
preparation_boundary : mean move_number_norm < 0.2
|
|
214
|
+
tactical_blindspot : high cpl_norm + high complexity
|
|
215
|
+
positional_confusion : low cpl_norm + high cumulative_cpl
|
|
216
|
+
strategic_drift : middlegame + low complexity + high cumul
|
|
217
|
+
other : everything else
|
|
218
|
+
"""
|
|
219
|
+
df = error_df.copy()
|
|
220
|
+
df["cluster"] = labels
|
|
221
|
+
df["archetype"] = "noise"
|
|
222
|
+
|
|
223
|
+
cluster_ids = [c for c in sorted(df["cluster"].unique()) if c != -1]
|
|
224
|
+
archetype_map = {}
|
|
225
|
+
|
|
226
|
+
print("\nCluster profiles:")
|
|
227
|
+
print(f"{'Cluster':>8} {'N':>7} {'CPL':>6} {'Clock':>6} "
|
|
228
|
+
f"{'Phase':>6} {'Complexity':>10} {'MoveN':>6} Archetype")
|
|
229
|
+
print("-" * 75)
|
|
230
|
+
|
|
231
|
+
for cid in cluster_ids:
|
|
232
|
+
mask = df["cluster"] == cid
|
|
233
|
+
sub = df[mask]
|
|
234
|
+
n = len(sub)
|
|
235
|
+
|
|
236
|
+
mean_cpl = sub["cpl_norm"].mean()
|
|
237
|
+
mean_clock = sub["clock_pressure_index"].mean()
|
|
238
|
+
mean_phase = sub["phase_encoded"].mean()
|
|
239
|
+
mean_complexity = sub["position_complexity_score"].mean()
|
|
240
|
+
mean_move = sub["move_number_norm"].mean()
|
|
241
|
+
mean_cumul = sub["cumulative_cpl_norm"].mean()
|
|
242
|
+
|
|
243
|
+
# Assign archetype by feature signature
|
|
244
|
+
if mean_clock < 0.2:
|
|
245
|
+
archetype = "time_pressure_collapse"
|
|
246
|
+
elif mean_phase > 0.75:
|
|
247
|
+
archetype = "endgame_failure"
|
|
248
|
+
elif mean_move < 0.2:
|
|
249
|
+
archetype = "preparation_boundary"
|
|
250
|
+
elif mean_cpl > 0.5 and mean_complexity > 0.6:
|
|
251
|
+
archetype = "tactical_blindspot"
|
|
252
|
+
elif mean_cumul > 0.5 and mean_complexity < 0.4:
|
|
253
|
+
archetype = "strategic_drift"
|
|
254
|
+
elif mean_cumul > 0.4 and mean_complexity > 0.4:
|
|
255
|
+
archetype = "positional_confusion"
|
|
256
|
+
else:
|
|
257
|
+
archetype = "other"
|
|
258
|
+
|
|
259
|
+
archetype_map[cid] = archetype
|
|
260
|
+
print(f"{cid:>8} {n:>7,} {mean_cpl:>6.3f} {mean_clock:>6.3f} "
|
|
261
|
+
f"{mean_phase:>6.3f} {mean_complexity:>10.3f} "
|
|
262
|
+
f"{mean_move:>6.3f} {archetype}")
|
|
263
|
+
|
|
264
|
+
df["archetype"] = df["cluster"].map(archetype_map).fillna("noise")
|
|
265
|
+
return df, archetype_map
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def run_archetype_analysis(
|
|
269
|
+
moves_df: pd.DataFrame,
|
|
270
|
+
min_cluster_size: int = 200,
|
|
271
|
+
min_samples: int = 50,
|
|
272
|
+
umap_components: int = 10,
|
|
273
|
+
output_dir: Optional[Path] = None,
|
|
274
|
+
) -> pd.DataFrame:
|
|
275
|
+
"""
|
|
276
|
+
Full pipeline: features → UMAP → HDBSCAN → archetypes.
|
|
277
|
+
|
|
278
|
+
Parameters
|
|
279
|
+
----------
|
|
280
|
+
moves_df : evaluated move dataframe
|
|
281
|
+
min_cluster_size : HDBSCAN minimum cluster size
|
|
282
|
+
min_samples : HDBSCAN minimum samples
|
|
283
|
+
umap_components : UMAP output dimensions
|
|
284
|
+
output_dir : if set, saves model and results here
|
|
285
|
+
|
|
286
|
+
Returns
|
|
287
|
+
-------
|
|
288
|
+
error_df with cluster and archetype columns added
|
|
289
|
+
"""
|
|
290
|
+
print("=" * 60)
|
|
291
|
+
print("Phase 3B — Error Archetype Analysis")
|
|
292
|
+
print("=" * 60)
|
|
293
|
+
|
|
294
|
+
# Check input
|
|
295
|
+
required = ["cpl", "phase", "move_number", "clock_pressure_index",
|
|
296
|
+
"position_complexity_score", "mobility_ratio"]
|
|
297
|
+
missing = [c for c in required if c not in moves_df.columns]
|
|
298
|
+
if missing:
|
|
299
|
+
raise ValueError(
|
|
300
|
+
f"Missing columns: {missing}. "
|
|
301
|
+
f"Run evaluate_games() and engineer_features() first."
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
evaluated = moves_df["cpl"].notna().sum()
|
|
305
|
+
if evaluated == 0:
|
|
306
|
+
raise ValueError("No evaluated moves found. Run evaluate_games() first.")
|
|
307
|
+
|
|
308
|
+
print(f"\nInput: {len(moves_df):,} moves, {evaluated:,} evaluated")
|
|
309
|
+
|
|
310
|
+
# Step 1: build features
|
|
311
|
+
print("\n[1/4] Building error features...")
|
|
312
|
+
error_df = build_error_features(moves_df)
|
|
313
|
+
|
|
314
|
+
# Step 2: feature matrix
|
|
315
|
+
print("\n[2/4] Extracting feature matrix...")
|
|
316
|
+
X = get_feature_matrix(error_df)
|
|
317
|
+
|
|
318
|
+
# Step 3: UMAP reduction
|
|
319
|
+
print("\n[3/4] Dimensionality reduction...")
|
|
320
|
+
X_reduced, reducer = reduce_dimensions(X, n_components=umap_components)
|
|
321
|
+
|
|
322
|
+
# Step 4: clustering
|
|
323
|
+
print("\n[4/4] Clustering...")
|
|
324
|
+
labels, clusterer = cluster_errors(
|
|
325
|
+
X_reduced, min_cluster_size, min_samples
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Label archetypes
|
|
329
|
+
error_df, archetype_map = label_archetypes(error_df, labels)
|
|
330
|
+
|
|
331
|
+
# Summary
|
|
332
|
+
print("\nArchetype distribution:")
|
|
333
|
+
arch_counts = error_df["archetype"].value_counts()
|
|
334
|
+
for arch, count in arch_counts.items():
|
|
335
|
+
pct = count / len(error_df) * 100
|
|
336
|
+
print(f" {arch:<28} : {count:>6,} ({pct:.1f}%)")
|
|
337
|
+
|
|
338
|
+
# Save if requested
|
|
339
|
+
if output_dir:
|
|
340
|
+
output_dir = Path(output_dir)
|
|
341
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
342
|
+
|
|
343
|
+
joblib.dump(clusterer, output_dir / "hdbscan_model.joblib")
|
|
344
|
+
joblib.dump(reducer, output_dir / "umap_reducer.joblib")
|
|
345
|
+
|
|
346
|
+
error_df.to_parquet(output_dir / "error_archetypes.parquet",
|
|
347
|
+
index=False)
|
|
348
|
+
|
|
349
|
+
import json
|
|
350
|
+
with open(output_dir / "archetype_map.json", "w") as f:
|
|
351
|
+
json.dump({str(k): v for k, v in archetype_map.items()}, f,
|
|
352
|
+
indent=2)
|
|
353
|
+
|
|
354
|
+
print(f"\nSaved to {output_dir}")
|
|
355
|
+
|
|
356
|
+
return error_df
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def player_archetype_profile(
|
|
360
|
+
error_df: pd.DataFrame,
|
|
361
|
+
player_name: str,
|
|
362
|
+
games_df: Optional[pd.DataFrame] = None,
|
|
363
|
+
) -> pd.DataFrame:
|
|
364
|
+
"""
|
|
365
|
+
Compute archetype distribution for a specific player.
|
|
366
|
+
Returns a dataframe with archetype counts and percentages.
|
|
367
|
+
"""
|
|
368
|
+
if games_df is not None:
|
|
369
|
+
player_games = games_df[
|
|
370
|
+
(games_df["white"] == player_name) |
|
|
371
|
+
(games_df["black"] == player_name)
|
|
372
|
+
]["game_id"].tolist()
|
|
373
|
+
player_errors = error_df[error_df["game_id"].isin(player_games)]
|
|
374
|
+
else:
|
|
375
|
+
player_errors = error_df
|
|
376
|
+
|
|
377
|
+
if len(player_errors) == 0:
|
|
378
|
+
print(f"No error moves found for player: {player_name}")
|
|
379
|
+
return pd.DataFrame()
|
|
380
|
+
|
|
381
|
+
profile = (
|
|
382
|
+
player_errors["archetype"]
|
|
383
|
+
.value_counts()
|
|
384
|
+
.reset_index()
|
|
385
|
+
)
|
|
386
|
+
profile.columns = ["archetype", "count"]
|
|
387
|
+
profile["pct"] = (profile["count"] / len(player_errors) * 100).round(1)
|
|
388
|
+
|
|
389
|
+
print(f"\nError archetype profile for {player_name}:")
|
|
390
|
+
print(f"Total error moves: {len(player_errors):,}")
|
|
391
|
+
print(profile.to_string(index=False))
|
|
392
|
+
|
|
393
|
+
return profile
|