nbastatpy 0.1.6__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nbastatpy might be problematic. Click here for more details.

@@ -0,0 +1,529 @@
1
+ from datetime import datetime
2
+ from typing import Optional
3
+
4
+ import pandas as pd
5
+ from loguru import logger
6
+
7
+ from nbastatpy.config import (
8
+ ColumnTypes,
9
+ DateFields,
10
+ IDFields,
11
+ SpecialFields,
12
+ TimeFields,
13
+ )
14
+
15
+
16
+ class DataStandardizer:
17
+ """Base class for standardizing NBA data."""
18
+
19
+ def __init__(self, df: pd.DataFrame, add_metadata: bool = False):
20
+ """Initialize the DataStandardizer.
21
+
22
+ Args:
23
+ df: The DataFrame to standardize
24
+ add_metadata: Whether to add metadata fields (standardized_at, etc.)
25
+ """
26
+ self.df = df.copy()
27
+ self.add_metadata = add_metadata
28
+
29
+ def standardize(self) -> pd.DataFrame:
30
+ """Apply all standardization steps and return the standardized DataFrame.
31
+
32
+ Returns:
33
+ Standardized DataFrame
34
+ """
35
+ self.lowercase_columns()
36
+ self.standardize_ids()
37
+ self.standardize_dates()
38
+ self.standardize_types()
39
+
40
+ if self.add_metadata:
41
+ self.add_metadata_fields()
42
+
43
+ return self.df
44
+
45
+ def lowercase_columns(self) -> None:
46
+ """Convert all column names to lowercase."""
47
+ self.df.columns = [col.lower() for col in self.df.columns]
48
+
49
+ def standardize_ids(self) -> None:
50
+ """Standardize ID columns: rename and zero-pad to 10 digits."""
51
+ # First, rename inconsistent ID fields
52
+ for old_name, new_name in IDFields.ID_FIELD_MAPPING.items():
53
+ if old_name in self.df.columns:
54
+ self.df = self.df.rename(columns={old_name: new_name})
55
+
56
+ # Then, zero-pad all ID fields
57
+ for id_field in IDFields.ID_FIELDS:
58
+ if id_field in self.df.columns:
59
+ try:
60
+ self.df[id_field] = (
61
+ self.df[id_field]
62
+ .astype(str)
63
+ .str.replace(".0", "", regex=False)
64
+ .str.zfill(10)
65
+ )
66
+ except Exception as e:
67
+ logger.warning(f"Could not standardize ID field {id_field}: {e}")
68
+
69
+ def standardize_dates(self) -> None:
70
+ """Parse and standardize date columns."""
71
+ for date_field in DateFields.DATE_FIELDS:
72
+ if date_field in self.df.columns:
73
+ try:
74
+ # Try pandas automatic parsing first
75
+ self.df[date_field] = pd.to_datetime(
76
+ self.df[date_field], errors="coerce"
77
+ ).dt.date
78
+ except Exception as e:
79
+ logger.warning(f"Could not parse date field {date_field}: {e}")
80
+
81
+ def standardize_types(self) -> None:
82
+ """Convert columns to appropriate data types."""
83
+ # Integer columns - try int first, fall back to float if needed
84
+ for col in ColumnTypes.INTEGER_COLUMNS:
85
+ if col in self.df.columns:
86
+ # Convert to numeric, coercing errors to NaN
87
+ numeric_col = pd.to_numeric(self.df[col], errors="coerce")
88
+
89
+ # Check if all non-null values are integers
90
+ if numeric_col.notna().any():
91
+ non_null = numeric_col.dropna()
92
+ if (non_null == non_null.astype(int)).all():
93
+ # All values are integers, use Int64
94
+ self.df[col] = numeric_col.astype("Int64")
95
+ else:
96
+ # Has decimal values, use float
97
+ self.df[col] = numeric_col
98
+
99
+ # Float columns
100
+ for col in ColumnTypes.FLOAT_COLUMNS:
101
+ if col in self.df.columns:
102
+ self.df[col] = pd.to_numeric(self.df[col], errors="coerce")
103
+
104
+ # String columns
105
+ for col in ColumnTypes.STRING_COLUMNS:
106
+ if col in self.df.columns:
107
+ self.df[col] = self.df[col].astype(str)
108
+
109
+ def add_metadata_fields(self) -> None:
110
+ """Add metadata fields like standardization timestamp."""
111
+ self.df["standardized_at"] = datetime.now().isoformat()
112
+
113
+
114
+ class PlayerDataStandardizer(DataStandardizer):
115
+ """Standardizer for player-specific data."""
116
+
117
+ def standardize(self) -> pd.DataFrame:
118
+ """Apply player-specific standardization steps.
119
+
120
+ Returns:
121
+ Standardized DataFrame
122
+ """
123
+ # Apply base standardization
124
+ super().standardize()
125
+
126
+ # Player-specific transformations
127
+ self.convert_height()
128
+ self.parse_birthdate()
129
+ self.standardize_weight()
130
+ self.standardize_position()
131
+
132
+ return self.df
133
+
134
+ def convert_height(self) -> None:
135
+ """Convert height from feet-inches format (e.g., '6-11') to total inches."""
136
+ for height_field in SpecialFields.HEIGHT_FIELDS:
137
+ if height_field in self.df.columns:
138
+ try:
139
+ # Split on hyphen and convert to inches
140
+ def parse_height(height_str):
141
+ if pd.isna(height_str) or height_str == "":
142
+ return None
143
+ parts = str(height_str).split("-")
144
+ if len(parts) == 2:
145
+ feet = int(parts[0])
146
+ inches = int(parts[1])
147
+ return feet * 12 + inches
148
+ return None
149
+
150
+ self.df["height_inches"] = self.df[height_field].apply(parse_height)
151
+ except Exception as e:
152
+ logger.warning(
153
+ f"Could not convert height field {height_field}: {e}"
154
+ )
155
+
156
+ def parse_birthdate(self) -> None:
157
+ """Parse birthdate fields with special handling."""
158
+ if "birthdate" in self.df.columns:
159
+ try:
160
+ self.df["birthdate"] = pd.to_datetime(
161
+ self.df["birthdate"], errors="coerce"
162
+ ).dt.date
163
+ except Exception as e:
164
+ logger.warning(f"Could not parse birthdate: {e}")
165
+
166
+ def standardize_weight(self) -> None:
167
+ """Standardize weight to numeric pounds."""
168
+ for weight_field in SpecialFields.WEIGHT_FIELDS:
169
+ if weight_field in self.df.columns:
170
+ try:
171
+ # Remove any text and convert to numeric
172
+ self.df[weight_field] = (
173
+ pd.to_numeric(
174
+ self.df[weight_field].astype(str).str.extract(r"(\d+)")[0],
175
+ errors="coerce",
176
+ )
177
+ .fillna(0)
178
+ .astype("Int64")
179
+ )
180
+ except Exception as e:
181
+ logger.warning(
182
+ f"Could not standardize weight field {weight_field}: {e}"
183
+ )
184
+
185
+ def standardize_position(self) -> None:
186
+ """Standardize position abbreviations."""
187
+ position_fields = ["position", "pos", "player_position"]
188
+ for pos_field in position_fields:
189
+ if pos_field in self.df.columns:
190
+ try:
191
+
192
+ def clean_position(pos):
193
+ if pd.isna(pos):
194
+ return None
195
+ pos_str = str(pos).upper().strip()
196
+ # Standardize common variations
197
+ position_map = {
198
+ "GUARD": "G",
199
+ "FORWARD": "F",
200
+ "CENTER": "C",
201
+ "POINT GUARD": "PG",
202
+ "SHOOTING GUARD": "SG",
203
+ "SMALL FORWARD": "SF",
204
+ "POWER FORWARD": "PF",
205
+ "G-F": "GF",
206
+ "F-G": "FG",
207
+ "F-C": "FC",
208
+ "C-F": "CF",
209
+ }
210
+ return position_map.get(pos_str, pos_str)
211
+
212
+ self.df[pos_field] = self.df[pos_field].apply(clean_position)
213
+ except Exception as e:
214
+ logger.warning(
215
+ f"Could not standardize position field {pos_field}: {e}"
216
+ )
217
+
218
+
219
+ class GameDataStandardizer(DataStandardizer):
220
+ """Standardizer for game-specific data."""
221
+
222
+ def standardize(self) -> pd.DataFrame:
223
+ """Apply game-specific standardization steps.
224
+
225
+ Returns:
226
+ Standardized DataFrame
227
+ """
228
+ # Apply base standardization
229
+ super().standardize()
230
+
231
+ # Game-specific transformations
232
+ self.convert_minutes_to_seconds()
233
+ self.convert_matchup_time()
234
+ self.convert_clock_time()
235
+ self.parse_matchup_string()
236
+ self.standardize_wl()
237
+
238
+ return self.df
239
+
240
+ def convert_minutes_to_seconds(self) -> None:
241
+ """Convert MM:SS format to total seconds for minutes fields."""
242
+ for field in TimeFields.MINUTES_SECONDS_FIELDS:
243
+ if field in self.df.columns:
244
+ try:
245
+
246
+ def parse_time(time_str):
247
+ if pd.isna(time_str) or time_str == "":
248
+ return None
249
+ parts = str(time_str).split(":")
250
+ if len(parts) == 2:
251
+ minutes = int(parts[0])
252
+ seconds = int(float(parts[1]))
253
+ return minutes * 60 + seconds
254
+ return None
255
+
256
+ seconds_field = field.replace("minutes", "seconds").replace(
257
+ "min", "seconds"
258
+ )
259
+ self.df[seconds_field] = self.df[field].apply(parse_time)
260
+ except Exception as e:
261
+ logger.warning(f"Could not convert time field {field}: {e}")
262
+
263
+ def convert_matchup_time(self) -> None:
264
+ """Convert matchupminutes to matchup_seconds."""
265
+ if "matchupminutes" in self.df.columns:
266
+ try:
267
+
268
+ def parse_matchup_time(time_str):
269
+ if pd.isna(time_str) or time_str == "":
270
+ return None
271
+ parts = str(time_str).split(":")
272
+ if len(parts) == 2:
273
+ minutes = int(parts[0])
274
+ seconds = int(float(parts[1]))
275
+ return minutes * 60 + seconds
276
+ return None
277
+
278
+ self.df["matchup_seconds"] = self.df["matchupminutes"].apply(
279
+ parse_matchup_time
280
+ )
281
+ except Exception as e:
282
+ logger.warning(f"Could not convert matchupminutes: {e}")
283
+
284
+ def convert_clock_time(self) -> None:
285
+ """Process play-by-play clock format (e.g., 'PT11M23.45S')."""
286
+ if "clock" in self.df.columns:
287
+ try:
288
+
289
+ def parse_clock(clock_str):
290
+ if pd.isna(clock_str) or clock_str == "":
291
+ return None
292
+ import re
293
+
294
+ # Extract minutes and seconds from format like PT11M23.45S
295
+ minutes_match = re.search(r"(\d+)M", str(clock_str))
296
+ seconds_match = re.search(r"M(\d+(?:\.\d+)?)S", str(clock_str))
297
+
298
+ if minutes_match and seconds_match:
299
+ minutes = int(minutes_match.group(1))
300
+ seconds = float(seconds_match.group(1))
301
+ return minutes * 60 + seconds
302
+ return None
303
+
304
+ self.df["clock_seconds"] = self.df["clock"].apply(parse_clock)
305
+ except Exception as e:
306
+ logger.warning(f"Could not convert clock time: {e}")
307
+
308
+ def parse_matchup_string(self) -> None:
309
+ """Parse matchup strings like 'TOR @ BOS' into home/away teams."""
310
+ for matchup_field in SpecialFields.MATCHUP_FIELDS:
311
+ if matchup_field in self.df.columns:
312
+ try:
313
+
314
+ def extract_teams(matchup_str):
315
+ if pd.isna(matchup_str) or matchup_str == "":
316
+ return None, None
317
+ parts = str(matchup_str).split()
318
+ if len(parts) >= 3:
319
+ if "@" in parts:
320
+ idx = parts.index("@")
321
+ away_team = parts[idx - 1] if idx > 0 else None
322
+ home_team = (
323
+ parts[idx + 1] if idx < len(parts) - 1 else None
324
+ )
325
+ return away_team, home_team
326
+ elif "vs." in parts or "vs" in parts:
327
+ vs_idx = (
328
+ parts.index("vs.")
329
+ if "vs." in parts
330
+ else parts.index("vs")
331
+ )
332
+ home_team = parts[vs_idx - 1] if vs_idx > 0 else None
333
+ away_team = (
334
+ parts[vs_idx + 1]
335
+ if vs_idx < len(parts) - 1
336
+ else None
337
+ )
338
+ return away_team, home_team
339
+ return None, None
340
+
341
+ # Extract home and away teams
342
+ teams = self.df[matchup_field].apply(extract_teams)
343
+ self.df["away_team"] = teams.apply(lambda x: x[0] if x else None)
344
+ self.df["home_team"] = teams.apply(lambda x: x[1] if x else None)
345
+ except Exception as e:
346
+ logger.warning(
347
+ f"Could not parse matchup field {matchup_field}: {e}"
348
+ )
349
+
350
+ def standardize_wl(self) -> None:
351
+ """Standardize win/loss indicator fields to consistent format."""
352
+ for wl_field in SpecialFields.WL_FIELDS:
353
+ if wl_field in self.df.columns:
354
+ try:
355
+
356
+ def standardize_outcome(val):
357
+ if pd.isna(val):
358
+ return None
359
+ val_str = str(val).upper().strip()
360
+ if val_str in ["W", "WIN", "WON", "1", "TRUE"]:
361
+ return "W"
362
+ elif val_str in ["L", "LOSS", "LOST", "0", "FALSE"]:
363
+ return "L"
364
+ return val_str
365
+
366
+ self.df[wl_field] = self.df[wl_field].apply(standardize_outcome)
367
+ except Exception as e:
368
+ logger.warning(f"Could not standardize W/L field {wl_field}: {e}")
369
+
370
+
371
+ class SeasonDataStandardizer(DataStandardizer):
372
+ """Standardizer for season-specific data."""
373
+
374
+ def __init__(
375
+ self,
376
+ df: pd.DataFrame,
377
+ season: Optional[str] = None,
378
+ playoffs: bool = False,
379
+ add_metadata: bool = False,
380
+ ):
381
+ """Initialize the SeasonDataStandardizer.
382
+
383
+ Args:
384
+ df: The DataFrame to standardize
385
+ season: Season ID (e.g., '2023-24')
386
+ playoffs: Whether this is playoff data
387
+ add_metadata: Whether to add metadata fields
388
+ """
389
+ super().__init__(df, add_metadata)
390
+ self.season = season
391
+ self.playoffs = playoffs
392
+
393
+ def standardize(self) -> pd.DataFrame:
394
+ """Apply season-specific standardization steps.
395
+
396
+ Returns:
397
+ Standardized DataFrame
398
+ """
399
+ # Apply base standardization
400
+ super().standardize()
401
+
402
+ # Season-specific transformations
403
+ self.add_season_id()
404
+ self.add_playoff_flag()
405
+ self.parse_game_dates()
406
+
407
+ return self.df
408
+
409
+ def add_season_id(self) -> None:
410
+ """Add season_id column if season is provided."""
411
+ if self.season and "season_id" not in self.df.columns:
412
+ self.df["season_id"] = self.season
413
+
414
+ def add_playoff_flag(self) -> None:
415
+ """Add or standardize playoff indicator."""
416
+ if "is_playoffs" not in self.df.columns:
417
+ self.df["is_playoffs"] = "PLAYOFFS" if self.playoffs else "REGULAR_SEASON"
418
+ else:
419
+ # Standardize existing playoff flags
420
+ def standardize_playoff_flag(val):
421
+ if pd.isna(val):
422
+ return "REGULAR_SEASON"
423
+ val_str = str(val).upper()
424
+ if "PLAYOFF" in val_str or val_str == "TRUE" or val_str == "1":
425
+ return "PLAYOFFS"
426
+ return "REGULAR_SEASON"
427
+
428
+ self.df["is_playoffs"] = self.df["is_playoffs"].apply(
429
+ standardize_playoff_flag
430
+ )
431
+
432
+ def parse_game_dates(self) -> None:
433
+ """Parse game_date fields with various formats."""
434
+ if "game_date" in self.df.columns:
435
+ try:
436
+ self.df["game_date"] = pd.to_datetime(
437
+ self.df["game_date"], errors="coerce"
438
+ ).dt.date
439
+ except Exception as e:
440
+ logger.warning(f"Could not parse game_date: {e}")
441
+
442
+
443
+ class TeamDataStandardizer(DataStandardizer):
444
+ """Standardizer for team-specific data."""
445
+
446
+ def __init__(
447
+ self,
448
+ df: pd.DataFrame,
449
+ season: Optional[str] = None,
450
+ playoffs: bool = False,
451
+ add_metadata: bool = False,
452
+ ):
453
+ """Initialize the TeamDataStandardizer.
454
+
455
+ Args:
456
+ df: The DataFrame to standardize
457
+ season: Season ID (e.g., '2023-24')
458
+ playoffs: Whether this is playoff data
459
+ add_metadata: Whether to add metadata fields
460
+ """
461
+ super().__init__(df, add_metadata)
462
+ self.season = season
463
+ self.playoffs = playoffs
464
+
465
+ def standardize(self) -> pd.DataFrame:
466
+ """Apply team-specific standardization steps.
467
+
468
+ Returns:
469
+ Standardized DataFrame
470
+ """
471
+ # Apply base standardization
472
+ super().standardize()
473
+
474
+ # Team-specific transformations
475
+ self.add_season_metadata()
476
+
477
+ return self.df
478
+
479
+ def add_season_metadata(self) -> None:
480
+ """Add season and playoff metadata if not present."""
481
+ if self.season and "season" not in self.df.columns:
482
+ self.df["season"] = self.season
483
+
484
+ if "season_type" not in self.df.columns:
485
+ self.df["season_type"] = "Playoffs" if self.playoffs else "Regular Season"
486
+
487
+
488
+ def standardize_dataframe(
489
+ df: pd.DataFrame,
490
+ data_type: str = "base",
491
+ season: Optional[str] = None,
492
+ playoffs: bool = False,
493
+ add_metadata: bool = False,
494
+ ) -> pd.DataFrame:
495
+ """Standardize a DataFrame based on its type.
496
+
497
+ Args:
498
+ df: The DataFrame to standardize
499
+ data_type: Type of data ('player', 'game', 'season', 'team', or 'base')
500
+ season: Season ID for season/team data
501
+ playoffs: Whether this is playoff data
502
+ add_metadata: Whether to add metadata fields
503
+
504
+ Returns:
505
+ Standardized DataFrame
506
+
507
+ Example:
508
+ >>> df = player.get_common_info()
509
+ >>> standardized_df = standardize_dataframe(df, data_type='player')
510
+ """
511
+ standardizers = {
512
+ "player": PlayerDataStandardizer,
513
+ "game": GameDataStandardizer,
514
+ "season": SeasonDataStandardizer,
515
+ "team": TeamDataStandardizer,
516
+ "base": DataStandardizer,
517
+ }
518
+
519
+ standardizer_class = standardizers.get(data_type.lower(), DataStandardizer)
520
+
521
+ # Create standardizer with appropriate arguments
522
+ if data_type.lower() in ["season", "team"]:
523
+ standardizer = standardizer_class(
524
+ df, season=season, playoffs=playoffs, add_metadata=add_metadata
525
+ )
526
+ else:
527
+ standardizer = standardizer_class(df, add_metadata=add_metadata)
528
+
529
+ return standardizer.standardize()
nbastatpy/team.py CHANGED
@@ -6,17 +6,18 @@ import requests
6
6
  from bs4 import BeautifulSoup
7
7
  from nba_api.stats.static import teams
8
8
 
9
+ from nbastatpy.standardize import standardize_dataframe
9
10
  from nbastatpy.utils import Formatter, PlayTypes
10
11
 
11
12
 
12
13
  class Team:
13
14
  def __init__(
14
- self,
15
- team_abbreviation: str,
16
- season_year: str = None,
17
- playoffs=False,
18
- permode: str = "PerGame",
19
- ):
15
+ self,
16
+ team_abbreviation: str,
17
+ season_year: str = None,
18
+ playoffs=False,
19
+ permode: str = "PerGame",
20
+ ):
20
21
  """
21
22
  Initializes a Team object.
22
23
 
@@ -63,17 +64,33 @@ class Team:
63
64
  self.logo = pic.content
64
65
  return self.logo
65
66
 
66
- def get_roster(self) -> List[pd.DataFrame]:
67
+ def get_roster(self, standardize: bool = False) -> List[pd.DataFrame]:
67
68
  """
68
69
  Retrieves the roster of the team for the specified season.
69
70
 
71
+ Args:
72
+ standardize: Whether to apply data standardization
73
+
70
74
  Returns:
71
75
  List[pd.DataFrame]: A list of pandas DataFrames containing the roster data.
72
76
  """
73
- self.roster = nba.CommonTeamRoster(
77
+ dfs = nba.CommonTeamRoster(
74
78
  self.id,
75
79
  season=self.season,
76
80
  ).get_data_frames()
81
+
82
+ if standardize:
83
+ dfs = [
84
+ standardize_dataframe(
85
+ df,
86
+ data_type="team",
87
+ season=self.season,
88
+ playoffs=(self.season_type == "Playoffs"),
89
+ )
90
+ for df in dfs
91
+ ]
92
+
93
+ self.roster = dfs
77
94
  return self.roster
78
95
 
79
96
  def get_salary(self) -> pd.DataFrame:
@@ -91,13 +108,16 @@ class Team:
91
108
  result = requests.get(self.salary_url)
92
109
  soup = BeautifulSoup(result.content, features="html.parser")
93
110
  tables = soup.find_all("table")
94
-
95
- rows = [[cell.text.strip() for cell in row.find_all('td')] for row in tables[0].find_all('tr')]
96
-
111
+
112
+ rows = [
113
+ [cell.text.strip() for cell in row.find_all("td")]
114
+ for row in tables[0].find_all("tr")
115
+ ]
116
+
97
117
  if not rows[0]:
98
118
  rows.pop(0)
99
119
  if not rows:
100
- raise(ValueError(f"Season data unavailable for: {season_string}"))
120
+ raise (ValueError(f"Season data unavailable for: {season_string}"))
101
121
  self.salary_df = pd.DataFrame(rows[1:], columns=rows[0])
102
122
  self.salary_df["Season"] = self.salary_df.columns[1].replace("/", "_")
103
123
  self.salary_df.columns = ["Player", "Salary", "Adjusted Salary", "Season"]
@@ -269,22 +289,35 @@ class Team:
269
289
 
270
290
  return self.player_shot_locations
271
291
 
272
- def get_player_stats(self) -> pd.DataFrame:
292
+ def get_player_stats(self, standardize: bool = False) -> pd.DataFrame:
273
293
  """
274
294
  Retrieves the player statistics for the team.
275
295
 
296
+ Args:
297
+ standardize: Whether to apply data standardization
298
+
276
299
  Returns:
277
300
  pd.DataFrame: A DataFrame containing the player statistics.
278
301
  """
279
- self.player_stats = nba.LeagueDashPlayerStats(
302
+ df = nba.LeagueDashPlayerStats(
280
303
  team_id_nullable=self.id,
281
304
  season=self.season,
282
305
  season_type_all_star=self.season_type,
283
306
  per_mode_detailed=self.permode,
284
307
  ).get_data_frames()[0]
285
- self.player_stats["season"] = self.season
286
- self.player_stats["season_type"] = self.season_type
287
308
 
309
+ df["season"] = self.season
310
+ df["season_type"] = self.season_type
311
+
312
+ if standardize:
313
+ df = standardize_dataframe(
314
+ df,
315
+ data_type="team",
316
+ season=self.season,
317
+ playoffs=(self.season_type == "Playoffs"),
318
+ )
319
+
320
+ self.player_stats = df
288
321
  return self.player_stats
289
322
 
290
323
  def get_player_point_defend(self) -> pd.DataFrame: