wsba-hockey 1.1.7__py3-none-any.whl → 1.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/__init__.py +22 -1
- wsba_hockey/data_pipelines.py +2 -2
- wsba_hockey/tools/scraping.py +20 -20
- wsba_hockey/workspace.py +2 -5
- wsba_hockey/wsba_main.py +347 -89
- {wsba_hockey-1.1.7.dist-info → wsba_hockey-1.1.9.dist-info}/METADATA +1 -1
- {wsba_hockey-1.1.7.dist-info → wsba_hockey-1.1.9.dist-info}/RECORD +10 -10
- {wsba_hockey-1.1.7.dist-info → wsba_hockey-1.1.9.dist-info}/WHEEL +0 -0
- {wsba_hockey-1.1.7.dist-info → wsba_hockey-1.1.9.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-1.1.7.dist-info → wsba_hockey-1.1.9.dist-info}/top_level.txt +0 -0
wsba_hockey/__init__.py
CHANGED
@@ -1 +1,22 @@
|
|
1
|
-
from wsba_hockey.wsba_main import
|
1
|
+
from wsba_hockey.wsba_main import (
|
2
|
+
nhl_scrape_game,
|
3
|
+
nhl_scrape_schedule,
|
4
|
+
nhl_scrape_season,
|
5
|
+
nhl_scrape_seasons_info,
|
6
|
+
nhl_scrape_standings,
|
7
|
+
nhl_scrape_roster,
|
8
|
+
nhl_scrape_draft_rankings,
|
9
|
+
nhl_scrape_prospects,
|
10
|
+
nhl_scrape_game_info,
|
11
|
+
nhl_calculate_stats,
|
12
|
+
nhl_apply_xG,
|
13
|
+
nhl_plot_skaters_shots,
|
14
|
+
nhl_plot_games,
|
15
|
+
repo_load_rosters,
|
16
|
+
repo_load_schedule,
|
17
|
+
repo_load_teaminfo,
|
18
|
+
repo_load_pbp,
|
19
|
+
repo_load_seasons
|
20
|
+
)
|
21
|
+
|
22
|
+
from wsba_hockey.wsba_main import NHL_Database
|
wsba_hockey/data_pipelines.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import os
|
2
2
|
import numpy as np
|
3
3
|
import pandas as pd
|
4
|
-
import wsba_main as wsba
|
4
|
+
import wsba_hockey.wsba_main as wsba
|
5
5
|
import numpy as np
|
6
6
|
from datetime import datetime
|
7
7
|
from gspread_pandas import Spread
|
@@ -14,7 +14,7 @@ def pbp(seasons):
|
|
14
14
|
for season in seasons:
|
15
15
|
errors=[]
|
16
16
|
for season in seasons:
|
17
|
-
data = wsba.nhl_scrape_season(season,remove=[],local=
|
17
|
+
data = wsba.nhl_scrape_season(season,remove=[],local=False,sources=True,errors=True)
|
18
18
|
errors.append(data['errors'])
|
19
19
|
data['pbp'].to_csv('temp.csv',index=False)
|
20
20
|
pd.read_csv('temp.csv').to_parquet(f'pbp/parquet/nhl_pbp_{season}.parquet',index=False)
|
wsba_hockey/tools/scraping.py
CHANGED
@@ -509,26 +509,26 @@ async def parse_html(info):
|
|
509
509
|
|
510
510
|
data = pd.concat(event_log)
|
511
511
|
data['event_type'] = data['event_type'].replace({
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
512
|
+
"PGSTR": "pre-game-start",
|
513
|
+
"PGEND": "pre-game-end",
|
514
|
+
'GSTR':"game-start",
|
515
|
+
"ANTHEM":"anthem",
|
516
|
+
"PSTR":"period-start",
|
517
|
+
"FAC":"faceoff",
|
518
|
+
"SHOT":"shot-on-goal",
|
519
|
+
"BLOCK":"blocked-shot",
|
520
|
+
"STOP":"stoppage",
|
521
|
+
"MISS":"missed-shot",
|
522
|
+
"HIT":"hit",
|
523
|
+
"GOAL":"goal",
|
524
|
+
"GIVE":"giveaway",
|
525
|
+
"TAKE":"takeaway",
|
526
|
+
"DELPEN":"delayed-penalty",
|
527
|
+
"PENL":"penalty",
|
528
|
+
"CHL":"challenge",
|
529
|
+
"SOC":'shootout-complete',
|
530
|
+
"PEND":"period-end",
|
531
|
+
"GEND":"game-end"
|
532
532
|
})
|
533
533
|
|
534
534
|
#Return: parsed HTML pbp
|
wsba_hockey/workspace.py
CHANGED
@@ -1,15 +1,12 @@
|
|
1
1
|
import numpy as np
|
2
2
|
import pandas as pd
|
3
|
-
import
|
4
|
-
import data_pipelines as data
|
3
|
+
import wsba_hockey as wsba
|
4
|
+
import wsba_hockey.data_pipelines as data
|
5
5
|
import numpy as np
|
6
6
|
|
7
7
|
season_load = wsba.repo_load_seasons()
|
8
|
-
|
9
8
|
select = season_load[0:3]
|
10
9
|
|
11
|
-
data.pbp(select)
|
12
|
-
|
13
10
|
#pbp = data.load_pbp_db(select)
|
14
11
|
|
15
12
|
#wsba.wsba_xG(pbp,hypertune=True,train=True,train_runs=30,cv_runs=30)
|
wsba_hockey/wsba_main.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
import random
|
2
2
|
import os
|
3
|
-
import requests as rs
|
4
|
-
import pandas as pd
|
5
3
|
import asyncio
|
6
4
|
import time
|
5
|
+
import requests as rs
|
6
|
+
import pandas as pd
|
7
|
+
import matplotlib.pyplot as plt
|
7
8
|
from typing import Literal, Union
|
8
9
|
from datetime import datetime, timedelta, date
|
9
10
|
from wsba_hockey.tools.scraping import *
|
@@ -16,44 +17,44 @@ from wsba_hockey.tools.plotting import *
|
|
16
17
|
|
17
18
|
## GLOBAL VARIABLES ##
|
18
19
|
SEASONS = [
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
20
|
+
20072008,
|
21
|
+
20082009,
|
22
|
+
20092010,
|
23
|
+
20102011,
|
24
|
+
20112012,
|
25
|
+
20122013,
|
26
|
+
20132014,
|
27
|
+
20142015,
|
28
|
+
20152016,
|
29
|
+
20162017,
|
30
|
+
20172018,
|
31
|
+
20182019,
|
32
|
+
20192020,
|
33
|
+
20202021,
|
34
|
+
20212022,
|
35
|
+
20222023,
|
36
|
+
20232024,
|
37
|
+
20242025
|
37
38
|
]
|
38
39
|
|
39
|
-
CONVERT_SEASONS = {
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
40
|
+
CONVERT_SEASONS = {2007: 20072008,
|
41
|
+
2008: 20082009,
|
42
|
+
2009: 20092010,
|
43
|
+
2010: 20102011,
|
44
|
+
2011: 20112012,
|
45
|
+
2012: 20122013,
|
46
|
+
2013: 20132014,
|
47
|
+
2014: 20142015,
|
48
|
+
2015: 20152016,
|
49
|
+
2016: 20162017,
|
50
|
+
2017: 20172018,
|
51
|
+
2018: 20182019,
|
52
|
+
2019: 20192020,
|
53
|
+
2020: 20202021,
|
54
|
+
2021: 20212022,
|
55
|
+
2022: 20222023,
|
56
|
+
2023: 20232024,
|
57
|
+
2024: 20242025}
|
57
58
|
|
58
59
|
CONVERT_TEAM_ABBR = {'L.A':'LAK',
|
59
60
|
'N.J':'NJD',
|
@@ -66,46 +67,25 @@ PER_SIXTY = ['Fi','xGi','Gi','A1','A2','P1','P','Si','OZF','NZF','DZF','FF','FA'
|
|
66
67
|
#Some games in the API are specifically known to cause errors in scraping.
|
67
68
|
#This list is updated as frequently as necessary
|
68
69
|
KNOWN_PROBS = {
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
70
|
+
2007020011:'Missing shifts data for game between Chicago and Minnesota.',
|
71
|
+
2007021178:'Game between the Bruins and Sabres is missing data after the second period, for some reason.',
|
72
|
+
2008020259:'HTML data is completely missing for this game.',
|
73
|
+
2008020409:'HTML data is completely missing for this game.',
|
74
|
+
2008021077:'HTML data is completely missing for this game.',
|
75
|
+
2009020081:'HTML pbp for this game between Pittsburgh and Carolina is missing all but the period start and first faceoff events, for some reason.',
|
76
|
+
2009020658:'Missing shifts data for game between New York Islanders and Dallas.',
|
77
|
+
2009020885:'Missing shifts data for game between Sharks and Blue Jackets.',
|
78
|
+
2010020124:'Game between Capitals and Hurricanes is sporadically missing player on-ice data',
|
79
|
+
2012020018:'HTML events contain mislabeled events.',
|
80
|
+
2013020971:'On March 10th, 2014, Stars forward Rich Peverley suffered from a cardiac episode midgame and as a result, the remainder of the game was postponed. \nThe game resumed on April 9th, and the only goal scorer in the game, Blue Jackets forward Nathan Horton, did not appear in the resumed game due to injury. Interestingly, Horton would never play in the NHL again.',
|
81
|
+
2018021133:'Game between Lightning and Capitals has incorrectly labeled event teams (i.e. WSH TAKEAWAY - #71 CIRELLI (Cirelli is a Tampa Bay skater in this game)).',
|
82
|
+
2019020876:'Due to the frightening collapse of Blues defensemen Jay Bouwmeester, a game on February 2nd, 2020 between the Ducks and Blues was postponed. \nWhen the game resumed, Ducks defensemen Hampus Lindholm, who assisted on a goal in the inital game, did not play in the resumed match.'
|
82
83
|
}
|
83
84
|
|
84
85
|
SHOT_TYPES = ['wrist','deflected','tip-in','slap','backhand','snap','wrap-around','poke','bat','cradle','between-legs']
|
85
86
|
|
86
87
|
NEW = 2024
|
87
88
|
|
88
|
-
STANDINGS_END = {
|
89
|
-
'20072008':'04-06',
|
90
|
-
'20082009':'04-12',
|
91
|
-
'20092010':'04-11',
|
92
|
-
'20102011':'04-10',
|
93
|
-
'20112012':'04-07',
|
94
|
-
'20122013':'04-28',
|
95
|
-
'20132014':'04-13',
|
96
|
-
'20142015':'04-11',
|
97
|
-
'20152016':'04-10',
|
98
|
-
'20162017':'04-09',
|
99
|
-
'20172018':'04-08',
|
100
|
-
'20182019':'04-06',
|
101
|
-
'20192020':'03-11',
|
102
|
-
'20202021':'05-19',
|
103
|
-
'20212022':'04-01',
|
104
|
-
'20222023':'04-14',
|
105
|
-
'20232024':'04-18',
|
106
|
-
'20242025':'04-17'
|
107
|
-
}
|
108
|
-
|
109
89
|
EVENTS = ['faceoff','hit','giveaway','takeaway','blocked-shot','missed-shot','shot-on-goal','goal','penalty']
|
110
90
|
|
111
91
|
DIR = os.path.dirname(os.path.realpath(__file__))
|
@@ -344,7 +324,7 @@ def nhl_scrape_season(season:int, split_shifts:bool = False, season_types:list[i
|
|
344
324
|
split_shifts (bool, optional):
|
345
325
|
If True, returns a dict with separate 'pbp' and 'shifts' DataFrames. Default is False.
|
346
326
|
season_types (List[int], optional):
|
347
|
-
List of season_types to include in scraping process. Default is all regular season and playoff games which are 2 and 3
|
327
|
+
List of season_types to include in scraping process. Default is all regular season and playoff games which are 2 and 3 respectively.
|
348
328
|
remove (List[str], optional):
|
349
329
|
List of event types to remove from the result. Default is an empty list.
|
350
330
|
start (str, optional):
|
@@ -455,12 +435,12 @@ def nhl_scrape_seasons_info(seasons:list[int] = []):
|
|
455
435
|
else:
|
456
436
|
return df.sort_values(by=['id'])
|
457
437
|
|
458
|
-
def nhl_scrape_standings(arg:str = "now", season_type:int = 2):
|
438
|
+
def nhl_scrape_standings(arg:str | int = "now", season_type:int = 2):
|
459
439
|
"""
|
460
440
|
Returns standings or playoff bracket
|
461
441
|
Args:
|
462
|
-
arg (str, optional):
|
463
|
-
Date formatted as 'YYYY-MM-DD' to scrape standings
|
442
|
+
arg (str or int, optional):
|
443
|
+
Date formatted as 'YYYY-MM-DD' to scrape standings, NHL season such as "20242025", or 'now' for current standings. Default is 'now'.
|
464
444
|
season_type (int, optional):
|
465
445
|
Part of season to scrape. If 3 (playoffs) then scrape the playoff bracket for the season implied by arg. When arg = 'now' this is ignored. Default is 2.
|
466
446
|
|
@@ -488,7 +468,11 @@ def nhl_scrape_standings(arg:str = "now", season_type:int = 2):
|
|
488
468
|
else:
|
489
469
|
print(f"Scraping standings for date: {arg}")
|
490
470
|
|
491
|
-
|
471
|
+
season_data = rs.get('https://api.nhle.com/stats/rest/en/season').json()['data']
|
472
|
+
season_data = [s for s in season_data if s['id'] == arg][0]
|
473
|
+
end = season_data['regularSeasonEndDate'][0:10]
|
474
|
+
|
475
|
+
api = f"https://api-web.nhle.com/v1/standings/{end}"
|
492
476
|
data = rs.get(api).json()['standings']
|
493
477
|
|
494
478
|
return pd.json_normalize(data)
|
@@ -649,6 +633,41 @@ def nhl_scrape_draft_rankings(arg:str = 'now', category:int = 0):
|
|
649
633
|
#Return: prospect rankings
|
650
634
|
return data
|
651
635
|
|
636
|
+
def nhl_scrape_game_info(game_ids:list[int]):
|
637
|
+
"""
|
638
|
+
Given a set of game_ids (NHL API), return information for each game.
|
639
|
+
|
640
|
+
Args:
|
641
|
+
game_ids (List[int] or ['random', int, int, int]):
|
642
|
+
List of NHL game IDs to scrape or use ['random', n, start_year, end_year] to fetch n random games.
|
643
|
+
|
644
|
+
Returns:
|
645
|
+
pd.DataFrame:
|
646
|
+
An DataFrame containing information for each game.
|
647
|
+
"""
|
648
|
+
|
649
|
+
print(f'Finding game information for games: {game_ids}')
|
650
|
+
|
651
|
+
link = 'https://api-web.nhle.com/v1/gamecenter'
|
652
|
+
|
653
|
+
#Scrape information
|
654
|
+
df = pd.concat([pd.json_normalize(rs.get(f'{link}/{game_id}/landing').json()) for game_id in game_ids])
|
655
|
+
|
656
|
+
#Add extra info
|
657
|
+
df['date'] = df['gameDate']
|
658
|
+
df['season_type'] = df['gameType']
|
659
|
+
df['away_team_abbr'] = df['awayTeam.abbrev']
|
660
|
+
df['home_team_abbr'] = df['homeTeam.abbrev']
|
661
|
+
df['game_title'] = df['away_team_abbr'] + " @ " + df['home_team_abbr'] + " - " + df['date']
|
662
|
+
df['estStartTime'] = pd.to_datetime(df['startTimeUTC']).dt.tz_convert('US/Eastern').dt.strftime("%I:%M %p")
|
663
|
+
|
664
|
+
front_col = ['id','season','date','season_type','game_title','away_team_abbr','home_team_abbr','estStartTime']
|
665
|
+
df = df[front_col+[col for col in df.columns.to_list() if col not in front_col]]
|
666
|
+
|
667
|
+
#Return: game information
|
668
|
+
return df
|
669
|
+
|
670
|
+
|
652
671
|
def nhl_apply_xG(pbp: pd.DataFrame):
|
653
672
|
"""
|
654
673
|
Given play-by-play data, return this data with xG-related columns
|
@@ -1040,7 +1059,7 @@ def nhl_calculate_stats(pbp:pd.DataFrame, type:Literal['skater','goalie','team']
|
|
1040
1059
|
season (int):
|
1041
1060
|
The NHL season formatted such as "20242025".
|
1042
1061
|
season_types (List[int], optional):
|
1043
|
-
List of season_types to include in scraping process. Default is all regular season and playoff games which are 2 and 3
|
1062
|
+
List of season_types to include in scraping process. Default is all regular season and playoff games which are 2 and 3 respectively.
|
1044
1063
|
game_strength (str or list[str]):
|
1045
1064
|
List of game strength states to include (e.g., ['5v5','5v4','4v5']).
|
1046
1065
|
split_game (bool, optional):
|
@@ -1380,19 +1399,37 @@ def nhl_plot_games(pbp:pd.DataFrame, events:list[str], strengths:Union[Literal['
|
|
1380
1399
|
#Return: list of plotted game events
|
1381
1400
|
return game_plots
|
1382
1401
|
|
1383
|
-
def repo_load_rosters(seasons = []):
|
1384
|
-
|
1385
|
-
|
1402
|
+
def repo_load_rosters(seasons:list[int] = []):
|
1403
|
+
"""
|
1404
|
+
Returns roster data from repository
|
1405
|
+
|
1406
|
+
Args:
|
1407
|
+
seasons (list[int], optional):
|
1408
|
+
A DataFrame containing play-by-play event data.
|
1409
|
+
|
1410
|
+
Returns:
|
1411
|
+
pd.DataFrame:
|
1412
|
+
A DataFrame containing roster data for supplied seasons.
|
1413
|
+
"""
|
1386
1414
|
|
1387
1415
|
data = pd.read_csv(DEFAULT_ROSTER)
|
1388
|
-
if
|
1416
|
+
if not seasons:
|
1389
1417
|
data = data.loc[data['season'].isin(seasons)]
|
1390
1418
|
|
1391
1419
|
return data
|
1392
1420
|
|
1393
|
-
def repo_load_schedule(seasons = []):
|
1394
|
-
|
1395
|
-
|
1421
|
+
def repo_load_schedule(seasons:list[int] = []):
|
1422
|
+
"""
|
1423
|
+
Returns schedule data from repository
|
1424
|
+
|
1425
|
+
Args:
|
1426
|
+
seasons (list[int], optional):
|
1427
|
+
A DataFrame containing play-by-play event data.
|
1428
|
+
|
1429
|
+
Returns:
|
1430
|
+
pd.DataFrame:
|
1431
|
+
A DataFrame containing the schedule data for the specified season and date range.
|
1432
|
+
"""
|
1396
1433
|
|
1397
1434
|
data = pd.read_csv(SCHEDULE_PATH)
|
1398
1435
|
if len(seasons)>0:
|
@@ -1401,12 +1438,30 @@ def repo_load_schedule(seasons = []):
|
|
1401
1438
|
return data
|
1402
1439
|
|
1403
1440
|
def repo_load_teaminfo():
|
1404
|
-
|
1441
|
+
"""
|
1442
|
+
Returns team data from repository
|
1443
|
+
|
1444
|
+
Args:
|
1445
|
+
|
1446
|
+
Returns:
|
1447
|
+
pd.DataFrame:
|
1448
|
+
A DataFrame containing general team information.
|
1449
|
+
"""
|
1405
1450
|
|
1406
1451
|
return pd.read_csv(INFO_PATH)
|
1407
1452
|
|
1408
|
-
def repo_load_pbp(seasons = []):
|
1409
|
-
|
1453
|
+
def repo_load_pbp(seasons:list = []):
|
1454
|
+
"""
|
1455
|
+
Returns play-by-play data from repository
|
1456
|
+
|
1457
|
+
Args:
|
1458
|
+
seasons (List[int], optional):
|
1459
|
+
The NHL season formatted such as "20242025".
|
1460
|
+
Returns:
|
1461
|
+
pd.DataFrame:
|
1462
|
+
A DataFrame containing full play-by-play data for the selected season.
|
1463
|
+
"""
|
1464
|
+
#
|
1410
1465
|
# param 'seasons' - list of seasons to include
|
1411
1466
|
|
1412
1467
|
#Add parquet to total
|
@@ -1416,6 +1471,209 @@ def repo_load_pbp(seasons = []):
|
|
1416
1471
|
return pd.concat(dfs)
|
1417
1472
|
|
1418
1473
|
def repo_load_seasons():
|
1419
|
-
|
1474
|
+
"""
|
1475
|
+
Returns list of available seasons
|
1476
|
+
|
1477
|
+
Args:
|
1478
|
+
|
1479
|
+
Returns:
|
1480
|
+
pd.DataFrame:
|
1481
|
+
A DataFrame containing a list of seasons available in the WSBA Hockey package.
|
1482
|
+
"""
|
1420
1483
|
|
1421
1484
|
return SEASONS
|
1485
|
+
|
1486
|
+
## CLASSES ##
|
1487
|
+
class NHL_Database:
|
1488
|
+
"""
|
1489
|
+
A class for managing and analyzing NHL play-by-play data.
|
1490
|
+
|
1491
|
+
This class supports game scraping, filtering, stat calculation, and plotting.
|
1492
|
+
It initializes with either a provided list of game IDs or a default/random set.
|
1493
|
+
|
1494
|
+
Attributes:
|
1495
|
+
name (str):
|
1496
|
+
Designated name of the database.
|
1497
|
+
pbp (pd.DataFrame):
|
1498
|
+
Combined play-by-play data for selected games.
|
1499
|
+
games (list[int]):
|
1500
|
+
Unique game IDs currently in the dataset.
|
1501
|
+
stats (dict[str, dict[str, pd.DataFrame]]):
|
1502
|
+
Dictionary storing calculated stats by type and name.
|
1503
|
+
plots (dict[int, matplotlib.figure.Figure]):
|
1504
|
+
Dictionary storing plot outputs keyed by game or event.
|
1505
|
+
|
1506
|
+
Args:
|
1507
|
+
game_ids (list[int], optional):
|
1508
|
+
List of game IDs to scrape initially.
|
1509
|
+
pbp (pd.DataFrame, optional):
|
1510
|
+
Existing PBP DataFrame to load instead of scraping.
|
1511
|
+
"""
|
1512
|
+
|
1513
|
+
def __init__(self, name:str, game_ids:list[int] = [], pbp:pd.DataFrame = pd.DataFrame()):
|
1514
|
+
"""
|
1515
|
+
Initialize the WSBA_Database with scraped or preloaded PBP data.
|
1516
|
+
|
1517
|
+
If no `pbp` is provided and `game_ids` is empty, a random set of games will be scraped.
|
1518
|
+
|
1519
|
+
Args:
|
1520
|
+
name (str):
|
1521
|
+
Name of database.
|
1522
|
+
game_ids (list[int], optional):
|
1523
|
+
List of NHL game IDs to scrape in initialization.
|
1524
|
+
pbp (pd.DataFrame, optional):
|
1525
|
+
Existing play-by-play data to initialization.
|
1526
|
+
|
1527
|
+
Returns:
|
1528
|
+
pd.DataFrame:
|
1529
|
+
The initialized play-by-play dataset.
|
1530
|
+
"""
|
1531
|
+
|
1532
|
+
print('Initializing database...')
|
1533
|
+
self.name = name
|
1534
|
+
|
1535
|
+
if game_ids:
|
1536
|
+
self.pbp = nhl_scrape_game(game_ids)
|
1537
|
+
else:
|
1538
|
+
self.pbp = nhl_scrape_game(['random',3,2010,2024]) if pbp.empty else pbp
|
1539
|
+
|
1540
|
+
self.games = self.pbp['game_id'].drop_duplicates().to_list()
|
1541
|
+
self.stats = {}
|
1542
|
+
self.plots = {}
|
1543
|
+
|
1544
|
+
def add_games(self, game_ids:list[int]):
|
1545
|
+
"""
|
1546
|
+
Add additional games to the existing play-by-play dataset.
|
1547
|
+
|
1548
|
+
Args:
|
1549
|
+
game_ids (list[int]):
|
1550
|
+
List of game IDs to scrape and append.
|
1551
|
+
|
1552
|
+
Returns:
|
1553
|
+
pd.DataFrame:
|
1554
|
+
The updated play-by-play dataset.
|
1555
|
+
"""
|
1556
|
+
|
1557
|
+
print('Adding games...')
|
1558
|
+
self.pbp = pd.concat([self.pbp,wsba.nhl_scrape_game(game_ids)])
|
1559
|
+
|
1560
|
+
return self.pbp
|
1561
|
+
|
1562
|
+
def select_games(self, game_ids:list[int]):
|
1563
|
+
"""
|
1564
|
+
Return a filtered subset of the PBP data for specific games.
|
1565
|
+
|
1566
|
+
Args:
|
1567
|
+
game_ids (list[int]):
|
1568
|
+
List of game IDs to include.
|
1569
|
+
|
1570
|
+
Returns:
|
1571
|
+
pd.DataFrame:
|
1572
|
+
Filtered PBP data matching the selected games.
|
1573
|
+
"""
|
1574
|
+
|
1575
|
+
print('Selecting games...')
|
1576
|
+
|
1577
|
+
df = self.pbp
|
1578
|
+
return df.loc[df['game_id'].isin(game_ids)]
|
1579
|
+
|
1580
|
+
def add_stats(self, name:str, type:Literal['skater','goalie','team'], season_types:list[int], game_strength: Union[Literal['all'], list[str]], split_game:bool = False, roster_path:str = DEFAULT_ROSTER, shot_impact:bool = False):
|
1581
|
+
"""
|
1582
|
+
Calculate and store statistics for the given play-by-play data.
|
1583
|
+
|
1584
|
+
Args:
|
1585
|
+
name (str):
|
1586
|
+
Key name to store the results under.
|
1587
|
+
type (Literal['skater', 'goalie', 'team']):
|
1588
|
+
Type of statistics to calculate. Must be one of 'skater', 'goalie', or 'team'.
|
1589
|
+
season (int):
|
1590
|
+
The NHL season formatted such as "20242025".
|
1591
|
+
season_types (List[int], optional):
|
1592
|
+
List of season_types to include in scraping process. Default is all regular season and playoff games which are 2 and 3 respectively.
|
1593
|
+
game_strength (str or list[str]):
|
1594
|
+
List of game strength states to include (e.g., ['5v5','5v4','4v5']).
|
1595
|
+
split_game (bool, optional):
|
1596
|
+
If True, aggregates stats separately for each game; otherwise, stats are aggregated across all games. Default is False.
|
1597
|
+
roster_path (str, optional):
|
1598
|
+
File path to the roster data used for mapping players and teams.
|
1599
|
+
shot_impact (bool, optional):
|
1600
|
+
If True, applies shot impact metrics to the stats DataFrame. Default is False.
|
1601
|
+
|
1602
|
+
Returns:
|
1603
|
+
pd.DataFrame:
|
1604
|
+
The calculated statistics.
|
1605
|
+
"""
|
1606
|
+
|
1607
|
+
df = wsba.nhl_calculate_stats(self.pbp, type, season_types, game_strength, split_game, roster_path, shot_impact)
|
1608
|
+
self.stats.update({type:{name:df}})
|
1609
|
+
|
1610
|
+
return df
|
1611
|
+
|
1612
|
+
def add_game_plots(self, events:list[str], strengths:Union[Literal['all'], list[str]], game_ids: Union[Literal['all'], list[int]] = 'all', marker_dict:dict = event_markers, team_colors:dict = {'away':'primary','home':'primary'}, legend:bool = False):
|
1613
|
+
"""
|
1614
|
+
Generate visualizations of game events based on play-by-play data.
|
1615
|
+
|
1616
|
+
Args:
|
1617
|
+
events (list[str]):
|
1618
|
+
List of event types to include in the plot (e.g., ['shot-on-goal', 'goal']).
|
1619
|
+
strengths (str or list[str]):
|
1620
|
+
List of game strength states to include (e.g., ['5v5','5v4','4v5']).
|
1621
|
+
game_ids (str or list[int]):
|
1622
|
+
List of game IDs to plot. If set to 'all', plots will be generated for all games in the DataFrame.
|
1623
|
+
marker_dict (dict[str, dict]):
|
1624
|
+
Dictionary mapping event types to marker styles and/or colors used in plotting.
|
1625
|
+
legend (bool):
|
1626
|
+
Whether to include a legend on the plots.
|
1627
|
+
|
1628
|
+
Returns:
|
1629
|
+
dict[int, matplotlib.figure.Figure]:
|
1630
|
+
A dictionary mapping each game ID to its corresponding matplotlib event plot figure.
|
1631
|
+
"""
|
1632
|
+
|
1633
|
+
self.plots.update(nhl_plot_games(self.pbp, events, strengths, game_ids, marker_dict, team_colors, legend))
|
1634
|
+
|
1635
|
+
return self.plots
|
1636
|
+
|
1637
|
+
def export_data(self, path:str = ''):
|
1638
|
+
"""
|
1639
|
+
Export the data within the object to a specified directory.
|
1640
|
+
|
1641
|
+
The method writes:
|
1642
|
+
- The full play-by-play DataFrame to a CSV file.
|
1643
|
+
- All calculated statistics by type and name to CSV files in subfolders.
|
1644
|
+
- All stored plots to PNG files.
|
1645
|
+
|
1646
|
+
If no path is provided, exports to a folder named after the database (`self.name/`).
|
1647
|
+
|
1648
|
+
Args:
|
1649
|
+
path (str, optional):
|
1650
|
+
Root folder to export data into. Defaults to `self.name/`.
|
1651
|
+
"""
|
1652
|
+
|
1653
|
+
print('Exporting data...')
|
1654
|
+
start = time.perf_counter()
|
1655
|
+
|
1656
|
+
# Use default path if none provided
|
1657
|
+
path = f'{self.name}/' if path == '' else os.path.join(path,f'{self.name}')
|
1658
|
+
os.makedirs(path, exist_ok=True)
|
1659
|
+
|
1660
|
+
# Export master PBP
|
1661
|
+
self.pbp.to_csv(os.path.join(path, 'pbp.csv'), index=False)
|
1662
|
+
|
1663
|
+
# Export stats
|
1664
|
+
for stat_type in self.stats.keys():
|
1665
|
+
for name, df in self.stats[stat_type].items():
|
1666
|
+
stat_path = os.path.join(path, 'stats', stat_type)
|
1667
|
+
os.makedirs(stat_path, exist_ok=True)
|
1668
|
+
df.to_csv(os.path.join(stat_path, f'{name}.csv'), index=False)
|
1669
|
+
|
1670
|
+
# Export plots
|
1671
|
+
plot_path = os.path.join(path, 'plots')
|
1672
|
+
os.makedirs(plot_path, exist_ok=True)
|
1673
|
+
for game_id, plot in self.plots.items():
|
1674
|
+
plot[0].savefig(os.path.join(plot_path, f'{game_id}.png'))
|
1675
|
+
|
1676
|
+
# Completion message
|
1677
|
+
end = time.perf_counter()
|
1678
|
+
length = end - start
|
1679
|
+
print(f"...finished in {length:.2f} {'seconds' if length < 60 else 'minutes'}.")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: wsba_hockey
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.9
|
4
4
|
Summary: WeakSide Breakout's complete Python package of access to hockey data, primairly including the scraping of National Hockey League schedule, play-by-play, and shifts information.
|
5
5
|
Author-email: Owen Singh <owenbksingh@gmail.com>
|
6
6
|
Project-URL: Homepage, https://github.com/owensingh38/wsba_hockey/
|
@@ -1,7 +1,7 @@
|
|
1
|
-
wsba_hockey/__init__.py,sha256=
|
2
|
-
wsba_hockey/data_pipelines.py,sha256=
|
3
|
-
wsba_hockey/workspace.py,sha256=
|
4
|
-
wsba_hockey/wsba_main.py,sha256=
|
1
|
+
wsba_hockey/__init__.py,sha256=rXczecTEm4W8F9RbFmIttxpX9aqiq7PF5KNDLQVY35w,539
|
2
|
+
wsba_hockey/data_pipelines.py,sha256=BA6cfCyd1n3INmgqJRMNZteI74lEUEL6Q7ShG-g7G_o,10866
|
3
|
+
wsba_hockey/workspace.py,sha256=qVLyL9lCeaSGn7kc2_nY18e8_EeVZqvBO70xbhUWtdk,946
|
4
|
+
wsba_hockey/wsba_main.py,sha256=2ny9f_x8uOkj_ncPwo0Gd1TDykRVLeOxcUd_1UUQLbo,72155
|
5
5
|
wsba_hockey/api/api/index.py,sha256=r2keq105Ve8V0JAsSZMIPs9geVHX2Fuxyi4MqnzCt48,6537
|
6
6
|
wsba_hockey/evidence/weakside-breakout/node_modules/duckdb/vendor.py,sha256=lmu0TB0rIYkAuV9-csFJgW-1hJojso_-EZpEoorUUKM,4949
|
7
7
|
wsba_hockey/evidence/weakside-breakout/node_modules/flatted/python/flatted.py,sha256=ke8FuEflns-WlphCcQ9CC0qJqWqX3zEEuak74o6rgE8,3879
|
@@ -134,15 +134,15 @@ wsba_hockey/flask/app.py,sha256=J51iA65h9xyJfLgdH0h2sVSbfIR7xgGd2Oy8bJsmpAk,1873
|
|
134
134
|
wsba_hockey/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
135
135
|
wsba_hockey/tools/agg.py,sha256=OkIYd-ApvGVYe2JJLOI21jnDIN5LH8nkeH7eo0reWFI,23364
|
136
136
|
wsba_hockey/tools/plotting.py,sha256=81hBaM7tcwUNB4-tovPn7QreOUz6B2NuI_SR4-djVSk,6029
|
137
|
-
wsba_hockey/tools/scraping.py,sha256
|
137
|
+
wsba_hockey/tools/scraping.py,sha256=rJHmv8hSm5QEGlT3uygF_kgsGK8o-c8gFHpAMjXVbEM,52923
|
138
138
|
wsba_hockey/tools/xg_model.py,sha256=nOr_2RBijLgPmJ0TTs4wbSsORYmRqWCKRjLKDm7sAhI,18342
|
139
139
|
wsba_hockey/tools/archive/old_scraping.py,sha256=hEjMI1RtfeZnf0RBiJFI38oXkLZ3WofeH5xqcF4pzgM,49585
|
140
140
|
wsba_hockey/tools/utils/__init__.py,sha256=vccXhOtzARoR99fmEWU1OEI3qCIdQ9Z42AlRA_BUhrs,114
|
141
141
|
wsba_hockey/tools/utils/config.py,sha256=D3Uk05-YTyrhfReMTTLfNI3HN_rON2uo_CDE9oER3Lg,351
|
142
142
|
wsba_hockey/tools/utils/save_pages.py,sha256=CsyL_0n-b-4pJoUauwU3HpnCO6n69-RlBMJQBd_qGDc,4979
|
143
143
|
wsba_hockey/tools/utils/shared.py,sha256=dH_JwZfia5fib8rksy5sW-mBp0pluBPvw37Vdr8Kap0,14211
|
144
|
-
wsba_hockey-1.1.
|
145
|
-
wsba_hockey-1.1.
|
146
|
-
wsba_hockey-1.1.
|
147
|
-
wsba_hockey-1.1.
|
148
|
-
wsba_hockey-1.1.
|
144
|
+
wsba_hockey-1.1.9.dist-info/licenses/LICENSE,sha256=Nr_Um1Pd5FQJTWWgm7maZArdtYMbDhzXYSwyJIZDGik,1114
|
145
|
+
wsba_hockey-1.1.9.dist-info/METADATA,sha256=l5tHOebzSMuJSrhphvd2AWTItmvHl0oZzzSuK7VifvA,3566
|
146
|
+
wsba_hockey-1.1.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
147
|
+
wsba_hockey-1.1.9.dist-info/top_level.txt,sha256=acU7s3x-RZC1zGiqCOmO0g267iqCg34lzIfdmYxxGmQ,12
|
148
|
+
wsba_hockey-1.1.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|