TopDownHockey-Scraper 6.0.2__tar.gz → 6.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of TopDownHockey-Scraper might be problematic. Click here for more details.
- {topdownhockey_scraper-6.0.2/src/TopDownHockey_Scraper.egg-info → topdownhockey_scraper-6.0.4}/PKG-INFO +1 -1
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/setup.cfg +1 -1
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/setup.py +1 -1
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/src/TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py +1 -403
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4/src/TopDownHockey_Scraper.egg-info}/PKG-INFO +1 -1
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/LICENSE +0 -0
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/README.md +0 -0
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/pyproject.toml +0 -0
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/src/TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py +0 -0
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/src/TopDownHockey_Scraper.egg-info/SOURCES.txt +0 -0
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/src/TopDownHockey_Scraper.egg-info/dependency_links.txt +0 -0
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/src/TopDownHockey_Scraper.egg-info/requires.txt +0 -0
- {topdownhockey_scraper-6.0.2 → topdownhockey_scraper-6.0.4}/src/TopDownHockey_Scraper.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = TopDownHockey_Scraper
|
|
3
|
-
version = 6.0.
|
|
3
|
+
version = 6.0.4
|
|
4
4
|
author = Patrick Bacon
|
|
5
5
|
author_email = patrick.s.bacon@gmail.com
|
|
6
6
|
description = A package built for scraping hockey data from EliteProspects, the NHL's HTML/API reports, and ESPN's XML reports.
|
|
@@ -9,7 +9,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
9
9
|
|
|
10
10
|
setup(
|
|
11
11
|
name="TopDownHockey_Scraper", # Replace with your own username
|
|
12
|
-
version="6.0.
|
|
12
|
+
version="6.0.4",
|
|
13
13
|
author="Patrick Bacon",
|
|
14
14
|
author_email="patrick.s.bacon@gmail.com",
|
|
15
15
|
description="The TopDownHockey Scraper",
|
|
@@ -62,121 +62,6 @@ team_names = ['ANAHEIM DUCKS',
|
|
|
62
62
|
|
|
63
63
|
ewc = ['SHOT', 'HIT', 'BLOCK', 'MISS', 'GIVE', 'TAKE', 'GOAL']
|
|
64
64
|
|
|
65
|
-
def scrape_schedule_one_week(start_date):
|
|
66
|
-
|
|
67
|
-
url = f'https://api-web.nhle.com/v1/schedule/{start_date}'
|
|
68
|
-
page = requests.get(url, timeout = 500)
|
|
69
|
-
loaddict = json.loads(page.content)
|
|
70
|
-
|
|
71
|
-
game_df = pd.DataFrame()
|
|
72
|
-
|
|
73
|
-
for i in range(0, (len(loaddict['gameWeek']))):
|
|
74
|
-
#print(i)
|
|
75
|
-
game_day = loaddict['gameWeek'][i]
|
|
76
|
-
game_df = game_df._append(pd.DataFrame(game_day['games']).assign(date = game_day['date']).rename(columns = {'id':'ID'}))
|
|
77
|
-
|
|
78
|
-
home_df = pd.DataFrame(game_df['homeTeam'].values.tolist())
|
|
79
|
-
away_df = pd.DataFrame(game_df['awayTeam'].values.tolist())
|
|
80
|
-
|
|
81
|
-
game_df = game_df.assign(
|
|
82
|
-
home_team = game_df.homeTeam.apply(lambda x: x['abbrev']),
|
|
83
|
-
away_team = game_df.awayTeam.apply(lambda x: x['abbrev'])
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
game_df = game_df.assign(state = np.where(game_df.gameState=='OFF', 'Final',
|
|
87
|
-
np.where(game_df.gameState=='FUT', 'Scheduled',
|
|
88
|
-
np.where(game_df.gameState=='LIVE', 'In Progress',
|
|
89
|
-
'Error'))))
|
|
90
|
-
|
|
91
|
-
game_df = game_df.assign(type = np.where(game_df.gameType==2, 'R', 'Error'),
|
|
92
|
-
venue = game_df['venue'].apply(lambda x: x['default']))
|
|
93
|
-
|
|
94
|
-
game_df = game_df.assign(ID = game_df.ID.astype(int), season = game_df.season.astype(int))
|
|
95
|
-
|
|
96
|
-
schedule = game_df.loc[:, ['ID', 'type', 'season', 'date', 'home_team', 'away_team', 'state']]
|
|
97
|
-
|
|
98
|
-
return schedule
|
|
99
|
-
|
|
100
|
-
def scrape_full_schedule(
|
|
101
|
-
start_date = '2023-10-07',
|
|
102
|
-
end_date = '2024-04-18'):
|
|
103
|
-
|
|
104
|
-
full_schedule = pd.DataFrame()
|
|
105
|
-
|
|
106
|
-
scrape_day = start_date
|
|
107
|
-
|
|
108
|
-
while scrape_day <= end_date:
|
|
109
|
-
|
|
110
|
-
print(scrape_day)
|
|
111
|
-
|
|
112
|
-
week_scrape = scrape_schedule_one_week(scrape_day)
|
|
113
|
-
|
|
114
|
-
full_schedule = full_schedule._append(week_scrape)
|
|
115
|
-
|
|
116
|
-
last_day_scraped = max(full_schedule.date)
|
|
117
|
-
|
|
118
|
-
scrape_day = datetime.strftime((datetime.strptime(last_day_scraped, '%Y-%m-%d').date() + timedelta(days = 1)), '%Y-%m-%d')
|
|
119
|
-
|
|
120
|
-
return full_schedule[full_schedule.type=='R']
|
|
121
|
-
|
|
122
|
-
def scrape_standings(season):
|
|
123
|
-
"""
|
|
124
|
-
Takes an integer in "20202021" form and scrapes standings for that season.
|
|
125
|
-
"""
|
|
126
|
-
url = 'https://statsapi.web.nhl.com/api/v1/standings?season=' + str(season)
|
|
127
|
-
page = requests.get(url, timeout = 500)
|
|
128
|
-
loaddict = json.loads(page.content)
|
|
129
|
-
record_df = pd.DataFrame(loaddict['records'])
|
|
130
|
-
team = []
|
|
131
|
-
wins = []
|
|
132
|
-
losses = []
|
|
133
|
-
otl = []
|
|
134
|
-
rw = []
|
|
135
|
-
ga = []
|
|
136
|
-
gf = []
|
|
137
|
-
row = []
|
|
138
|
-
gp = []
|
|
139
|
-
pts = []
|
|
140
|
-
divisions = []
|
|
141
|
-
conferences = []
|
|
142
|
-
|
|
143
|
-
for i in range(0, len(record_df['teamRecords'])):
|
|
144
|
-
div = (record_df['division'].iloc[i]['name'])
|
|
145
|
-
conf = (record_df['conference'].iloc[i]['name'])
|
|
146
|
-
for x in range(0, len((record_df['teamRecords'].iloc[i]))):
|
|
147
|
-
divisions._append(div)
|
|
148
|
-
conferences._append(conf)
|
|
149
|
-
team._append(record_df['teamRecords'].iloc[i][x]['team']['name'])
|
|
150
|
-
wins._append(record_df['teamRecords'].iloc[i][x]['leagueRecord']['wins'])
|
|
151
|
-
losses._append(record_df['teamRecords'].iloc[i][x]['leagueRecord']['losses'])
|
|
152
|
-
otl._append(record_df['teamRecords'].iloc[i][x]['leagueRecord']['ot'])
|
|
153
|
-
gf._append(record_df['teamRecords'].iloc[i][x]['goalsScored'])
|
|
154
|
-
ga._append(record_df['teamRecords'].iloc[i][x]['goalsAgainst'])
|
|
155
|
-
if season>20092010:
|
|
156
|
-
row._append(record_df['teamRecords'].iloc[i][x]['row'])
|
|
157
|
-
gp._append(record_df['teamRecords'].iloc[i][x]['gamesPlayed'])
|
|
158
|
-
pts._append(record_df['teamRecords'].iloc[i][x]['points'])
|
|
159
|
-
if season>20192020:
|
|
160
|
-
rw._append(record_df['teamRecords'].iloc[i][x]['regulationWins'])
|
|
161
|
-
|
|
162
|
-
if season < 20092010:
|
|
163
|
-
stand = pd.DataFrame().assign(Team = team, Division = divisions, Conference = conferences,
|
|
164
|
-
GP = gp, W = wins, L = losses, OTL = otl, PTS = pts, GF = gf, GA = ga)
|
|
165
|
-
stand = stand.assign(GD = stand.GF - stand.GA).sort_values(by = ['PTS', 'GD'], ascending = False)
|
|
166
|
-
return stand.assign(Season = season).loc[:, ['Season', 'Team', 'Division', 'Conference', 'GP', 'W', 'L', 'OTL', 'PTS', 'GF','GA', 'GD']].reset_index(drop = True)
|
|
167
|
-
|
|
168
|
-
if ((season<20202021) & (season>20092010)):
|
|
169
|
-
stand = pd.DataFrame().assign(Team = team, Division = divisions, Conference = conferences,
|
|
170
|
-
GP = gp, W = wins, L = losses, OTL = otl, PTS = pts, GF = gf, GA = ga, ROW = row)
|
|
171
|
-
stand = stand.assign(GD = stand.GF - stand.GA).sort_values(by = ['PTS', 'ROW', 'GD'], ascending = False)
|
|
172
|
-
return stand.assign(Season = season).loc[:, ['Season', 'Team', 'Division', 'Conference', 'GP', 'W', 'L', 'OTL', 'PTS', 'GF','GA', 'ROW', 'GD']].reset_index(drop = True)
|
|
173
|
-
|
|
174
|
-
else:
|
|
175
|
-
stand = pd.DataFrame().assign(Team = team, Division = divisions, Conference = conferences,
|
|
176
|
-
GP = gp, W = wins, L = losses, OTL = otl, PTS = pts, GF = gf, GA = ga, RW = rw, ROW = row)
|
|
177
|
-
stand = stand.assign(GD = stand.GF - stand.GA).sort_values(by = ['PTS', 'RW', 'ROW', 'GD'], ascending = False)
|
|
178
|
-
return stand.assign(Season = season).loc[:, ['Season', 'Team', 'Division', 'Conference', 'GP', 'W', 'L', 'OTL', 'PTS', 'GF','GA', 'RW', 'ROW', 'GD']].reset_index(drop = True)
|
|
179
|
-
|
|
180
65
|
def scrape_schedule(start_date, end_date):
|
|
181
66
|
|
|
182
67
|
"""
|
|
@@ -1517,293 +1402,6 @@ def scrape_api_events(game_id, drop_description = True, shift_to_espn = False):
|
|
|
1517
1402
|
|
|
1518
1403
|
if shift_to_espn == True:
|
|
1519
1404
|
raise KeyError
|
|
1520
|
-
|
|
1521
|
-
page = requests.get(str('https://api-web.nhle.com/v1/gamecenter/' + str(game_id) + '/play-by-play'))
|
|
1522
|
-
|
|
1523
|
-
if str(page) == '<Response [404]>':
|
|
1524
|
-
raise KeyError('You got the 404 error; game data could not be found.')
|
|
1525
|
-
|
|
1526
|
-
loaddict = json.loads(page.content)
|
|
1527
|
-
|
|
1528
|
-
if loaddict['liveData']['plays']['allPlays'] != []:
|
|
1529
|
-
|
|
1530
|
-
eventdf = pd.DataFrame(loaddict['liveData']['plays']['allPlays'])
|
|
1531
|
-
|
|
1532
|
-
coordsdf = pd.DataFrame(eventdf['coordinates'].values.tolist(), index = eventdf.index)
|
|
1533
|
-
resultdf = pd.DataFrame(eventdf['result'].values.tolist(), index = eventdf.index)
|
|
1534
|
-
aboutdf = pd.DataFrame(eventdf['about'].values.tolist(), index = eventdf.index)
|
|
1535
|
-
scoredf = pd.DataFrame(aboutdf['goals'].values.tolist(), index = aboutdf.index)
|
|
1536
|
-
playerdf = pd.DataFrame(eventdf['players'])
|
|
1537
|
-
teamdf = eventdf['team'].apply(pd.Series)
|
|
1538
|
-
clean = playerdf[~pd.isna(playerdf.players)].reset_index()
|
|
1539
|
-
clean_index = clean.loc[:, ['index']]
|
|
1540
|
-
player1 = pd.DataFrame((pd.DataFrame(clean.reset_index()['players'].values.tolist())[0].values.tolist()))
|
|
1541
|
-
player1df = pd.concat([clean_index, pd.DataFrame(player1['player'].values.tolist())], axis = 1).assign(playerType = player1['playerType']).rename(
|
|
1542
|
-
columns = {'id':'player1id', 'fullName':'player1name', 'link':'player1link', 'playerType':'player1type'})
|
|
1543
|
-
player2 = pd.concat([clean_index, pd.DataFrame((pd.DataFrame(clean['players'].values.tolist())[1]))], axis = 1)
|
|
1544
|
-
player2 = player2[player2[1].notnull()]
|
|
1545
|
-
player2df = pd.concat([player2.reset_index(drop = True),
|
|
1546
|
-
(pd.DataFrame(pd.DataFrame(player2[1].values.tolist())['player'].values.tolist()).assign(playerType = (pd.DataFrame(player2[1].values.tolist())).loc[:, ['playerType']]))], axis = 1).drop(
|
|
1547
|
-
columns = 1).rename(
|
|
1548
|
-
columns = {'id':'player2id', 'fullName':'player2name', 'link':'player2link', 'playerType':'player2type'})
|
|
1549
|
-
|
|
1550
|
-
if len((pd.DataFrame(clean['players'].values.tolist())).columns) > 2:
|
|
1551
|
-
|
|
1552
|
-
player3 = pd.concat([clean_index, pd.DataFrame((pd.DataFrame(clean['players'].values.tolist())[2]))], axis = 1)
|
|
1553
|
-
player3 = player3[player3[2].notnull()]
|
|
1554
|
-
player3df = pd.concat([player3.reset_index(drop = True),
|
|
1555
|
-
(pd.DataFrame(pd.DataFrame(player3[2].values.tolist())['player'].values.tolist()).assign(playerType = (pd.DataFrame(player3[2].values.tolist())).loc[:, ['playerType']]))], axis = 1).drop(
|
|
1556
|
-
columns = 2).rename(
|
|
1557
|
-
columns = {'id':'player3id', 'fullName':'player3name', 'link':'player3link', 'playerType':'player3type'})
|
|
1558
|
-
else:
|
|
1559
|
-
player3df = pd.DataFrame(columns = ['index', 'player3id', 'player3name', 'player3link', 'player3type'])
|
|
1560
|
-
|
|
1561
|
-
if len((pd.DataFrame(clean['players'].values.tolist())).columns) > 3:
|
|
1562
|
-
|
|
1563
|
-
player4 = pd.concat([clean_index, pd.DataFrame((pd.DataFrame(clean['players'].values.tolist())[3]))], axis = 1)
|
|
1564
|
-
player4 = player4[player4[3].notnull()]
|
|
1565
|
-
player4df = pd.concat([player4.reset_index(drop = True),
|
|
1566
|
-
(pd.DataFrame(pd.DataFrame(player4[3].values.tolist())['player'].values.tolist()).assign(playerType = (pd.DataFrame(player4[3].values.tolist())).loc[:, ['playerType']]))], axis = 1).drop(
|
|
1567
|
-
columns = 3).rename(
|
|
1568
|
-
columns = {'id':'player4id', 'fullName':'player4name', 'link':'player4link', 'playerType':'player4type'})
|
|
1569
|
-
else:
|
|
1570
|
-
player4df = pd.DataFrame(columns = ['index', 'player4id', 'player4name', 'player4link', 'player4type'])
|
|
1571
|
-
|
|
1572
|
-
finaldf = eventdf.assign(
|
|
1573
|
-
hometeam = loaddict['gameData']['teams']['home']['triCode'],
|
|
1574
|
-
hometeamfull = loaddict['gameData']['teams']['home']['name'],
|
|
1575
|
-
awayteam = loaddict['gameData']['teams']['away']['triCode'],
|
|
1576
|
-
awayteamfull = loaddict['gameData']['teams']['away']['name'],
|
|
1577
|
-
description = resultdf['description'],
|
|
1578
|
-
event = resultdf['eventTypeId'],
|
|
1579
|
-
detail = resultdf['secondaryType'],
|
|
1580
|
-
coords_x = coordsdf['x'],
|
|
1581
|
-
coords_y = coordsdf['y'],
|
|
1582
|
-
period = aboutdf['period'],
|
|
1583
|
-
time = aboutdf['periodTime'],
|
|
1584
|
-
homescore = scoredf['home'],
|
|
1585
|
-
awayscore = scoredf['away'],
|
|
1586
|
-
eventteam = teamdf['triCode'],
|
|
1587
|
-
eventteamfull = teamdf['name'],
|
|
1588
|
-
eventidx = aboutdf['eventIdx'],
|
|
1589
|
-
eventNumber = aboutdf['eventId'],
|
|
1590
|
-
session = loaddict['gameData']['game']['type'])
|
|
1591
|
-
|
|
1592
|
-
finaldf = finaldf.drop(columns = ['result', 'about', 'coordinates', 'players', 'team'])
|
|
1593
|
-
|
|
1594
|
-
finaldf = finaldf.reset_index().merge(
|
|
1595
|
-
player1df, on = 'index', how = 'left').merge(
|
|
1596
|
-
player2df, on = 'index', how = 'left').merge(
|
|
1597
|
-
player3df, on = 'index', how = 'left').merge(
|
|
1598
|
-
player4df, on = 'index', how = 'left')
|
|
1599
|
-
|
|
1600
|
-
finaldf = finaldf.assign(
|
|
1601
|
-
awayteamfull = finaldf.awayteamfull.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'),
|
|
1602
|
-
hometeamfull = finaldf.hometeamfull.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'),
|
|
1603
|
-
eventteamfull = finaldf.eventteamfull.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))
|
|
1604
|
-
|
|
1605
|
-
finaldf = finaldf.assign(
|
|
1606
|
-
player1name = np.where((finaldf.player1name=='Sebastian Aho') & (finaldf.eventteam=='NYI'),
|
|
1607
|
-
'Sebastian Aho (SWE)',
|
|
1608
|
-
finaldf.player1name
|
|
1609
|
-
))
|
|
1610
|
-
|
|
1611
|
-
api_events = finaldf
|
|
1612
|
-
|
|
1613
|
-
api_events.period = api_events.period.astype(int)
|
|
1614
|
-
api_events.time = api_events.time.astype(str)
|
|
1615
|
-
|
|
1616
|
-
api_events.event = np.where(api_events.event=='BLOCKED_SHOT', 'BLOCK',
|
|
1617
|
-
np.where(api_events.event=='BLOCKEDSHOT', 'BLOCK',
|
|
1618
|
-
np.where(api_events.event=='MISSED_SHOT', 'MISS',
|
|
1619
|
-
np.where(api_events.event=='FACEOFF', 'FAC',
|
|
1620
|
-
np.where(api_events.event=='PENALTY', 'PENL',
|
|
1621
|
-
np.where(api_events.event=='GIVEAWAY', 'GIVE',
|
|
1622
|
-
np.where(api_events.event=='TAKEAWAY', 'TAKE',
|
|
1623
|
-
np.where(api_events.event=='MISSEDSHOT', 'MISS',
|
|
1624
|
-
api_events.event))))))))
|
|
1625
|
-
|
|
1626
|
-
api_events = api_events[api_events.event.isin(['TAKE', 'GIVE', 'MISS', 'HIT', 'SHOT', 'BLOCK', 'GOAL', 'PENL', 'FAC'])]
|
|
1627
|
-
|
|
1628
|
-
api_events['awayteamfull'] = (api_events.awayteamfull.str.upper())
|
|
1629
|
-
api_events['hometeamfull'] = (api_events.hometeamfull.str.upper())
|
|
1630
|
-
api_events['eventteamfull'] = (api_events.eventteamfull.str.upper())
|
|
1631
|
-
|
|
1632
|
-
api_events['period_seconds'] = api_events.time.str.split(':').str[0].astype(int) * 60 + api_events.time.str.split(':').str[1].astype(int)
|
|
1633
|
-
|
|
1634
|
-
api_events['game_seconds'] = (np.where(api_events.period<5,
|
|
1635
|
-
(((api_events.period - 1) * 1200) + api_events.period_seconds),
|
|
1636
|
-
3900))
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
api_events = api_events.loc[:, ['period_seconds', 'game_seconds', 'event', 'session', 'coords_x', 'coords_y', 'description', 'period',
|
|
1640
|
-
'eventteam', 'eventteamfull', 'hometeamfull', 'awayteamfull', 'player1name', 'player2name', 'player3name', 'player4name']].rename(
|
|
1641
|
-
columns = {'eventteamfull':'event_team'})
|
|
1642
|
-
|
|
1643
|
-
api_events = api_events.assign(
|
|
1644
|
-
player1name = api_events.player1name.str.upper(),
|
|
1645
|
-
player2name = api_events.player2name.str.upper(),
|
|
1646
|
-
player3name = api_events.player3name.str.upper()
|
|
1647
|
-
).drop(columns = 'player4name').rename(columns = {'player1name':'ep1_name', 'player2name':'ep2_name', 'player3name':'ep3_name'})
|
|
1648
|
-
|
|
1649
|
-
api_events = api_events.assign(event_team = np.where(api_events.event!='BLOCK', api_events.event_team,
|
|
1650
|
-
np.where(api_events.event_team==api_events.hometeamfull, api_events.awayteamfull, api_events.hometeamfull)))
|
|
1651
|
-
|
|
1652
|
-
api_events = api_events.assign(ep1_name = np.where(api_events.event!='BLOCK', api_events.ep1_name, api_events.ep2_name))
|
|
1653
|
-
|
|
1654
|
-
api_events = api_events.sort_values(by = ['game_seconds', 'event_team', 'ep1_name'])
|
|
1655
|
-
|
|
1656
|
-
api_events = api_events.assign(version =
|
|
1657
|
-
(np.where(
|
|
1658
|
-
(api_events.event==api_events.event.shift()) &
|
|
1659
|
-
(api_events.ep1_name==api_events.ep1_name.shift()) &
|
|
1660
|
-
(api_events.game_seconds==api_events.game_seconds.shift()),
|
|
1661
|
-
1, 0)))
|
|
1662
|
-
|
|
1663
|
-
api_events = api_events.assign(version =
|
|
1664
|
-
(np.where(
|
|
1665
|
-
(api_events.event==api_events.event.shift(2)) &
|
|
1666
|
-
(api_events.ep1_name==api_events.ep1_name.shift(2)) &
|
|
1667
|
-
(api_events.game_seconds==api_events.game_seconds.shift(2) )&
|
|
1668
|
-
(~api_events.description.str.contains('Penalty Shot')),
|
|
1669
|
-
2, api_events.version)))
|
|
1670
|
-
|
|
1671
|
-
api_events = api_events.assign(version =
|
|
1672
|
-
(np.where(
|
|
1673
|
-
(api_events.event==api_events.event.shift(3)) &
|
|
1674
|
-
(api_events.ep1_name==api_events.ep1_name.shift(3)) &
|
|
1675
|
-
(api_events.game_seconds==api_events.game_seconds.shift(3)),
|
|
1676
|
-
3, api_events.version)))#.drop(columns = 'description')
|
|
1677
|
-
|
|
1678
|
-
api_events['ep1_name'] = np.where((api_events.description.str.contains('Too many men')) | (api_events.description.str.contains('unsportsmanlike conduct-bench')), 'BENCH', api_events['ep1_name'])
|
|
1679
|
-
|
|
1680
|
-
api_events['ep1_name'] = np.where(api_events['ep1_name'].str.contains('ALEXANDRE '),
|
|
1681
|
-
api_events['ep1_name'].str.replace('ALEXANDRE ', 'ALEX '),
|
|
1682
|
-
api_events['ep1_name'])
|
|
1683
|
-
|
|
1684
|
-
api_events['ep1_name'] = np.where(api_events['ep1_name'].str.contains('ALEXANDER '),
|
|
1685
|
-
api_events['ep1_name'].str.replace('ALEXANDER ', 'ALEX '),
|
|
1686
|
-
api_events['ep1_name'])
|
|
1687
|
-
|
|
1688
|
-
api_events['ep1_name'] = np.where(api_events['ep1_name'].str.contains('CHRISTOPHER '),
|
|
1689
|
-
api_events['ep1_name'].str.replace('CHRISTOPHER ', 'CHRIS '),
|
|
1690
|
-
api_events['ep1_name'])
|
|
1691
|
-
|
|
1692
|
-
api_events = api_events.assign(
|
|
1693
|
-
ep1_name =
|
|
1694
|
-
(np.where(api_events['ep1_name']=="ALEX PECHURSKIY", "ALEX PECHURSKI",
|
|
1695
|
-
(np.where(api_events['ep1_name']=="BEN ONDRUS", "BENJAMIN ONDRUS",
|
|
1696
|
-
(np.where(api_events['ep1_name']=="BRYCE VAN BRABANT", "BRYCE VAN BRABANT",
|
|
1697
|
-
(np.where(api_events['ep1_name']=="CALVIN DE HAAN", "CALVIN DE HAAN",
|
|
1698
|
-
(np.where(api_events['ep1_name']=="CHASE DE LEO", "CHASE DE LEO",
|
|
1699
|
-
(np.where(api_events['ep1_name']=="CAL PETERSEN", "CALVIN PETERSEN",
|
|
1700
|
-
(np.where(api_events['ep1_name']=="DANIEL CARCILLO", "DAN CARCILLO",
|
|
1701
|
-
(np.where(api_events['ep1_name']=="DANNY O'REGAN", "DANIEL O'REGAN",
|
|
1702
|
-
(np.where(api_events['ep1_name']=="DAVID VAN DER GULIK", "DAVID VAN DER GULIK",
|
|
1703
|
-
(np.where(api_events['ep1_name']=="EVGENII DADONOV", "EVGENY DADONOV",
|
|
1704
|
-
(np.where(api_events['ep1_name']=="FREDDY MODIN", "FREDRIK MODIN",
|
|
1705
|
-
(np.where(api_events['ep1_name']=="GREG DE VRIES", "GREG DE VRIES",
|
|
1706
|
-
(np.where(api_events['ep1_name']=="ILYA ZUBOV", "ILJA ZUBOV",
|
|
1707
|
-
(np.where(api_events['ep1_name']=="JACOB DE LA ROSE", "JACOB DE LA ROSE",
|
|
1708
|
-
(np.where(api_events['ep1_name']=="JAMES VAN RIEMSDYK", "JAMES VAN RIEMSDYK",
|
|
1709
|
-
(np.where(api_events['ep1_name']=="JEAN-FRANCOIS JACQUES", "J-F JACQUES",
|
|
1710
|
-
(np.where(api_events['ep1_name']=="JAKOB FORSBACKA KARLSSON", "JAKOB FORSBACKA KARLSSON",
|
|
1711
|
-
(np.where(api_events['ep1_name']=="JIM DOWD", "JAMES DOWD",
|
|
1712
|
-
(np.where(api_events['ep1_name']=="JEFF HAMILTON", "JEFFREY HAMILTON",
|
|
1713
|
-
(np.where(api_events['ep1_name']=="JEFF PENNER", "JEFFREY PENNER",
|
|
1714
|
-
(np.where(api_events['ep1_name']=="JOEL ERIKSSON EK", "JOEL ERIKSSON EK",
|
|
1715
|
-
(np.where(api_events['ep1_name']=="MARK VAN GUILDER", "MARK VAN GUILDER",
|
|
1716
|
-
(np.where(api_events['ep1_name']=="MARTIN ST LOUIS", "MARTIN ST. LOUIS",
|
|
1717
|
-
(np.where(api_events['ep1_name']=="MARTIN ST PIERRE", "MARTIN ST. PIERRE",
|
|
1718
|
-
(np.where(api_events['ep1_name']=="MARTIN ST PIERRE", "MARTIN ST. PIERRE",
|
|
1719
|
-
(np.where(api_events['ep1_name']=="MICHAEL CAMMALLERI", "MIKE CAMMALLERI",
|
|
1720
|
-
(np.where(api_events['ep1_name']=="MICHAEL DAL COLLE", "MICHAEL DAL COLLE",
|
|
1721
|
-
(np.where(api_events['ep1_name']=="MICHAEL DEL ZOTTO", "MICHAEL DEL ZOTTO",
|
|
1722
|
-
(np.where(api_events['ep1_name']=="MIKE VERNACE", "MICHAEL VERNACE",
|
|
1723
|
-
(np.where(api_events['ep1_name']=="MIKE YORK", "MICHAEL YORK",
|
|
1724
|
-
(np.where(api_events['ep1_name']=="MIKE VAN RYN", "MIKE VAN RYN",
|
|
1725
|
-
(np.where(api_events['ep1_name']=="MITCHELL MARNER", "MITCH MARNER",
|
|
1726
|
-
(np.where(api_events['ep1_name']=="PAT MAROON", "PATRICK MAROON",
|
|
1727
|
-
(np.where(api_events['ep1_name']=="PA PARENTEAU", "P.A. PARENTEAU",
|
|
1728
|
-
(np.where(api_events['ep1_name']=="PHILLIP DI GIUSEPPE", "PHILLIP DI GIUSEPPE",
|
|
1729
|
-
(np.where(api_events['ep1_name']=="STEFAN DELLA ROVERE", "STEFAN DELLA ROVERE",
|
|
1730
|
-
(np.where(api_events['ep1_name']=="STEPHANE DA COSTA", "STEPHANE DA COSTA",
|
|
1731
|
-
(np.where(api_events['ep1_name']=="TJ GALIARDI", "T.J. GALIARDI",
|
|
1732
|
-
(np.where(api_events['ep1_name']=="TOBY ENSTROM", "TOBIAS ENSTROM",
|
|
1733
|
-
(np.where(api_events['ep1_name']=="TREVOR VAN RIEMSDYK", "TREVOR VAN RIEMSDYK",
|
|
1734
|
-
(np.where(api_events['ep1_name']=="ZACK FITZGERALD", "ZACH FITZGERALD",
|
|
1735
|
-
|
|
1736
|
-
## NEW CHANGES
|
|
1737
|
-
(np.where(api_events['ep1_name']=="TIM GETTINGER", "TIMOTHY GETTINGER",
|
|
1738
|
-
(np.where(api_events['ep1_name']=="THOMAS DI PAULI", "THOMAS DI PAULI",
|
|
1739
|
-
(np.where(api_events['ep1_name']=="NICHOLAS SHORE", "NICK SHORE",
|
|
1740
|
-
(np.where(api_events['ep1_name']=="T.J. TYNAN", "TJ TYNAN",
|
|
1741
|
-
|
|
1742
|
-
## '20-21 CHANGES (from HTM update function)
|
|
1743
|
-
(np.where(api_events['ep1_name']=="ALEXIS LAFRENI?RE", "ALEXIS LAFRENIÈRE",
|
|
1744
|
-
(np.where(api_events['ep1_name']=="ALEXIS LAFRENIERE", "ALEXIS LAFRENIÈRE",
|
|
1745
|
-
(np.where(api_events['ep1_name']=="TIM STUTZLE", "TIM STÜTZLE",
|
|
1746
|
-
(np.where(api_events['ep1_name']=="TIM ST?TZLE", "TIM STÜTZLE",
|
|
1747
|
-
(np.where(api_events['ep1_name']== "JANI HAKANPÃ\x84Ã\x84" , "JANI HAKANPAA",
|
|
1748
|
-
(np.where(api_events['ep1_name']=="EGOR SHARANGOVICH", "YEGOR SHARANGOVICH",
|
|
1749
|
-
(np.where(api_events['ep1_name']=="CALLAN FOOTE", "CAL FOOTE",
|
|
1750
|
-
(np.where(api_events['ep1_name']=="JOSH DUNNE", "JOSHUA DUNNE", api_events['ep1_name']
|
|
1751
|
-
))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
|
|
1752
|
-
)))))))))))))))))))))))))))))))))))))))))))))))
|
|
1753
|
-
|
|
1754
|
-
api_events['ep1_name'] = (np.where(api_events['ep1_name']== "JANIS MOSER" , "J.J. MOSER",
|
|
1755
|
-
(np.where(api_events['ep1_name']== "NICHOLAS PAUL" , "NICK PAUL",
|
|
1756
|
-
(np.where(api_events['ep1_name']== "JACOB MIDDLETON" , "JAKE MIDDLETON",
|
|
1757
|
-
(np.where(api_events['ep1_name']== "TOMMY NOVAK" , "THOMAS NOVAK",
|
|
1758
|
-
# New guys from 24-25
|
|
1759
|
-
(np.where(api_events['ep1_name']== "JOSHUA NORRIS" , "JOSH NORRIS",
|
|
1760
|
-
(np.where(api_events['ep1_name']== "P.O JOSEPH" , "PIERRE-OLIVIER JOSEPH",
|
|
1761
|
-
(np.where(api_events['ep1_name']== "MIKEY EYSSIMONT" , "MICHAEL EYSSIMONT",
|
|
1762
|
-
(np.where(api_events['ep1_name']== "MATAJ BLAMEL" , "MATAJ BLAMEL",
|
|
1763
|
-
(np.where(api_events['ep1_name']== "VITTORIO MANCINI" , "VICTOR MANCINI",
|
|
1764
|
-
(np.where(api_events['ep1_name']== "JOSHUA MAHURA" , "JOSH MAHURA",
|
|
1765
|
-
(np.where(api_events['ep1_name']== "JOSEPH VELENO" , "JOE VELENO",
|
|
1766
|
-
(np.where(api_events['ep1_name']== "ZACK BOLDUC" , "ZACHARY BOLDUC",
|
|
1767
|
-
(np.where(api_events['ep1_name']== "JOSHUA BROWN" , "JOSH BROWN",
|
|
1768
|
-
(np.where(api_events['ep1_name']== "JAKE LUCCHINI" , "JACOB LUCCHINI",
|
|
1769
|
-
(np.where(api_events['ep1_name']== "EMIL LILLEBERG" , "EMIL MARTINSEN LILLEBERG",
|
|
1770
|
-
(np.where(api_events['ep1_name']== "CAMERON ATKINSON" , "CAM ATKINSON",
|
|
1771
|
-
(np.where(api_events['ep1_name']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY",
|
|
1772
|
-
api_events['ep1_name']))))))))))))))))))))))))))))))))))
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
# 21-22 CHANGES
|
|
1776
|
-
|
|
1777
|
-
api_events['ep1_name'] = api_events['ep1_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
|
|
1778
|
-
|
|
1779
|
-
# Apply regex to remove (A) and (C) designations at end of names
|
|
1780
|
-
api_events['ep1_name'] = api_events['ep1_name'].apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
|
|
1781
|
-
api_events['ep1_name'] = api_events['ep1_name'].apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
|
|
1782
|
-
|
|
1783
|
-
# Apply specific name corrections
|
|
1784
|
-
api_events['ep1_name'] = np.where(api_events['ep1_name'] == "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", api_events['ep1_name']) # Need to do this after normalization, only then he becomes Slafkovska?
|
|
1785
|
-
api_events['ep1_name'] = np.where(api_events['ep1_name'] == "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", api_events['ep1_name'])
|
|
1786
|
-
api_events['ep1_name'] = np.where(api_events['ep1_name'] == "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", api_events['ep1_name'])
|
|
1787
|
-
|
|
1788
|
-
api_events['ep1_name'] = np.where(api_events['ep1_name'] == 'MARTIN FEHARVARY' , 'MARTIN FEHERVARY', api_events['ep1_name'])
|
|
1789
|
-
|
|
1790
|
-
api_events['ep1_name'] = np.where(api_events['ep1_name'] == 'MATAJ BLAMEL' , 'MATAJ BLAMEL', api_events['ep1_name'])
|
|
1791
|
-
|
|
1792
|
-
api_events['ep1_name'] = api_events['ep1_name'].str.replace(' ', ' ')
|
|
1793
|
-
|
|
1794
|
-
api_events = api_events.assign(ep1_name = np.where(api_events.ep1_name=='ALEX BARRÃ-BOULET', 'ALEX BARRE_BOULET', api_events.ep1_name))
|
|
1795
|
-
|
|
1796
|
-
if drop_description == True:
|
|
1797
|
-
|
|
1798
|
-
return api_events.loc[:, ['game_seconds', 'event', 'coords_x', 'coords_y', 'ep1_name', 'period', 'version']].rename(columns = {'ep1_name':'event_player_1'})
|
|
1799
|
-
|
|
1800
|
-
else:
|
|
1801
|
-
|
|
1802
|
-
return api_events.loc[:, ['game_seconds', 'event', 'coords_x', 'coords_y', 'ep1_name', 'period', 'version', 'description']].rename(columns = {'ep1_name':'event_player_1'})
|
|
1803
|
-
|
|
1804
|
-
else:
|
|
1805
|
-
print("This game doesn't exist within the API.")
|
|
1806
|
-
raise KeyError
|
|
1807
1405
|
|
|
1808
1406
|
def scrape_html_events(season, game_id):
|
|
1809
1407
|
#global game
|
|
@@ -2930,7 +2528,7 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
|
|
|
2930
2528
|
print('Scraping ESPN IDs')
|
|
2931
2529
|
espn_id = scrape_espn_ids_single_game(str(game_date.date()), espn_home_team, espn_away_team).espn_id.iloc[0]
|
|
2932
2530
|
print('Scraping ESPN Events')
|
|
2933
|
-
print('Here is the ESPN ID'
|
|
2531
|
+
print('Here is the ESPN ID:' espn_id)
|
|
2934
2532
|
event_coords = scrape_espn_events(int(espn_id))
|
|
2935
2533
|
event_coords['coordinate_source'] = 'espn'
|
|
2936
2534
|
events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left').drop(columns = ['espn_id'])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|