TopDownHockey-Scraper 6.0.0__tar.gz → 6.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of TopDownHockey-Scraper might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TopDownHockey_Scraper
3
- Version: 6.0.0
3
+ Version: 6.0.5
4
4
  Summary: The TopDownHockey Scraper
5
5
  Home-page: https://github.com/TopDownHockey/TopDownHockey_Scraper
6
6
  Author: Patrick Bacon
@@ -17,8 +17,6 @@ Requires-Dist: numpy
17
17
  Requires-Dist: pandas
18
18
  Requires-Dist: bs4
19
19
  Requires-Dist: datetime
20
- Requires-Dist: seaborn
21
- Requires-Dist: matplotlib
22
20
  Requires-Dist: xmltodict
23
21
  Requires-Dist: lxml
24
22
  Requires-Dist: natsort
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = TopDownHockey_Scraper
3
- version = 5.0.2
3
+ version = 6.0.5
4
4
  author = Patrick Bacon
5
5
  author_email = patrick.s.bacon@gmail.com
6
6
  description = A package built for scraping hockey data from EliteProspects, the NHL's HTML/API reports, and ESPN's XML reports.
@@ -25,8 +25,6 @@ install_requires =
25
25
  pandas
26
26
  datetime
27
27
  requests
28
- seasborn
29
- matplotlib
30
28
  xml
31
29
  xmltodict
32
30
  requests
@@ -9,7 +9,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
9
9
 
10
10
  setup(
11
11
  name="TopDownHockey_Scraper", # Replace with your own username
12
- version="6.0.0",
12
+ version="6.0.5",
13
13
  author="Patrick Bacon",
14
14
  author_email="patrick.s.bacon@gmail.com",
15
15
  description="The TopDownHockey Scraper",
@@ -33,8 +33,6 @@ setup(
33
33
  'pandas',
34
34
  'bs4',
35
35
  'datetime',
36
- 'seaborn',
37
- 'matplotlib',
38
36
  'xmltodict',
39
37
  'lxml',
40
38
  'natsort'
@@ -62,121 +62,6 @@ team_names = ['ANAHEIM DUCKS',
62
62
 
63
63
  ewc = ['SHOT', 'HIT', 'BLOCK', 'MISS', 'GIVE', 'TAKE', 'GOAL']
64
64
 
65
- def scrape_schedule_one_week(start_date):
66
-
67
- url = f'https://api-web.nhle.com/v1/schedule/{start_date}'
68
- page = requests.get(url, timeout = 500)
69
- loaddict = json.loads(page.content)
70
-
71
- game_df = pd.DataFrame()
72
-
73
- for i in range(0, (len(loaddict['gameWeek']))):
74
- #print(i)
75
- game_day = loaddict['gameWeek'][i]
76
- game_df = game_df._append(pd.DataFrame(game_day['games']).assign(date = game_day['date']).rename(columns = {'id':'ID'}))
77
-
78
- home_df = pd.DataFrame(game_df['homeTeam'].values.tolist())
79
- away_df = pd.DataFrame(game_df['awayTeam'].values.tolist())
80
-
81
- game_df = game_df.assign(
82
- home_team = game_df.homeTeam.apply(lambda x: x['abbrev']),
83
- away_team = game_df.awayTeam.apply(lambda x: x['abbrev'])
84
- )
85
-
86
- game_df = game_df.assign(state = np.where(game_df.gameState=='OFF', 'Final',
87
- np.where(game_df.gameState=='FUT', 'Scheduled',
88
- np.where(game_df.gameState=='LIVE', 'In Progress',
89
- 'Error'))))
90
-
91
- game_df = game_df.assign(type = np.where(game_df.gameType==2, 'R', 'Error'),
92
- venue = game_df['venue'].apply(lambda x: x['default']))
93
-
94
- game_df = game_df.assign(ID = game_df.ID.astype(int), season = game_df.season.astype(int))
95
-
96
- schedule = game_df.loc[:, ['ID', 'type', 'season', 'date', 'home_team', 'away_team', 'state']]
97
-
98
- return schedule
99
-
100
- def scrape_full_schedule(
101
- start_date = '2023-10-07',
102
- end_date = '2024-04-18'):
103
-
104
- full_schedule = pd.DataFrame()
105
-
106
- scrape_day = start_date
107
-
108
- while scrape_day <= end_date:
109
-
110
- print(scrape_day)
111
-
112
- week_scrape = scrape_schedule_one_week(scrape_day)
113
-
114
- full_schedule = full_schedule._append(week_scrape)
115
-
116
- last_day_scraped = max(full_schedule.date)
117
-
118
- scrape_day = datetime.strftime((datetime.strptime(last_day_scraped, '%Y-%m-%d').date() + timedelta(days = 1)), '%Y-%m-%d')
119
-
120
- return full_schedule[full_schedule.type=='R']
121
-
122
- def scrape_standings(season):
123
- """
124
- Takes an integer in "20202021" form and scrapes standings for that season.
125
- """
126
- url = 'https://statsapi.web.nhl.com/api/v1/standings?season=' + str(season)
127
- page = requests.get(url, timeout = 500)
128
- loaddict = json.loads(page.content)
129
- record_df = pd.DataFrame(loaddict['records'])
130
- team = []
131
- wins = []
132
- losses = []
133
- otl = []
134
- rw = []
135
- ga = []
136
- gf = []
137
- row = []
138
- gp = []
139
- pts = []
140
- divisions = []
141
- conferences = []
142
-
143
- for i in range(0, len(record_df['teamRecords'])):
144
- div = (record_df['division'].iloc[i]['name'])
145
- conf = (record_df['conference'].iloc[i]['name'])
146
- for x in range(0, len((record_df['teamRecords'].iloc[i]))):
147
- divisions._append(div)
148
- conferences._append(conf)
149
- team._append(record_df['teamRecords'].iloc[i][x]['team']['name'])
150
- wins._append(record_df['teamRecords'].iloc[i][x]['leagueRecord']['wins'])
151
- losses._append(record_df['teamRecords'].iloc[i][x]['leagueRecord']['losses'])
152
- otl._append(record_df['teamRecords'].iloc[i][x]['leagueRecord']['ot'])
153
- gf._append(record_df['teamRecords'].iloc[i][x]['goalsScored'])
154
- ga._append(record_df['teamRecords'].iloc[i][x]['goalsAgainst'])
155
- if season>20092010:
156
- row._append(record_df['teamRecords'].iloc[i][x]['row'])
157
- gp._append(record_df['teamRecords'].iloc[i][x]['gamesPlayed'])
158
- pts._append(record_df['teamRecords'].iloc[i][x]['points'])
159
- if season>20192020:
160
- rw._append(record_df['teamRecords'].iloc[i][x]['regulationWins'])
161
-
162
- if season < 20092010:
163
- stand = pd.DataFrame().assign(Team = team, Division = divisions, Conference = conferences,
164
- GP = gp, W = wins, L = losses, OTL = otl, PTS = pts, GF = gf, GA = ga)
165
- stand = stand.assign(GD = stand.GF - stand.GA).sort_values(by = ['PTS', 'GD'], ascending = False)
166
- return stand.assign(Season = season).loc[:, ['Season', 'Team', 'Division', 'Conference', 'GP', 'W', 'L', 'OTL', 'PTS', 'GF','GA', 'GD']].reset_index(drop = True)
167
-
168
- if ((season<20202021) & (season>20092010)):
169
- stand = pd.DataFrame().assign(Team = team, Division = divisions, Conference = conferences,
170
- GP = gp, W = wins, L = losses, OTL = otl, PTS = pts, GF = gf, GA = ga, ROW = row)
171
- stand = stand.assign(GD = stand.GF - stand.GA).sort_values(by = ['PTS', 'ROW', 'GD'], ascending = False)
172
- return stand.assign(Season = season).loc[:, ['Season', 'Team', 'Division', 'Conference', 'GP', 'W', 'L', 'OTL', 'PTS', 'GF','GA', 'ROW', 'GD']].reset_index(drop = True)
173
-
174
- else:
175
- stand = pd.DataFrame().assign(Team = team, Division = divisions, Conference = conferences,
176
- GP = gp, W = wins, L = losses, OTL = otl, PTS = pts, GF = gf, GA = ga, RW = rw, ROW = row)
177
- stand = stand.assign(GD = stand.GF - stand.GA).sort_values(by = ['PTS', 'RW', 'ROW', 'GD'], ascending = False)
178
- return stand.assign(Season = season).loc[:, ['Season', 'Team', 'Division', 'Conference', 'GP', 'W', 'L', 'OTL', 'PTS', 'GF','GA', 'RW', 'ROW', 'GD']].reset_index(drop = True)
179
-
180
65
  def scrape_schedule(start_date, end_date):
181
66
 
182
67
  """
@@ -1517,293 +1402,6 @@ def scrape_api_events(game_id, drop_description = True, shift_to_espn = False):
1517
1402
 
1518
1403
  if shift_to_espn == True:
1519
1404
  raise KeyError
1520
-
1521
- page = requests.get(str('https://api-web.nhle.com/v1/gamecenter/' + str(game_id) + '/play-by-play'))
1522
-
1523
- if str(page) == '<Response [404]>':
1524
- raise KeyError('You got the 404 error; game data could not be found.')
1525
-
1526
- loaddict = json.loads(page.content)
1527
-
1528
- if loaddict['liveData']['plays']['allPlays'] != []:
1529
-
1530
- eventdf = pd.DataFrame(loaddict['liveData']['plays']['allPlays'])
1531
-
1532
- coordsdf = pd.DataFrame(eventdf['coordinates'].values.tolist(), index = eventdf.index)
1533
- resultdf = pd.DataFrame(eventdf['result'].values.tolist(), index = eventdf.index)
1534
- aboutdf = pd.DataFrame(eventdf['about'].values.tolist(), index = eventdf.index)
1535
- scoredf = pd.DataFrame(aboutdf['goals'].values.tolist(), index = aboutdf.index)
1536
- playerdf = pd.DataFrame(eventdf['players'])
1537
- teamdf = eventdf['team'].apply(pd.Series)
1538
- clean = playerdf[~pd.isna(playerdf.players)].reset_index()
1539
- clean_index = clean.loc[:, ['index']]
1540
- player1 = pd.DataFrame((pd.DataFrame(clean.reset_index()['players'].values.tolist())[0].values.tolist()))
1541
- player1df = pd.concat([clean_index, pd.DataFrame(player1['player'].values.tolist())], axis = 1).assign(playerType = player1['playerType']).rename(
1542
- columns = {'id':'player1id', 'fullName':'player1name', 'link':'player1link', 'playerType':'player1type'})
1543
- player2 = pd.concat([clean_index, pd.DataFrame((pd.DataFrame(clean['players'].values.tolist())[1]))], axis = 1)
1544
- player2 = player2[player2[1].notnull()]
1545
- player2df = pd.concat([player2.reset_index(drop = True),
1546
- (pd.DataFrame(pd.DataFrame(player2[1].values.tolist())['player'].values.tolist()).assign(playerType = (pd.DataFrame(player2[1].values.tolist())).loc[:, ['playerType']]))], axis = 1).drop(
1547
- columns = 1).rename(
1548
- columns = {'id':'player2id', 'fullName':'player2name', 'link':'player2link', 'playerType':'player2type'})
1549
-
1550
- if len((pd.DataFrame(clean['players'].values.tolist())).columns) > 2:
1551
-
1552
- player3 = pd.concat([clean_index, pd.DataFrame((pd.DataFrame(clean['players'].values.tolist())[2]))], axis = 1)
1553
- player3 = player3[player3[2].notnull()]
1554
- player3df = pd.concat([player3.reset_index(drop = True),
1555
- (pd.DataFrame(pd.DataFrame(player3[2].values.tolist())['player'].values.tolist()).assign(playerType = (pd.DataFrame(player3[2].values.tolist())).loc[:, ['playerType']]))], axis = 1).drop(
1556
- columns = 2).rename(
1557
- columns = {'id':'player3id', 'fullName':'player3name', 'link':'player3link', 'playerType':'player3type'})
1558
- else:
1559
- player3df = pd.DataFrame(columns = ['index', 'player3id', 'player3name', 'player3link', 'player3type'])
1560
-
1561
- if len((pd.DataFrame(clean['players'].values.tolist())).columns) > 3:
1562
-
1563
- player4 = pd.concat([clean_index, pd.DataFrame((pd.DataFrame(clean['players'].values.tolist())[3]))], axis = 1)
1564
- player4 = player4[player4[3].notnull()]
1565
- player4df = pd.concat([player4.reset_index(drop = True),
1566
- (pd.DataFrame(pd.DataFrame(player4[3].values.tolist())['player'].values.tolist()).assign(playerType = (pd.DataFrame(player4[3].values.tolist())).loc[:, ['playerType']]))], axis = 1).drop(
1567
- columns = 3).rename(
1568
- columns = {'id':'player4id', 'fullName':'player4name', 'link':'player4link', 'playerType':'player4type'})
1569
- else:
1570
- player4df = pd.DataFrame(columns = ['index', 'player4id', 'player4name', 'player4link', 'player4type'])
1571
-
1572
- finaldf = eventdf.assign(
1573
- hometeam = loaddict['gameData']['teams']['home']['triCode'],
1574
- hometeamfull = loaddict['gameData']['teams']['home']['name'],
1575
- awayteam = loaddict['gameData']['teams']['away']['triCode'],
1576
- awayteamfull = loaddict['gameData']['teams']['away']['name'],
1577
- description = resultdf['description'],
1578
- event = resultdf['eventTypeId'],
1579
- detail = resultdf['secondaryType'],
1580
- coords_x = coordsdf['x'],
1581
- coords_y = coordsdf['y'],
1582
- period = aboutdf['period'],
1583
- time = aboutdf['periodTime'],
1584
- homescore = scoredf['home'],
1585
- awayscore = scoredf['away'],
1586
- eventteam = teamdf['triCode'],
1587
- eventteamfull = teamdf['name'],
1588
- eventidx = aboutdf['eventIdx'],
1589
- eventNumber = aboutdf['eventId'],
1590
- session = loaddict['gameData']['game']['type'])
1591
-
1592
- finaldf = finaldf.drop(columns = ['result', 'about', 'coordinates', 'players', 'team'])
1593
-
1594
- finaldf = finaldf.reset_index().merge(
1595
- player1df, on = 'index', how = 'left').merge(
1596
- player2df, on = 'index', how = 'left').merge(
1597
- player3df, on = 'index', how = 'left').merge(
1598
- player4df, on = 'index', how = 'left')
1599
-
1600
- finaldf = finaldf.assign(
1601
- awayteamfull = finaldf.awayteamfull.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'),
1602
- hometeamfull = finaldf.hometeamfull.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'),
1603
- eventteamfull = finaldf.eventteamfull.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))
1604
-
1605
- finaldf = finaldf.assign(
1606
- player1name = np.where((finaldf.player1name=='Sebastian Aho') & (finaldf.eventteam=='NYI'),
1607
- 'Sebastian Aho (SWE)',
1608
- finaldf.player1name
1609
- ))
1610
-
1611
- api_events = finaldf
1612
-
1613
- api_events.period = api_events.period.astype(int)
1614
- api_events.time = api_events.time.astype(str)
1615
-
1616
- api_events.event = np.where(api_events.event=='BLOCKED_SHOT', 'BLOCK',
1617
- np.where(api_events.event=='BLOCKEDSHOT', 'BLOCK',
1618
- np.where(api_events.event=='MISSED_SHOT', 'MISS',
1619
- np.where(api_events.event=='FACEOFF', 'FAC',
1620
- np.where(api_events.event=='PENALTY', 'PENL',
1621
- np.where(api_events.event=='GIVEAWAY', 'GIVE',
1622
- np.where(api_events.event=='TAKEAWAY', 'TAKE',
1623
- np.where(api_events.event=='MISSEDSHOT', 'MISS',
1624
- api_events.event))))))))
1625
-
1626
- api_events = api_events[api_events.event.isin(['TAKE', 'GIVE', 'MISS', 'HIT', 'SHOT', 'BLOCK', 'GOAL', 'PENL', 'FAC'])]
1627
-
1628
- api_events['awayteamfull'] = (api_events.awayteamfull.str.upper())
1629
- api_events['hometeamfull'] = (api_events.hometeamfull.str.upper())
1630
- api_events['eventteamfull'] = (api_events.eventteamfull.str.upper())
1631
-
1632
- api_events['period_seconds'] = api_events.time.str.split(':').str[0].astype(int) * 60 + api_events.time.str.split(':').str[1].astype(int)
1633
-
1634
- api_events['game_seconds'] = (np.where(api_events.period<5,
1635
- (((api_events.period - 1) * 1200) + api_events.period_seconds),
1636
- 3900))
1637
-
1638
-
1639
- api_events = api_events.loc[:, ['period_seconds', 'game_seconds', 'event', 'session', 'coords_x', 'coords_y', 'description', 'period',
1640
- 'eventteam', 'eventteamfull', 'hometeamfull', 'awayteamfull', 'player1name', 'player2name', 'player3name', 'player4name']].rename(
1641
- columns = {'eventteamfull':'event_team'})
1642
-
1643
- api_events = api_events.assign(
1644
- player1name = api_events.player1name.str.upper(),
1645
- player2name = api_events.player2name.str.upper(),
1646
- player3name = api_events.player3name.str.upper()
1647
- ).drop(columns = 'player4name').rename(columns = {'player1name':'ep1_name', 'player2name':'ep2_name', 'player3name':'ep3_name'})
1648
-
1649
- api_events = api_events.assign(event_team = np.where(api_events.event!='BLOCK', api_events.event_team,
1650
- np.where(api_events.event_team==api_events.hometeamfull, api_events.awayteamfull, api_events.hometeamfull)))
1651
-
1652
- api_events = api_events.assign(ep1_name = np.where(api_events.event!='BLOCK', api_events.ep1_name, api_events.ep2_name))
1653
-
1654
- api_events = api_events.sort_values(by = ['game_seconds', 'event_team', 'ep1_name'])
1655
-
1656
- api_events = api_events.assign(version =
1657
- (np.where(
1658
- (api_events.event==api_events.event.shift()) &
1659
- (api_events.ep1_name==api_events.ep1_name.shift()) &
1660
- (api_events.game_seconds==api_events.game_seconds.shift()),
1661
- 1, 0)))
1662
-
1663
- api_events = api_events.assign(version =
1664
- (np.where(
1665
- (api_events.event==api_events.event.shift(2)) &
1666
- (api_events.ep1_name==api_events.ep1_name.shift(2)) &
1667
- (api_events.game_seconds==api_events.game_seconds.shift(2) )&
1668
- (~api_events.description.str.contains('Penalty Shot')),
1669
- 2, api_events.version)))
1670
-
1671
- api_events = api_events.assign(version =
1672
- (np.where(
1673
- (api_events.event==api_events.event.shift(3)) &
1674
- (api_events.ep1_name==api_events.ep1_name.shift(3)) &
1675
- (api_events.game_seconds==api_events.game_seconds.shift(3)),
1676
- 3, api_events.version)))#.drop(columns = 'description')
1677
-
1678
- api_events['ep1_name'] = np.where((api_events.description.str.contains('Too many men')) | (api_events.description.str.contains('unsportsmanlike conduct-bench')), 'BENCH', api_events['ep1_name'])
1679
-
1680
- api_events['ep1_name'] = np.where(api_events['ep1_name'].str.contains('ALEXANDRE '),
1681
- api_events['ep1_name'].str.replace('ALEXANDRE ', 'ALEX '),
1682
- api_events['ep1_name'])
1683
-
1684
- api_events['ep1_name'] = np.where(api_events['ep1_name'].str.contains('ALEXANDER '),
1685
- api_events['ep1_name'].str.replace('ALEXANDER ', 'ALEX '),
1686
- api_events['ep1_name'])
1687
-
1688
- api_events['ep1_name'] = np.where(api_events['ep1_name'].str.contains('CHRISTOPHER '),
1689
- api_events['ep1_name'].str.replace('CHRISTOPHER ', 'CHRIS '),
1690
- api_events['ep1_name'])
1691
-
1692
- api_events = api_events.assign(
1693
- ep1_name =
1694
- (np.where(api_events['ep1_name']=="ALEX PECHURSKIY", "ALEX PECHURSKI",
1695
- (np.where(api_events['ep1_name']=="BEN ONDRUS", "BENJAMIN ONDRUS",
1696
- (np.where(api_events['ep1_name']=="BRYCE VAN BRABANT", "BRYCE VAN BRABANT",
1697
- (np.where(api_events['ep1_name']=="CALVIN DE HAAN", "CALVIN DE HAAN",
1698
- (np.where(api_events['ep1_name']=="CHASE DE LEO", "CHASE DE LEO",
1699
- (np.where(api_events['ep1_name']=="CAL PETERSEN", "CALVIN PETERSEN",
1700
- (np.where(api_events['ep1_name']=="DANIEL CARCILLO", "DAN CARCILLO",
1701
- (np.where(api_events['ep1_name']=="DANNY O'REGAN", "DANIEL O'REGAN",
1702
- (np.where(api_events['ep1_name']=="DAVID VAN DER GULIK", "DAVID VAN DER GULIK",
1703
- (np.where(api_events['ep1_name']=="EVGENII DADONOV", "EVGENY DADONOV",
1704
- (np.where(api_events['ep1_name']=="FREDDY MODIN", "FREDRIK MODIN",
1705
- (np.where(api_events['ep1_name']=="GREG DE VRIES", "GREG DE VRIES",
1706
- (np.where(api_events['ep1_name']=="ILYA ZUBOV", "ILJA ZUBOV",
1707
- (np.where(api_events['ep1_name']=="JACOB DE LA ROSE", "JACOB DE LA ROSE",
1708
- (np.where(api_events['ep1_name']=="JAMES VAN RIEMSDYK", "JAMES VAN RIEMSDYK",
1709
- (np.where(api_events['ep1_name']=="JEAN-FRANCOIS JACQUES", "J-F JACQUES",
1710
- (np.where(api_events['ep1_name']=="JAKOB FORSBACKA KARLSSON", "JAKOB FORSBACKA KARLSSON",
1711
- (np.where(api_events['ep1_name']=="JIM DOWD", "JAMES DOWD",
1712
- (np.where(api_events['ep1_name']=="JEFF HAMILTON", "JEFFREY HAMILTON",
1713
- (np.where(api_events['ep1_name']=="JEFF PENNER", "JEFFREY PENNER",
1714
- (np.where(api_events['ep1_name']=="JOEL ERIKSSON EK", "JOEL ERIKSSON EK",
1715
- (np.where(api_events['ep1_name']=="MARK VAN GUILDER", "MARK VAN GUILDER",
1716
- (np.where(api_events['ep1_name']=="MARTIN ST LOUIS", "MARTIN ST. LOUIS",
1717
- (np.where(api_events['ep1_name']=="MARTIN ST PIERRE", "MARTIN ST. PIERRE",
1718
- (np.where(api_events['ep1_name']=="MARTIN ST PIERRE", "MARTIN ST. PIERRE",
1719
- (np.where(api_events['ep1_name']=="MICHAEL CAMMALLERI", "MIKE CAMMALLERI",
1720
- (np.where(api_events['ep1_name']=="MICHAEL DAL COLLE", "MICHAEL DAL COLLE",
1721
- (np.where(api_events['ep1_name']=="MICHAEL DEL ZOTTO", "MICHAEL DEL ZOTTO",
1722
- (np.where(api_events['ep1_name']=="MIKE VERNACE", "MICHAEL VERNACE",
1723
- (np.where(api_events['ep1_name']=="MIKE YORK", "MICHAEL YORK",
1724
- (np.where(api_events['ep1_name']=="MIKE VAN RYN", "MIKE VAN RYN",
1725
- (np.where(api_events['ep1_name']=="MITCHELL MARNER", "MITCH MARNER",
1726
- (np.where(api_events['ep1_name']=="PAT MAROON", "PATRICK MAROON",
1727
- (np.where(api_events['ep1_name']=="PA PARENTEAU", "P.A. PARENTEAU",
1728
- (np.where(api_events['ep1_name']=="PHILLIP DI GIUSEPPE", "PHILLIP DI GIUSEPPE",
1729
- (np.where(api_events['ep1_name']=="STEFAN DELLA ROVERE", "STEFAN DELLA ROVERE",
1730
- (np.where(api_events['ep1_name']=="STEPHANE DA COSTA", "STEPHANE DA COSTA",
1731
- (np.where(api_events['ep1_name']=="TJ GALIARDI", "T.J. GALIARDI",
1732
- (np.where(api_events['ep1_name']=="TOBY ENSTROM", "TOBIAS ENSTROM",
1733
- (np.where(api_events['ep1_name']=="TREVOR VAN RIEMSDYK", "TREVOR VAN RIEMSDYK",
1734
- (np.where(api_events['ep1_name']=="ZACK FITZGERALD", "ZACH FITZGERALD",
1735
-
1736
- ## NEW CHANGES
1737
- (np.where(api_events['ep1_name']=="TIM GETTINGER", "TIMOTHY GETTINGER",
1738
- (np.where(api_events['ep1_name']=="THOMAS DI PAULI", "THOMAS DI PAULI",
1739
- (np.where(api_events['ep1_name']=="NICHOLAS SHORE", "NICK SHORE",
1740
- (np.where(api_events['ep1_name']=="T.J. TYNAN", "TJ TYNAN",
1741
-
1742
- ## '20-21 CHANGES (from HTM update function)
1743
- (np.where(api_events['ep1_name']=="ALEXIS LAFRENI?RE", "ALEXIS LAFRENIÈRE",
1744
- (np.where(api_events['ep1_name']=="ALEXIS LAFRENIERE", "ALEXIS LAFRENIÈRE",
1745
- (np.where(api_events['ep1_name']=="TIM STUTZLE", "TIM STÜTZLE",
1746
- (np.where(api_events['ep1_name']=="TIM ST?TZLE", "TIM STÜTZLE",
1747
- (np.where(api_events['ep1_name']== "JANI HAKANPÃ\x84Ã\x84" , "JANI HAKANPAA",
1748
- (np.where(api_events['ep1_name']=="EGOR SHARANGOVICH", "YEGOR SHARANGOVICH",
1749
- (np.where(api_events['ep1_name']=="CALLAN FOOTE", "CAL FOOTE",
1750
- (np.where(api_events['ep1_name']=="JOSH DUNNE", "JOSHUA DUNNE", api_events['ep1_name']
1751
- ))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
1752
- )))))))))))))))))))))))))))))))))))))))))))))))
1753
-
1754
- api_events['ep1_name'] = (np.where(api_events['ep1_name']== "JANIS MOSER" , "J.J. MOSER",
1755
- (np.where(api_events['ep1_name']== "NICHOLAS PAUL" , "NICK PAUL",
1756
- (np.where(api_events['ep1_name']== "JACOB MIDDLETON" , "JAKE MIDDLETON",
1757
- (np.where(api_events['ep1_name']== "TOMMY NOVAK" , "THOMAS NOVAK",
1758
- # New guys from 24-25
1759
- (np.where(api_events['ep1_name']== "JOSHUA NORRIS" , "JOSH NORRIS",
1760
- (np.where(api_events['ep1_name']== "P.O JOSEPH" , "PIERRE-OLIVIER JOSEPH",
1761
- (np.where(api_events['ep1_name']== "MIKEY EYSSIMONT" , "MICHAEL EYSSIMONT",
1762
- (np.where(api_events['ep1_name']== "MATAJ BLAMEL" , "MATAJ BLAMEL",
1763
- (np.where(api_events['ep1_name']== "VITTORIO MANCINI" , "VICTOR MANCINI",
1764
- (np.where(api_events['ep1_name']== "JOSHUA MAHURA" , "JOSH MAHURA",
1765
- (np.where(api_events['ep1_name']== "JOSEPH VELENO" , "JOE VELENO",
1766
- (np.where(api_events['ep1_name']== "ZACK BOLDUC" , "ZACHARY BOLDUC",
1767
- (np.where(api_events['ep1_name']== "JOSHUA BROWN" , "JOSH BROWN",
1768
- (np.where(api_events['ep1_name']== "JAKE LUCCHINI" , "JACOB LUCCHINI",
1769
- (np.where(api_events['ep1_name']== "EMIL LILLEBERG" , "EMIL MARTINSEN LILLEBERG",
1770
- (np.where(api_events['ep1_name']== "CAMERON ATKINSON" , "CAM ATKINSON",
1771
- (np.where(api_events['ep1_name']== "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY",
1772
- api_events['ep1_name']))))))))))))))))))))))))))))))))))
1773
-
1774
-
1775
- # 21-22 CHANGES
1776
-
1777
- api_events['ep1_name'] = api_events['ep1_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.upper()
1778
-
1779
- # Apply regex to remove (A) and (C) designations at end of names
1780
- api_events['ep1_name'] = api_events['ep1_name'].apply(lambda x: re.sub(r' \(A\)$', '', x).strip())
1781
- api_events['ep1_name'] = api_events['ep1_name'].apply(lambda x: re.sub(r' \(C\)$', '', x).strip())
1782
-
1783
- # Apply specific name corrections
1784
- api_events['ep1_name'] = np.where(api_events['ep1_name'] == "JURAJ SLAFKOVSKA" , "JURAJ SLAFKOVSKY", api_events['ep1_name']) # Need to do this after normalization, only then he becomes Slafkovska?
1785
- api_events['ep1_name'] = np.where(api_events['ep1_name'] == "JOHN (JACK) ROSLOVIC" , "JACK ROSLOVIC", api_events['ep1_name'])
1786
- api_events['ep1_name'] = np.where(api_events['ep1_name'] == "ANTHONY-JOHN (AJ) GREER" , "A.J. GREER", api_events['ep1_name'])
1787
-
1788
- api_events['ep1_name'] = np.where(api_events['ep1_name'] == 'MARTIN FEHARVARY' , 'MARTIN FEHERVARY', api_events['ep1_name'])
1789
-
1790
- api_events['ep1_name'] = np.where(api_events['ep1_name'] == 'MATAJ BLAMEL' , 'MATAJ BLAMEL', api_events['ep1_name'])
1791
-
1792
- api_events['ep1_name'] = api_events['ep1_name'].str.replace(' ', ' ')
1793
-
1794
- api_events = api_events.assign(ep1_name = np.where(api_events.ep1_name=='ALEX BARRÉ-BOULET', 'ALEX BARRE_BOULET', api_events.ep1_name))
1795
-
1796
- if drop_description == True:
1797
-
1798
- return api_events.loc[:, ['game_seconds', 'event', 'coords_x', 'coords_y', 'ep1_name', 'period', 'version']].rename(columns = {'ep1_name':'event_player_1'})
1799
-
1800
- else:
1801
-
1802
- return api_events.loc[:, ['game_seconds', 'event', 'coords_x', 'coords_y', 'ep1_name', 'period', 'version', 'description']].rename(columns = {'ep1_name':'event_player_1'})
1803
-
1804
- else:
1805
- print("This game doesn't exist within the API.")
1806
- raise KeyError
1807
1405
 
1808
1406
  def scrape_html_events(season, game_id):
1809
1407
  #global game
@@ -2255,6 +1853,7 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
2255
1853
  this_date = (game_date)
2256
1854
  url = 'http://www.espn.com/nhl/scoreboard?date=' + this_date.replace("-", "")
2257
1855
  page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout = 500)
1856
+ print('Request to ESPN IDs successful.')
2258
1857
  soup = BeautifulSoup(page.content, parser = 'lxml')
2259
1858
  soup_found = soup.find_all('a', {'class':['AnchorLink truncate',
2260
1859
  'AnchorLink Button Button--sm Button--anchorLink Button--alt mb4 w-100',
@@ -2866,7 +2465,7 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
2866
2465
  # If all goes well with the HTML scrape:
2867
2466
 
2868
2467
  try:
2869
- event_coords = scrape_api_events(game_id, shift_to_espn = shift_to_espn)
2468
+ event_coords = scrape_api_events(game_id, shift_to_espn = True)
2870
2469
  api_coords = event_coords
2871
2470
  api_coords['coordinate_source'] = 'api'
2872
2471
  if len(event_coords[(event_coords.event.isin(ewc)) & (pd.isna(event_coords.coords_x))]) > 0:
@@ -2926,7 +2525,10 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
2926
2525
  espn_home_team = 'SJS'
2927
2526
  if away_team == 'S.J':
2928
2527
  espn_away_team = 'SJS'
2528
+ print('Scraping ESPN IDs')
2929
2529
  espn_id = scrape_espn_ids_single_game(str(game_date.date()), espn_home_team, espn_away_team).espn_id.iloc[0]
2530
+ print('Scraping ESPN Events')
2531
+ print('Here is the ESPN ID:', espn_id)
2930
2532
  event_coords = scrape_espn_events(int(espn_id))
2931
2533
  event_coords['coordinate_source'] = 'espn'
2932
2534
  events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left').drop(columns = ['espn_id'])
@@ -3196,35 +2798,38 @@ def full_scrape(game_id_list, live = True, shift = False):
3196
2798
  df = full_scrape_1by1(game_id_list, live, shift_to_espn = shift)
3197
2799
 
3198
2800
  # Fixing the Pettersson issue for event player. Just going downstream for this.
3199
- df = df.assign(
3200
- event_player_1 = np.where(
3201
- (df.event_player_1 == 'ELIAS PETTERSSON') &
3202
- (df.event_description.str.contains('#', na=False)) &
3203
- (df.event_description.str.contains(' PETTERSSON', na=False)) &
3204
- (df.event_description.str.extract(r'#(\d+) PETTERSSON', expand=False) == '25'),
3205
- 'ELIAS PETTERSSON(D)', df.event_player_1),
3206
- event_player_2 = np.where(
3207
- (df.event_player_2 == 'ELIAS PETTERSSON') &
3208
- (
3209
- # Goal and Petey got A1
3210
- ((df.event_type == 'GOAL') &
3211
- (df.event_description.str.contains(': #', na=False)) &
3212
- (df.event_description.str.contains(' PETTERSSON', na=False)) &
3213
- (df.event_description.str.extract(r': #(\d+) PETTERSSON', expand=False) == '25')) |
3214
- # Not a goal, Petey was EP2
3215
- ((df.event_type != 'GOAL') &
3216
- (df.event_description.str.contains('VAN #', na=False)) &
3217
- (df.event_description.str.contains(' PETTERSSON', na=False)) &
3218
- (df.event_description.str.extract(r'VAN #(\d+) PETTERSSON', expand=False) == '25'))
3219
- ),
3220
- 'ELIAS PETTERSSON(D)', df.event_player_2),
3221
- event_player_3 = np.where(
3222
- (df.event_player_3=='ELIAS PETTERSSON') &
3223
- (df.event_description.str.contains('#', na=False)) &
3224
- (df.event_description.str.contains(' PETTERSSON', na=False)) &
3225
- (df.event_description.str.extract(r'#(\d+) PETTERSSON(?:\s|$)', expand=False) == '25'),
3226
- 'ELIAS PETTERSSON(D)', df.event_player_3)
3227
- )
2801
+ try:
2802
+ df = df.assign(
2803
+ event_player_1 = np.where(
2804
+ (df.event_player_1 == 'ELIAS PETTERSSON') &
2805
+ (df.event_description.str.contains('#', na=False)) &
2806
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
2807
+ (df.event_description.str.extract(r'#(\d+) PETTERSSON', expand=False) == '25'),
2808
+ 'ELIAS PETTERSSON(D)', df.event_player_1),
2809
+ event_player_2 = np.where(
2810
+ (df.event_player_2 == 'ELIAS PETTERSSON') &
2811
+ (
2812
+ # Goal and Petey got A1
2813
+ ((df.event_type == 'GOAL') &
2814
+ (df.event_description.str.contains(': #', na=False)) &
2815
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
2816
+ (df.event_description.str.extract(r': #(\d+) PETTERSSON', expand=False) == '25')) |
2817
+ # Not a goal, Petey was EP2
2818
+ ((df.event_type != 'GOAL') &
2819
+ (df.event_description.str.contains('VAN #', na=False)) &
2820
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
2821
+ (df.event_description.str.extract(r'VAN #(\d+) PETTERSSON', expand=False) == '25'))
2822
+ ),
2823
+ 'ELIAS PETTERSSON(D)', df.event_player_2),
2824
+ event_player_3 = np.where(
2825
+ (df.event_player_3=='ELIAS PETTERSSON') &
2826
+ (df.event_description.str.contains('#', na=False)) &
2827
+ (df.event_description.str.contains(' PETTERSSON', na=False)) &
2828
+ (df.event_description.str.extract(r'#(\d+) PETTERSSON(?:\s|$)', expand=False) == '25'),
2829
+ 'ELIAS PETTERSSON(D)', df.event_player_3)
2830
+ )
2831
+ except Exception as e:
2832
+ print(e)
3228
2833
 
3229
2834
  # Don't even need this, we've had this problem with Stutzle for years, just let it be.
3230
2835
  # df.event_description = df.event_description.str.replace('FEHÃ\x89RVÃ\x81RY', 'FEHERVARY').str.replace('BLÜMEL', 'BLAMEL')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TopDownHockey_Scraper
3
- Version: 6.0.0
3
+ Version: 6.0.5
4
4
  Summary: The TopDownHockey Scraper
5
5
  Home-page: https://github.com/TopDownHockey/TopDownHockey_Scraper
6
6
  Author: Patrick Bacon
@@ -17,8 +17,6 @@ Requires-Dist: numpy
17
17
  Requires-Dist: pandas
18
18
  Requires-Dist: bs4
19
19
  Requires-Dist: datetime
20
- Requires-Dist: seaborn
21
- Requires-Dist: matplotlib
22
20
  Requires-Dist: xmltodict
23
21
  Requires-Dist: lxml
24
22
  Requires-Dist: natsort
@@ -2,8 +2,6 @@ numpy
2
2
  pandas
3
3
  bs4
4
4
  datetime
5
- seaborn
6
- matplotlib
7
5
  xmltodict
8
6
  lxml
9
7
  natsort