TopDownHockey-Scraper 6.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py +820 -0
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py +3285 -0
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper_OG.py +3224 -0
- TopDownHockey_Scraper/__init__.py +26 -0
- TopDownHockey_Scraper/data/handedness.csv +1276 -0
- TopDownHockey_Scraper/name_corrections.py +302 -0
- TopDownHockey_Scraper/portrait_links.csv +2445 -0
- TopDownHockey_Scraper/scrape_nhl_api_events.py +438 -0
- topdownhockey_scraper-6.1.30.dist-info/METADATA +169 -0
- topdownhockey_scraper-6.1.30.dist-info/RECORD +13 -0
- topdownhockey_scraper-6.1.30.dist-info/WHEEL +5 -0
- topdownhockey_scraper-6.1.30.dist-info/licenses/LICENSE +19 -0
- topdownhockey_scraper-6.1.30.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,820 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
A package built for scraping Elite Prospects
|
|
4
|
+
|
|
5
|
+
This package is built for personal use. If you are interested in professional use, look into the EliteProspects API.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from bs4 import BeautifulSoup
|
|
11
|
+
import requests
|
|
12
|
+
import time
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
import warnings
|
|
15
|
+
warnings.filterwarnings("ignore")
|
|
16
|
+
import sys
|
|
17
|
+
from requests import ConnectionError, ReadTimeout, ConnectTimeout, HTTPError, Timeout
|
|
18
|
+
|
|
19
|
+
def tableDataText(table):
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
A function that is built strictly for the back end and should not be run by the user.
|
|
23
|
+
Function built by Marcus Sjölin
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
rows = []
|
|
27
|
+
trs = table.find_all('tr')
|
|
28
|
+
|
|
29
|
+
headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
|
|
30
|
+
if headerow: # if there is a header row include first
|
|
31
|
+
rows.append(headerow)
|
|
32
|
+
trs = trs[1:]
|
|
33
|
+
for tr in trs: # for every table row
|
|
34
|
+
rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
|
|
35
|
+
|
|
36
|
+
df_rows = pd.DataFrame(rows[1:], columns=rows[0])
|
|
37
|
+
|
|
38
|
+
return df_rows
|
|
39
|
+
|
|
40
|
+
def getskaters(league, year):
|
|
41
|
+
"""
|
|
42
|
+
A function that is built strictly for the back end and should not be run by the user.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
url = 'https://www.eliteprospects.com/league/' + league + '/stats/' + year + '?page='
|
|
46
|
+
# print('Collects data from ' + 'https://www.eliteprospects.com/league/' + league + '/stats/' + year)
|
|
47
|
+
|
|
48
|
+
print("Beginning scrape of " + league + " skater data from " + year + ".")
|
|
49
|
+
|
|
50
|
+
# Return list with all plyers for season in link
|
|
51
|
+
players = []
|
|
52
|
+
|
|
53
|
+
page = (requests.get(url+str(1), timeout = 500))
|
|
54
|
+
first_page_string = str(page)
|
|
55
|
+
|
|
56
|
+
while first_page_string == '<Response [403]>':
|
|
57
|
+
print("Just got a 403 Error before entering the page. Time to Sleep, then re-obtain the link.")
|
|
58
|
+
time.sleep(100)
|
|
59
|
+
page = (requests.get(url+str(1), timeout = 500))
|
|
60
|
+
first_page_string = str(page)
|
|
61
|
+
print("Changed the string before entering the page. Let's try again")
|
|
62
|
+
|
|
63
|
+
if (str(first_page_string) == '<Response [404]>'):
|
|
64
|
+
print("ERROR: " + str(first_page_string) + " on league: " + league + " in year: " + year + ". Data doesn't exist for this league in this year.")
|
|
65
|
+
|
|
66
|
+
else:
|
|
67
|
+
|
|
68
|
+
for i in range(1,99):
|
|
69
|
+
page = requests.get(url+str(i), timeout = 500)
|
|
70
|
+
page_string = str(page)
|
|
71
|
+
|
|
72
|
+
while page_string == '<Response [403]>':
|
|
73
|
+
print("Just got a 403 Error within the page. Time to Sleep, then re-obtain the link.")
|
|
74
|
+
time.sleep(100)
|
|
75
|
+
page = requests.get(url+str(i), timeout = 500)
|
|
76
|
+
page_string = str(page)
|
|
77
|
+
print("Changed the string within the page. Let's try again")
|
|
78
|
+
|
|
79
|
+
soup = BeautifulSoup(page.content, "html.parser")
|
|
80
|
+
|
|
81
|
+
# Get data for players table
|
|
82
|
+
player_table = soup.find( "table", {"class":"table table-striped table-sortable player-stats highlight-stats season"})
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
df_players = tableDataText(player_table)
|
|
86
|
+
|
|
87
|
+
except AttributeError:
|
|
88
|
+
print("BREAK: TABLE NONE ERROR: " + str(requests.get(url+str(i), timeout = 500)) + " On League: " + league + " In Year: " + year)
|
|
89
|
+
break
|
|
90
|
+
|
|
91
|
+
if len(df_players)>0:
|
|
92
|
+
|
|
93
|
+
if df_players['#'].count()>0:
|
|
94
|
+
# Remove empty rows
|
|
95
|
+
df_players = df_players[df_players['#']!=''].reset_index(drop=True)
|
|
96
|
+
|
|
97
|
+
# Extract href links in table
|
|
98
|
+
href_row = []
|
|
99
|
+
for link in player_table.find_all('a'):
|
|
100
|
+
href_row.append(link.attrs['href'])
|
|
101
|
+
|
|
102
|
+
# Create data frame, rename and only keep links to players
|
|
103
|
+
df_links = pd.DataFrame(href_row)
|
|
104
|
+
df_links.rename(columns={ df_links.columns[0]:"link"}, inplace=True)
|
|
105
|
+
df_links= df_links[df_links['link'].str.contains("/player/")].reset_index(drop=True)
|
|
106
|
+
|
|
107
|
+
# Add links to players
|
|
108
|
+
df_players['link']=df_links['link']
|
|
109
|
+
|
|
110
|
+
players.append(df_players)
|
|
111
|
+
|
|
112
|
+
# Wait 3 seconds before going to next
|
|
113
|
+
#time.sleep(1)
|
|
114
|
+
#print("Scraped page " + str(i))
|
|
115
|
+
|
|
116
|
+
else:
|
|
117
|
+
#print("Scraped final page of: " + league + " In Year: " + year)
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
if len(players)!=0:
|
|
122
|
+
df_players = pd.concat(players).reset_index()
|
|
123
|
+
|
|
124
|
+
df_players.columns = map(str.lower, df_players.columns)
|
|
125
|
+
|
|
126
|
+
# Clean up dataset
|
|
127
|
+
df_players['season'] = year
|
|
128
|
+
df_players['league'] = league
|
|
129
|
+
|
|
130
|
+
df_players = df_players.drop(['index','#'], axis=1).reset_index(drop=True)
|
|
131
|
+
|
|
132
|
+
df_players['playername'] = df_players['player'].str.replace(r"\(.*\)","")
|
|
133
|
+
df_players['position'] = df_players['player'].str.extract('.*\((.*)\).*')
|
|
134
|
+
df_players['position'] = np.where(pd.isna(df_players['position']), "F", df_players['position'])
|
|
135
|
+
|
|
136
|
+
df_players['fw_def'] = df_players['position'].str.contains('LW|RW|C|F')
|
|
137
|
+
df_players.loc[df_players['position'].str.contains('LW|RW|C'), 'fw_def'] = 'FW'
|
|
138
|
+
df_players.loc[df_players['position'].str.contains('D'), 'fw_def'] = 'DEF'
|
|
139
|
+
|
|
140
|
+
# Adjust columns; transform data
|
|
141
|
+
team = df_players['team'].str.split("“", n=1, expand=True)
|
|
142
|
+
df_players['team'] = team[0]
|
|
143
|
+
|
|
144
|
+
# drop player-column
|
|
145
|
+
df_players = df_players.drop(columns = ['fw_def'], axis=1)
|
|
146
|
+
print("Successfully scraped all " + league + " skater data from " + year + ".")
|
|
147
|
+
|
|
148
|
+
return df_players
|
|
149
|
+
|
|
150
|
+
else: print("LENGTH 0 ERROR: " + str(requests.get(url+str(1), timeout = 500)) + " On League: " + league + " In Year: " + year)
|
|
151
|
+
|
|
152
|
+
def getgoalies(league, year):
|
|
153
|
+
"""
|
|
154
|
+
A function that is built strictly for the back end and should not be run by the user.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
url = 'https://www.eliteprospects.com/league/' + league + '/stats/' + year + '?page-goalie='
|
|
158
|
+
# print('Collects data from ' + 'https://www.eliteprospects.com/league/' + league + '/stats/' + year)
|
|
159
|
+
|
|
160
|
+
print("Beginning scrape of " + league + " goalie data from " + year + ".")
|
|
161
|
+
|
|
162
|
+
# Return list with all plyers for season in link
|
|
163
|
+
players = []
|
|
164
|
+
|
|
165
|
+
page = (requests.get(url + str(1) + "#goalies", timeout = 500))
|
|
166
|
+
first_page_string = str(page)
|
|
167
|
+
|
|
168
|
+
while first_page_string == '<Response [403]>':
|
|
169
|
+
print("Just got a 403 Error before entering the page. This means EliteProspects has temporarily blocked your IP address.")
|
|
170
|
+
print("We're going to sleep for 60 seconds, then try again.")
|
|
171
|
+
time.sleep(100)
|
|
172
|
+
page = (requests.get(url + str(1) + "#goalies", timeout = 500))
|
|
173
|
+
first_page_string = str(page)
|
|
174
|
+
print("Okay, let's try this again")
|
|
175
|
+
|
|
176
|
+
if (first_page_string) == '<Response [404]>':
|
|
177
|
+
print("ERROR: " + first_page_string + " on league: " + league + " in year: " + year + ". Data doesn't exist for this league and season.")
|
|
178
|
+
|
|
179
|
+
else:
|
|
180
|
+
|
|
181
|
+
for i in range(1,99):
|
|
182
|
+
page = requests.get(url+str(i), timeout = 500)
|
|
183
|
+
page_string = str(page)
|
|
184
|
+
|
|
185
|
+
while page_string == '<Response [403]>':
|
|
186
|
+
print("Just got a 403 Error within the page. Time to Sleep, then re-obtain the link.")
|
|
187
|
+
time.sleep(100)
|
|
188
|
+
page = (requests.get(url+str(i), timeout = 500))
|
|
189
|
+
page_string = str(page)
|
|
190
|
+
print("Changed the string within the page. Let's try again")
|
|
191
|
+
|
|
192
|
+
soup = BeautifulSoup(page.content, "html.parser")
|
|
193
|
+
|
|
194
|
+
# Get data for players table
|
|
195
|
+
player_table = soup.find("table", {"class":"table table-striped table-sortable goalie-stats highlight-stats season"})
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
df_players = tableDataText(player_table)
|
|
199
|
+
except AttributeError:
|
|
200
|
+
print("BREAK: TABLE NONE ERROR: " + str(requests.get(url+str(i), timeout = 500)) + " On League: " + league + " In Year: " + year)
|
|
201
|
+
break
|
|
202
|
+
|
|
203
|
+
if len(df_players)>0:
|
|
204
|
+
|
|
205
|
+
if df_players['#'].count()>0:
|
|
206
|
+
# Remove empty rows
|
|
207
|
+
df_players = df_players[df_players['#']!=''].reset_index(drop=True)
|
|
208
|
+
|
|
209
|
+
# Extract href links in table
|
|
210
|
+
href_row = []
|
|
211
|
+
for link in player_table.find_all('a'):
|
|
212
|
+
href_row.append(link.attrs['href'])
|
|
213
|
+
|
|
214
|
+
# Create data frame, rename and only keep links to players
|
|
215
|
+
df_links = pd.DataFrame(href_row)
|
|
216
|
+
df_links.rename(columns={ df_links.columns[0]:"link"}, inplace=True)
|
|
217
|
+
df_links= df_links[df_links['link'].str.contains("/player/")].reset_index(drop=True)
|
|
218
|
+
|
|
219
|
+
# Add links to players
|
|
220
|
+
df_players['link']=df_links['link']
|
|
221
|
+
|
|
222
|
+
players.append(df_players)
|
|
223
|
+
|
|
224
|
+
# Wait 3 seconds before going to next
|
|
225
|
+
# time.sleep(1)
|
|
226
|
+
#print("Scraped page " + str(i))
|
|
227
|
+
|
|
228
|
+
else:
|
|
229
|
+
#print("Scraped final page of: " + league + " In Year: " + year)
|
|
230
|
+
break
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
if len(players)!=0:
|
|
234
|
+
df_players = pd.concat(players).reset_index()
|
|
235
|
+
|
|
236
|
+
df_players.columns = map(str.lower, df_players.columns)
|
|
237
|
+
|
|
238
|
+
# Clean up dataset
|
|
239
|
+
df_players['season'] = year
|
|
240
|
+
df_players['league'] = league
|
|
241
|
+
|
|
242
|
+
df_players = df_players.drop(['index','#'], axis=1).reset_index(drop=True)
|
|
243
|
+
|
|
244
|
+
print("Successfully scraped all " + league + " goalie data from " + year + ".")
|
|
245
|
+
|
|
246
|
+
df_players = df_players.loc[((df_players.gp!=0) & (~pd.isna(df_players.gp)) & (df_players.gp!="0") & (df_players.gaa!="-"))]
|
|
247
|
+
|
|
248
|
+
return df_players
|
|
249
|
+
else: print("LENGTH 0 ERROR: " + str(requests.get(url+str(1), timeout = 500)) + " On League: " + league + " In Year: " + year)
|
|
250
|
+
|
|
251
|
+
def get_info(link):
|
|
252
|
+
"""
|
|
253
|
+
A function that is built strictly for the back end and should not be run by the user.
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
page = requests.get(link, timeout = 500)
|
|
257
|
+
soup = BeautifulSoup(page.content, "html.parser")
|
|
258
|
+
|
|
259
|
+
page_string = str(page)
|
|
260
|
+
|
|
261
|
+
while ((page_string == '<Response [403]>') or ("evil" in str(soup.p))):
|
|
262
|
+
print("403 Error. re-obtaining string and re-trying.")
|
|
263
|
+
page = requests.get(link, timeout = 500)
|
|
264
|
+
page_string = str(page)
|
|
265
|
+
soup = BeautifulSoup(page.content, "html.parser")
|
|
266
|
+
time.sleep(60)
|
|
267
|
+
|
|
268
|
+
lis = soup.find_all('li')
|
|
269
|
+
|
|
270
|
+
relevant_lis = [li for li in lis if li.find('span') is not None]
|
|
271
|
+
|
|
272
|
+
# player
|
|
273
|
+
|
|
274
|
+
if soup.find("title") != None:
|
|
275
|
+
player = soup.find("title").string.replace(' - Stats, Contract, Salary & More', '')
|
|
276
|
+
else:
|
|
277
|
+
player = '-'
|
|
278
|
+
|
|
279
|
+
# status
|
|
280
|
+
|
|
281
|
+
if [li for li in relevant_lis if li.find('span').text=='NHL Rights'] != []:
|
|
282
|
+
rights = [li for li in relevant_lis if li.find('span').text=='NHL Rights'][0].find('a').text.split(' /')[0]
|
|
283
|
+
else:
|
|
284
|
+
rights = '-'
|
|
285
|
+
|
|
286
|
+
# rights
|
|
287
|
+
|
|
288
|
+
if [li for li in relevant_lis if li.find('span').text=='NHL Rights'] != []:
|
|
289
|
+
status = [li for li in relevant_lis if li.find('span').text=='NHL Rights'][0].find('a').text.split(' / ')[1]
|
|
290
|
+
else:
|
|
291
|
+
status = '-'
|
|
292
|
+
|
|
293
|
+
# dob
|
|
294
|
+
|
|
295
|
+
if [li.find('a')['href'].split('dob=', 1)[1].split('&sort', 1)[0] for li in relevant_lis if li.span.text == 'Date of Birth'] != []:
|
|
296
|
+
dob = [li.find('a')['href'].split('dob=', 1)[1].split('&sort', 1)[0] for li in relevant_lis if li.span.text == 'Date of Birth'][0]
|
|
297
|
+
else:
|
|
298
|
+
dob = '-'
|
|
299
|
+
|
|
300
|
+
# height
|
|
301
|
+
|
|
302
|
+
if [li for li in relevant_lis if li.find('span').text=='Height'] != []:
|
|
303
|
+
height = [li for li in relevant_lis if li.find('span').text=='Height'][0].text.split('Height')[1].split(' cm')[0]
|
|
304
|
+
else:
|
|
305
|
+
height = '-'
|
|
306
|
+
|
|
307
|
+
# weight
|
|
308
|
+
|
|
309
|
+
if [li for li in relevant_lis if li.find('span').text=='Weight'] != []:
|
|
310
|
+
weight = [li for li in relevant_lis if li.find('span').text=='Weight'][0].text.split('Weight')[1].split(' cm')[0]
|
|
311
|
+
else:
|
|
312
|
+
weight = '-'
|
|
313
|
+
|
|
314
|
+
# birthplace
|
|
315
|
+
|
|
316
|
+
if [li for li in relevant_lis if li.find('span').text=='Place of Birth'] != []:
|
|
317
|
+
birthplace = [li for li in relevant_lis if li.find('span').text=='Place of Birth'][0].text.split('Birth')[1]
|
|
318
|
+
else:
|
|
319
|
+
birthplace = '-'
|
|
320
|
+
|
|
321
|
+
# nation
|
|
322
|
+
|
|
323
|
+
if [li for li in relevant_lis if li.find('span').text=='Nation'] != []:
|
|
324
|
+
nation = [li for li in relevant_lis if li.find('span').text=='Nation'][0].text.split('Nation')[1]
|
|
325
|
+
else:
|
|
326
|
+
nation = '-'
|
|
327
|
+
|
|
328
|
+
# shoots
|
|
329
|
+
|
|
330
|
+
if [li for li in relevant_lis if li.find('span').text=='Shoots'] != []:
|
|
331
|
+
shoots = [li for li in relevant_lis if li.find('span').text=='Shoots'][0].text.split('Shoots')[1]
|
|
332
|
+
else:
|
|
333
|
+
shoots = '-'
|
|
334
|
+
|
|
335
|
+
# draft
|
|
336
|
+
|
|
337
|
+
if [li for li in relevant_lis if li.find('span').text=='Drafted'] != []:
|
|
338
|
+
draft = [li for li in relevant_lis if li.find('span').text=='Drafted'][0].text.split('Drafted')[1]
|
|
339
|
+
else:
|
|
340
|
+
draft = '-'
|
|
341
|
+
|
|
342
|
+
return(player, rights, status, dob, height, weight, birthplace, nation, shoots, draft, link)
|
|
343
|
+
|
|
344
|
+
def get_player_information(dataframe):
|
|
345
|
+
'''
|
|
346
|
+
Takes a data frame from the get_players or get_goalies function and obtains biographcal information for all players in said dataframe, then returns it as a dataframe.
|
|
347
|
+
'''
|
|
348
|
+
|
|
349
|
+
myplayer = []
|
|
350
|
+
myrights = []
|
|
351
|
+
mystatus = []
|
|
352
|
+
mydob = []
|
|
353
|
+
myheight = []
|
|
354
|
+
myweight = []
|
|
355
|
+
mybirthplace = []
|
|
356
|
+
mynation = []
|
|
357
|
+
myshot = []
|
|
358
|
+
mydraft = []
|
|
359
|
+
mylink = []
|
|
360
|
+
|
|
361
|
+
print("Beginning scrape for " + str(len(list(set(dataframe.link)))) + " players.")
|
|
362
|
+
|
|
363
|
+
for i in range(0, len(list(set(dataframe.link)))):
|
|
364
|
+
try:
|
|
365
|
+
myresult = get_info(((list(set(dataframe.link))[i])))
|
|
366
|
+
myplayer.append(myresult[0])
|
|
367
|
+
myrights.append(myresult[1])
|
|
368
|
+
mystatus.append(myresult[2])
|
|
369
|
+
mydob.append(myresult[3])
|
|
370
|
+
myheight.append(myresult[4])
|
|
371
|
+
myweight.append(myresult[5])
|
|
372
|
+
mybirthplace.append(myresult[6])
|
|
373
|
+
mynation.append(myresult[7])
|
|
374
|
+
myshot.append(myresult[8])
|
|
375
|
+
mydraft.append(myresult[9])
|
|
376
|
+
mylink.append(myresult[10])
|
|
377
|
+
print(myresult[0] + " scraped! That's " + str(i + 1) + " down! Only " + str(len(list(set(dataframe.link))) - (i + 1)) + " left to go!")
|
|
378
|
+
except KeyboardInterrupt:
|
|
379
|
+
print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
|
|
380
|
+
break
|
|
381
|
+
except (ConnectionError,
|
|
382
|
+
HTTPError,
|
|
383
|
+
ReadTimeout,
|
|
384
|
+
ConnectTimeout,
|
|
385
|
+
ValueError) as errormessage:
|
|
386
|
+
print("You've been disconnected. Here's the error message:")
|
|
387
|
+
print(errormessage)
|
|
388
|
+
print("Luckily, everything you've scraped up to this point will still be safe.")
|
|
389
|
+
break
|
|
390
|
+
|
|
391
|
+
resultdf = pd.DataFrame(columns = ["player", "rights", "status", "dob", "height", "weight", "birthplace", "nation", "shoots", "draft", "link"])
|
|
392
|
+
|
|
393
|
+
resultdf.player = myplayer
|
|
394
|
+
resultdf.rights = myrights
|
|
395
|
+
resultdf.status = mystatus
|
|
396
|
+
resultdf.dob = mydob
|
|
397
|
+
resultdf.height = myheight
|
|
398
|
+
resultdf.weight = myweight
|
|
399
|
+
resultdf.birthplace = mybirthplace
|
|
400
|
+
resultdf.nation = mynation
|
|
401
|
+
resultdf.shoots = myshot
|
|
402
|
+
resultdf.draft = mydraft
|
|
403
|
+
resultdf.link = mylink
|
|
404
|
+
|
|
405
|
+
print("Your scrape is complete! You've obtained player information for " + str(len(resultdf)) + " players!")
|
|
406
|
+
|
|
407
|
+
return resultdf
|
|
408
|
+
|
|
409
|
+
def get_league_skater_boxcars(league, seasons):
|
|
410
|
+
"""
|
|
411
|
+
A function that is built strictly for the back end and should not be run by the user.
|
|
412
|
+
"""
|
|
413
|
+
|
|
414
|
+
if len(set(seasons))==1:
|
|
415
|
+
scraped_season_list = str(seasons)
|
|
416
|
+
elif len(set(seasons))>2:
|
|
417
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
418
|
+
else:
|
|
419
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
global hidden_patrick
|
|
423
|
+
hidden_patrick = 0
|
|
424
|
+
global error
|
|
425
|
+
error = 0
|
|
426
|
+
|
|
427
|
+
output = pd.DataFrame()
|
|
428
|
+
|
|
429
|
+
if type(seasons) == str:
|
|
430
|
+
single = getskaters(league, seasons)
|
|
431
|
+
output = output._append(single)
|
|
432
|
+
print("Scraping " + league + " data is complete. You scraped skater data from " + seasons + ".")
|
|
433
|
+
return(output)
|
|
434
|
+
|
|
435
|
+
elif ((type(seasons) == tuple) or (type(seasons) == list)):
|
|
436
|
+
|
|
437
|
+
for i in range(0, len(seasons)):
|
|
438
|
+
try:
|
|
439
|
+
single = getskaters(league, seasons[i])
|
|
440
|
+
output = output._append(single)
|
|
441
|
+
except KeyboardInterrupt as e:
|
|
442
|
+
hidden_patrick = 4
|
|
443
|
+
error = e
|
|
444
|
+
return output
|
|
445
|
+
except (ConnectionError,
|
|
446
|
+
HTTPError,
|
|
447
|
+
ReadTimeout,
|
|
448
|
+
ConnectTimeout,
|
|
449
|
+
ValueError) as e:
|
|
450
|
+
hidden_patrick = 5
|
|
451
|
+
error = e
|
|
452
|
+
return output
|
|
453
|
+
|
|
454
|
+
print("Scraping " + league + " data is complete. You scraped skater data from " + scraped_season_list + ".")
|
|
455
|
+
return(output)
|
|
456
|
+
|
|
457
|
+
def get_league_goalie_boxcars(league, seasons):
|
|
458
|
+
"""
|
|
459
|
+
A function that is built strictly for the back end and should not be run by the user.
|
|
460
|
+
"""
|
|
461
|
+
|
|
462
|
+
if len(set(seasons))==1:
|
|
463
|
+
scraped_season_list = str(seasons)
|
|
464
|
+
elif len(set(seasons))>2:
|
|
465
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
466
|
+
else:
|
|
467
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
global hidden_patrick
|
|
471
|
+
hidden_patrick = 0
|
|
472
|
+
global error
|
|
473
|
+
error = 0
|
|
474
|
+
|
|
475
|
+
output = pd.DataFrame()
|
|
476
|
+
|
|
477
|
+
if type(seasons) == str:
|
|
478
|
+
single = getgoalies(league, seasons)
|
|
479
|
+
output = output._append(single)
|
|
480
|
+
print("Scraping " + league + " data is complete. You scraped goalie data from " + seasons + ".")
|
|
481
|
+
return(output)
|
|
482
|
+
|
|
483
|
+
elif ((type(seasons) == tuple) or (type(seasons) == list)):
|
|
484
|
+
|
|
485
|
+
for i in range(0, len(seasons)):
|
|
486
|
+
try:
|
|
487
|
+
single = getgoalies(league, seasons[i])
|
|
488
|
+
output = output._append(single)
|
|
489
|
+
except KeyboardInterrupt as e:
|
|
490
|
+
hidden_patrick = 4
|
|
491
|
+
error = e
|
|
492
|
+
return output
|
|
493
|
+
except (ConnectionError,
|
|
494
|
+
HTTPError,
|
|
495
|
+
ReadTimeout,
|
|
496
|
+
ConnectTimeout) as e:
|
|
497
|
+
hidden_patrick = 5
|
|
498
|
+
error = e
|
|
499
|
+
return output
|
|
500
|
+
|
|
501
|
+
print("Scraping " + league + " data is complete. You scraped goalie data from " + scraped_season_list + ".")
|
|
502
|
+
return(output)
|
|
503
|
+
|
|
504
|
+
def get_goalies(leagues, seasons):
|
|
505
|
+
'''
|
|
506
|
+
Obtains goalie data for at least one season and at least one league. Returns a dataframe.
|
|
507
|
+
'''
|
|
508
|
+
|
|
509
|
+
if (len(seasons)==1 or type(seasons)==str):
|
|
510
|
+
season_string = str(seasons)
|
|
511
|
+
elif len(seasons)==2:
|
|
512
|
+
season_string = " and".join(str((tuple(sorted(tuple(seasons))))).replace("'", "").replace("(", "").replace(")", "").split(","))
|
|
513
|
+
else:
|
|
514
|
+
season_string = str(((tuple(sorted(tuple(seasons)))))[:-1]).replace("'", "").replace("(", "").replace(")", "") + " and " + str(((tuple(sorted(tuple(seasons)))))[-1])
|
|
515
|
+
|
|
516
|
+
if (len(leagues)==1 or type(leagues)==str):
|
|
517
|
+
league_string = str(leagues)
|
|
518
|
+
elif len(leagues)==2:
|
|
519
|
+
league_string = " and".join(str((tuple(sorted(tuple(leagues))))).replace("'", "").replace("(", "").replace(")", "").split(","))
|
|
520
|
+
else:
|
|
521
|
+
league_string = str(((tuple(sorted(tuple(leagues)))))[:-1]).replace("'", "").replace("(", "").replace(")", "") + " and " + str(((tuple(sorted(tuple(leagues)))))[-1])
|
|
522
|
+
|
|
523
|
+
leaguesall = pd.DataFrame()
|
|
524
|
+
|
|
525
|
+
if ((type(leagues)==str) and (type(seasons)==str)):
|
|
526
|
+
print("Your scrape request is goalie data from the following league:")
|
|
527
|
+
print(league_string)
|
|
528
|
+
print("In the following season:")
|
|
529
|
+
print(season_string)
|
|
530
|
+
leaguesall = get_league_goalie_boxcars(leagues, seasons)
|
|
531
|
+
print("Completed scraping goalie data from the following league:")
|
|
532
|
+
print(str(leagues))
|
|
533
|
+
print("Over the following season:")
|
|
534
|
+
print(str(seasons))
|
|
535
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
536
|
+
|
|
537
|
+
elif ((type(leagues)==str) and ((type(seasons) == tuple) or (type(seasons) == list))):
|
|
538
|
+
print("Your scrape request is goalie data from the following league:")
|
|
539
|
+
print(league_string)
|
|
540
|
+
print("In the following seasons:")
|
|
541
|
+
print(season_string)
|
|
542
|
+
leaguesall = get_league_goalie_boxcars(leagues, seasons)
|
|
543
|
+
|
|
544
|
+
if hidden_patrick == 4:
|
|
545
|
+
print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
|
|
546
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
547
|
+
if hidden_patrick == 5:
|
|
548
|
+
print("You were disconnected! The output here will be every player you've scraped so far. Here's your error message:")
|
|
549
|
+
print(error)
|
|
550
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
551
|
+
|
|
552
|
+
if len(set(leaguesall.league))==1:
|
|
553
|
+
scraped_league_list = leaguesall.league
|
|
554
|
+
elif len(set(leaguesall.league))>2:
|
|
555
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
556
|
+
else:
|
|
557
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
558
|
+
|
|
559
|
+
if len(set(seasons))==1:
|
|
560
|
+
scraped_season_list = seasons
|
|
561
|
+
elif len(set(seasons))>2:
|
|
562
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
563
|
+
else:
|
|
564
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
565
|
+
|
|
566
|
+
print("Completed scraping goalie data from the following league:")
|
|
567
|
+
print(str(leagues))
|
|
568
|
+
print("Over the following seasons:")
|
|
569
|
+
print(scraped_season_list)
|
|
570
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
571
|
+
|
|
572
|
+
elif ((type(seasons) == str) and ((type(leagues) == tuple) or (type(leagues) == list))):
|
|
573
|
+
print("Your scrape request is goalie data from the following leagues:")
|
|
574
|
+
print(league_string)
|
|
575
|
+
print("In the following season:")
|
|
576
|
+
print(season_string)
|
|
577
|
+
|
|
578
|
+
for i in range (0, len(leagues)):
|
|
579
|
+
try:
|
|
580
|
+
targetleague = get_league_goalie_boxcars(leagues[i], seasons)
|
|
581
|
+
leaguesall = leaguesall._append(targetleague)
|
|
582
|
+
if hidden_patrick == 4:
|
|
583
|
+
raise KeyboardInterrupt
|
|
584
|
+
if hidden_patrick == 5:
|
|
585
|
+
raise ConnectionError
|
|
586
|
+
except KeyboardInterrupt:
|
|
587
|
+
print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
|
|
588
|
+
break
|
|
589
|
+
except ConnectionError:
|
|
590
|
+
print("You were disconnected! Let's sleep and try again.")
|
|
591
|
+
print(error)
|
|
592
|
+
time.sleep(100)
|
|
593
|
+
continue
|
|
594
|
+
|
|
595
|
+
if len(set(leaguesall.league))==1:
|
|
596
|
+
scraped_league_list = leaguesall.league
|
|
597
|
+
elif len(set(leaguesall.league))>2:
|
|
598
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
599
|
+
else:
|
|
600
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
601
|
+
|
|
602
|
+
print("Completed scraping goalie data from the following leagues:")
|
|
603
|
+
print(scraped_league_list)
|
|
604
|
+
print("Over the following season:")
|
|
605
|
+
print((seasons))
|
|
606
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
607
|
+
|
|
608
|
+
elif (((type(seasons) == tuple) or (type(seasons) == list)) and ((type(leagues) == tuple) or (type(leagues) == list))):
|
|
609
|
+
print("Your scrape request is goalie data from the following leagues:")
|
|
610
|
+
print(league_string)
|
|
611
|
+
print("In the following seasons:")
|
|
612
|
+
print(season_string)
|
|
613
|
+
#print("Your scrape request: " + str(leagues[:-1]).replace("'", "").replace("(", "").replace(")", "") + ", and " + (leagues)[-1] + " goalie data from " +str(seasons[:-1]).replace("'", "").replace("(", "").replace(")", "") + ", and " + (seasons)[-1] + ".")
|
|
614
|
+
for i in range (0, len(leagues)):
|
|
615
|
+
try:
|
|
616
|
+
targetleague = get_league_goalie_boxcars(leagues[i], seasons)
|
|
617
|
+
leaguesall = leaguesall._append(targetleague)
|
|
618
|
+
if hidden_patrick == 4:
|
|
619
|
+
raise KeyboardInterrupt
|
|
620
|
+
if hidden_patrick == 5:
|
|
621
|
+
raise ConnectionError
|
|
622
|
+
except KeyboardInterrupt:
|
|
623
|
+
print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
|
|
624
|
+
break
|
|
625
|
+
except ConnectionError:
|
|
626
|
+
print("You were disconnected! Let's sleep and try again.")
|
|
627
|
+
print(error)
|
|
628
|
+
time.sleep(100)
|
|
629
|
+
continue
|
|
630
|
+
|
|
631
|
+
if len(set(leaguesall.league))==1:
|
|
632
|
+
scraped_league_list = leaguesall.league
|
|
633
|
+
elif len(set(leaguesall.league))>2:
|
|
634
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
635
|
+
else:
|
|
636
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
637
|
+
|
|
638
|
+
if len(set(seasons))==1:
|
|
639
|
+
scraped_season_list = seasons
|
|
640
|
+
elif len(set(seasons))>2:
|
|
641
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
642
|
+
else:
|
|
643
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
644
|
+
|
|
645
|
+
print("Completed scraping goalie data from the following leagues:")
|
|
646
|
+
print(scraped_league_list)
|
|
647
|
+
print("Over the following seasons:")
|
|
648
|
+
print(scraped_season_list)
|
|
649
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
650
|
+
|
|
651
|
+
else:
|
|
652
|
+
print("There was an issue with the request you made. Please enter a single league and season as a string, or multiple leagues as either a list or tuple.")
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def get_skaters(leagues, seasons):
|
|
656
|
+
|
|
657
|
+
'''
|
|
658
|
+
Obtains skater data for at least one season and at least one league. Returns a dataframe.
|
|
659
|
+
'''
|
|
660
|
+
|
|
661
|
+
if (len(seasons)==1 or type(seasons)==str):
|
|
662
|
+
season_string = str(seasons)
|
|
663
|
+
elif len(seasons)==2:
|
|
664
|
+
season_string = " and".join(str((tuple(sorted(tuple(seasons))))).replace("'", "").replace("(", "").replace(")", "").split(","))
|
|
665
|
+
else:
|
|
666
|
+
season_string = str(((tuple(sorted(tuple(seasons)))))[:-1]).replace("'", "").replace("(", "").replace(")", "") + " and " + str(((tuple(sorted(tuple(seasons)))))[-1])
|
|
667
|
+
|
|
668
|
+
if (len(leagues)==1 or type(leagues)==str):
|
|
669
|
+
league_string = str(leagues)
|
|
670
|
+
elif len(leagues)==2:
|
|
671
|
+
league_string = " and".join(str((tuple(sorted(tuple(leagues))))).replace("'", "").replace("(", "").replace(")", "").split(","))
|
|
672
|
+
else:
|
|
673
|
+
league_string = str(((tuple(sorted(tuple(leagues)))))[:-1]).replace("'", "").replace("(", "").replace(")", "") + " and " + str(((tuple(sorted(tuple(leagues)))))[-1])
|
|
674
|
+
|
|
675
|
+
leaguesall = pd.DataFrame()
|
|
676
|
+
|
|
677
|
+
if ((type(leagues)==str) and (type(seasons)==str)):
|
|
678
|
+
print("Your scrape request is skater data from the following league:")
|
|
679
|
+
print(league_string)
|
|
680
|
+
print("In the following season:")
|
|
681
|
+
print(season_string)
|
|
682
|
+
leaguesall = get_league_skater_boxcars(leagues, seasons)
|
|
683
|
+
print("Completed scraping skater data from the following league:")
|
|
684
|
+
print(str(leagues))
|
|
685
|
+
print("Over the following season:")
|
|
686
|
+
print(str(seasons))
|
|
687
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
688
|
+
|
|
689
|
+
elif ((type(leagues)==str) and ((type(seasons) == tuple) or (type(seasons) == list))):
|
|
690
|
+
print("Your scrape request is skater data from the following league:")
|
|
691
|
+
print(league_string)
|
|
692
|
+
print("In the following seasons:")
|
|
693
|
+
print(season_string)
|
|
694
|
+
leaguesall = get_league_skater_boxcars(leagues, seasons)
|
|
695
|
+
|
|
696
|
+
if hidden_patrick == 4:
|
|
697
|
+
print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
|
|
698
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
699
|
+
if hidden_patrick == 5:
|
|
700
|
+
print("You were disconnected! The output here will be every player you've scraped so far. Here's your error message:")
|
|
701
|
+
print(error)
|
|
702
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
703
|
+
|
|
704
|
+
if len(set(leaguesall.league))==1:
|
|
705
|
+
scraped_league_list = leaguesall.league
|
|
706
|
+
elif len(set(leaguesall.league))>2:
|
|
707
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
708
|
+
else:
|
|
709
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
710
|
+
|
|
711
|
+
if len(set(seasons))==1:
|
|
712
|
+
scraped_season_list = seasons
|
|
713
|
+
elif len(set(seasons))>2:
|
|
714
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
715
|
+
else:
|
|
716
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
717
|
+
|
|
718
|
+
print("Completed scraping skater data from the following league:")
|
|
719
|
+
print(str(leagues))
|
|
720
|
+
print("Over the following seasons:")
|
|
721
|
+
print(scraped_season_list)
|
|
722
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
723
|
+
|
|
724
|
+
elif ((type(seasons) == str) and ((type(leagues) == tuple) or (type(leagues) == list))):
|
|
725
|
+
print("Your scrape request is skater data from the following leagues:")
|
|
726
|
+
print(league_string)
|
|
727
|
+
print("In the following season:")
|
|
728
|
+
print(season_string)
|
|
729
|
+
|
|
730
|
+
for i in range (0, len(leagues)):
|
|
731
|
+
try:
|
|
732
|
+
targetleague = get_league_skater_boxcars(leagues[i], seasons)
|
|
733
|
+
leaguesall = leaguesall._append(targetleague)
|
|
734
|
+
if hidden_patrick == 4:
|
|
735
|
+
raise KeyboardInterrupt
|
|
736
|
+
if hidden_patrick == 5:
|
|
737
|
+
raise ConnectionError
|
|
738
|
+
except KeyboardInterrupt:
|
|
739
|
+
print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
|
|
740
|
+
break
|
|
741
|
+
except ConnectionError:
|
|
742
|
+
print("You were disconnected! Let's sleep and try again.")
|
|
743
|
+
print(error)
|
|
744
|
+
time.sleep(100)
|
|
745
|
+
continue
|
|
746
|
+
|
|
747
|
+
if len(set(leaguesall.league))==1:
|
|
748
|
+
scraped_league_list = leaguesall.league
|
|
749
|
+
elif len(set(leaguesall.league))>2:
|
|
750
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
751
|
+
else:
|
|
752
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
753
|
+
|
|
754
|
+
print("Completed scraping skater data from the following leagues:")
|
|
755
|
+
print(scraped_league_list)
|
|
756
|
+
print("Over the following season:")
|
|
757
|
+
print((seasons))
|
|
758
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
759
|
+
|
|
760
|
+
elif (((type(seasons) == tuple) or (type(seasons) == list)) and ((type(leagues) == tuple) or (type(leagues) == list))):
|
|
761
|
+
print("Your scrape request is skater data from the following leagues:")
|
|
762
|
+
print(league_string)
|
|
763
|
+
print("In the following seasons:")
|
|
764
|
+
print(season_string)
|
|
765
|
+
#print("Your scrape request: " + str(leagues[:-1]).replace("'", "").replace("(", "").replace(")", "") + ", and " + (leagues)[-1] + " skater data from " +str(seasons[:-1]).replace("'", "").replace("(", "").replace(")", "") + ", and " + (seasons)[-1] + ".")
|
|
766
|
+
for i in range (0, len(leagues)):
|
|
767
|
+
try:
|
|
768
|
+
targetleague = get_league_skater_boxcars(leagues[i], seasons)
|
|
769
|
+
leaguesall = leaguesall._append(targetleague)
|
|
770
|
+
if hidden_patrick == 4:
|
|
771
|
+
raise KeyboardInterrupt
|
|
772
|
+
if hidden_patrick == 5:
|
|
773
|
+
raise ConnectionError
|
|
774
|
+
except KeyboardInterrupt:
|
|
775
|
+
print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
|
|
776
|
+
break
|
|
777
|
+
except ConnectionError:
|
|
778
|
+
print("You were disconnected! Let's sleep and try again.")
|
|
779
|
+
print(error)
|
|
780
|
+
time.sleep(100)
|
|
781
|
+
continue
|
|
782
|
+
|
|
783
|
+
if len(set(leaguesall.league))==1:
|
|
784
|
+
scraped_league_list = leaguesall.league
|
|
785
|
+
elif len(set(leaguesall.league))>2:
|
|
786
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
787
|
+
else:
|
|
788
|
+
scraped_league_list = str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(list(set(leaguesall.league))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
789
|
+
|
|
790
|
+
if len(set(seasons))==1:
|
|
791
|
+
scraped_season_list = seasons
|
|
792
|
+
elif len(set(seasons))>2:
|
|
793
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + ", and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
794
|
+
else:
|
|
795
|
+
scraped_season_list = str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[:-1]).replace("'", "").replace("[", "").replace("]", "") + " and " + str(((str(tuple(sorted(tuple(set(seasons))))).replace("'", "").replace("(", "").replace(")", "").replace("[", "").replace("]", ""))).split(", ")[-1])
|
|
796
|
+
|
|
797
|
+
print("Completed scraping skater data from the following leagues:")
|
|
798
|
+
print(scraped_league_list)
|
|
799
|
+
print("Incorporating the following seasons:")
|
|
800
|
+
print(scraped_season_list)
|
|
801
|
+
return(leaguesall.reset_index().drop(columns = 'index'))
|
|
802
|
+
|
|
803
|
+
else:
|
|
804
|
+
print("There was an issue with the request you made. Please enter a single league and season as a string, or multiple leagues as either a list or tuple.")
|
|
805
|
+
|
|
806
|
+
def add_player_information(dataframe):
|
|
807
|
+
'''
|
|
808
|
+
Takes a data frame from the get_players or get_goalies function and obtains biographcal information for all players in said dataframe, then returns it as a dataframe that adds to the other data you've already scraped..
|
|
809
|
+
'''
|
|
810
|
+
with_player_info = get_player_information(dataframe)
|
|
811
|
+
doubledup = dataframe.merge(with_player_info.drop(columns = ['player']), on = 'link', how = 'inner')
|
|
812
|
+
return doubledup
|
|
813
|
+
|
|
814
|
+
### EXAMPLE ONE: GET ALL SKATERS FROM THE MHL IN 2020-2021 ###
|
|
815
|
+
|
|
816
|
+
#mhl2021 = get_skaters("mhl", "2020-2021")
|
|
817
|
+
print("Welcome to the TopDownHockey EliteProspects Scraper, built by Patrick Bacon.")
|
|
818
|
+
print("This scraper is built strictly for personal use. For commercial or professional use, please look into the EliteProspects API.")
|
|
819
|
+
print("If you enjoy the scraper and would like to support my work, feel free to follow me on Twitter @TopDownHockey. Have fun!")
|
|
820
|
+
|