TopDownHockey-Scraper 3.2.8__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of TopDownHockey-Scraper might be problematic. Click here for more details.
- TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py +79 -98
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py +18 -6
- {TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.1.dist-info}/METADATA +1 -1
- TopDownHockey_Scraper-4.0.1.dist-info/RECORD +7 -0
- {TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.1.dist-info}/WHEEL +1 -1
- TopDownHockey_Scraper-3.2.8.dist-info/RECORD +0 -7
- {TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.1.dist-info}/LICENSE +0 -0
- {TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.1.dist-info}/top_level.txt +0 -0
|
@@ -252,7 +252,7 @@ def get_info(link):
|
|
|
252
252
|
"""
|
|
253
253
|
A function that is built strictly for the back end and should not be run by the user.
|
|
254
254
|
"""
|
|
255
|
-
|
|
255
|
+
|
|
256
256
|
page = requests.get(link, timeout = 500)
|
|
257
257
|
soup = BeautifulSoup(page.content, "html.parser")
|
|
258
258
|
|
|
@@ -265,99 +265,80 @@ def get_info(link):
|
|
|
265
265
|
soup = BeautifulSoup(page.content, "html.parser")
|
|
266
266
|
time.sleep(60)
|
|
267
267
|
|
|
268
|
-
|
|
269
|
-
player = soup.find("title").string.replace(" - Elite Prospects" ,"")
|
|
268
|
+
lis = soup.find_all('li')
|
|
270
269
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
if soup.find("div", {"class":"order-11 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"})!=None:
|
|
274
|
-
rights = soup.find("div", {"class":"order-11 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
275
|
-
).find("div", {"class":"col-xs-12 col-18 text-right p-0"}).find("span").string.split("\n")[1].split("/")[0].strip()
|
|
276
|
-
status = soup.find("div", {"class":"order-11 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
277
|
-
).find("div", {"class":"col-xs-12 col-18 text-right p-0"}).find("span").string.split("\n")[1].split("/")[1].strip()
|
|
278
|
-
else:
|
|
279
|
-
rights = "-"
|
|
280
|
-
status = "-"
|
|
281
|
-
|
|
282
|
-
if (soup.find("div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}))!= None:
|
|
283
|
-
if 'dob' in (soup.find("div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"})).find("a")['href']:
|
|
284
|
-
dob = soup.find("div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}).find("a")['href'].split("dob=", 1)[1].split("&sort", 1)[0]
|
|
285
|
-
else:
|
|
286
|
-
dob = "-"
|
|
270
|
+
relevant_lis = [li for li in lis if li.find('span') is not None]
|
|
287
271
|
|
|
272
|
+
# player
|
|
273
|
+
|
|
274
|
+
if soup.find("title") != None:
|
|
275
|
+
player = soup.find("title").string.replace(' - Stats, Contract, Salary & More', '')
|
|
288
276
|
else:
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
277
|
+
player = '-'
|
|
278
|
+
|
|
279
|
+
# status
|
|
280
|
+
|
|
281
|
+
if [li for li in relevant_lis if li.find('span').text=='NHL Rights'] != []:
|
|
282
|
+
rights = [li for li in relevant_lis if li.find('span').text=='NHL Rights'][0].find('a').text.split(' /')[0]
|
|
283
|
+
else:
|
|
284
|
+
rights = '-'
|
|
285
|
+
|
|
286
|
+
# rights
|
|
287
|
+
|
|
288
|
+
if [li for li in relevant_lis if li.find('span').text=='NHL Rights'] != []:
|
|
289
|
+
status = [li for li in relevant_lis if li.find('span').text=='NHL Rights'][0].find('a').text.split(' / ')[1]
|
|
290
|
+
else:
|
|
291
|
+
status = '-'
|
|
292
|
+
|
|
293
|
+
# dob
|
|
294
|
+
|
|
295
|
+
if [li.find('a')['href'].split('dob=', 1)[1].split('&sort', 1)[0] for li in relevant_lis if li.span.text == 'Date of Birth'] != []:
|
|
296
|
+
dob = [li.find('a')['href'].split('dob=', 1)[1].split('&sort', 1)[0] for li in relevant_lis if li.span.text == 'Date of Birth'][0]
|
|
297
|
+
else:
|
|
298
|
+
dob = '-'
|
|
299
|
+
|
|
300
|
+
# height
|
|
301
|
+
|
|
302
|
+
if [li for li in relevant_lis if li.find('span').text=='Height'] != []:
|
|
303
|
+
height = [li for li in relevant_lis if li.find('span').text=='Height'][0].text.split('Height')[1].split(' cm')[0]
|
|
301
304
|
else:
|
|
302
|
-
height =
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
weight = "-"
|
|
309
|
-
else:
|
|
310
|
-
weight = soup.find("div", {"class":"order-7 order-sm-5 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
311
|
-
).find(
|
|
312
|
-
"div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).string.split("\n")[1].split("lbs")[0].strip()
|
|
313
|
-
|
|
314
|
-
else: weight = "-"
|
|
315
|
-
|
|
316
|
-
if soup.find("div", {"class":"order-2 order-sm-4 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
317
|
-
) != None:
|
|
318
|
-
if soup.find("div", {"class":"order-2 order-sm-4 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
319
|
-
).find(
|
|
320
|
-
"div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}).find("a") != None:
|
|
321
|
-
|
|
322
|
-
birthplace = soup.find("div", {"class":"order-2 order-sm-4 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
323
|
-
).find(
|
|
324
|
-
"div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}).find("a").string.replace("\n", "").strip()
|
|
325
|
-
|
|
326
|
-
else:
|
|
327
|
-
birthplace = "-"
|
|
305
|
+
height = '-'
|
|
306
|
+
|
|
307
|
+
# weight
|
|
308
|
+
|
|
309
|
+
if [li for li in relevant_lis if li.find('span').text=='Weight'] != []:
|
|
310
|
+
weight = [li for li in relevant_lis if li.find('span').text=='Weight'][0].text.split('Weight')[1].split(' cm')[0]
|
|
328
311
|
else:
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
nation = soup.find("div", {"class":"order-3 order-sm-6 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
336
|
-
).find(
|
|
337
|
-
"div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).find("a").string.replace("\n", "").strip()
|
|
338
|
-
else: nation = "-"
|
|
339
|
-
|
|
312
|
+
weight = '-'
|
|
313
|
+
|
|
314
|
+
# birthplace
|
|
315
|
+
|
|
316
|
+
if [li for li in relevant_lis if li.find('span').text=='Place of Birth'] != []:
|
|
317
|
+
birthplace = [li for li in relevant_lis if li.find('span').text=='Place of Birth'][0].text.split('Birth')[1]
|
|
340
318
|
else:
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
319
|
+
birthplace = '-'
|
|
320
|
+
|
|
321
|
+
# nation
|
|
322
|
+
|
|
323
|
+
if [li for li in relevant_lis if li.find('span').text=='Nation'] != []:
|
|
324
|
+
nation = [li for li in relevant_lis if li.find('span').text=='Nation'][0].text.split('Nation')[1]
|
|
348
325
|
else:
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
else:
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
#
|
|
359
|
-
|
|
360
|
-
|
|
326
|
+
nation = '-'
|
|
327
|
+
|
|
328
|
+
# shoots
|
|
329
|
+
|
|
330
|
+
if [li for li in relevant_lis if li.find('span').text=='Shoots'] != []:
|
|
331
|
+
shoots = [li for li in relevant_lis if li.find('span').text=='Shoots'][0].text.split('Shoots')[1]
|
|
332
|
+
else:
|
|
333
|
+
shoots = '-'
|
|
334
|
+
|
|
335
|
+
# draft
|
|
336
|
+
|
|
337
|
+
if [li for li in relevant_lis if li.find('span').text=='Drafted'] != []:
|
|
338
|
+
draft = [li for li in relevant_lis if li.find('span').text=='Drafted'][0].text.split('Drafted')[1]
|
|
339
|
+
else:
|
|
340
|
+
draft = '-'
|
|
341
|
+
|
|
361
342
|
return(player, rights, status, dob, height, weight, birthplace, nation, shoots, draft, link)
|
|
362
343
|
|
|
363
344
|
def get_player_information(dataframe):
|
|
@@ -382,17 +363,17 @@ def get_player_information(dataframe):
|
|
|
382
363
|
for i in range(0, len(list(set(dataframe.link)))):
|
|
383
364
|
try:
|
|
384
365
|
myresult = get_info(((list(set(dataframe.link))[i])))
|
|
385
|
-
myplayer.
|
|
386
|
-
myrights.
|
|
387
|
-
mystatus.
|
|
388
|
-
mydob.
|
|
389
|
-
myheight.
|
|
390
|
-
myweight.
|
|
391
|
-
mybirthplace.
|
|
392
|
-
mynation.
|
|
393
|
-
myshot.
|
|
394
|
-
mydraft.
|
|
395
|
-
mylink.
|
|
366
|
+
myplayer.append(myresult[0])
|
|
367
|
+
myrights.append(myresult[1])
|
|
368
|
+
mystatus.append(myresult[2])
|
|
369
|
+
mydob.append(myresult[3])
|
|
370
|
+
myheight.append(myresult[4])
|
|
371
|
+
myweight.append(myresult[5])
|
|
372
|
+
mybirthplace.append(myresult[6])
|
|
373
|
+
mynation.append(myresult[7])
|
|
374
|
+
myshot.append(myresult[8])
|
|
375
|
+
mydraft.append(myresult[9])
|
|
376
|
+
mylink.append(myresult[10])
|
|
396
377
|
print(myresult[0] + " scraped! That's " + str(i + 1) + " down! Only " + str(len(list(set(dataframe.link))) - (i + 1)) + " left to go!")
|
|
397
378
|
except KeyboardInterrupt:
|
|
398
379
|
print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
|
|
@@ -1912,6 +1912,8 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
|
|
|
1912
1912
|
game_date = pd.to_datetime(this_date))
|
|
1913
1913
|
|
|
1914
1914
|
gamedays = gamedays._append(fax)
|
|
1915
|
+
|
|
1916
|
+
gamedays = gamedays[gamedays.espn_id!='gameId']
|
|
1915
1917
|
|
|
1916
1918
|
gamedays = gamedays.assign(
|
|
1917
1919
|
home_team = np.where(gamedays.home_team=='ST LOUIS BLUES', 'ST. LOUIS BLUES', gamedays.home_team),
|
|
@@ -1977,8 +1979,10 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
|
|
|
1977
1979
|
np.where(gamedays.away_team=='GOLDEN', 'VGK',
|
|
1978
1980
|
np.where(gamedays.away_team=='KNIGHTS', 'VGK',
|
|
1979
1981
|
np.where(gamedays.away_team=='CAPITALS', 'WSH',
|
|
1980
|
-
np.where(gamedays.away_team=='JETS', 'WPG',
|
|
1981
|
-
|
|
1982
|
+
np.where(gamedays.away_team=='JETS', 'WPG',
|
|
1983
|
+
np.where(gamedays.away_team=='CLUB', 'UTA',
|
|
1984
|
+
np.where(gamedays.away_team=='HOCKEY', 'UTA', 'mistake'
|
|
1985
|
+
))))))))))))))))))))))))))))))))))))))
|
|
1982
1986
|
)
|
|
1983
1987
|
|
|
1984
1988
|
gamedays = gamedays.assign(
|
|
@@ -2017,8 +2021,10 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
|
|
|
2017
2021
|
np.where(gamedays.home_team=='GOLDEN', 'VGK',
|
|
2018
2022
|
np.where(gamedays.home_team=='KNIGHTS', 'VGK',
|
|
2019
2023
|
np.where(gamedays.home_team=='CAPITALS', 'WSH',
|
|
2020
|
-
np.where(gamedays.home_team=='JETS', 'WPG',
|
|
2021
|
-
|
|
2024
|
+
np.where(gamedays.home_team=='JETS', 'WPG',
|
|
2025
|
+
np.where(gamedays.home_team=='CLUB', 'UTA',
|
|
2026
|
+
np.where(gamedays.home_team=='HOCKEY', 'UTA', 'mistake'
|
|
2027
|
+
))))))))))))))))))))))))))))))))))))))
|
|
2022
2028
|
)
|
|
2023
2029
|
|
|
2024
2030
|
gamedays = gamedays[(gamedays.game_date==this_date) & (gamedays.home_team==home_team) & (gamedays.away_team==away_team)]
|
|
@@ -2434,7 +2440,7 @@ def fix_missing(single, event_coords, events):
|
|
|
2434
2440
|
|
|
2435
2441
|
return(events)
|
|
2436
2442
|
|
|
2437
|
-
def full_scrape_1by1(game_id_list, shift_to_espn =
|
|
2443
|
+
def full_scrape_1by1(game_id_list, shift_to_espn = True):
|
|
2438
2444
|
|
|
2439
2445
|
global single
|
|
2440
2446
|
global event_coords
|
|
@@ -2446,12 +2452,13 @@ def full_scrape_1by1(game_id_list, shift_to_espn = False):
|
|
|
2446
2452
|
|
|
2447
2453
|
i = 0
|
|
2448
2454
|
|
|
2449
|
-
while i in range(0, len(game_id_list)):
|
|
2455
|
+
while i in range(0, len(game_id_list)) and len(game_id_list)>0:
|
|
2450
2456
|
|
|
2451
2457
|
# First thing to try: Scraping HTML events
|
|
2452
2458
|
|
|
2453
2459
|
try:
|
|
2454
2460
|
first_time = time.time()
|
|
2461
|
+
print(game_id_list[i])
|
|
2455
2462
|
game_id = game_id_list[i]
|
|
2456
2463
|
print('Attempting scrape for: ' + str(game_id))
|
|
2457
2464
|
season = str(int(str(game_id)[:4])) + str(int(str(game_id)[:4]) + 1)
|
|
@@ -2678,6 +2685,11 @@ def full_scrape_1by1(game_id_list, shift_to_espn = False):
|
|
|
2678
2685
|
print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
|
|
2679
2686
|
i = i + 1
|
|
2680
2687
|
continue
|
|
2688
|
+
|
|
2689
|
+
except KeyError as k:
|
|
2690
|
+
print(str(game_id) + 'gave some kind of Key Error. Here is the error: ' + str(e))
|
|
2691
|
+
i = i + 1
|
|
2692
|
+
continue
|
|
2681
2693
|
|
|
2682
2694
|
except KeyboardInterrupt:
|
|
2683
2695
|
print('You manually interrupted the scrape. You will get to keep every game you have already completed scraping after just a bit of post-processing. Good bye.')
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=j-7gTk-cp_0LyZihNxm67xH9KdA3Fx4xrFKKu3-9-rU,42245
|
|
2
|
+
TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=KYIDRpxvZJdLA5AsMGqmxpnCh87LbZJLEIU42i4ULDI,153759
|
|
3
|
+
TopDownHockey_Scraper-4.0.1.dist-info/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
4
|
+
TopDownHockey_Scraper-4.0.1.dist-info/METADATA,sha256=jWKaLwhy1rOIth7qe8rcqlqLGBjas2c-sLg_2ljvE2Y,5462
|
|
5
|
+
TopDownHockey_Scraper-4.0.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
6
|
+
TopDownHockey_Scraper-4.0.1.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
|
|
7
|
+
TopDownHockey_Scraper-4.0.1.dist-info/RECORD,,
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=KyQjVTUKLDlnwJb9Sdm6jUaCh6ZxJoq2kEXBHFb1PcM,45374
|
|
2
|
-
TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=vumUGPWrtHOTWltSwKZCJNgKzum9UKr_xh7xX0E9_Fo,153213
|
|
3
|
-
TopDownHockey_Scraper-3.2.8.dist-info/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
4
|
-
TopDownHockey_Scraper-3.2.8.dist-info/METADATA,sha256=ngmt5EJasFMsJyNmTR7iOSK_2VBk_7bY0l3eefOf1zk,5462
|
|
5
|
-
TopDownHockey_Scraper-3.2.8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
6
|
-
TopDownHockey_Scraper-3.2.8.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
|
|
7
|
-
TopDownHockey_Scraper-3.2.8.dist-info/RECORD,,
|
|
File without changes
|
{TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.1.dist-info}/top_level.txt
RENAMED
|
File without changes
|