TopDownHockey-Scraper 3.2.8__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of TopDownHockey-Scraper might be problematic. Click here for more details.
- TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py +79 -98
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py +16 -6
- {TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.0.dist-info}/METADATA +1 -1
- TopDownHockey_Scraper-4.0.0.dist-info/RECORD +7 -0
- {TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.0.dist-info}/WHEEL +1 -1
- TopDownHockey_Scraper-3.2.8.dist-info/RECORD +0 -7
- {TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.0.dist-info}/LICENSE +0 -0
- {TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.0.dist-info}/top_level.txt +0 -0
|
@@ -252,7 +252,7 @@ def get_info(link):
|
|
|
252
252
|
"""
|
|
253
253
|
A function that is built strictly for the back end and should not be run by the user.
|
|
254
254
|
"""
|
|
255
|
-
|
|
255
|
+
|
|
256
256
|
page = requests.get(link, timeout = 500)
|
|
257
257
|
soup = BeautifulSoup(page.content, "html.parser")
|
|
258
258
|
|
|
@@ -265,99 +265,80 @@ def get_info(link):
|
|
|
265
265
|
soup = BeautifulSoup(page.content, "html.parser")
|
|
266
266
|
time.sleep(60)
|
|
267
267
|
|
|
268
|
-
|
|
269
|
-
player = soup.find("title").string.replace(" - Elite Prospects" ,"")
|
|
268
|
+
lis = soup.find_all('li')
|
|
270
269
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
if soup.find("div", {"class":"order-11 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"})!=None:
|
|
274
|
-
rights = soup.find("div", {"class":"order-11 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
275
|
-
).find("div", {"class":"col-xs-12 col-18 text-right p-0"}).find("span").string.split("\n")[1].split("/")[0].strip()
|
|
276
|
-
status = soup.find("div", {"class":"order-11 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
277
|
-
).find("div", {"class":"col-xs-12 col-18 text-right p-0"}).find("span").string.split("\n")[1].split("/")[1].strip()
|
|
278
|
-
else:
|
|
279
|
-
rights = "-"
|
|
280
|
-
status = "-"
|
|
281
|
-
|
|
282
|
-
if (soup.find("div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}))!= None:
|
|
283
|
-
if 'dob' in (soup.find("div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"})).find("a")['href']:
|
|
284
|
-
dob = soup.find("div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}).find("a")['href'].split("dob=", 1)[1].split("&sort", 1)[0]
|
|
285
|
-
else:
|
|
286
|
-
dob = "-"
|
|
270
|
+
relevant_lis = [li for li in lis if li.find('span') is not None]
|
|
287
271
|
|
|
272
|
+
# player
|
|
273
|
+
|
|
274
|
+
if soup.find("title") != None:
|
|
275
|
+
player = soup.find("title").string.replace(' - Stats, Contract, Salary & More', '')
|
|
288
276
|
else:
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
277
|
+
player = '-'
|
|
278
|
+
|
|
279
|
+
# status
|
|
280
|
+
|
|
281
|
+
if [li for li in relevant_lis if li.find('span').text=='NHL Rights'] != []:
|
|
282
|
+
rights = [li for li in relevant_lis if li.find('span').text=='NHL Rights'][0].find('a').text.split(' /')[0]
|
|
283
|
+
else:
|
|
284
|
+
rights = '-'
|
|
285
|
+
|
|
286
|
+
# rights
|
|
287
|
+
|
|
288
|
+
if [li for li in relevant_lis if li.find('span').text=='NHL Rights'] != []:
|
|
289
|
+
status = [li for li in relevant_lis if li.find('span').text=='NHL Rights'][0].find('a').text.split(' / ')[1]
|
|
290
|
+
else:
|
|
291
|
+
status = '-'
|
|
292
|
+
|
|
293
|
+
# dob
|
|
294
|
+
|
|
295
|
+
if [li.find('a')['href'].split('dob=', 1)[1].split('&sort', 1)[0] for li in relevant_lis if li.span.text == 'Date of Birth'] != []:
|
|
296
|
+
dob = [li.find('a')['href'].split('dob=', 1)[1].split('&sort', 1)[0] for li in relevant_lis if li.span.text == 'Date of Birth'][0]
|
|
297
|
+
else:
|
|
298
|
+
dob = '-'
|
|
299
|
+
|
|
300
|
+
# height
|
|
301
|
+
|
|
302
|
+
if [li for li in relevant_lis if li.find('span').text=='Height'] != []:
|
|
303
|
+
height = [li for li in relevant_lis if li.find('span').text=='Height'][0].text.split('Height')[1].split(' cm')[0]
|
|
301
304
|
else:
|
|
302
|
-
height =
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
weight = "-"
|
|
309
|
-
else:
|
|
310
|
-
weight = soup.find("div", {"class":"order-7 order-sm-5 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
311
|
-
).find(
|
|
312
|
-
"div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).string.split("\n")[1].split("lbs")[0].strip()
|
|
313
|
-
|
|
314
|
-
else: weight = "-"
|
|
315
|
-
|
|
316
|
-
if soup.find("div", {"class":"order-2 order-sm-4 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
317
|
-
) != None:
|
|
318
|
-
if soup.find("div", {"class":"order-2 order-sm-4 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
319
|
-
).find(
|
|
320
|
-
"div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}).find("a") != None:
|
|
321
|
-
|
|
322
|
-
birthplace = soup.find("div", {"class":"order-2 order-sm-4 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
323
|
-
).find(
|
|
324
|
-
"div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}).find("a").string.replace("\n", "").strip()
|
|
325
|
-
|
|
326
|
-
else:
|
|
327
|
-
birthplace = "-"
|
|
305
|
+
height = '-'
|
|
306
|
+
|
|
307
|
+
# weight
|
|
308
|
+
|
|
309
|
+
if [li for li in relevant_lis if li.find('span').text=='Weight'] != []:
|
|
310
|
+
weight = [li for li in relevant_lis if li.find('span').text=='Weight'][0].text.split('Weight')[1].split(' cm')[0]
|
|
328
311
|
else:
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
nation = soup.find("div", {"class":"order-3 order-sm-6 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
|
|
336
|
-
).find(
|
|
337
|
-
"div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).find("a").string.replace("\n", "").strip()
|
|
338
|
-
else: nation = "-"
|
|
339
|
-
|
|
312
|
+
weight = '-'
|
|
313
|
+
|
|
314
|
+
# birthplace
|
|
315
|
+
|
|
316
|
+
if [li for li in relevant_lis if li.find('span').text=='Place of Birth'] != []:
|
|
317
|
+
birthplace = [li for li in relevant_lis if li.find('span').text=='Place of Birth'][0].text.split('Birth')[1]
|
|
340
318
|
else:
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
319
|
+
birthplace = '-'
|
|
320
|
+
|
|
321
|
+
# nation
|
|
322
|
+
|
|
323
|
+
if [li for li in relevant_lis if li.find('span').text=='Nation'] != []:
|
|
324
|
+
nation = [li for li in relevant_lis if li.find('span').text=='Nation'][0].text.split('Nation')[1]
|
|
348
325
|
else:
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
else:
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
#
|
|
359
|
-
|
|
360
|
-
|
|
326
|
+
nation = '-'
|
|
327
|
+
|
|
328
|
+
# shoots
|
|
329
|
+
|
|
330
|
+
if [li for li in relevant_lis if li.find('span').text=='Shoots'] != []:
|
|
331
|
+
shoots = [li for li in relevant_lis if li.find('span').text=='Shoots'][0].text.split('Shoots')[1]
|
|
332
|
+
else:
|
|
333
|
+
shoots = '-'
|
|
334
|
+
|
|
335
|
+
# draft
|
|
336
|
+
|
|
337
|
+
if [li for li in relevant_lis if li.find('span').text=='Drafted'] != []:
|
|
338
|
+
draft = [li for li in relevant_lis if li.find('span').text=='Drafted'][0].text.split('Drafted')[1]
|
|
339
|
+
else:
|
|
340
|
+
draft = '-'
|
|
341
|
+
|
|
361
342
|
return(player, rights, status, dob, height, weight, birthplace, nation, shoots, draft, link)
|
|
362
343
|
|
|
363
344
|
def get_player_information(dataframe):
|
|
@@ -382,17 +363,17 @@ def get_player_information(dataframe):
|
|
|
382
363
|
for i in range(0, len(list(set(dataframe.link)))):
|
|
383
364
|
try:
|
|
384
365
|
myresult = get_info(((list(set(dataframe.link))[i])))
|
|
385
|
-
myplayer.
|
|
386
|
-
myrights.
|
|
387
|
-
mystatus.
|
|
388
|
-
mydob.
|
|
389
|
-
myheight.
|
|
390
|
-
myweight.
|
|
391
|
-
mybirthplace.
|
|
392
|
-
mynation.
|
|
393
|
-
myshot.
|
|
394
|
-
mydraft.
|
|
395
|
-
mylink.
|
|
366
|
+
myplayer.append(myresult[0])
|
|
367
|
+
myrights.append(myresult[1])
|
|
368
|
+
mystatus.append(myresult[2])
|
|
369
|
+
mydob.append(myresult[3])
|
|
370
|
+
myheight.append(myresult[4])
|
|
371
|
+
myweight.append(myresult[5])
|
|
372
|
+
mybirthplace.append(myresult[6])
|
|
373
|
+
mynation.append(myresult[7])
|
|
374
|
+
myshot.append(myresult[8])
|
|
375
|
+
mydraft.append(myresult[9])
|
|
376
|
+
mylink.append(myresult[10])
|
|
396
377
|
print(myresult[0] + " scraped! That's " + str(i + 1) + " down! Only " + str(len(list(set(dataframe.link))) - (i + 1)) + " left to go!")
|
|
397
378
|
except KeyboardInterrupt:
|
|
398
379
|
print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
|
|
@@ -1912,6 +1912,8 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
|
|
|
1912
1912
|
game_date = pd.to_datetime(this_date))
|
|
1913
1913
|
|
|
1914
1914
|
gamedays = gamedays._append(fax)
|
|
1915
|
+
|
|
1916
|
+
gamedays = gamedays[gamedays.espn_id!='gameId']
|
|
1915
1917
|
|
|
1916
1918
|
gamedays = gamedays.assign(
|
|
1917
1919
|
home_team = np.where(gamedays.home_team=='ST LOUIS BLUES', 'ST. LOUIS BLUES', gamedays.home_team),
|
|
@@ -1977,8 +1979,9 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
|
|
|
1977
1979
|
np.where(gamedays.away_team=='GOLDEN', 'VGK',
|
|
1978
1980
|
np.where(gamedays.away_team=='KNIGHTS', 'VGK',
|
|
1979
1981
|
np.where(gamedays.away_team=='CAPITALS', 'WSH',
|
|
1980
|
-
np.where(gamedays.away_team=='JETS', 'WPG',
|
|
1981
|
-
|
|
1982
|
+
np.where(gamedays.away_team=='JETS', 'WPG',
|
|
1983
|
+
np.where(gamedays.away_team=='CLUB', 'UTA', 'mistake'
|
|
1984
|
+
)))))))))))))))))))))))))))))))))))))
|
|
1982
1985
|
)
|
|
1983
1986
|
|
|
1984
1987
|
gamedays = gamedays.assign(
|
|
@@ -2017,8 +2020,9 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
|
|
|
2017
2020
|
np.where(gamedays.home_team=='GOLDEN', 'VGK',
|
|
2018
2021
|
np.where(gamedays.home_team=='KNIGHTS', 'VGK',
|
|
2019
2022
|
np.where(gamedays.home_team=='CAPITALS', 'WSH',
|
|
2020
|
-
np.where(gamedays.home_team=='JETS', 'WPG',
|
|
2021
|
-
|
|
2023
|
+
np.where(gamedays.home_team=='JETS', 'WPG',
|
|
2024
|
+
np.where(gamedays.home_team=='CLUB', 'UTA', 'mistake'
|
|
2025
|
+
)))))))))))))))))))))))))))))))))))))
|
|
2022
2026
|
)
|
|
2023
2027
|
|
|
2024
2028
|
gamedays = gamedays[(gamedays.game_date==this_date) & (gamedays.home_team==home_team) & (gamedays.away_team==away_team)]
|
|
@@ -2434,7 +2438,7 @@ def fix_missing(single, event_coords, events):
|
|
|
2434
2438
|
|
|
2435
2439
|
return(events)
|
|
2436
2440
|
|
|
2437
|
-
def full_scrape_1by1(game_id_list, shift_to_espn =
|
|
2441
|
+
def full_scrape_1by1(game_id_list, shift_to_espn = True):
|
|
2438
2442
|
|
|
2439
2443
|
global single
|
|
2440
2444
|
global event_coords
|
|
@@ -2446,12 +2450,13 @@ def full_scrape_1by1(game_id_list, shift_to_espn = False):
|
|
|
2446
2450
|
|
|
2447
2451
|
i = 0
|
|
2448
2452
|
|
|
2449
|
-
while i in range(0, len(game_id_list)):
|
|
2453
|
+
while i in range(0, len(game_id_list)) and len(game_id_list)>0:
|
|
2450
2454
|
|
|
2451
2455
|
# First thing to try: Scraping HTML events
|
|
2452
2456
|
|
|
2453
2457
|
try:
|
|
2454
2458
|
first_time = time.time()
|
|
2459
|
+
print(game_id_list[i])
|
|
2455
2460
|
game_id = game_id_list[i]
|
|
2456
2461
|
print('Attempting scrape for: ' + str(game_id))
|
|
2457
2462
|
season = str(int(str(game_id)[:4])) + str(int(str(game_id)[:4]) + 1)
|
|
@@ -2678,6 +2683,11 @@ def full_scrape_1by1(game_id_list, shift_to_espn = False):
|
|
|
2678
2683
|
print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
|
|
2679
2684
|
i = i + 1
|
|
2680
2685
|
continue
|
|
2686
|
+
|
|
2687
|
+
except KeyError as k:
|
|
2688
|
+
print(str(game_id) + 'gave some kind of Key Error. Here is the error: ' + str(e))
|
|
2689
|
+
i = i + 1
|
|
2690
|
+
continue
|
|
2681
2691
|
|
|
2682
2692
|
except KeyboardInterrupt:
|
|
2683
2693
|
print('You manually interrupted the scrape. You will get to keep every game you have already completed scraping after just a bit of post-processing. Good bye.')
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=j-7gTk-cp_0LyZihNxm67xH9KdA3Fx4xrFKKu3-9-rU,42245
|
|
2
|
+
TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=jPYnjZMTN6tvQgAvQo9mmFSVQmc4-fEra1jLeuFRkpA,153624
|
|
3
|
+
TopDownHockey_Scraper-4.0.0.dist-info/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
4
|
+
TopDownHockey_Scraper-4.0.0.dist-info/METADATA,sha256=5CD1aQY7EMR8wuQcRmItqnY9Uk9Ews8P0Sx_b3sJFUI,5462
|
|
5
|
+
TopDownHockey_Scraper-4.0.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
6
|
+
TopDownHockey_Scraper-4.0.0.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
|
|
7
|
+
TopDownHockey_Scraper-4.0.0.dist-info/RECORD,,
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=KyQjVTUKLDlnwJb9Sdm6jUaCh6ZxJoq2kEXBHFb1PcM,45374
|
|
2
|
-
TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=vumUGPWrtHOTWltSwKZCJNgKzum9UKr_xh7xX0E9_Fo,153213
|
|
3
|
-
TopDownHockey_Scraper-3.2.8.dist-info/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
4
|
-
TopDownHockey_Scraper-3.2.8.dist-info/METADATA,sha256=ngmt5EJasFMsJyNmTR7iOSK_2VBk_7bY0l3eefOf1zk,5462
|
|
5
|
-
TopDownHockey_Scraper-3.2.8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
6
|
-
TopDownHockey_Scraper-3.2.8.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
|
|
7
|
-
TopDownHockey_Scraper-3.2.8.dist-info/RECORD,,
|
|
File without changes
|
{TopDownHockey_Scraper-3.2.8.dist-info → TopDownHockey_Scraper-4.0.0.dist-info}/top_level.txt
RENAMED
|
File without changes
|