TopDownHockey-Scraper 3.2.7__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of TopDownHockey-Scraper might be problematic. Click here for more details.

@@ -28,10 +28,10 @@ def tableDataText(table):
28
28
 
29
29
  headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
30
30
  if headerow: # if there is a header row include first
31
- rows._append(headerow)
31
+ rows.append(headerow)
32
32
  trs = trs[1:]
33
33
  for tr in trs: # for every table row
34
- rows._append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
34
+ rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
35
35
 
36
36
  df_rows = pd.DataFrame(rows[1:], columns=rows[0])
37
37
 
@@ -97,7 +97,7 @@ def getskaters(league, year):
97
97
  # Extract href links in table
98
98
  href_row = []
99
99
  for link in player_table.find_all('a'):
100
- href_row._append(link.attrs['href'])
100
+ href_row.append(link.attrs['href'])
101
101
 
102
102
  # Create data frame, rename and only keep links to players
103
103
  df_links = pd.DataFrame(href_row)
@@ -107,7 +107,7 @@ def getskaters(league, year):
107
107
  # Add links to players
108
108
  df_players['link']=df_links['link']
109
109
 
110
- players._append(df_players)
110
+ players.append(df_players)
111
111
 
112
112
  # Wait 3 seconds before going to next
113
113
  #time.sleep(1)
@@ -209,7 +209,7 @@ def getgoalies(league, year):
209
209
  # Extract href links in table
210
210
  href_row = []
211
211
  for link in player_table.find_all('a'):
212
- href_row._append(link.attrs['href'])
212
+ href_row.append(link.attrs['href'])
213
213
 
214
214
  # Create data frame, rename and only keep links to players
215
215
  df_links = pd.DataFrame(href_row)
@@ -219,7 +219,7 @@ def getgoalies(league, year):
219
219
  # Add links to players
220
220
  df_players['link']=df_links['link']
221
221
 
222
- players._append(df_players)
222
+ players.append(df_players)
223
223
 
224
224
  # Wait 3 seconds before going to next
225
225
  # time.sleep(1)
@@ -252,7 +252,7 @@ def get_info(link):
252
252
  """
253
253
  A function that is built strictly for the back end and should not be run by the user.
254
254
  """
255
-
255
+
256
256
  page = requests.get(link, timeout = 500)
257
257
  soup = BeautifulSoup(page.content, "html.parser")
258
258
 
@@ -265,99 +265,80 @@ def get_info(link):
265
265
  soup = BeautifulSoup(page.content, "html.parser")
266
266
  time.sleep(60)
267
267
 
268
- if soup.find("title") != None:
269
- player = soup.find("title").string.replace(" - Elite Prospects" ,"")
268
+ lis = soup.find_all('li')
270
269
 
271
- else: player = "-"
272
-
273
- if soup.find("div", {"class":"order-11 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"})!=None:
274
- rights = soup.find("div", {"class":"order-11 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"}
275
- ).find("div", {"class":"col-xs-12 col-18 text-right p-0"}).find("span").string.split("\n")[1].split("/")[0].strip()
276
- status = soup.find("div", {"class":"order-11 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"}
277
- ).find("div", {"class":"col-xs-12 col-18 text-right p-0"}).find("span").string.split("\n")[1].split("/")[1].strip()
278
- else:
279
- rights = "-"
280
- status = "-"
281
-
282
- if (soup.find("div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}))!= None:
283
- if 'dob' in (soup.find("div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"})).find("a")['href']:
284
- dob = soup.find("div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}).find("a")['href'].split("dob=", 1)[1].split("&sort", 1)[0]
285
- else:
286
- dob = "-"
270
+ relevant_lis = [li for li in lis if li.find('span') is not None]
287
271
 
272
+ # player
273
+
274
+ if soup.find("title") != None:
275
+ player = soup.find("title").string.replace(' - Stats, Contract, Salary & More', '')
288
276
  else:
289
- dob = "-"
290
-
291
- if soup.find("div", {"class":"order-6 order-sm-3 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}) != None:
292
- if "cm" in soup.find("div", {"class":"order-6 order-sm-3 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
293
- ).find(
294
- "div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).string:
295
- height = soup.find("div", {"class":"order-6 order-sm-3 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
296
- ).find(
297
- "div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).string.split(" / ")[1].split("cm")[0].strip()
298
- else:
299
- height = "-"
300
-
277
+ player = '-'
278
+
279
+ # status
280
+
281
+ if [li for li in relevant_lis if li.find('span').text=='NHL Rights'] != []:
282
+ rights = [li for li in relevant_lis if li.find('span').text=='NHL Rights'][0].find('a').text.split(' /')[0]
283
+ else:
284
+ rights = '-'
285
+
286
+ # rights
287
+
288
+ if [li for li in relevant_lis if li.find('span').text=='NHL Rights'] != []:
289
+ status = [li for li in relevant_lis if li.find('span').text=='NHL Rights'][0].find('a').text.split(' / ')[1]
290
+ else:
291
+ status = '-'
292
+
293
+ # dob
294
+
295
+ if [li.find('a')['href'].split('dob=', 1)[1].split('&sort', 1)[0] for li in relevant_lis if li.span.text == 'Date of Birth'] != []:
296
+ dob = [li.find('a')['href'].split('dob=', 1)[1].split('&sort', 1)[0] for li in relevant_lis if li.span.text == 'Date of Birth'][0]
297
+ else:
298
+ dob = '-'
299
+
300
+ # height
301
+
302
+ if [li for li in relevant_lis if li.find('span').text=='Height'] != []:
303
+ height = [li for li in relevant_lis if li.find('span').text=='Height'][0].text.split('Height')[1].split(' cm')[0]
301
304
  else:
302
- height = "-"
303
-
304
- if soup.find("div", {"class":"order-7 order-sm-5 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}) != None:
305
- if soup.find("div", {"class":"order-7 order-sm-5 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
306
- ).find(
307
- "div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).string.split("\n")[1].split("lbs")[0].strip() == '- / -':
308
- weight = "-"
309
- else:
310
- weight = soup.find("div", {"class":"order-7 order-sm-5 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
311
- ).find(
312
- "div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).string.split("\n")[1].split("lbs")[0].strip()
313
-
314
- else: weight = "-"
315
-
316
- if soup.find("div", {"class":"order-2 order-sm-4 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
317
- ) != None:
318
- if soup.find("div", {"class":"order-2 order-sm-4 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
319
- ).find(
320
- "div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}).find("a") != None:
321
-
322
- birthplace = soup.find("div", {"class":"order-2 order-sm-4 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
323
- ).find(
324
- "div", {"class":"col-xs-12 col-17 text-right p-0 ep-text-color--black"}).find("a").string.replace("\n", "").strip()
325
-
326
- else:
327
- birthplace = "-"
305
+ height = '-'
306
+
307
+ # weight
308
+
309
+ if [li for li in relevant_lis if li.find('span').text=='Weight'] != []:
310
+ weight = [li for li in relevant_lis if li.find('span').text=='Weight'][0].text.split('Weight')[1].split(' cm')[0]
328
311
  else:
329
- birthplace = "-"
330
-
331
- if soup.find("div", {"class":"order-3 order-sm-6 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}) != None:
332
- if soup.find("div", {"class":"order-3 order-sm-6 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
333
- ).find(
334
- "div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).find("a") != None:
335
- nation = soup.find("div", {"class":"order-3 order-sm-6 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
336
- ).find(
337
- "div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).find("a").string.replace("\n", "").strip()
338
- else: nation = "-"
339
-
312
+ weight = '-'
313
+
314
+ # birthplace
315
+
316
+ if [li for li in relevant_lis if li.find('span').text=='Place of Birth'] != []:
317
+ birthplace = [li for li in relevant_lis if li.find('span').text=='Place of Birth'][0].text.split('Birth')[1]
340
318
  else:
341
- nation = "-"
342
-
343
- if soup.find("div", {"class":"order-8 order-sm-7 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}) !=None:
344
- shoots = soup.find("div", {"class":"order-8 order-sm-7 ep-list__item ep-list__item--col-2 ep-list__item--in-card-body ep-list__item--is-compact"}
345
- ).find(
346
- "div", {"class":"col-xs-12 col-18 text-right p-0 ep-text-color--black"}).string.replace("\n", "").strip()
347
-
319
+ birthplace = '-'
320
+
321
+ # nation
322
+
323
+ if [li for li in relevant_lis if li.find('span').text=='Nation'] != []:
324
+ nation = [li for li in relevant_lis if li.find('span').text=='Nation'][0].text.split('Nation')[1]
348
325
  else:
349
- shoots = "-"
350
-
351
- if soup.find("div", {"class":"order-12 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"}) != None:
352
- draft = soup.find("div", {"class":"order-12 ep-list__item ep-list__item--in-card-body ep-list__item--is-compact"}
353
- ).find(
354
- "div", {"class":"col-xs-12 col-18 text-right p-0"}).find("a").string.replace("\n", "").strip()
355
- else:
356
- draft = "-"
357
-
358
- #height = np.where(height=="- / -", "-", height)
359
-
360
- #print(player + " scraped!")
326
+ nation = '-'
327
+
328
+ # shoots
329
+
330
+ if [li for li in relevant_lis if li.find('span').text=='Shoots'] != []:
331
+ shoots = [li for li in relevant_lis if li.find('span').text=='Shoots'][0].text.split('Shoots')[1]
332
+ else:
333
+ shoots = '-'
334
+
335
+ # draft
336
+
337
+ if [li for li in relevant_lis if li.find('span').text=='Drafted'] != []:
338
+ draft = [li for li in relevant_lis if li.find('span').text=='Drafted'][0].text.split('Drafted')[1]
339
+ else:
340
+ draft = '-'
341
+
361
342
  return(player, rights, status, dob, height, weight, birthplace, nation, shoots, draft, link)
362
343
 
363
344
  def get_player_information(dataframe):
@@ -382,17 +363,17 @@ def get_player_information(dataframe):
382
363
  for i in range(0, len(list(set(dataframe.link)))):
383
364
  try:
384
365
  myresult = get_info(((list(set(dataframe.link))[i])))
385
- myplayer._append(myresult[0])
386
- myrights._append(myresult[1])
387
- mystatus._append(myresult[2])
388
- mydob._append(myresult[3])
389
- myheight._append(myresult[4])
390
- myweight._append(myresult[5])
391
- mybirthplace._append(myresult[6])
392
- mynation._append(myresult[7])
393
- myshot._append(myresult[8])
394
- mydraft._append(myresult[9])
395
- mylink._append(myresult[10])
366
+ myplayer.append(myresult[0])
367
+ myrights.append(myresult[1])
368
+ mystatus.append(myresult[2])
369
+ mydob.append(myresult[3])
370
+ myheight.append(myresult[4])
371
+ myweight.append(myresult[5])
372
+ mybirthplace.append(myresult[6])
373
+ mynation.append(myresult[7])
374
+ myshot.append(myresult[8])
375
+ mydraft.append(myresult[9])
376
+ mylink.append(myresult[10])
396
377
  print(myresult[0] + " scraped! That's " + str(i + 1) + " down! Only " + str(len(list(set(dataframe.link))) - (i + 1)) + " left to go!")
397
378
  except KeyboardInterrupt:
398
379
  print("You interrupted this one manually. The output here will be every player you've scraped so far. Good bye!")
@@ -1912,6 +1912,8 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
1912
1912
  game_date = pd.to_datetime(this_date))
1913
1913
 
1914
1914
  gamedays = gamedays._append(fax)
1915
+
1916
+ gamedays = gamedays[gamedays.espn_id!='gameId']
1915
1917
 
1916
1918
  gamedays = gamedays.assign(
1917
1919
  home_team = np.where(gamedays.home_team=='ST LOUIS BLUES', 'ST. LOUIS BLUES', gamedays.home_team),
@@ -1977,8 +1979,9 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
1977
1979
  np.where(gamedays.away_team=='GOLDEN', 'VGK',
1978
1980
  np.where(gamedays.away_team=='KNIGHTS', 'VGK',
1979
1981
  np.where(gamedays.away_team=='CAPITALS', 'WSH',
1980
- np.where(gamedays.away_team=='JETS', 'WPG', 'mistake'
1981
- ))))))))))))))))))))))))))))))))))))
1982
+ np.where(gamedays.away_team=='JETS', 'WPG',
1983
+ np.where(gamedays.away_team=='CLUB', 'UTA', 'mistake'
1984
+ )))))))))))))))))))))))))))))))))))))
1982
1985
  )
1983
1986
 
1984
1987
  gamedays = gamedays.assign(
@@ -2017,8 +2020,9 @@ def scrape_espn_ids_single_game(game_date, home_team, away_team):
2017
2020
  np.where(gamedays.home_team=='GOLDEN', 'VGK',
2018
2021
  np.where(gamedays.home_team=='KNIGHTS', 'VGK',
2019
2022
  np.where(gamedays.home_team=='CAPITALS', 'WSH',
2020
- np.where(gamedays.home_team=='JETS', 'WPG', 'mistake'
2021
- ))))))))))))))))))))))))))))))))))))
2023
+ np.where(gamedays.home_team=='JETS', 'WPG',
2024
+ np.where(gamedays.home_team=='CLUB', 'UTA', 'mistake'
2025
+ )))))))))))))))))))))))))))))))))))))
2022
2026
  )
2023
2027
 
2024
2028
  gamedays = gamedays[(gamedays.game_date==this_date) & (gamedays.home_team==home_team) & (gamedays.away_team==away_team)]
@@ -2434,7 +2438,7 @@ def fix_missing(single, event_coords, events):
2434
2438
 
2435
2439
  return(events)
2436
2440
 
2437
- def full_scrape_1by1(game_id_list, shift_to_espn = False):
2441
+ def full_scrape_1by1(game_id_list, shift_to_espn = True):
2438
2442
 
2439
2443
  global single
2440
2444
  global event_coords
@@ -2446,12 +2450,13 @@ def full_scrape_1by1(game_id_list, shift_to_espn = False):
2446
2450
 
2447
2451
  i = 0
2448
2452
 
2449
- while i in range(0, len(game_id_list)):
2453
+ while i in range(0, len(game_id_list)) and len(game_id_list)>0:
2450
2454
 
2451
2455
  # First thing to try: Scraping HTML events
2452
2456
 
2453
2457
  try:
2454
2458
  first_time = time.time()
2459
+ print(game_id_list[i])
2455
2460
  game_id = game_id_list[i]
2456
2461
  print('Attempting scrape for: ' + str(game_id))
2457
2462
  season = str(int(str(game_id)[:4])) + str(int(str(game_id)[:4]) + 1)
@@ -2678,6 +2683,11 @@ def full_scrape_1by1(game_id_list, shift_to_espn = False):
2678
2683
  print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
2679
2684
  i = i + 1
2680
2685
  continue
2686
+
2687
+ except KeyError as k:
2688
+ print(str(game_id) + 'gave some kind of Key Error. Here is the error: ' + str(e))
2689
+ i = i + 1
2690
+ continue
2681
2691
 
2682
2692
  except KeyboardInterrupt:
2683
2693
  print('You manually interrupted the scrape. You will get to keep every game you have already completed scraping after just a bit of post-processing. Good bye.')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: TopDownHockey_Scraper
3
- Version: 3.2.7
3
+ Version: 4.0.0
4
4
  Summary: The TopDownHockey Scraper
5
5
  Home-page: https://github.com/TopDownHockey/TopDownHockey_Scraper
6
6
  Author: Patrick Bacon
@@ -0,0 +1,7 @@
1
+ TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=j-7gTk-cp_0LyZihNxm67xH9KdA3Fx4xrFKKu3-9-rU,42245
2
+ TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=jPYnjZMTN6tvQgAvQo9mmFSVQmc4-fEra1jLeuFRkpA,153624
3
+ TopDownHockey_Scraper-4.0.0.dist-info/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
4
+ TopDownHockey_Scraper-4.0.0.dist-info/METADATA,sha256=5CD1aQY7EMR8wuQcRmItqnY9Uk9Ews8P0Sx_b3sJFUI,5462
5
+ TopDownHockey_Scraper-4.0.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
6
+ TopDownHockey_Scraper-4.0.0.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
7
+ TopDownHockey_Scraper-4.0.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,7 +0,0 @@
1
- TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=-EPVHQc06W8OcpVoTQvpUH40sjLj9Nwsv1-y3ANrOiQ,45380
2
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=vumUGPWrtHOTWltSwKZCJNgKzum9UKr_xh7xX0E9_Fo,153213
3
- TopDownHockey_Scraper-3.2.7.dist-info/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
4
- TopDownHockey_Scraper-3.2.7.dist-info/METADATA,sha256=MhGH9lavzF59_ILyFKpgWil9x8IIGTw4zi_bJ1PB0DU,5462
5
- TopDownHockey_Scraper-3.2.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
- TopDownHockey_Scraper-3.2.7.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
7
- TopDownHockey_Scraper-3.2.7.dist-info/RECORD,,