TopDownHockey-Scraper 6.0.5__py3-none-any.whl → 6.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of TopDownHockey-Scraper might be problematic. Click here for more details.

@@ -18,6 +18,7 @@ import xml.etree.ElementTree as ET
18
18
  import xmltodict
19
19
  from xml.parsers.expat import ExpatError
20
20
  from requests.exceptions import ChunkedEncodingError
21
+ import traceback
21
22
 
22
23
  print('Successfully did local install plus update')
23
24
 
@@ -834,8 +835,6 @@ def scrape_html_shifts(season, game_id, live = True):
834
835
 
835
836
  home_shifts = alldf
836
837
 
837
- home_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/home_shifts.csv', index = False)
838
-
839
838
  if live == True:
840
839
 
841
840
  home_shifts = home_shifts.assign(shift_number = home_shifts.shift_number.astype(int))
@@ -956,8 +955,6 @@ def scrape_html_shifts(season, game_id, live = True):
956
955
 
957
956
  away_shifts = alldf
958
957
 
959
- away_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/away_shifts.csv', index = False)
960
-
961
958
  if live == True:
962
959
 
963
960
  away_shifts = away_shifts.assign(shift_number = away_shifts.shift_number.astype(int))
@@ -1039,7 +1036,7 @@ def scrape_html_shifts(season, game_id, live = True):
1039
1036
 
1040
1037
  # Additional logic to handle period 1 scrape when we don't have goalie shifts yet.
1041
1038
 
1042
- if len(home_shifts[(home_shifts.name.isin(goalie_names))]) == 0 or len(away_shifts[(away_shifts.name.isin(goalie_names))]):
1039
+ if len(home_shifts[(home_shifts.name.isin(goalie_names))]) == 0 or len(away_shifts[(away_shifts.name.isin(goalie_names))]) == 0:
1043
1040
 
1044
1041
  pbp_html_url = f'https://www.nhl.com/scores/htmlreports/{season}/GS0{game_id}.HTM'
1045
1042
  pbp_soup = BeautifulSoup(requests.get(pbp_html_url).content)
@@ -1050,26 +1047,27 @@ def scrape_html_shifts(season, game_id, live = True):
1050
1047
 
1051
1048
  away_teams = pd.read_html(str(goalie_table))[0][:2]
1052
1049
  away_team = away_teams[0].iloc[0]
1053
-
1054
- away_goalies = pd.read_html(str(goalie_table))[0][2:4]
1050
+
1051
+ away_goalies = pd.read_html(str(goalie_table))[0][2:4][
1052
+ ~pd.isna(pd.read_html(str(goalie_table))[0][2:4])[0]
1053
+ ]
1055
1054
  away_goalies = away_goalies[~pd.isna(away_goalies[6])]
1056
-
1055
+
1057
1056
  away_goalies = away_goalies.assign(team = away_team).rename(columns = {0:'number', 2:'name', 6:'TOI'}).loc[:, ['number', 'name', 'TOI', 'team']]
1058
1057
 
1059
- home_teams = pd.read_html(str(goalie_table))[0][6:7]
1058
+ away_goalies = away_goalies[away_goalies.TOI!='TOT']
1059
+
1060
+ home_teams = pd.read_html(str(goalie_table))[0][6:8][
1061
+ ~pd.isna(pd.read_html(str(goalie_table))[0][6:8])[0]
1062
+ ]
1060
1063
  home_team = home_teams[0].iloc[0]
1061
-
1064
+
1062
1065
  home_goalies = pd.read_html(str(goalie_table))[0][8:10]
1063
1066
  home_goalies = home_goalies[~pd.isna(home_goalies[6])]
1064
-
1067
+
1065
1068
  home_goalies = home_goalies.assign(team = home_team).rename(columns = {0:'number', 2:'name', 6:'TOI'}).loc[:, ['number', 'name', 'TOI', 'team']]
1066
1069
 
1067
- home_goalies = pd.read_html(str(goalie_table))[0][8:9]
1068
-
1069
- # Temporary to test. Will fix later.
1070
-
1071
- # home_goalies = home_goalies.assign(TOI = '11:26')
1072
- # away_goalies = away_goalies.assign(TOI = '11:26')
1070
+ home_goalies = home_goalies[home_goalies.TOI!='TOT']
1073
1071
 
1074
1072
  if len(home_shifts[(home_shifts.name.isin(goalie_names))]) == 0:
1075
1073
 
@@ -1108,8 +1106,6 @@ def scrape_html_shifts(season, game_id, live = True):
1108
1106
  global all_shifts
1109
1107
 
1110
1108
  all_shifts = pd.concat([home_shifts, away_shifts])
1111
-
1112
- #all_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/all_shifts.csv', index = False)
1113
1109
 
1114
1110
  all_shifts = all_shifts.assign(start_time = all_shifts.shift_start.str.split('/').str[0])
1115
1111
 
@@ -1359,7 +1355,6 @@ def scrape_html_shifts(season, game_id, live = True):
1359
1355
  global changes_on
1360
1356
  global changes_off
1361
1357
  myshifts = all_shifts
1362
- #myshifts.to_csv('/Users/patrickbacon/compact_topdownhockey/tmp.csv', index = False)
1363
1358
  #print('Printing my shifts')
1364
1359
 
1365
1360
  #print(myshifts)
@@ -1416,7 +1411,8 @@ def scrape_html_events(season, game_id):
1416
1411
  #global stripped_html
1417
1412
  #global eventdf
1418
1413
  stripped_html = hs_strip_html(tds)
1419
- length = int(len(stripped_html)/8)
1414
+ length = (len(stripped_html) // 8) * 8
1415
+ stripped_html = stripped_html[:length]
1420
1416
  eventdf = pd.DataFrame(np.array(stripped_html).reshape(length, 8)).rename(
1421
1417
  columns = {0:'index', 1:'period', 2:'strength', 3:'time', 4:'event', 5:'description', 6:'away_skaters', 7:'home_skaters'})
1422
1418
  split = eventdf.time.str.split(':')
@@ -2080,8 +2076,6 @@ def merge_and_prepare(events, shifts):
2080
2076
  np.where(merged.event.isin(['PGSTR', 'PGEND', 'PSTR', 'PEND', 'ANTHEM']), -1, 1))).sort_values(
2081
2077
  by = ['game_seconds', 'period', 'event_index'])
2082
2078
 
2083
- merged.to_csv('/Users/patrickbacon/compact_topdownhockey/first_merged.csv', index = False)
2084
-
2085
2079
  merged = merged.assign(change_before_event = np.where(
2086
2080
  (
2087
2081
  (merged.away_on_ice!='') & (merged.event.shift()=='CHANGE') & (merged.away_on_ice!=merged.away_on_ice.shift()) |
@@ -2471,7 +2465,9 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
2471
2465
  if len(event_coords[(event_coords.event.isin(ewc)) & (pd.isna(event_coords.coords_x))]) > 0:
2472
2466
  raise ExpatError('Bad takes, dude!')
2473
2467
  event_coords['game_id'] = int(game_id)
2468
+ print('Attempting to merge events again')
2474
2469
  events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'version', 'period', 'game_id', 'event'], how = 'left')
2470
+ print('Merged events again, we have this many rows:', len(events))
2475
2471
  try:
2476
2472
  events = fix_missing(single, event_coords, events)
2477
2473
  except IndexError as e:
@@ -2530,8 +2526,11 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
2530
2526
  print('Scraping ESPN Events')
2531
2527
  print('Here is the ESPN ID:', espn_id)
2532
2528
  event_coords = scrape_espn_events(int(espn_id))
2529
+ print('Scraped ESPN Events, we have this many rows:', len(event_coords))
2533
2530
  event_coords['coordinate_source'] = 'espn'
2531
+ print('Attempting to merge events')
2534
2532
  events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left').drop(columns = ['espn_id'])
2533
+ print('Merged events, we have this many rows:', len(events))
2535
2534
  try:
2536
2535
  events = fix_missing(single, event_coords, events)
2537
2536
  except IndexError as e:
@@ -2690,16 +2689,19 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
2690
2689
 
2691
2690
  except AttributeError as e:
2692
2691
  print(str(game_id) + ' does not have an HTML report. Here is the error: ' + str(e))
2692
+ print(traceback.format_exc())
2693
2693
  i = i + 1
2694
2694
  continue
2695
2695
 
2696
2696
  except IndexError as e:
2697
2697
  print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
2698
+ print(traceback.format_exc())
2698
2699
  i = i + 1
2699
2700
  continue
2700
2701
 
2701
2702
  except ValueError as e:
2702
2703
  print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
2704
+ print(traceback.format_exc())
2703
2705
  i = i + 1
2704
2706
  continue
2705
2707
 
@@ -2796,6 +2798,7 @@ def full_scrape(game_id_list, live = True, shift = False):
2796
2798
  hidden_patrick = 0
2797
2799
 
2798
2800
  df = full_scrape_1by1(game_id_list, live, shift_to_espn = shift)
2801
+ print('Full scrape complete, we have this many rows:', len(df))
2799
2802
 
2800
2803
  # Fixing the Pettersson issue for event player. Just going downstream for this.
2801
2804
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TopDownHockey_Scraper
3
- Version: 6.0.5
3
+ Version: 6.0.9
4
4
  Summary: The TopDownHockey Scraper
5
5
  Home-page: https://github.com/TopDownHockey/TopDownHockey_Scraper
6
6
  Author: Patrick Bacon
@@ -0,0 +1,7 @@
1
+ TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=j-7gTk-cp_0LyZihNxm67xH9KdA3Fx4xrFKKu3-9-rU,42245
2
+ TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=rg_7RJo1eWL9dWlRweCat2v21fWX1Z45olcZN859BN4,163992
3
+ topdownhockey_scraper-6.0.9.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
4
+ topdownhockey_scraper-6.0.9.dist-info/METADATA,sha256=nM1FoBq-lslyopV84S10eukQQ4pPMSyFGA2S_5xfa8g,5670
5
+ topdownhockey_scraper-6.0.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ topdownhockey_scraper-6.0.9.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
7
+ topdownhockey_scraper-6.0.9.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=j-7gTk-cp_0LyZihNxm67xH9KdA3Fx4xrFKKu3-9-rU,42245
2
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=kRZPB-pfRCDn6K2UK4ZHYlo09bDHxF7B34w8VE59GoI,163837
3
- topdownhockey_scraper-6.0.5.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
4
- topdownhockey_scraper-6.0.5.dist-info/METADATA,sha256=yvjnUIQ66Z80Oi02-mWvV2GdFMvvBk2O-lgoGOB4kx0,5670
5
- topdownhockey_scraper-6.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- topdownhockey_scraper-6.0.5.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
7
- topdownhockey_scraper-6.0.5.dist-info/RECORD,,