TopDownHockey-Scraper 6.0.5__py3-none-any.whl → 6.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of TopDownHockey-Scraper might be problematic. Click here for more details.
- TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py +26 -23
- {topdownhockey_scraper-6.0.5.dist-info → topdownhockey_scraper-6.0.9.dist-info}/METADATA +1 -1
- topdownhockey_scraper-6.0.9.dist-info/RECORD +7 -0
- topdownhockey_scraper-6.0.5.dist-info/RECORD +0 -7
- {topdownhockey_scraper-6.0.5.dist-info → topdownhockey_scraper-6.0.9.dist-info}/WHEEL +0 -0
- {topdownhockey_scraper-6.0.5.dist-info → topdownhockey_scraper-6.0.9.dist-info}/licenses/LICENSE +0 -0
- {topdownhockey_scraper-6.0.5.dist-info → topdownhockey_scraper-6.0.9.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@ import xml.etree.ElementTree as ET
|
|
|
18
18
|
import xmltodict
|
|
19
19
|
from xml.parsers.expat import ExpatError
|
|
20
20
|
from requests.exceptions import ChunkedEncodingError
|
|
21
|
+
import traceback
|
|
21
22
|
|
|
22
23
|
print('Successfully did local install plus update')
|
|
23
24
|
|
|
@@ -834,8 +835,6 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
834
835
|
|
|
835
836
|
home_shifts = alldf
|
|
836
837
|
|
|
837
|
-
home_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/home_shifts.csv', index = False)
|
|
838
|
-
|
|
839
838
|
if live == True:
|
|
840
839
|
|
|
841
840
|
home_shifts = home_shifts.assign(shift_number = home_shifts.shift_number.astype(int))
|
|
@@ -956,8 +955,6 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
956
955
|
|
|
957
956
|
away_shifts = alldf
|
|
958
957
|
|
|
959
|
-
away_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/away_shifts.csv', index = False)
|
|
960
|
-
|
|
961
958
|
if live == True:
|
|
962
959
|
|
|
963
960
|
away_shifts = away_shifts.assign(shift_number = away_shifts.shift_number.astype(int))
|
|
@@ -1039,7 +1036,7 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
1039
1036
|
|
|
1040
1037
|
# Additional logic to handle period 1 scrape when we don't have goalie shifts yet.
|
|
1041
1038
|
|
|
1042
|
-
if len(home_shifts[(home_shifts.name.isin(goalie_names))]) == 0 or len(away_shifts[(away_shifts.name.isin(goalie_names))]):
|
|
1039
|
+
if len(home_shifts[(home_shifts.name.isin(goalie_names))]) == 0 or len(away_shifts[(away_shifts.name.isin(goalie_names))]) == 0:
|
|
1043
1040
|
|
|
1044
1041
|
pbp_html_url = f'https://www.nhl.com/scores/htmlreports/{season}/GS0{game_id}.HTM'
|
|
1045
1042
|
pbp_soup = BeautifulSoup(requests.get(pbp_html_url).content)
|
|
@@ -1050,26 +1047,27 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
1050
1047
|
|
|
1051
1048
|
away_teams = pd.read_html(str(goalie_table))[0][:2]
|
|
1052
1049
|
away_team = away_teams[0].iloc[0]
|
|
1053
|
-
|
|
1054
|
-
away_goalies = pd.read_html(str(goalie_table))[0][2:4]
|
|
1050
|
+
|
|
1051
|
+
away_goalies = pd.read_html(str(goalie_table))[0][2:4][
|
|
1052
|
+
~pd.isna(pd.read_html(str(goalie_table))[0][2:4])[0]
|
|
1053
|
+
]
|
|
1055
1054
|
away_goalies = away_goalies[~pd.isna(away_goalies[6])]
|
|
1056
|
-
|
|
1055
|
+
|
|
1057
1056
|
away_goalies = away_goalies.assign(team = away_team).rename(columns = {0:'number', 2:'name', 6:'TOI'}).loc[:, ['number', 'name', 'TOI', 'team']]
|
|
1058
1057
|
|
|
1059
|
-
|
|
1058
|
+
away_goalies = away_goalies[away_goalies.TOI!='TOT']
|
|
1059
|
+
|
|
1060
|
+
home_teams = pd.read_html(str(goalie_table))[0][6:8][
|
|
1061
|
+
~pd.isna(pd.read_html(str(goalie_table))[0][6:8])[0]
|
|
1062
|
+
]
|
|
1060
1063
|
home_team = home_teams[0].iloc[0]
|
|
1061
|
-
|
|
1064
|
+
|
|
1062
1065
|
home_goalies = pd.read_html(str(goalie_table))[0][8:10]
|
|
1063
1066
|
home_goalies = home_goalies[~pd.isna(home_goalies[6])]
|
|
1064
|
-
|
|
1067
|
+
|
|
1065
1068
|
home_goalies = home_goalies.assign(team = home_team).rename(columns = {0:'number', 2:'name', 6:'TOI'}).loc[:, ['number', 'name', 'TOI', 'team']]
|
|
1066
1069
|
|
|
1067
|
-
home_goalies =
|
|
1068
|
-
|
|
1069
|
-
# Temporary to test. Will fix later.
|
|
1070
|
-
|
|
1071
|
-
# home_goalies = home_goalies.assign(TOI = '11:26')
|
|
1072
|
-
# away_goalies = away_goalies.assign(TOI = '11:26')
|
|
1070
|
+
home_goalies = home_goalies[home_goalies.TOI!='TOT']
|
|
1073
1071
|
|
|
1074
1072
|
if len(home_shifts[(home_shifts.name.isin(goalie_names))]) == 0:
|
|
1075
1073
|
|
|
@@ -1108,8 +1106,6 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
1108
1106
|
global all_shifts
|
|
1109
1107
|
|
|
1110
1108
|
all_shifts = pd.concat([home_shifts, away_shifts])
|
|
1111
|
-
|
|
1112
|
-
#all_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/all_shifts.csv', index = False)
|
|
1113
1109
|
|
|
1114
1110
|
all_shifts = all_shifts.assign(start_time = all_shifts.shift_start.str.split('/').str[0])
|
|
1115
1111
|
|
|
@@ -1359,7 +1355,6 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
1359
1355
|
global changes_on
|
|
1360
1356
|
global changes_off
|
|
1361
1357
|
myshifts = all_shifts
|
|
1362
|
-
#myshifts.to_csv('/Users/patrickbacon/compact_topdownhockey/tmp.csv', index = False)
|
|
1363
1358
|
#print('Printing my shifts')
|
|
1364
1359
|
|
|
1365
1360
|
#print(myshifts)
|
|
@@ -1416,7 +1411,8 @@ def scrape_html_events(season, game_id):
|
|
|
1416
1411
|
#global stripped_html
|
|
1417
1412
|
#global eventdf
|
|
1418
1413
|
stripped_html = hs_strip_html(tds)
|
|
1419
|
-
length =
|
|
1414
|
+
length = (len(stripped_html) // 8) * 8
|
|
1415
|
+
stripped_html = stripped_html[:length]
|
|
1420
1416
|
eventdf = pd.DataFrame(np.array(stripped_html).reshape(length, 8)).rename(
|
|
1421
1417
|
columns = {0:'index', 1:'period', 2:'strength', 3:'time', 4:'event', 5:'description', 6:'away_skaters', 7:'home_skaters'})
|
|
1422
1418
|
split = eventdf.time.str.split(':')
|
|
@@ -2080,8 +2076,6 @@ def merge_and_prepare(events, shifts):
|
|
|
2080
2076
|
np.where(merged.event.isin(['PGSTR', 'PGEND', 'PSTR', 'PEND', 'ANTHEM']), -1, 1))).sort_values(
|
|
2081
2077
|
by = ['game_seconds', 'period', 'event_index'])
|
|
2082
2078
|
|
|
2083
|
-
merged.to_csv('/Users/patrickbacon/compact_topdownhockey/first_merged.csv', index = False)
|
|
2084
|
-
|
|
2085
2079
|
merged = merged.assign(change_before_event = np.where(
|
|
2086
2080
|
(
|
|
2087
2081
|
(merged.away_on_ice!='') & (merged.event.shift()=='CHANGE') & (merged.away_on_ice!=merged.away_on_ice.shift()) |
|
|
@@ -2471,7 +2465,9 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
|
|
|
2471
2465
|
if len(event_coords[(event_coords.event.isin(ewc)) & (pd.isna(event_coords.coords_x))]) > 0:
|
|
2472
2466
|
raise ExpatError('Bad takes, dude!')
|
|
2473
2467
|
event_coords['game_id'] = int(game_id)
|
|
2468
|
+
print('Attempting to merge events again')
|
|
2474
2469
|
events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'version', 'period', 'game_id', 'event'], how = 'left')
|
|
2470
|
+
print('Merged events again, we have this many rows:', len(events))
|
|
2475
2471
|
try:
|
|
2476
2472
|
events = fix_missing(single, event_coords, events)
|
|
2477
2473
|
except IndexError as e:
|
|
@@ -2530,8 +2526,11 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
|
|
|
2530
2526
|
print('Scraping ESPN Events')
|
|
2531
2527
|
print('Here is the ESPN ID:', espn_id)
|
|
2532
2528
|
event_coords = scrape_espn_events(int(espn_id))
|
|
2529
|
+
print('Scraped ESPN Events, we have this many rows:', len(event_coords))
|
|
2533
2530
|
event_coords['coordinate_source'] = 'espn'
|
|
2531
|
+
print('Attempting to merge events')
|
|
2534
2532
|
events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left').drop(columns = ['espn_id'])
|
|
2533
|
+
print('Merged events, we have this many rows:', len(events))
|
|
2535
2534
|
try:
|
|
2536
2535
|
events = fix_missing(single, event_coords, events)
|
|
2537
2536
|
except IndexError as e:
|
|
@@ -2690,16 +2689,19 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
|
|
|
2690
2689
|
|
|
2691
2690
|
except AttributeError as e:
|
|
2692
2691
|
print(str(game_id) + ' does not have an HTML report. Here is the error: ' + str(e))
|
|
2692
|
+
print(traceback.format_exc())
|
|
2693
2693
|
i = i + 1
|
|
2694
2694
|
continue
|
|
2695
2695
|
|
|
2696
2696
|
except IndexError as e:
|
|
2697
2697
|
print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
|
|
2698
|
+
print(traceback.format_exc())
|
|
2698
2699
|
i = i + 1
|
|
2699
2700
|
continue
|
|
2700
2701
|
|
|
2701
2702
|
except ValueError as e:
|
|
2702
2703
|
print(str(game_id) + ' has an issue with the HTML Report. Here is the error: ' + str(e))
|
|
2704
|
+
print(traceback.format_exc())
|
|
2703
2705
|
i = i + 1
|
|
2704
2706
|
continue
|
|
2705
2707
|
|
|
@@ -2796,6 +2798,7 @@ def full_scrape(game_id_list, live = True, shift = False):
|
|
|
2796
2798
|
hidden_patrick = 0
|
|
2797
2799
|
|
|
2798
2800
|
df = full_scrape_1by1(game_id_list, live, shift_to_espn = shift)
|
|
2801
|
+
print('Full scrape complete, we have this many rows:', len(df))
|
|
2799
2802
|
|
|
2800
2803
|
# Fixing the Pettersson issue for event player. Just going downstream for this.
|
|
2801
2804
|
try:
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=j-7gTk-cp_0LyZihNxm67xH9KdA3Fx4xrFKKu3-9-rU,42245
|
|
2
|
+
TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=rg_7RJo1eWL9dWlRweCat2v21fWX1Z45olcZN859BN4,163992
|
|
3
|
+
topdownhockey_scraper-6.0.9.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
4
|
+
topdownhockey_scraper-6.0.9.dist-info/METADATA,sha256=nM1FoBq-lslyopV84S10eukQQ4pPMSyFGA2S_5xfa8g,5670
|
|
5
|
+
topdownhockey_scraper-6.0.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
topdownhockey_scraper-6.0.9.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
|
|
7
|
+
topdownhockey_scraper-6.0.9.dist-info/RECORD,,
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py,sha256=j-7gTk-cp_0LyZihNxm67xH9KdA3Fx4xrFKKu3-9-rU,42245
|
|
2
|
-
TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py,sha256=kRZPB-pfRCDn6K2UK4ZHYlo09bDHxF7B34w8VE59GoI,163837
|
|
3
|
-
topdownhockey_scraper-6.0.5.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
4
|
-
topdownhockey_scraper-6.0.5.dist-info/METADATA,sha256=yvjnUIQ66Z80Oi02-mWvV2GdFMvvBk2O-lgoGOB4kx0,5670
|
|
5
|
-
topdownhockey_scraper-6.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
-
topdownhockey_scraper-6.0.5.dist-info/top_level.txt,sha256=PBd96GLGFq97ZDLd7_4ZCx8_ZFr_wdWKs5SIpGl5xCs,22
|
|
7
|
-
topdownhockey_scraper-6.0.5.dist-info/RECORD,,
|
|
File without changes
|
{topdownhockey_scraper-6.0.5.dist-info → topdownhockey_scraper-6.0.9.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{topdownhockey_scraper-6.0.5.dist-info → topdownhockey_scraper-6.0.9.dist-info}/top_level.txt
RENAMED
|
File without changes
|