TopDownHockey-Scraper 6.0.4__tar.gz → 6.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of TopDownHockey-Scraper might be problematic. Click here for more details.
- {topdownhockey_scraper-6.0.4/src/TopDownHockey_Scraper.egg-info → topdownhockey_scraper-6.0.7}/PKG-INFO +1 -1
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/setup.cfg +1 -1
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/setup.py +1 -1
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/src/TopDownHockey_Scraper/TopDownHockey_NHL_Scraper.py +11 -13
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7/src/TopDownHockey_Scraper.egg-info}/PKG-INFO +1 -1
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/LICENSE +0 -0
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/README.md +0 -0
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/pyproject.toml +0 -0
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/src/TopDownHockey_Scraper/TopDownHockey_EliteProspects_Scraper.py +0 -0
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/src/TopDownHockey_Scraper.egg-info/SOURCES.txt +0 -0
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/src/TopDownHockey_Scraper.egg-info/dependency_links.txt +0 -0
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/src/TopDownHockey_Scraper.egg-info/requires.txt +0 -0
- {topdownhockey_scraper-6.0.4 → topdownhockey_scraper-6.0.7}/src/TopDownHockey_Scraper.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = TopDownHockey_Scraper
|
|
3
|
-
version = 6.0.
|
|
3
|
+
version = 6.0.7
|
|
4
4
|
author = Patrick Bacon
|
|
5
5
|
author_email = patrick.s.bacon@gmail.com
|
|
6
6
|
description = A package built for scraping hockey data from EliteProspects, the NHL's HTML/API reports, and ESPN's XML reports.
|
|
@@ -9,7 +9,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
9
9
|
|
|
10
10
|
setup(
|
|
11
11
|
name="TopDownHockey_Scraper", # Replace with your own username
|
|
12
|
-
version="6.0.
|
|
12
|
+
version="6.0.7",
|
|
13
13
|
author="Patrick Bacon",
|
|
14
14
|
author_email="patrick.s.bacon@gmail.com",
|
|
15
15
|
description="The TopDownHockey Scraper",
|
|
@@ -18,6 +18,7 @@ import xml.etree.ElementTree as ET
|
|
|
18
18
|
import xmltodict
|
|
19
19
|
from xml.parsers.expat import ExpatError
|
|
20
20
|
from requests.exceptions import ChunkedEncodingError
|
|
21
|
+
import traceback
|
|
21
22
|
|
|
22
23
|
print('Successfully did local install plus update')
|
|
23
24
|
|
|
@@ -834,8 +835,6 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
834
835
|
|
|
835
836
|
home_shifts = alldf
|
|
836
837
|
|
|
837
|
-
home_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/home_shifts.csv', index = False)
|
|
838
|
-
|
|
839
838
|
if live == True:
|
|
840
839
|
|
|
841
840
|
home_shifts = home_shifts.assign(shift_number = home_shifts.shift_number.astype(int))
|
|
@@ -956,8 +955,6 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
956
955
|
|
|
957
956
|
away_shifts = alldf
|
|
958
957
|
|
|
959
|
-
away_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/away_shifts.csv', index = False)
|
|
960
|
-
|
|
961
958
|
if live == True:
|
|
962
959
|
|
|
963
960
|
away_shifts = away_shifts.assign(shift_number = away_shifts.shift_number.astype(int))
|
|
@@ -1064,7 +1061,7 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
1064
1061
|
|
|
1065
1062
|
home_goalies = home_goalies.assign(team = home_team).rename(columns = {0:'number', 2:'name', 6:'TOI'}).loc[:, ['number', 'name', 'TOI', 'team']]
|
|
1066
1063
|
|
|
1067
|
-
home_goalies = pd.read_html(str(goalie_table))[0][8:9]
|
|
1064
|
+
# home_goalies = pd.read_html(str(goalie_table))[0][8:9]
|
|
1068
1065
|
|
|
1069
1066
|
# Temporary to test. Will fix later.
|
|
1070
1067
|
|
|
@@ -1108,8 +1105,6 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
1108
1105
|
global all_shifts
|
|
1109
1106
|
|
|
1110
1107
|
all_shifts = pd.concat([home_shifts, away_shifts])
|
|
1111
|
-
|
|
1112
|
-
#all_shifts.to_csv('/Users/patrickbacon/compact_topdownhockey/all_shifts.csv', index = False)
|
|
1113
1108
|
|
|
1114
1109
|
all_shifts = all_shifts.assign(start_time = all_shifts.shift_start.str.split('/').str[0])
|
|
1115
1110
|
|
|
@@ -1359,7 +1354,6 @@ def scrape_html_shifts(season, game_id, live = True):
|
|
|
1359
1354
|
global changes_on
|
|
1360
1355
|
global changes_off
|
|
1361
1356
|
myshifts = all_shifts
|
|
1362
|
-
#myshifts.to_csv('/Users/patrickbacon/compact_topdownhockey/tmp.csv', index = False)
|
|
1363
1357
|
#print('Printing my shifts')
|
|
1364
1358
|
|
|
1365
1359
|
#print(myshifts)
|
|
@@ -2080,8 +2074,6 @@ def merge_and_prepare(events, shifts):
|
|
|
2080
2074
|
np.where(merged.event.isin(['PGSTR', 'PGEND', 'PSTR', 'PEND', 'ANTHEM']), -1, 1))).sort_values(
|
|
2081
2075
|
by = ['game_seconds', 'period', 'event_index'])
|
|
2082
2076
|
|
|
2083
|
-
merged.to_csv('/Users/patrickbacon/compact_topdownhockey/first_merged.csv', index = False)
|
|
2084
|
-
|
|
2085
2077
|
merged = merged.assign(change_before_event = np.where(
|
|
2086
2078
|
(
|
|
2087
2079
|
(merged.away_on_ice!='') & (merged.event.shift()=='CHANGE') & (merged.away_on_ice!=merged.away_on_ice.shift()) |
|
|
@@ -2465,13 +2457,15 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
|
|
|
2465
2457
|
# If all goes well with the HTML scrape:
|
|
2466
2458
|
|
|
2467
2459
|
try:
|
|
2468
|
-
event_coords = scrape_api_events(game_id, shift_to_espn =
|
|
2460
|
+
event_coords = scrape_api_events(game_id, shift_to_espn = True)
|
|
2469
2461
|
api_coords = event_coords
|
|
2470
2462
|
api_coords['coordinate_source'] = 'api'
|
|
2471
2463
|
if len(event_coords[(event_coords.event.isin(ewc)) & (pd.isna(event_coords.coords_x))]) > 0:
|
|
2472
2464
|
raise ExpatError('Bad takes, dude!')
|
|
2473
2465
|
event_coords['game_id'] = int(game_id)
|
|
2466
|
+
print('Attempting to merge events again')
|
|
2474
2467
|
events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'version', 'period', 'game_id', 'event'], how = 'left')
|
|
2468
|
+
print('Merged events again, we have this many rows:', len(events))
|
|
2475
2469
|
try:
|
|
2476
2470
|
events = fix_missing(single, event_coords, events)
|
|
2477
2471
|
except IndexError as e:
|
|
@@ -2528,10 +2522,13 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
|
|
|
2528
2522
|
print('Scraping ESPN IDs')
|
|
2529
2523
|
espn_id = scrape_espn_ids_single_game(str(game_date.date()), espn_home_team, espn_away_team).espn_id.iloc[0]
|
|
2530
2524
|
print('Scraping ESPN Events')
|
|
2531
|
-
print('Here is the ESPN ID:' espn_id)
|
|
2525
|
+
print('Here is the ESPN ID:', espn_id)
|
|
2532
2526
|
event_coords = scrape_espn_events(int(espn_id))
|
|
2527
|
+
print('Scraped ESPN Events, we have this many rows:', len(event_coords))
|
|
2533
2528
|
event_coords['coordinate_source'] = 'espn'
|
|
2529
|
+
print('Attempting to merge events')
|
|
2534
2530
|
events = single.merge(event_coords, on = ['event_player_1', 'game_seconds', 'period', 'version', 'event'], how = 'left').drop(columns = ['espn_id'])
|
|
2531
|
+
print('Merged events, we have this many rows:', len(events))
|
|
2535
2532
|
try:
|
|
2536
2533
|
events = fix_missing(single, event_coords, events)
|
|
2537
2534
|
except IndexError as e:
|
|
@@ -2690,6 +2687,7 @@ def full_scrape_1by1(game_id_list, live = False, shift_to_espn = True):
|
|
|
2690
2687
|
|
|
2691
2688
|
except AttributeError as e:
|
|
2692
2689
|
print(str(game_id) + ' does not have an HTML report. Here is the error: ' + str(e))
|
|
2690
|
+
print(traceback.format_exc())
|
|
2693
2691
|
i = i + 1
|
|
2694
2692
|
continue
|
|
2695
2693
|
|
|
@@ -2796,6 +2794,7 @@ def full_scrape(game_id_list, live = True, shift = False):
|
|
|
2796
2794
|
hidden_patrick = 0
|
|
2797
2795
|
|
|
2798
2796
|
df = full_scrape_1by1(game_id_list, live, shift_to_espn = shift)
|
|
2797
|
+
print('Full scrape complete, we have this many rows:', len(df))
|
|
2799
2798
|
|
|
2800
2799
|
# Fixing the Pettersson issue for event player. Just going downstream for this.
|
|
2801
2800
|
try:
|
|
@@ -2830,7 +2829,6 @@ def full_scrape(game_id_list, live = True, shift = False):
|
|
|
2830
2829
|
)
|
|
2831
2830
|
except Exception as e:
|
|
2832
2831
|
print(e)
|
|
2833
|
-
continue
|
|
2834
2832
|
|
|
2835
2833
|
# Don't even need this, we've had this problem with Stutzle for years, just let it be.
|
|
2836
2834
|
# df.event_description = df.event_description.str.replace('FEHÃ\x89RVÃ\x81RY', 'FEHERVARY').str.replace('BLÃMEL', 'BLAMEL')
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|