wsba-hockey 0.1.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wsba_hockey/__init__.py +1 -1
- wsba_hockey/stats/calculate_viz/shot_impact.py +2 -0
- wsba_hockey/tools/__init__.py +0 -0
- wsba_hockey/tools/agg.py +185 -0
- wsba_hockey/tools/archive/old_scraping.py +1104 -0
- wsba_hockey/tools/plotting.py +113 -0
- wsba_hockey/tools/scraping.py +836 -369
- wsba_hockey/tools/utils/__init__.py +1 -0
- wsba_hockey/tools/utils/config.py +14 -0
- wsba_hockey/tools/utils/save_pages.py +133 -0
- wsba_hockey/tools/utils/shared.py +450 -0
- wsba_hockey/tools/xg_model.py +275 -47
- wsba_hockey/wsba_main.py +699 -132
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/METADATA +42 -11
- wsba_hockey-1.0.0.dist-info/RECORD +18 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/WHEEL +1 -1
- wsba_hockey-0.1.2.dist-info/RECORD +0 -9
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {wsba_hockey-0.1.2.dist-info → wsba_hockey-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
### CODE IN THIS DIRECTORY ORIGINALLY (FULLY OR PARITALLY) WRITTEN BY HARRY SHOMER IN THE "hockey_scraper" PACKAGE
|
@@ -0,0 +1,14 @@
|
|
1
|
+
"""
|
2
|
+
Basic configurations
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Directory where to save pages
|
6
|
+
# When True assumes ~/hockey_scraper_data
|
7
|
+
# Otherwise can take str to `existing` directory
|
8
|
+
DOCS_DIR = False
|
9
|
+
|
10
|
+
# Boolean that tells us whether or not we should re-scrape a given page if it's already saved
|
11
|
+
RESCRAPE = False
|
12
|
+
|
13
|
+
# Whether to log verbose errors to log file
|
14
|
+
LOG = False
|
@@ -0,0 +1,133 @@
|
|
1
|
+
#
|
2
|
+
#Saves the scraped docs so you don't have to re-scrape them every time you want to parse the docs.
|
3
|
+
#
|
4
|
+
#\**** Don't mess with this unless you know what you're doing \****
|
5
|
+
#
|
6
|
+
import os
|
7
|
+
import gzip
|
8
|
+
|
9
|
+
|
10
|
+
def create_base_file_path(file_info):
|
11
|
+
"""
|
12
|
+
Creates the base file path for a given file
|
13
|
+
|
14
|
+
:param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
|
15
|
+
we want to deposit any data in.
|
16
|
+
|
17
|
+
:return: path
|
18
|
+
"""
|
19
|
+
# Shitty fix for when you already have it saved but don't have nwhl folders
|
20
|
+
if 'nwhl' in file_info['type']:
|
21
|
+
if not os.path.isdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'])):
|
22
|
+
os.mkdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type']))
|
23
|
+
|
24
|
+
return os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'], file_info['name'] + ".txt")
|
25
|
+
|
26
|
+
|
27
|
+
def is_compressed(file_info):
|
28
|
+
"""
|
29
|
+
Check if stored file is compressed as we used to not save them as compressed.
|
30
|
+
|
31
|
+
:param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
|
32
|
+
we want to deposit any data in.
|
33
|
+
|
34
|
+
return Boolean
|
35
|
+
"""
|
36
|
+
return os.path.isfile(create_base_file_path(file_info) + ".gz")
|
37
|
+
|
38
|
+
|
39
|
+
def create_dir_structure(dir_name):
|
40
|
+
"""
|
41
|
+
Create the basic directory structure for docs_dir if not done yet.
|
42
|
+
Creates the docs and csvs subdir if it doesn't exist
|
43
|
+
|
44
|
+
:param dir_name: Name of dir to create
|
45
|
+
|
46
|
+
:return None
|
47
|
+
"""
|
48
|
+
if not os.path.isdir(os.path.join(dir_name, 'docs')):
|
49
|
+
os.mkdir(os.path.join(dir_name, 'docs'))
|
50
|
+
|
51
|
+
if not os.path.isdir(os.path.join(dir_name, 'csvs')):
|
52
|
+
os.mkdir(os.path.join(dir_name, 'csvs'))
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
def create_season_dirs(file_info):
|
57
|
+
"""
|
58
|
+
Creates the infrastructure to hold all the scraped docs for a season
|
59
|
+
|
60
|
+
:param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
|
61
|
+
we want to deposit any data in.
|
62
|
+
|
63
|
+
:return: None
|
64
|
+
"""
|
65
|
+
sub_folders = ["html_pbp", "json_pbp", "espn_pbp", "html_shifts_home", "html_shifts_away",
|
66
|
+
"json_shifts", "html_roster", "json_schedule", "espn_scoreboard"]
|
67
|
+
|
68
|
+
season_path = os.path.join(file_info['dir'], 'docs', str(file_info['season']))
|
69
|
+
os.mkdir(season_path)
|
70
|
+
|
71
|
+
for sub_f in sub_folders:
|
72
|
+
os.mkdir(os.path.join(season_path, sub_f))
|
73
|
+
|
74
|
+
|
75
|
+
def check_file_exists(file_info):
|
76
|
+
"""
|
77
|
+
Checks if the file exists. Also check if structure for holding scraped file exists to. If not, it creates it.
|
78
|
+
|
79
|
+
:param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
|
80
|
+
we want to deposit any data in.
|
81
|
+
|
82
|
+
:return: Boolean - True if it exists
|
83
|
+
"""
|
84
|
+
create_dir_structure(file_info['dir'])
|
85
|
+
|
86
|
+
# Check if the folder for the season for the given game was created yet...if not create it
|
87
|
+
if not os.path.isdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']))):
|
88
|
+
create_season_dirs(file_info)
|
89
|
+
|
90
|
+
# May or may not be compressed due to file saved under older versions
|
91
|
+
non_compressed_file = os.path.isfile(create_base_file_path(file_info))
|
92
|
+
compressed_file = is_compressed(file_info)
|
93
|
+
|
94
|
+
return compressed_file or non_compressed_file
|
95
|
+
|
96
|
+
|
97
|
+
def get_page(file_info):
|
98
|
+
"""
|
99
|
+
Get the file so we don't need to re-scrape.
|
100
|
+
|
101
|
+
Try both compressed and non-compressed for backwards compatability issues (formerly non-compressed)
|
102
|
+
|
103
|
+
:param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
|
104
|
+
we want to deposit any data in.
|
105
|
+
|
106
|
+
:return: Response or None
|
107
|
+
"""
|
108
|
+
base_file = create_base_file_path(file_info)
|
109
|
+
|
110
|
+
if is_compressed(file_info):
|
111
|
+
with gzip.open(base_file + ".gz", 'rb') as my_file:
|
112
|
+
return my_file.read().decode("utf-8").replace('\n', '')
|
113
|
+
else:
|
114
|
+
with open(base_file, 'r') as my_file:
|
115
|
+
return my_file.read().replace('\n', '')
|
116
|
+
|
117
|
+
|
118
|
+
def save_page(page, file_info):
|
119
|
+
"""
|
120
|
+
Save the page we just scraped.
|
121
|
+
|
122
|
+
Note: It'll only get saved if the directory already exists!!!!!!. I'm not dealing with any fuck ups. That would
|
123
|
+
involve checking if it's even a valid path and creating it. Make sure you get it right.
|
124
|
+
|
125
|
+
:param page: File scraped
|
126
|
+
:param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
|
127
|
+
we want to deposit any data in.
|
128
|
+
|
129
|
+
:return: None
|
130
|
+
"""
|
131
|
+
if file_info['dir'] and page is not None and page != '':
|
132
|
+
with gzip.open(create_base_file_path(file_info) + ".gz", 'wb') as file:
|
133
|
+
file.write(page.encode())
|
@@ -0,0 +1,450 @@
|
|
1
|
+
### CODE IN THIS DIRECTORY ORIGINALLY (FULLY OR PARITALLY) WRITTEN BY HARRY SHOMER IN THE "hockey_scraper" PACKAGE
|
2
|
+
|
3
|
+
## shared.py ##
|
4
|
+
"""
|
5
|
+
This file is a bunch of the shared functions or just general stuff used by the different scrapers in the package.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import time
|
9
|
+
import json
|
10
|
+
import logging
|
11
|
+
import warnings
|
12
|
+
import requests
|
13
|
+
from datetime import datetime, timedelta
|
14
|
+
from requests.adapters import HTTPAdapter
|
15
|
+
from urllib3.util import Retry
|
16
|
+
from . import save_pages as sp
|
17
|
+
from . import config
|
18
|
+
import inspect
|
19
|
+
|
20
|
+
# Directory where this file lives
|
21
|
+
FILE_DIR = os.path.dirname(os.path.realpath(__file__))
|
22
|
+
|
23
|
+
# Name and Team fixes used
|
24
|
+
with open(os.path.join(FILE_DIR, "player_name_fixes.json"), "r" ,encoding="utf-8") as f:
|
25
|
+
Names = json.load(f)['fixes']
|
26
|
+
|
27
|
+
with open(os.path.join(FILE_DIR, "team_tri_codes.json"), "r" ,encoding="utf-8") as f:
|
28
|
+
TEAMS = json.load(f)['teams']
|
29
|
+
|
30
|
+
|
31
|
+
def fix_name(name):
|
32
|
+
"""
|
33
|
+
Check if a name falls under those that need fixing. If it does...fix it.
|
34
|
+
|
35
|
+
:param name: name in pbp
|
36
|
+
|
37
|
+
:return: Either the given parameter or the fixed name
|
38
|
+
"""
|
39
|
+
return Names.get(name.upper(), name.upper()).upper()
|
40
|
+
|
41
|
+
|
42
|
+
def get_team(team):
|
43
|
+
"""
|
44
|
+
Get the fucking team
|
45
|
+
"""
|
46
|
+
return TEAMS.get(team.upper(), team.upper()).upper()
|
47
|
+
|
48
|
+
|
49
|
+
def custom_formatwarning(msg, *args, **kwargs):
|
50
|
+
"""
|
51
|
+
Override format for standard wanings
|
52
|
+
"""
|
53
|
+
ansi_no_color = '\033[0m'
|
54
|
+
return "{msg}\n{no_color}".format(no_color=ansi_no_color, msg=msg)
|
55
|
+
|
56
|
+
warnings.formatwarning = custom_formatwarning
|
57
|
+
|
58
|
+
|
59
|
+
def print_error(msg):
|
60
|
+
"""
|
61
|
+
Implement own custom error using warning module. Prints in red
|
62
|
+
|
63
|
+
Reason why i still use warning for errors is so i can set to ignore them if i want to (e.g. live_scrape line 200).
|
64
|
+
|
65
|
+
:param msg: Str to print
|
66
|
+
|
67
|
+
:return: None
|
68
|
+
"""
|
69
|
+
ansi_red_code = '\033[0;31m'
|
70
|
+
warning_msg = "{}Error: {}".format(ansi_red_code, msg)
|
71
|
+
|
72
|
+
# if config.LOG:
|
73
|
+
# caller_file = os.path.basename(inspect.stack()[1].filename)
|
74
|
+
# get_logger(caller_file).error(msg + " " + verbose)
|
75
|
+
|
76
|
+
warnings.warn(warning_msg)
|
77
|
+
|
78
|
+
|
79
|
+
def print_warning(msg):
|
80
|
+
"""
|
81
|
+
Implement own custom warning using warning module. Prints in Orange.
|
82
|
+
|
83
|
+
:param msg: Str to print
|
84
|
+
|
85
|
+
:return: None
|
86
|
+
"""
|
87
|
+
ansi_yellow_code = '\033[0;33m'
|
88
|
+
warning_msg = "{}Warning: {}".format(ansi_yellow_code, msg)
|
89
|
+
|
90
|
+
warnings.warn(warning_msg)
|
91
|
+
|
92
|
+
|
93
|
+
def get_logger(python_file):
|
94
|
+
"""
|
95
|
+
Create a basic logger to a log file
|
96
|
+
|
97
|
+
:param python_file: File that instantiates the logger instance
|
98
|
+
|
99
|
+
:return: logger
|
100
|
+
"""
|
101
|
+
base_py_file = os.path.basename(python_file)
|
102
|
+
|
103
|
+
# If already exists we don't try to recreate it
|
104
|
+
if base_py_file in logging.Logger.manager.loggerDict.keys():
|
105
|
+
return logging.getLogger(base_py_file)
|
106
|
+
|
107
|
+
logger = logging.getLogger(base_py_file)
|
108
|
+
logger.setLevel(logging.INFO)
|
109
|
+
|
110
|
+
fh = logging.FileHandler("hockey_scraper_errors_{}.log".format(datetime.now().strftime("%Y-%m-%dT%H:%M:%S")))
|
111
|
+
fh.setFormatter(logging.Formatter('%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s', datefmt='%Y-%m-%d %I:%M:%S'))
|
112
|
+
logger.addHandler(fh)
|
113
|
+
|
114
|
+
return logger
|
115
|
+
|
116
|
+
|
117
|
+
def log_error(err, py_file):
|
118
|
+
"""
|
119
|
+
Log error when Logging is specified
|
120
|
+
|
121
|
+
:param err: Error to log
|
122
|
+
:param python_file: File that instantiates the logger instance
|
123
|
+
|
124
|
+
:return: None
|
125
|
+
"""
|
126
|
+
if config.LOG:
|
127
|
+
get_logger(py_file).error(err)
|
128
|
+
|
129
|
+
|
130
|
+
def get_season(date):
|
131
|
+
"""
|
132
|
+
Get Season based on from_date
|
133
|
+
|
134
|
+
There is an exception for the 2019-2020 pandemic season. Accoding to the below url:
|
135
|
+
- 2019-2020 season ends in Oct. 2020
|
136
|
+
- 2020-2021 season begins in November 2020
|
137
|
+
- https://nhl.nbcsports.com/2020/07/10/new-nhl-critical-dates-calendar-means-an-october-free-agent-frenzy/
|
138
|
+
|
139
|
+
:param date: date
|
140
|
+
|
141
|
+
:return: season -> ex: 2016 for 2016-2017 season
|
142
|
+
"""
|
143
|
+
year = date[:4]
|
144
|
+
date = datetime.strptime(date, "%Y-%m-%d")
|
145
|
+
initial_bound = datetime.strptime('-'.join([year, '01-01']), "%Y-%m-%d")
|
146
|
+
|
147
|
+
# End bound for year1-year2 season is later for pandemic year
|
148
|
+
if initial_bound <= date <= season_end_bound(year):
|
149
|
+
return int(year) - 1
|
150
|
+
|
151
|
+
return int(year)
|
152
|
+
|
153
|
+
|
154
|
+
def season_start_bound(year):
|
155
|
+
"""
|
156
|
+
Get start bound for a season.
|
157
|
+
|
158
|
+
Notes:
|
159
|
+
- There is a bug in the schedule API for 2016 that causes the pushback to 09-30
|
160
|
+
- Pandemic season started in January
|
161
|
+
|
162
|
+
:param year: str of year for given date
|
163
|
+
|
164
|
+
:return: str of first date in season
|
165
|
+
"""
|
166
|
+
if int(year) == 2016:
|
167
|
+
return "2016-09-30"
|
168
|
+
|
169
|
+
if int(year) == 2020:
|
170
|
+
return '2021-01-01'
|
171
|
+
|
172
|
+
return "{}-09-01".format(str(year))
|
173
|
+
|
174
|
+
|
175
|
+
|
176
|
+
def season_end_bound(year):
|
177
|
+
"""
|
178
|
+
Determine the end bound of a given season. Changes depending on if it's the pandemic season or not
|
179
|
+
|
180
|
+
:param year: str of year for given date
|
181
|
+
|
182
|
+
:return: Datetime obj of last date in season
|
183
|
+
"""
|
184
|
+
normal_end_bound = datetime.strptime('-'.join([str(year), '08-31']), "%Y-%m-%d")
|
185
|
+
pandemic_end_bound = datetime.strptime('-'.join([str(year), '10-31']), "%Y-%m-%d")
|
186
|
+
|
187
|
+
if int(year) == 2020:
|
188
|
+
return pandemic_end_bound
|
189
|
+
|
190
|
+
return normal_end_bound
|
191
|
+
|
192
|
+
|
193
|
+
def convert_to_seconds(minutes):
|
194
|
+
"""
|
195
|
+
Return minutes elapsed in time format to seconds elapsed
|
196
|
+
|
197
|
+
:param minutes: time elapsed
|
198
|
+
|
199
|
+
:return: time elapsed in seconds
|
200
|
+
"""
|
201
|
+
if minutes == '-16:0-':
|
202
|
+
return '1200' # Sometimes in the html at the end of the game the time is -16:0-
|
203
|
+
|
204
|
+
# If the time is junk not much i can do
|
205
|
+
try:
|
206
|
+
x = time.strptime(minutes.strip(' '), '%M:%S')
|
207
|
+
except ValueError:
|
208
|
+
return None
|
209
|
+
|
210
|
+
return timedelta(hours=x.tm_hour, minutes=x.tm_min, seconds=x.tm_sec).total_seconds()
|
211
|
+
|
212
|
+
|
213
|
+
def if_rescrape(user_rescrape):
|
214
|
+
"""
|
215
|
+
If you want to re_scrape. If someone is a dumbass and feeds it a non-boolean it terminates the program
|
216
|
+
|
217
|
+
Note: Only matters when you have a directory specified
|
218
|
+
|
219
|
+
:param user_rescrape: Boolean
|
220
|
+
|
221
|
+
:return: None
|
222
|
+
"""
|
223
|
+
if isinstance(user_rescrape, bool):
|
224
|
+
config.RESCRAPE = user_rescrape
|
225
|
+
else:
|
226
|
+
raise ValueError("Error: 'if_rescrape' must be a boolean. Not a {}".format(type(user_rescrape)))
|
227
|
+
|
228
|
+
|
229
|
+
def add_dir(user_dir):
|
230
|
+
"""
|
231
|
+
Add directory to store scraped docs if valid. Or create in the home dir
|
232
|
+
|
233
|
+
NOTE: After this functions docs_dir is either None or a valid directory
|
234
|
+
|
235
|
+
:param user_dir: If bool=True create in the home dire or if user provided directory on their machine
|
236
|
+
|
237
|
+
:return: None
|
238
|
+
"""
|
239
|
+
# False so they don't want it
|
240
|
+
if not user_dir:
|
241
|
+
config.DOCS_DIR = False
|
242
|
+
return
|
243
|
+
|
244
|
+
# Something was given
|
245
|
+
# Either True or string to directory
|
246
|
+
# If boolean refer to the home directory
|
247
|
+
if isinstance(user_dir, bool):
|
248
|
+
config.DOCS_DIR = os.path.join(os.path.expanduser('~'), "hockey_scraper_data")
|
249
|
+
# Create if needed
|
250
|
+
if not os.path.isdir(config.DOCS_DIR):
|
251
|
+
print_warning("Creating the hockey_scraper_data directory in the home directory")
|
252
|
+
os.mkdir(config.DOCS_DIR)
|
253
|
+
elif isinstance(user_dir, str) and os.path.isdir(user_dir):
|
254
|
+
config.DOCS_DIR = user_dir
|
255
|
+
elif not (isinstance(user_dir, str) and isinstance(user_dir, bool)):
|
256
|
+
config.DOCS_DIR = False
|
257
|
+
print_error("The docs_dir argument provided is invalid")
|
258
|
+
else:
|
259
|
+
config.DOCS_DIR = False
|
260
|
+
print_error("The directory specified for the saving of scraped docs doesn't exist. Therefore:"
|
261
|
+
"\n1. All specified games will be scraped from their appropriate sources (NHL or ESPN)."
|
262
|
+
"\n2. All scraped files will NOT be saved at all. Please either create the directory you want them to be "
|
263
|
+
"deposited in or recheck the directory you typed in and start again.\n")
|
264
|
+
|
265
|
+
|
266
|
+
def scrape_page(url):
|
267
|
+
"""
|
268
|
+
Scrape a given url
|
269
|
+
|
270
|
+
:param url: url for page
|
271
|
+
|
272
|
+
:return: response object
|
273
|
+
"""
|
274
|
+
response = requests.Session()
|
275
|
+
retries = Retry(total=10, backoff_factor=.1)
|
276
|
+
response.mount('http://', HTTPAdapter(max_retries=retries))
|
277
|
+
|
278
|
+
try:
|
279
|
+
response = response.get(url, timeout=5)
|
280
|
+
response.raise_for_status()
|
281
|
+
page = response.text
|
282
|
+
except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError):
|
283
|
+
page = None
|
284
|
+
except requests.exceptions.ReadTimeout:
|
285
|
+
# If it times out and it's the schedule print an error message...otherwise just make the page = None
|
286
|
+
if "schedule" in url:
|
287
|
+
raise Exception("Timeout Error: The NHL API took too long to respond to our request. "
|
288
|
+
"Please Try Again (you may need to try a few times before it works). ")
|
289
|
+
else:
|
290
|
+
print_error("Timeout Error: The server took too long to respond to our request.")
|
291
|
+
page = None
|
292
|
+
|
293
|
+
# Pause for 1 second - make it more if you want
|
294
|
+
time.sleep(1)
|
295
|
+
|
296
|
+
return page
|
297
|
+
|
298
|
+
|
299
|
+
|
300
|
+
def get_file(file_info, force=False):
|
301
|
+
"""
|
302
|
+
Get the specified file.
|
303
|
+
|
304
|
+
If a docs_dir is provided we check if it exists. If it does we see if it contains that page (and saves if it
|
305
|
+
doesn't). If the docs_dir doesn't exist we just scrape from the source and not save.
|
306
|
+
|
307
|
+
:param file_info: Dictionary containing the info for the file.
|
308
|
+
Contains the url, name, type, and season
|
309
|
+
:param force: Force a rescrape. Default is False
|
310
|
+
|
311
|
+
:return: page
|
312
|
+
"""
|
313
|
+
file_info['dir'] = config.DOCS_DIR
|
314
|
+
|
315
|
+
# If everything checks out we'll retrieve it, otherwise we scrape it
|
316
|
+
if file_info['dir'] and sp.check_file_exists(file_info) and not config.RESCRAPE and not force:
|
317
|
+
page = sp.get_page(file_info)
|
318
|
+
else:
|
319
|
+
page = scrape_page(file_info['url'])
|
320
|
+
sp.save_page(page, file_info)
|
321
|
+
|
322
|
+
return page
|
323
|
+
|
324
|
+
|
325
|
+
def check_data_format(data_format):
|
326
|
+
"""
|
327
|
+
Checks if data_format specified (if it is at all) is either None, 'Csv', or 'pandas'.
|
328
|
+
It exits program with error message if input isn't good.
|
329
|
+
|
330
|
+
:param data_format: data_format provided
|
331
|
+
|
332
|
+
:return: Boolean - True if good
|
333
|
+
"""
|
334
|
+
if not data_format or data_format.lower() not in ['csv', 'pandas']:
|
335
|
+
raise ValueError('{} is an unspecified data format. The two options are Csv and Pandas '
|
336
|
+
'(Csv is default)\n'.format(data_format))
|
337
|
+
|
338
|
+
|
339
|
+
def check_valid_dates(from_date, to_date):
|
340
|
+
"""
|
341
|
+
Check if it's a valid date range
|
342
|
+
|
343
|
+
:param from_date: date should scrape from
|
344
|
+
:param to_date: date should scrape to
|
345
|
+
|
346
|
+
:return: None
|
347
|
+
"""
|
348
|
+
try:
|
349
|
+
if time.strptime(to_date, "%Y-%m-%d") < time.strptime(from_date, "%Y-%m-%d"):
|
350
|
+
raise ValueError("Error: The second date input is earlier than the first one")
|
351
|
+
except ValueError:
|
352
|
+
raise ValueError("Error: Incorrect format given for dates. They must be given like 'yyyy-mm-dd' "
|
353
|
+
"(ex: '2016-10-01').")
|
354
|
+
|
355
|
+
|
356
|
+
def to_csv(base_file_name, df, league, file_type):
|
357
|
+
"""
|
358
|
+
Write DataFrame to csv file
|
359
|
+
|
360
|
+
:param base_file_name: name of file
|
361
|
+
:param df: DataFrame
|
362
|
+
:param league: nhl or nwhl
|
363
|
+
:param file_type: type of file despoiting
|
364
|
+
|
365
|
+
:return: None
|
366
|
+
"""
|
367
|
+
docs_dir = config.DOCS_DIR
|
368
|
+
|
369
|
+
# This was a late addition so we add support here
|
370
|
+
if isinstance(docs_dir, str) and not os.path.isdir(os.path.join(docs_dir, "csvs")):
|
371
|
+
os.mkdir(os.path.join(docs_dir, "csvs"))
|
372
|
+
|
373
|
+
if df is not None:
|
374
|
+
if isinstance(docs_dir, str):
|
375
|
+
file_name = os.path.join(docs_dir, "csvs", '{}_{}_{}.csv'.format(league, file_type, base_file_name))
|
376
|
+
else:
|
377
|
+
file_name = '{}_{}_{}.csv'.format(league, file_type, base_file_name)
|
378
|
+
|
379
|
+
print("---> {} {} data deposited in file - {}".format(league, file_type, file_name))
|
380
|
+
df.to_csv(file_name, sep=',', encoding='utf-8')
|
381
|
+
|
382
|
+
import re
|
383
|
+
from bs4 import BeautifulSoup, SoupStrainer
|
384
|
+
|
385
|
+
## html_pbp.py ##
|
386
|
+
def get_contents(game_html):
|
387
|
+
"""
|
388
|
+
Uses Beautiful soup to parses the html document.
|
389
|
+
Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order
|
390
|
+
|
391
|
+
:param game_html: html doc
|
392
|
+
|
393
|
+
:return: "soupified" html
|
394
|
+
"""
|
395
|
+
parsers = ["html5lib", "lxml", "html.parser"]
|
396
|
+
strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})
|
397
|
+
|
398
|
+
for parser in parsers:
|
399
|
+
# parse_only only works with lxml for some reason
|
400
|
+
if parser == "lxml":
|
401
|
+
soup = BeautifulSoup(game_html, parser, parse_only=strainer)
|
402
|
+
else:
|
403
|
+
soup = BeautifulSoup(game_html, parser)
|
404
|
+
|
405
|
+
tds = soup.find_all("td", {"class": re.compile('.*bborder.*')})
|
406
|
+
|
407
|
+
if len(tds) > 0:
|
408
|
+
break
|
409
|
+
|
410
|
+
return tds
|
411
|
+
|
412
|
+
## html_shifts.py ##
|
413
|
+
def get_soup(shifts_html):
|
414
|
+
"""
|
415
|
+
Uses Beautiful soup to parses the html document.
|
416
|
+
Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order
|
417
|
+
|
418
|
+
:param shifts_html: html doc
|
419
|
+
|
420
|
+
:return: "soupified" html and player_shifts portion of html (it's a bunch of td tags)
|
421
|
+
"""
|
422
|
+
parsers = ["lxml", "html.parser", "html5lib"]
|
423
|
+
|
424
|
+
for parser in parsers:
|
425
|
+
soup = BeautifulSoup(shifts_html, parser)
|
426
|
+
td = soup.findAll(True, {'class': ['playerHeading + border', 'lborder + bborder']})
|
427
|
+
|
428
|
+
if len(td) > 0:
|
429
|
+
break
|
430
|
+
|
431
|
+
return td, get_teams(soup)
|
432
|
+
|
433
|
+
|
434
|
+
def get_teams(soup):
|
435
|
+
"""
|
436
|
+
Return the team for the TOI tables and the home team
|
437
|
+
|
438
|
+
:param soup: souped up html
|
439
|
+
|
440
|
+
:return: list with team and home team
|
441
|
+
"""
|
442
|
+
team = soup.find('td', class_='teamHeading + border') # Team for shifts
|
443
|
+
team = team.get_text()
|
444
|
+
|
445
|
+
# Get Home Team
|
446
|
+
teams = soup.find_all('td', {'align': 'center', 'style': 'font-size: 10px;font-weight:bold'})
|
447
|
+
regex = re.compile(r'>(.*)<br/?>')
|
448
|
+
home_team = regex.findall(str(teams[7]))
|
449
|
+
|
450
|
+
return [team, home_team[0]]
|