wsba-hockey 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ ### CODE IN THIS DIRECTORY ORIGINALLY (FULLY OR PARITALLY) WRITTEN BY HARRY SHOMER IN THE "hockey_scraper" PACKAGE
@@ -0,0 +1,14 @@
1
+ """
2
+ Basic configurations
3
+ """
4
+
5
+ # Directory where to save pages
6
+ # When True assumes ~/hockey_scraper_data
7
+ # Otherwise can take str to `existing` directory
8
+ DOCS_DIR = False
9
+
10
+ # Boolean that tells us whether or not we should re-scrape a given page if it's already saved
11
+ RESCRAPE = False
12
+
13
+ # Whether to log verbose errors to log file
14
+ LOG = False
@@ -0,0 +1,133 @@
1
+ #
2
+ #Saves the scraped docs so you don't have to re-scrape them every time you want to parse the docs.
3
+ #
4
+ #\**** Don't mess with this unless you know what you're doing \****
5
+ #
6
+ import os
7
+ import gzip
8
+
9
+
10
+ def create_base_file_path(file_info):
11
+ """
12
+ Creates the base file path for a given file
13
+
14
+ :param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
15
+ we want to deposit any data in.
16
+
17
+ :return: path
18
+ """
19
+ # Shitty fix for when you already have it saved but don't have nwhl folders
20
+ if 'nwhl' in file_info['type']:
21
+ if not os.path.isdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'])):
22
+ os.mkdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type']))
23
+
24
+ return os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'], file_info['name'] + ".txt")
25
+
26
+
27
+ def is_compressed(file_info):
28
+ """
29
+ Check if stored file is compressed as we used to not save them as compressed.
30
+
31
+ :param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
32
+ we want to deposit any data in.
33
+
34
+ return Boolean
35
+ """
36
+ return os.path.isfile(create_base_file_path(file_info) + ".gz")
37
+
38
+
39
+ def create_dir_structure(dir_name):
40
+ """
41
+ Create the basic directory structure for docs_dir if not done yet.
42
+ Creates the docs and csvs subdir if it doesn't exist
43
+
44
+ :param dir_name: Name of dir to create
45
+
46
+ :return None
47
+ """
48
+ if not os.path.isdir(os.path.join(dir_name, 'docs')):
49
+ os.mkdir(os.path.join(dir_name, 'docs'))
50
+
51
+ if not os.path.isdir(os.path.join(dir_name, 'csvs')):
52
+ os.mkdir(os.path.join(dir_name, 'csvs'))
53
+
54
+
55
+
56
+ def create_season_dirs(file_info):
57
+ """
58
+ Creates the infrastructure to hold all the scraped docs for a season
59
+
60
+ :param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
61
+ we want to deposit any data in.
62
+
63
+ :return: None
64
+ """
65
+ sub_folders = ["html_pbp", "json_pbp", "espn_pbp", "html_shifts_home", "html_shifts_away",
66
+ "json_shifts", "html_roster", "json_schedule", "espn_scoreboard"]
67
+
68
+ season_path = os.path.join(file_info['dir'], 'docs', str(file_info['season']))
69
+ os.mkdir(season_path)
70
+
71
+ for sub_f in sub_folders:
72
+ os.mkdir(os.path.join(season_path, sub_f))
73
+
74
+
75
+ def check_file_exists(file_info):
76
+ """
77
+ Checks if the file exists. Also check if structure for holding scraped file exists to. If not, it creates it.
78
+
79
+ :param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
80
+ we want to deposit any data in.
81
+
82
+ :return: Boolean - True if it exists
83
+ """
84
+ create_dir_structure(file_info['dir'])
85
+
86
+ # Check if the folder for the season for the given game was created yet...if not create it
87
+ if not os.path.isdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']))):
88
+ create_season_dirs(file_info)
89
+
90
+ # May or may not be compressed due to file saved under older versions
91
+ non_compressed_file = os.path.isfile(create_base_file_path(file_info))
92
+ compressed_file = is_compressed(file_info)
93
+
94
+ return compressed_file or non_compressed_file
95
+
96
+
97
+ def get_page(file_info):
98
+ """
99
+ Get the file so we don't need to re-scrape.
100
+
101
+ Try both compressed and non-compressed for backwards compatability issues (formerly non-compressed)
102
+
103
+ :param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
104
+ we want to deposit any data in.
105
+
106
+ :return: Response or None
107
+ """
108
+ base_file = create_base_file_path(file_info)
109
+
110
+ if is_compressed(file_info):
111
+ with gzip.open(base_file + ".gz", 'rb') as my_file:
112
+ return my_file.read().decode("utf-8").replace('\n', '')
113
+ else:
114
+ with open(base_file, 'r') as my_file:
115
+ return my_file.read().replace('\n', '')
116
+
117
+
118
+ def save_page(page, file_info):
119
+ """
120
+ Save the page we just scraped.
121
+
122
+ Note: It'll only get saved if the directory already exists!!!!!!. I'm not dealing with any fuck ups. That would
123
+ involve checking if it's even a valid path and creating it. Make sure you get it right.
124
+
125
+ :param page: File scraped
126
+ :param file_info: Dictionary containing the info on the file. Includes the name, season, file type, and the dir
127
+ we want to deposit any data in.
128
+
129
+ :return: None
130
+ """
131
+ if file_info['dir'] and page is not None and page != '':
132
+ with gzip.open(create_base_file_path(file_info) + ".gz", 'wb') as file:
133
+ file.write(page.encode())
@@ -0,0 +1,450 @@
1
+ ### CODE IN THIS DIRECTORY ORIGINALLY (FULLY OR PARITALLY) WRITTEN BY HARRY SHOMER IN THE "hockey_scraper" PACKAGE
2
+
3
+ ## shared.py ##
4
+ """
5
+ This file is a bunch of the shared functions or just general stuff used by the different scrapers in the package.
6
+ """
7
+ import os
8
+ import time
9
+ import json
10
+ import logging
11
+ import warnings
12
+ import requests
13
+ from datetime import datetime, timedelta
14
+ from requests.adapters import HTTPAdapter
15
+ from urllib3.util import Retry
16
+ from . import save_pages as sp
17
+ from . import config
18
+ import inspect
19
+
20
+ # Directory where this file lives
21
+ FILE_DIR = os.path.dirname(os.path.realpath(__file__))
22
+
23
+ # Name and Team fixes used
24
+ with open(os.path.join(FILE_DIR, "player_name_fixes.json"), "r" ,encoding="utf-8") as f:
25
+ Names = json.load(f)['fixes']
26
+
27
+ with open(os.path.join(FILE_DIR, "team_tri_codes.json"), "r" ,encoding="utf-8") as f:
28
+ TEAMS = json.load(f)['teams']
29
+
30
+
31
+ def fix_name(name):
32
+ """
33
+ Check if a name falls under those that need fixing. If it does...fix it.
34
+
35
+ :param name: name in pbp
36
+
37
+ :return: Either the given parameter or the fixed name
38
+ """
39
+ return Names.get(name.upper(), name.upper()).upper()
40
+
41
+
42
+ def get_team(team):
43
+ """
44
+ Get the fucking team
45
+ """
46
+ return TEAMS.get(team.upper(), team.upper()).upper()
47
+
48
+
49
+ def custom_formatwarning(msg, *args, **kwargs):
50
+ """
51
+ Override format for standard wanings
52
+ """
53
+ ansi_no_color = '\033[0m'
54
+ return "{msg}\n{no_color}".format(no_color=ansi_no_color, msg=msg)
55
+
56
+ warnings.formatwarning = custom_formatwarning
57
+
58
+
59
+ def print_error(msg):
60
+ """
61
+ Implement own custom error using warning module. Prints in red
62
+
63
+ Reason why i still use warning for errors is so i can set to ignore them if i want to (e.g. live_scrape line 200).
64
+
65
+ :param msg: Str to print
66
+
67
+ :return: None
68
+ """
69
+ ansi_red_code = '\033[0;31m'
70
+ warning_msg = "{}Error: {}".format(ansi_red_code, msg)
71
+
72
+ # if config.LOG:
73
+ # caller_file = os.path.basename(inspect.stack()[1].filename)
74
+ # get_logger(caller_file).error(msg + " " + verbose)
75
+
76
+ warnings.warn(warning_msg)
77
+
78
+
79
+ def print_warning(msg):
80
+ """
81
+ Implement own custom warning using warning module. Prints in Orange.
82
+
83
+ :param msg: Str to print
84
+
85
+ :return: None
86
+ """
87
+ ansi_yellow_code = '\033[0;33m'
88
+ warning_msg = "{}Warning: {}".format(ansi_yellow_code, msg)
89
+
90
+ warnings.warn(warning_msg)
91
+
92
+
93
+ def get_logger(python_file):
94
+ """
95
+ Create a basic logger to a log file
96
+
97
+ :param python_file: File that instantiates the logger instance
98
+
99
+ :return: logger
100
+ """
101
+ base_py_file = os.path.basename(python_file)
102
+
103
+ # If already exists we don't try to recreate it
104
+ if base_py_file in logging.Logger.manager.loggerDict.keys():
105
+ return logging.getLogger(base_py_file)
106
+
107
+ logger = logging.getLogger(base_py_file)
108
+ logger.setLevel(logging.INFO)
109
+
110
+ fh = logging.FileHandler("hockey_scraper_errors_{}.log".format(datetime.now().strftime("%Y-%m-%dT%H:%M:%S")))
111
+ fh.setFormatter(logging.Formatter('%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s', datefmt='%Y-%m-%d %I:%M:%S'))
112
+ logger.addHandler(fh)
113
+
114
+ return logger
115
+
116
+
117
+ def log_error(err, py_file):
118
+ """
119
+ Log error when Logging is specified
120
+
121
+ :param err: Error to log
122
+ :param python_file: File that instantiates the logger instance
123
+
124
+ :return: None
125
+ """
126
+ if config.LOG:
127
+ get_logger(py_file).error(err)
128
+
129
+
130
+ def get_season(date):
131
+ """
132
+ Get Season based on from_date
133
+
134
+ There is an exception for the 2019-2020 pandemic season. Accoding to the below url:
135
+ - 2019-2020 season ends in Oct. 2020
136
+ - 2020-2021 season begins in November 2020
137
+ - https://nhl.nbcsports.com/2020/07/10/new-nhl-critical-dates-calendar-means-an-october-free-agent-frenzy/
138
+
139
+ :param date: date
140
+
141
+ :return: season -> ex: 2016 for 2016-2017 season
142
+ """
143
+ year = date[:4]
144
+ date = datetime.strptime(date, "%Y-%m-%d")
145
+ initial_bound = datetime.strptime('-'.join([year, '01-01']), "%Y-%m-%d")
146
+
147
+ # End bound for year1-year2 season is later for pandemic year
148
+ if initial_bound <= date <= season_end_bound(year):
149
+ return int(year) - 1
150
+
151
+ return int(year)
152
+
153
+
154
+ def season_start_bound(year):
155
+ """
156
+ Get start bound for a season.
157
+
158
+ Notes:
159
+ - There is a bug in the schedule API for 2016 that causes the pushback to 09-30
160
+ - Pandemic season started in January
161
+
162
+ :param year: str of year for given date
163
+
164
+ :return: str of first date in season
165
+ """
166
+ if int(year) == 2016:
167
+ return "2016-09-30"
168
+
169
+ if int(year) == 2020:
170
+ return '2021-01-01'
171
+
172
+ return "{}-09-01".format(str(year))
173
+
174
+
175
+
176
+ def season_end_bound(year):
177
+ """
178
+ Determine the end bound of a given season. Changes depending on if it's the pandemic season or not
179
+
180
+ :param year: str of year for given date
181
+
182
+ :return: Datetime obj of last date in season
183
+ """
184
+ normal_end_bound = datetime.strptime('-'.join([str(year), '08-31']), "%Y-%m-%d")
185
+ pandemic_end_bound = datetime.strptime('-'.join([str(year), '10-31']), "%Y-%m-%d")
186
+
187
+ if int(year) == 2020:
188
+ return pandemic_end_bound
189
+
190
+ return normal_end_bound
191
+
192
+
193
+ def convert_to_seconds(minutes):
194
+ """
195
+ Return minutes elapsed in time format to seconds elapsed
196
+
197
+ :param minutes: time elapsed
198
+
199
+ :return: time elapsed in seconds
200
+ """
201
+ if minutes == '-16:0-':
202
+ return '1200' # Sometimes in the html at the end of the game the time is -16:0-
203
+
204
+ # If the time is junk not much i can do
205
+ try:
206
+ x = time.strptime(minutes.strip(' '), '%M:%S')
207
+ except ValueError:
208
+ return None
209
+
210
+ return timedelta(hours=x.tm_hour, minutes=x.tm_min, seconds=x.tm_sec).total_seconds()
211
+
212
+
213
+ def if_rescrape(user_rescrape):
214
+ """
215
+ If you want to re_scrape. If someone is a dumbass and feeds it a non-boolean it terminates the program
216
+
217
+ Note: Only matters when you have a directory specified
218
+
219
+ :param user_rescrape: Boolean
220
+
221
+ :return: None
222
+ """
223
+ if isinstance(user_rescrape, bool):
224
+ config.RESCRAPE = user_rescrape
225
+ else:
226
+ raise ValueError("Error: 'if_rescrape' must be a boolean. Not a {}".format(type(user_rescrape)))
227
+
228
+
229
+ def add_dir(user_dir):
230
+ """
231
+ Add directory to store scraped docs if valid. Or create in the home dir
232
+
233
+ NOTE: After this functions docs_dir is either None or a valid directory
234
+
235
+ :param user_dir: If bool=True create in the home dire or if user provided directory on their machine
236
+
237
+ :return: None
238
+ """
239
+ # False so they don't want it
240
+ if not user_dir:
241
+ config.DOCS_DIR = False
242
+ return
243
+
244
+ # Something was given
245
+ # Either True or string to directory
246
+ # If boolean refer to the home directory
247
+ if isinstance(user_dir, bool):
248
+ config.DOCS_DIR = os.path.join(os.path.expanduser('~'), "hockey_scraper_data")
249
+ # Create if needed
250
+ if not os.path.isdir(config.DOCS_DIR):
251
+ print_warning("Creating the hockey_scraper_data directory in the home directory")
252
+ os.mkdir(config.DOCS_DIR)
253
+ elif isinstance(user_dir, str) and os.path.isdir(user_dir):
254
+ config.DOCS_DIR = user_dir
255
+ elif not (isinstance(user_dir, str) and isinstance(user_dir, bool)):
256
+ config.DOCS_DIR = False
257
+ print_error("The docs_dir argument provided is invalid")
258
+ else:
259
+ config.DOCS_DIR = False
260
+ print_error("The directory specified for the saving of scraped docs doesn't exist. Therefore:"
261
+ "\n1. All specified games will be scraped from their appropriate sources (NHL or ESPN)."
262
+ "\n2. All scraped files will NOT be saved at all. Please either create the directory you want them to be "
263
+ "deposited in or recheck the directory you typed in and start again.\n")
264
+
265
+
266
+ def scrape_page(url):
267
+ """
268
+ Scrape a given url
269
+
270
+ :param url: url for page
271
+
272
+ :return: response object
273
+ """
274
+ response = requests.Session()
275
+ retries = Retry(total=10, backoff_factor=.1)
276
+ response.mount('http://', HTTPAdapter(max_retries=retries))
277
+
278
+ try:
279
+ response = response.get(url, timeout=5)
280
+ response.raise_for_status()
281
+ page = response.text
282
+ except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError):
283
+ page = None
284
+ except requests.exceptions.ReadTimeout:
285
+ # If it times out and it's the schedule print an error message...otherwise just make the page = None
286
+ if "schedule" in url:
287
+ raise Exception("Timeout Error: The NHL API took too long to respond to our request. "
288
+ "Please Try Again (you may need to try a few times before it works). ")
289
+ else:
290
+ print_error("Timeout Error: The server took too long to respond to our request.")
291
+ page = None
292
+
293
+ # Pause for 1 second - make it more if you want
294
+ time.sleep(1)
295
+
296
+ return page
297
+
298
+
299
+
300
+ def get_file(file_info, force=False):
301
+ """
302
+ Get the specified file.
303
+
304
+ If a docs_dir is provided we check if it exists. If it does we see if it contains that page (and saves if it
305
+ doesn't). If the docs_dir doesn't exist we just scrape from the source and not save.
306
+
307
+ :param file_info: Dictionary containing the info for the file.
308
+ Contains the url, name, type, and season
309
+ :param force: Force a rescrape. Default is False
310
+
311
+ :return: page
312
+ """
313
+ file_info['dir'] = config.DOCS_DIR
314
+
315
+ # If everything checks out we'll retrieve it, otherwise we scrape it
316
+ if file_info['dir'] and sp.check_file_exists(file_info) and not config.RESCRAPE and not force:
317
+ page = sp.get_page(file_info)
318
+ else:
319
+ page = scrape_page(file_info['url'])
320
+ sp.save_page(page, file_info)
321
+
322
+ return page
323
+
324
+
325
+ def check_data_format(data_format):
326
+ """
327
+ Checks if data_format specified (if it is at all) is either None, 'Csv', or 'pandas'.
328
+ It exits program with error message if input isn't good.
329
+
330
+ :param data_format: data_format provided
331
+
332
+ :return: Boolean - True if good
333
+ """
334
+ if not data_format or data_format.lower() not in ['csv', 'pandas']:
335
+ raise ValueError('{} is an unspecified data format. The two options are Csv and Pandas '
336
+ '(Csv is default)\n'.format(data_format))
337
+
338
+
339
+ def check_valid_dates(from_date, to_date):
340
+ """
341
+ Check if it's a valid date range
342
+
343
+ :param from_date: date should scrape from
344
+ :param to_date: date should scrape to
345
+
346
+ :return: None
347
+ """
348
+ try:
349
+ if time.strptime(to_date, "%Y-%m-%d") < time.strptime(from_date, "%Y-%m-%d"):
350
+ raise ValueError("Error: The second date input is earlier than the first one")
351
+ except ValueError:
352
+ raise ValueError("Error: Incorrect format given for dates. They must be given like 'yyyy-mm-dd' "
353
+ "(ex: '2016-10-01').")
354
+
355
+
356
+ def to_csv(base_file_name, df, league, file_type):
357
+ """
358
+ Write DataFrame to csv file
359
+
360
+ :param base_file_name: name of file
361
+ :param df: DataFrame
362
+ :param league: nhl or nwhl
363
+ :param file_type: type of file despoiting
364
+
365
+ :return: None
366
+ """
367
+ docs_dir = config.DOCS_DIR
368
+
369
+ # This was a late addition so we add support here
370
+ if isinstance(docs_dir, str) and not os.path.isdir(os.path.join(docs_dir, "csvs")):
371
+ os.mkdir(os.path.join(docs_dir, "csvs"))
372
+
373
+ if df is not None:
374
+ if isinstance(docs_dir, str):
375
+ file_name = os.path.join(docs_dir, "csvs", '{}_{}_{}.csv'.format(league, file_type, base_file_name))
376
+ else:
377
+ file_name = '{}_{}_{}.csv'.format(league, file_type, base_file_name)
378
+
379
+ print("---> {} {} data deposited in file - {}".format(league, file_type, file_name))
380
+ df.to_csv(file_name, sep=',', encoding='utf-8')
381
+
382
+ import re
383
+ from bs4 import BeautifulSoup, SoupStrainer
384
+
385
+ ## html_pbp.py ##
386
+ def get_contents(game_html):
387
+ """
388
+ Uses Beautiful soup to parses the html document.
389
+ Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order
390
+
391
+ :param game_html: html doc
392
+
393
+ :return: "soupified" html
394
+ """
395
+ parsers = ["html5lib", "lxml", "html.parser"]
396
+ strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})
397
+
398
+ for parser in parsers:
399
+ # parse_only only works with lxml for some reason
400
+ if parser == "lxml":
401
+ soup = BeautifulSoup(game_html, parser, parse_only=strainer)
402
+ else:
403
+ soup = BeautifulSoup(game_html, parser)
404
+
405
+ tds = soup.find_all("td", {"class": re.compile('.*bborder.*')})
406
+
407
+ if len(tds) > 0:
408
+ break
409
+
410
+ return tds
411
+
412
+ ## html_shifts.py ##
413
+ def get_soup(shifts_html):
414
+ """
415
+ Uses Beautiful soup to parses the html document.
416
+ Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order
417
+
418
+ :param shifts_html: html doc
419
+
420
+ :return: "soupified" html and player_shifts portion of html (it's a bunch of td tags)
421
+ """
422
+ parsers = ["lxml", "html.parser", "html5lib"]
423
+
424
+ for parser in parsers:
425
+ soup = BeautifulSoup(shifts_html, parser)
426
+ td = soup.findAll(True, {'class': ['playerHeading + border', 'lborder + bborder']})
427
+
428
+ if len(td) > 0:
429
+ break
430
+
431
+ return td, get_teams(soup)
432
+
433
+
434
+ def get_teams(soup):
435
+ """
436
+ Return the team for the TOI tables and the home team
437
+
438
+ :param soup: souped up html
439
+
440
+ :return: list with team and home team
441
+ """
442
+ team = soup.find('td', class_='teamHeading + border') # Team for shifts
443
+ team = team.get_text()
444
+
445
+ # Get Home Team
446
+ teams = soup.find_all('td', {'align': 'center', 'style': 'font-size: 10px;font-weight:bold'})
447
+ regex = re.compile(r'>(.*)<br/?>')
448
+ home_team = regex.findall(str(teams[7]))
449
+
450
+ return [team, home_team[0]]