basketball-reference-webscrapper 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.1
2
+ Name: basketball-reference-webscrapper
3
+ Version: 0.1.3
4
+ Summary: Python package for Basketball Reference that gathers data by scraping the website
5
+ Author: Yannick Flores
6
+ Author-email: yannick.flores1992@gmail.com
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Dist: beautifulsoup4 (==4.12.*)
12
+ Requires-Dist: numpy (==1.26.*)
13
+ Requires-Dist: pandas (==2.*)
14
+ Requires-Dist: pylint (>=2.15.10,<3.0.0)
15
+ Requires-Dist: pytest
16
+ Requires-Dist: requests (==2.32.*)
17
+ Description-Content-Type: text/markdown
18
+
19
+ # basketball-reference-webscrapper
20
+ A python package for Basketball Reference that gathers data by scraping the website.
21
+
@@ -0,0 +1,2 @@
1
+ # basketball-reference-webscrapper
2
+ A python package for Basketball Reference that gathers data by scraping the website.
@@ -0,0 +1,38 @@
1
+ team_abrev;city;team_name
2
+ ATL;Atlanta;Atlanta Hawks
3
+ BOS;Boston;Boston Celtics
4
+ BRK;Brooklyn;Brooklyn Nets
5
+ CHA;Charlotte;Charlotte Bobcats
6
+ CHH;Charlotte;Charlotte Hornets
7
+ CHI;Chicago;Chicago Bulls
8
+ CHO;Charlotte;Charlotte Hornets
9
+ CLE;Cleveland;Cleveland Cavaliers
10
+ DAL;Dallas;Dallas Mavericks
11
+ DEN;Denver;Denver Nuggets
12
+ DET;Detroit;Detroit Pistons
13
+ GSW;Golden State;Golden State Warriors
14
+ HOU;Houston;Houston Rockets
15
+ IND;Indiana;Indiana Pacers
16
+ LAC;Los Angeles;Los Angeles Clippers
17
+ LAL;Los Angeles;Los Angeles Lakers
18
+ MEM;Memphis;Memphis Grizzlies
19
+ MIA;Miami;Miami Heat
20
+ MIL;Milwaukee;Milwaukee Bucks
21
+ MIN;Minnesota;Minnesota Timberwolves
22
+ NJN;New Jersey;New Jersey Nets
23
+ NOH;New Orleans;New Orleans Hornets
24
+ NOK;New Orleans;New Orleans/Oklahoma City Hornets
25
+ NOP;New Orleans;New Orleans Pelicans
26
+ NYK;New York;New York Knicks
27
+ OKC;Oklahoma;Oklahoma City Thunder
28
+ ORL;Orlando;Orlando Magic
29
+ PHI;Philadelphie;Philadelphia 76ers
30
+ PHO;Phoenix;Phoenix Suns
31
+ POR;Portland;Portland Trail Blazers
32
+ SAC;Sacramento;Sacramento Kings
33
+ SAS;San Antonio;San Antonio Spurs
34
+ SEA;Seattle;Seattle SuperSonics
35
+ TOR;Toronto;Toronto Raptors
36
+ UTA;Utah;Utah Jazz
37
+ VAN;Vancover;Vancouver Grizzlies
38
+ WAS;Washington D.C;Washington Wizards
@@ -0,0 +1,44 @@
1
+ """Provides functions to create loggers."""
2
+
3
+ import logging
4
+ from typing import Text, Union
5
+ import sys
6
+
7
+
8
+ def get_console_handler() -> logging.StreamHandler:
9
+ """Get console handler.
10
+ Returns:
11
+ logging.StreamHandler which logs into stdout
12
+ """
13
+
14
+ console_handler = logging.StreamHandler(sys.stdout)
15
+ formatter = logging.Formatter(
16
+ "%(asctime)s — %(name)s — %(levelname)s — %(message)s"
17
+ )
18
+ console_handler.setFormatter(formatter)
19
+
20
+ return console_handler
21
+
22
+
23
+ def get_logger(
24
+ name: Text = __name__, log_level: Union[Text, int] = logging.DEBUG
25
+ ) -> logging.Logger:
26
+ """Get logger.
27
+ Args:
28
+ name {Text}: logger name
29
+ log_level {Text or int}: logging level; can be string name or integer value
30
+ Returns:
31
+ logging.Logger instance
32
+ """
33
+
34
+ logger = logging.getLogger(name)
35
+ logger.setLevel(log_level)
36
+
37
+ # Prevent duplicate outputs in Jypyter Notebook
38
+ if logger.hasHandlers():
39
+ logger.handlers.clear()
40
+
41
+ logger.addHandler(get_console_handler())
42
+ logger.propagate = False
43
+
44
+ return logger
@@ -0,0 +1,115 @@
1
+ from dataclasses import dataclass
2
+ from urllib.request import urlopen
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ import requests
6
+ import time
7
+ import importlib_resources
8
+
9
+ from basketball_reference_webscrapper.utils.logs import get_logger
10
+
11
+ logger = get_logger("WEB_SCRAPPING_EXECUTION", log_level="INFO")
12
+
13
+ @dataclass
14
+ class WebScrapBasketballReference:
15
+ """
16
+ Class that take as inputs url and season and return all games teams stats
17
+ """
18
+
19
+ # init method or constructor
20
+ def __init__(self, url, season):
21
+ self.url = url
22
+ self.season = season
23
+
24
+ def webscrappe_nba_games_data(self):
25
+ """
26
+ Webscrappe NBA games data
27
+ """
28
+
29
+ #------------------------------------------------------
30
+ # Get team reference data
31
+ ref = (
32
+ importlib_resources.files("basketball_reference_webscrapper")
33
+ / "constants/team_city_refdata.csv"
34
+ )
35
+ with importlib_resources.as_file(ref) as path:
36
+ # Do something with path. After the with-statement exits, any
37
+ # temporary file created will be immediately cleaned up.
38
+ team_city_refdata = pd.read_csv(path, sep = ';')
39
+
40
+ #------------------------------------------------------
41
+ # Initialization of the dataframe to fill-in
42
+ games = pd.DataFrame()
43
+
44
+ #------------------------------------------------------
45
+ # For Loop Throught all the team abrev for the given season
46
+
47
+ team_city_refdata = team_city_refdata.head(2)
48
+
49
+ for index, row in team_city_refdata.iterrows():
50
+
51
+ # URL to scrape
52
+ team = row['team_abrev']
53
+
54
+ logger.info(
55
+ f"Execution for {team}"
56
+ )
57
+
58
+ # url = f"https://www.basketball-reference.com/teams/{team}/{self.season}/gamelog/"
59
+ url = f"https://www.basketball-reference.com/teams/ATL/2022/gamelog/"
60
+
61
+
62
+ if '200' in str(requests.get(url)):
63
+
64
+ # collect HTML data and create beautiful soup object:
65
+ # collect HTML data
66
+ html = urlopen(url)
67
+
68
+ # create beautiful soup object from HTML
69
+ soup = BeautifulSoup(html, "html.parser")
70
+
71
+ rows = soup.findAll('tr')[2:]
72
+
73
+ rows_data = [[td.getText() for td in rows[i].findAll('td')]
74
+ for i in range(len(rows))]
75
+
76
+ if len(rows_data) != 0:
77
+ # create the dataframe
78
+ games_tmp = pd.DataFrame(rows_data)
79
+ cols = ["game_nb", "game_date", "extdom", "opp", "results",
80
+ "pts_tm","pts_opp",
81
+ "fg_tm", "fga_tm","fg_prct_tm",
82
+ "3p_tm","3pa_tm", "3p_prct_tm","ft_tm","fta_tm","ft_prct_tm",
83
+ "orb_tm","trb_tm", "ast_tm","stl_tm","blk_tm" ,"tov_tm","pf_tm",
84
+ "nc",
85
+ "fg_opp","fga_opp","fg_prct_opp",
86
+ "3p_opp", "3pa_opp", "3p_prct_opp", "ft_opp", "fta_opp","ft_prct_opp",
87
+ "orb_opp", "trb_opp","ast_opp", "stl_opp", "blk_opp","tov_opp", "pf_opp"]
88
+
89
+ games_tmp.columns = cols
90
+ games_tmp = games_tmp.dropna()
91
+ games_tmp['id_season'] = self.season
92
+ games_tmp['tm'] = team
93
+ games = pd.concat([games, games_tmp], axis=0)
94
+
95
+ time.sleep(5)
96
+
97
+ games = games[[
98
+ 'id_season', 'game_nb', 'game_date', 'extdom', 'tm','opp', 'results', 'pts_tm', 'pts_opp',
99
+ 'fg_tm', 'fga_tm', 'fg_prct_tm', '3p_tm', '3pa_tm', '3p_prct_tm',
100
+ 'ft_tm', 'fta_tm', 'ft_prct_tm', 'orb_tm', 'trb_tm', 'ast_tm', 'stl_tm',
101
+ 'blk_tm', 'tov_tm', 'pf_tm', 'fg_opp', 'fga_opp', 'fg_prct_opp',
102
+ '3p_opp', '3pa_opp', '3p_prct_opp', 'ft_opp', 'fta_opp', 'ft_prct_opp',
103
+ 'orb_opp', 'trb_opp', 'ast_opp', 'stl_opp', 'blk_opp', 'tov_opp',
104
+ 'pf_opp']]
105
+
106
+ return games
107
+
108
+ if __name__ == "__main__":
109
+ webscrapping_class = WebScrapBasketballReference(
110
+ url = "test",
111
+ season = 2022
112
+ )
113
+
114
+ nba_games = webscrapping_class.webscrappe_nba_games_data()
115
+
@@ -0,0 +1,39 @@
1
+ [tool.poetry]
2
+ name = "basketball-reference-webscrapper"
3
+ version = "0.1.3"
4
+ description = "Python package for Basketball Reference that gathers data by scraping the website"
5
+ authors = ["Yannick Flores <yannick.flores1992@gmail.com>"]
6
+ readme = "README.md"
7
+ exclude = ["tests", "docs"]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.11"
11
+ pandas = "2.*"
12
+ numpy = "1.26.*"
13
+ pytest = "*"
14
+ pylint = "^2.15.10"
15
+ beautifulsoup4 = "4.12.*"
16
+ requests = "2.32.*"
17
+
18
+ [tool.poetry.dev-dependencies]
19
+ coverage = {extras = ["toml"], version = "6.5.*"}
20
+ black = {version = "22.10.*", allow-prereleases = true}
21
+ pytest = "7.2.*"
22
+ pytest-html = "3.1.*"
23
+ pytest-cov = "2.12.*"
24
+ py = "1.11.*"
25
+ pylint = "2.15.*"
26
+ python-decouple = "3.6"
27
+ mkdocs = "1.4.*"
28
+ importlib-resources="6.0.1"
29
+
30
+ [tool.pytest.ini_options]
31
+ pythonpath = "."
32
+ testpaths = "tests"
33
+ addopts = [
34
+ "--import-mode=importlib",
35
+ ]
36
+
37
+ [build-system]
38
+ requires = ["poetry-core>=1.0.0"]
39
+ build-backend = "poetry.core.masonry.api"