basketball-reference-webscrapper 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- basketball_reference_webscrapper-0.1.3/PKG-INFO +21 -0
- basketball_reference_webscrapper-0.1.3/README.md +2 -0
- basketball_reference_webscrapper-0.1.3/basketball_reference_webscrapper/__init__.py +0 -0
- basketball_reference_webscrapper-0.1.3/basketball_reference_webscrapper/constants/__init__.py +0 -0
- basketball_reference_webscrapper-0.1.3/basketball_reference_webscrapper/constants/team_city_refdata.csv +38 -0
- basketball_reference_webscrapper-0.1.3/basketball_reference_webscrapper/utils/__init__.py +0 -0
- basketball_reference_webscrapper-0.1.3/basketball_reference_webscrapper/utils/logs.py +44 -0
- basketball_reference_webscrapper-0.1.3/basketball_reference_webscrapper/webscrapping_basketball_reference.py +115 -0
- basketball_reference_webscrapper-0.1.3/pyproject.toml +39 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: basketball-reference-webscrapper
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Python package for Basketball Reference that gathers data by scraping the website
|
|
5
|
+
Author: Yannick Flores
|
|
6
|
+
Author-email: yannick.flores1992@gmail.com
|
|
7
|
+
Requires-Python: >=3.11,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Requires-Dist: beautifulsoup4 (==4.12.*)
|
|
12
|
+
Requires-Dist: numpy (==1.26.*)
|
|
13
|
+
Requires-Dist: pandas (==2.*)
|
|
14
|
+
Requires-Dist: pylint (>=2.15.10,<3.0.0)
|
|
15
|
+
Requires-Dist: pytest
|
|
16
|
+
Requires-Dist: requests (==2.32.*)
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# basketball-reference-webscrapper
|
|
20
|
+
A python package for Basketball Reference that gathers data by scraping the website.
|
|
21
|
+
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
team_abrev;city;team_name
|
|
2
|
+
ATL;Atlanta;Atlanta Hawks
|
|
3
|
+
BOS;Boston;Boston Celtics
|
|
4
|
+
BRK;Brooklyn;Brooklyn Nets
|
|
5
|
+
CHA;Charlotte;Charlotte Bobcats
|
|
6
|
+
CHH;Charlotte;Charlotte Hornets
|
|
7
|
+
CHI;Chicago;Chicago Bulls
|
|
8
|
+
CHO;Charlotte;Charlotte Hornets
|
|
9
|
+
CLE;Cleveland;Cleveland Cavaliers
|
|
10
|
+
DAL;Dallas;Dallas Mavericks
|
|
11
|
+
DEN;Denver;Denver Nuggets
|
|
12
|
+
DET;Detroit;Detroit Pistons
|
|
13
|
+
GSW;Golden State;Golden State Warriors
|
|
14
|
+
HOU;Houston;Houston Rockets
|
|
15
|
+
IND;Indiana;Indiana Pacers
|
|
16
|
+
LAC;Los Angeles;Los Angeles Clippers
|
|
17
|
+
LAL;Los Angeles;Los Angeles Lakers
|
|
18
|
+
MEM;Memphis;Memphis Grizzlies
|
|
19
|
+
MIA;Miami;Miami Heat
|
|
20
|
+
MIL;Milwaukee;Milwaukee Bucks
|
|
21
|
+
MIN;Minnesota;Minnesota Timberwolves
|
|
22
|
+
NJN;New Jersey;New Jersey Nets
|
|
23
|
+
NOH;New Orleans;New Orleans Hornets
|
|
24
|
+
NOK;New Orleans;New Orleans/Oklahoma City Hornets
|
|
25
|
+
NOP;New Orleans;New Orleans Pelicans
|
|
26
|
+
NYK;New York;New York Knicks
|
|
27
|
+
OKC;Oklahoma;Oklahoma City Thunder
|
|
28
|
+
ORL;Orlando;Orlando Magic
|
|
29
|
+
PHI;Philadelphie;Philadelphia 76ers
|
|
30
|
+
PHO;Phoenix;Phoenix Suns
|
|
31
|
+
POR;Portland;Portland Trail Blazers
|
|
32
|
+
SAC;Sacramento;Sacramento Kings
|
|
33
|
+
SAS;San Antonio;San Antonio Spurs
|
|
34
|
+
SEA;Seattle;Seattle SuperSonics
|
|
35
|
+
TOR;Toronto;Toronto Raptors
|
|
36
|
+
UTA;Utah;Utah Jazz
|
|
37
|
+
VAN;Vancover;Vancouver Grizzlies
|
|
38
|
+
WAS;Washington D.C;Washington Wizards
|
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Provides functions to create loggers."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Text, Union
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_console_handler() -> logging.StreamHandler:
|
|
9
|
+
"""Get console handler.
|
|
10
|
+
Returns:
|
|
11
|
+
logging.StreamHandler which logs into stdout
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
console_handler = logging.StreamHandler(sys.stdout)
|
|
15
|
+
formatter = logging.Formatter(
|
|
16
|
+
"%(asctime)s — %(name)s — %(levelname)s — %(message)s"
|
|
17
|
+
)
|
|
18
|
+
console_handler.setFormatter(formatter)
|
|
19
|
+
|
|
20
|
+
return console_handler
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_logger(
|
|
24
|
+
name: Text = __name__, log_level: Union[Text, int] = logging.DEBUG
|
|
25
|
+
) -> logging.Logger:
|
|
26
|
+
"""Get logger.
|
|
27
|
+
Args:
|
|
28
|
+
name {Text}: logger name
|
|
29
|
+
log_level {Text or int}: logging level; can be string name or integer value
|
|
30
|
+
Returns:
|
|
31
|
+
logging.Logger instance
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(name)
|
|
35
|
+
logger.setLevel(log_level)
|
|
36
|
+
|
|
37
|
+
# Prevent duplicate outputs in Jypyter Notebook
|
|
38
|
+
if logger.hasHandlers():
|
|
39
|
+
logger.handlers.clear()
|
|
40
|
+
|
|
41
|
+
logger.addHandler(get_console_handler())
|
|
42
|
+
logger.propagate = False
|
|
43
|
+
|
|
44
|
+
return logger
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from urllib.request import urlopen
|
|
3
|
+
from bs4 import BeautifulSoup
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import requests
|
|
6
|
+
import time
|
|
7
|
+
import importlib_resources
|
|
8
|
+
|
|
9
|
+
from basketball_reference_webscrapper.utils.logs import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger("WEB_SCRAPPING_EXECUTION", log_level="INFO")
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class WebScrapBasketballReference:
|
|
15
|
+
"""
|
|
16
|
+
Class that take as inputs url and season and return all games teams stats
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
# init method or constructor
|
|
20
|
+
def __init__(self, url, season):
|
|
21
|
+
self.url = url
|
|
22
|
+
self.season = season
|
|
23
|
+
|
|
24
|
+
def webscrappe_nba_games_data(self):
|
|
25
|
+
"""
|
|
26
|
+
Webscrappe NBA games data
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
#------------------------------------------------------
|
|
30
|
+
# Get team reference data
|
|
31
|
+
ref = (
|
|
32
|
+
importlib_resources.files("basketball_reference_webscrapper")
|
|
33
|
+
/ "constants/team_city_refdata.csv"
|
|
34
|
+
)
|
|
35
|
+
with importlib_resources.as_file(ref) as path:
|
|
36
|
+
# Do something with path. After the with-statement exits, any
|
|
37
|
+
# temporary file created will be immediately cleaned up.
|
|
38
|
+
team_city_refdata = pd.read_csv(path, sep = ';')
|
|
39
|
+
|
|
40
|
+
#------------------------------------------------------
|
|
41
|
+
# Initialization of the dataframe to fill-in
|
|
42
|
+
games = pd.DataFrame()
|
|
43
|
+
|
|
44
|
+
#------------------------------------------------------
|
|
45
|
+
# For Loop Throught all the team abrev for the given season
|
|
46
|
+
|
|
47
|
+
team_city_refdata = team_city_refdata.head(2)
|
|
48
|
+
|
|
49
|
+
for index, row in team_city_refdata.iterrows():
|
|
50
|
+
|
|
51
|
+
# URL to scrape
|
|
52
|
+
team = row['team_abrev']
|
|
53
|
+
|
|
54
|
+
logger.info(
|
|
55
|
+
f"Execution for {team}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# url = f"https://www.basketball-reference.com/teams/{team}/{self.season}/gamelog/"
|
|
59
|
+
url = f"https://www.basketball-reference.com/teams/ATL/2022/gamelog/"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
if '200' in str(requests.get(url)):
|
|
63
|
+
|
|
64
|
+
# collect HTML data and create beautiful soup object:
|
|
65
|
+
# collect HTML data
|
|
66
|
+
html = urlopen(url)
|
|
67
|
+
|
|
68
|
+
# create beautiful soup object from HTML
|
|
69
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
70
|
+
|
|
71
|
+
rows = soup.findAll('tr')[2:]
|
|
72
|
+
|
|
73
|
+
rows_data = [[td.getText() for td in rows[i].findAll('td')]
|
|
74
|
+
for i in range(len(rows))]
|
|
75
|
+
|
|
76
|
+
if len(rows_data) != 0:
|
|
77
|
+
# create the dataframe
|
|
78
|
+
games_tmp = pd.DataFrame(rows_data)
|
|
79
|
+
cols = ["game_nb", "game_date", "extdom", "opp", "results",
|
|
80
|
+
"pts_tm","pts_opp",
|
|
81
|
+
"fg_tm", "fga_tm","fg_prct_tm",
|
|
82
|
+
"3p_tm","3pa_tm", "3p_prct_tm","ft_tm","fta_tm","ft_prct_tm",
|
|
83
|
+
"orb_tm","trb_tm", "ast_tm","stl_tm","blk_tm" ,"tov_tm","pf_tm",
|
|
84
|
+
"nc",
|
|
85
|
+
"fg_opp","fga_opp","fg_prct_opp",
|
|
86
|
+
"3p_opp", "3pa_opp", "3p_prct_opp", "ft_opp", "fta_opp","ft_prct_opp",
|
|
87
|
+
"orb_opp", "trb_opp","ast_opp", "stl_opp", "blk_opp","tov_opp", "pf_opp"]
|
|
88
|
+
|
|
89
|
+
games_tmp.columns = cols
|
|
90
|
+
games_tmp = games_tmp.dropna()
|
|
91
|
+
games_tmp['id_season'] = self.season
|
|
92
|
+
games_tmp['tm'] = team
|
|
93
|
+
games = pd.concat([games, games_tmp], axis=0)
|
|
94
|
+
|
|
95
|
+
time.sleep(5)
|
|
96
|
+
|
|
97
|
+
games = games[[
|
|
98
|
+
'id_season', 'game_nb', 'game_date', 'extdom', 'tm','opp', 'results', 'pts_tm', 'pts_opp',
|
|
99
|
+
'fg_tm', 'fga_tm', 'fg_prct_tm', '3p_tm', '3pa_tm', '3p_prct_tm',
|
|
100
|
+
'ft_tm', 'fta_tm', 'ft_prct_tm', 'orb_tm', 'trb_tm', 'ast_tm', 'stl_tm',
|
|
101
|
+
'blk_tm', 'tov_tm', 'pf_tm', 'fg_opp', 'fga_opp', 'fg_prct_opp',
|
|
102
|
+
'3p_opp', '3pa_opp', '3p_prct_opp', 'ft_opp', 'fta_opp', 'ft_prct_opp',
|
|
103
|
+
'orb_opp', 'trb_opp', 'ast_opp', 'stl_opp', 'blk_opp', 'tov_opp',
|
|
104
|
+
'pf_opp']]
|
|
105
|
+
|
|
106
|
+
return games
|
|
107
|
+
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
webscrapping_class = WebScrapBasketballReference(
|
|
110
|
+
url = "test",
|
|
111
|
+
season = 2022
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
nba_games = webscrapping_class.webscrappe_nba_games_data()
|
|
115
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "basketball-reference-webscrapper"
|
|
3
|
+
version = "0.1.3"
|
|
4
|
+
description = "Python package for Basketball Reference that gathers data by scraping the website"
|
|
5
|
+
authors = ["Yannick Flores <yannick.flores1992@gmail.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
exclude = ["tests", "docs"]
|
|
8
|
+
|
|
9
|
+
[tool.poetry.dependencies]
|
|
10
|
+
python = "^3.11"
|
|
11
|
+
pandas = "2.*"
|
|
12
|
+
numpy = "1.26.*"
|
|
13
|
+
pytest = "*"
|
|
14
|
+
pylint = "^2.15.10"
|
|
15
|
+
beautifulsoup4 = "4.12.*"
|
|
16
|
+
requests = "2.32.*"
|
|
17
|
+
|
|
18
|
+
[tool.poetry.dev-dependencies]
|
|
19
|
+
coverage = {extras = ["toml"], version = "6.5.*"}
|
|
20
|
+
black = {version = "22.10.*", allow-prereleases = true}
|
|
21
|
+
pytest = "7.2.*"
|
|
22
|
+
pytest-html = "3.1.*"
|
|
23
|
+
pytest-cov = "2.12.*"
|
|
24
|
+
py = "1.11.*"
|
|
25
|
+
pylint = "2.15.*"
|
|
26
|
+
python-decouple = "3.6"
|
|
27
|
+
mkdocs = "1.4.*"
|
|
28
|
+
importlib-resources="6.0.1"
|
|
29
|
+
|
|
30
|
+
[tool.pytest.ini_options]
|
|
31
|
+
pythonpath = "."
|
|
32
|
+
testpaths = "tests"
|
|
33
|
+
addopts = [
|
|
34
|
+
"--import-mode=importlib",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["poetry-core>=1.0.0"]
|
|
39
|
+
build-backend = "poetry.core.masonry.api"
|