datafc 0.1.0__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datafc-0.1.0 → datafc-1.0.0}/PKG-INFO +3 -3
- {datafc-0.1.0 → datafc-1.0.0}/README.md +2 -2
- datafc-1.0.0/datafc/__init__.py +3 -0
- datafc-1.0.0/datafc/sofascore/__init__.py +23 -0
- datafc-1.0.0/datafc/sofascore/fetch_coordinates_data.py +116 -0
- datafc-1.0.0/datafc/sofascore/fetch_goal_networks_data.py +151 -0
- datafc-1.0.0/datafc/sofascore/fetch_lineups_data.py +146 -0
- datafc-1.0.0/datafc/sofascore/fetch_match_data.py +102 -0
- datafc-1.0.0/datafc/sofascore/fetch_match_odds_data.py +111 -0
- datafc-1.0.0/datafc/sofascore/fetch_match_stats_data.py +109 -0
- datafc-1.0.0/datafc/sofascore/fetch_momentum_data.py +104 -0
- datafc-1.0.0/datafc/sofascore/fetch_shots_data.py +140 -0
- datafc-1.0.0/datafc/sofascore/fetch_standings_data.py +108 -0
- datafc-1.0.0/datafc/sofascore/fetch_substitutions_data.py +108 -0
- datafc-1.0.0/datafc/utils/__init__.py +0 -0
- datafc-1.0.0/datafc/utils/_config.py +6 -0
- datafc-1.0.0/datafc/utils/_save_files.py +50 -0
- datafc-1.0.0/datafc/utils/_setup_webdriver.py +49 -0
- {datafc-0.1.0 → datafc-1.0.0}/datafc.egg-info/PKG-INFO +3 -3
- datafc-1.0.0/datafc.egg-info/SOURCES.txt +24 -0
- datafc-1.0.0/datafc.egg-info/top_level.txt +1 -0
- {datafc-0.1.0 → datafc-1.0.0}/setup.py +4 -4
- datafc-0.1.0/datafc.egg-info/SOURCES.txt +0 -8
- datafc-0.1.0/datafc.egg-info/top_level.txt +0 -1
- {datafc-0.1.0 → datafc-1.0.0}/LICENSE +0 -0
- {datafc-0.1.0 → datafc-1.0.0}/datafc.egg-info/dependency_links.txt +0 -0
- {datafc-0.1.0 → datafc-1.0.0}/datafc.egg-info/requires.txt +0 -0
- {datafc-0.1.0 → datafc-1.0.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datafc
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: A scalable Python library for fetching, processing, and exporting structured football match data.
|
|
5
5
|
Home-page: https://github.com/urazakgul/datafc
|
|
6
6
|
Author: Uraz Akgül
|
|
@@ -14,7 +14,7 @@ Requires-Python: >=3.8
|
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
License-File: LICENSE
|
|
16
16
|
|
|
17
|
-
# datafc
|
|
17
|
+
# datafc v1.0.0
|
|
18
18
|
|
|
19
19
|
## Overview
|
|
20
20
|
|
|
@@ -53,7 +53,7 @@ pip install git+https://github.com/urazakgul/datafc.git
|
|
|
53
53
|
To install a specific version of `datafc`, use:
|
|
54
54
|
|
|
55
55
|
```bash
|
|
56
|
-
pip install datafc==
|
|
56
|
+
pip install datafc==1.0.0
|
|
57
57
|
```
|
|
58
58
|
|
|
59
59
|
## Why Selenium?
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# datafc
|
|
1
|
+
# datafc v1.0.0
|
|
2
2
|
|
|
3
3
|
## Overview
|
|
4
4
|
|
|
@@ -37,7 +37,7 @@ pip install git+https://github.com/urazakgul/datafc.git
|
|
|
37
37
|
To install a specific version of `datafc`, use:
|
|
38
38
|
|
|
39
39
|
```bash
|
|
40
|
-
pip install datafc==
|
|
40
|
+
pip install datafc==1.0.0
|
|
41
41
|
```
|
|
42
42
|
|
|
43
43
|
## Why Selenium?
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .fetch_match_data import match_data
|
|
2
|
+
from .fetch_match_stats_data import match_stats_data
|
|
3
|
+
from .fetch_standings_data import standings_data
|
|
4
|
+
from .fetch_shots_data import shots_data
|
|
5
|
+
from .fetch_goal_networks_data import goal_networks_data
|
|
6
|
+
from .fetch_lineups_data import lineups_data
|
|
7
|
+
from .fetch_coordinates_data import coordinates_data
|
|
8
|
+
from .fetch_substitutions_data import substitutions_data
|
|
9
|
+
from .fetch_match_odds_data import match_odds_data
|
|
10
|
+
from .fetch_momentum_data import momentum_data
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"match_data",
|
|
14
|
+
"match_stats_data",
|
|
15
|
+
"standings_data",
|
|
16
|
+
"shots_data",
|
|
17
|
+
"goal_networks_data",
|
|
18
|
+
"lineups_data",
|
|
19
|
+
"coordinates_data",
|
|
20
|
+
"substitutions_data",
|
|
21
|
+
"match_odds_data",
|
|
22
|
+
"momentum_data"
|
|
23
|
+
]
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from selenium.webdriver.common.by import By
|
|
4
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
5
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
6
|
+
from selenium.common.exceptions import TimeoutException, WebDriverException
|
|
7
|
+
from datafc.utils._setup_webdriver import setup_webdriver
|
|
8
|
+
from datafc.utils._save_files import save_json, save_excel
|
|
9
|
+
from datafc.utils._config import ALLOWED_SOURCES, API_BASE_URLS
|
|
10
|
+
|
|
11
|
+
def coordinates_data(
|
|
12
|
+
lineups_df: pd.DataFrame,
|
|
13
|
+
data_source: str = "sofascore",
|
|
14
|
+
element_load_timeout: int = 10,
|
|
15
|
+
enable_json_export: bool = False,
|
|
16
|
+
enable_excel_export: bool = False
|
|
17
|
+
) -> pd.DataFrame:
|
|
18
|
+
"""
|
|
19
|
+
Fetches coordinate data for each player in the provided lineup dataset.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
lineups_df (pd.DataFrame): A DataFrame containing player and match metadata,
|
|
23
|
+
which should be generated by the `lineups_data` function.
|
|
24
|
+
data_source (str): The data source ('sofavpn' or 'sofascore'). Defaults to 'sofascore'.
|
|
25
|
+
element_load_timeout (int): The maximum time (in seconds) to wait for the API response. Defaults to 10.
|
|
26
|
+
enable_json_export (bool): If `True`, exports the fetched coordinate data as a JSON file. Defaults to `False`.
|
|
27
|
+
enable_excel_export (bool): If `True`, exports the fetched coordinate data as an Excel file. Defaults to `False`.
|
|
28
|
+
"""
|
|
29
|
+
if data_source not in ALLOWED_SOURCES:
|
|
30
|
+
raise ValueError(f"Invalid data source: {data_source}. Must be one of {ALLOWED_SOURCES}")
|
|
31
|
+
|
|
32
|
+
if lineups_df is None or lineups_df.empty:
|
|
33
|
+
raise ValueError("Match dataframe must be provided and cannot be empty.")
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
webdriver_instance = setup_webdriver()
|
|
37
|
+
heatmap_data = []
|
|
38
|
+
|
|
39
|
+
unique_players = lineups_df[[
|
|
40
|
+
"country", "tournament", "season", "week", "game_id", "team", "player_id", "player_name"
|
|
41
|
+
]].drop_duplicates()
|
|
42
|
+
|
|
43
|
+
if unique_players.empty:
|
|
44
|
+
raise ValueError("No unique players found in lineup data.")
|
|
45
|
+
|
|
46
|
+
def process_heatmap_points(data, row):
|
|
47
|
+
if not data:
|
|
48
|
+
return []
|
|
49
|
+
return [
|
|
50
|
+
{
|
|
51
|
+
"country": row["country"],
|
|
52
|
+
"tournament": row["tournament"],
|
|
53
|
+
"season": row["season"],
|
|
54
|
+
"week": row["week"],
|
|
55
|
+
"game_id": row["game_id"],
|
|
56
|
+
"team": row["team"],
|
|
57
|
+
"player_id": row["player_id"],
|
|
58
|
+
"player_name": row["player_name"],
|
|
59
|
+
"x": point.get("x"),
|
|
60
|
+
"y": point.get("y")
|
|
61
|
+
}
|
|
62
|
+
for point in data if isinstance(point, dict) and "x" in point and "y" in point
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
for _, row in unique_players.iterrows():
|
|
66
|
+
try:
|
|
67
|
+
url = f"{API_BASE_URLS[data_source]}/api/v1/event/{row['game_id']}/player/{row['player_id']}/heatmap"
|
|
68
|
+
webdriver_instance.get(url)
|
|
69
|
+
|
|
70
|
+
pre_tag = WebDriverWait(webdriver_instance, element_load_timeout).until(
|
|
71
|
+
EC.visibility_of_element_located((By.TAG_NAME, "pre"))
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
heatmap_json = json.loads(pre_tag.text).get("heatmap", [])
|
|
75
|
+
player_heatmap_data = process_heatmap_points(heatmap_json, row)
|
|
76
|
+
heatmap_data.extend(player_heatmap_data)
|
|
77
|
+
|
|
78
|
+
except (TimeoutException, json.JSONDecodeError, WebDriverException) as e:
|
|
79
|
+
raise RuntimeError(f"Error fetching heatmap data for player {row['player_name']} (ID: {row['player_id']}): {str(e)}")
|
|
80
|
+
|
|
81
|
+
heatmap_df = pd.DataFrame(heatmap_data)
|
|
82
|
+
|
|
83
|
+
if heatmap_df.empty:
|
|
84
|
+
raise ValueError("No heatmap data extracted.")
|
|
85
|
+
|
|
86
|
+
if enable_json_export or enable_excel_export:
|
|
87
|
+
first_row = heatmap_df.iloc[0]
|
|
88
|
+
|
|
89
|
+
if enable_json_export:
|
|
90
|
+
save_json(
|
|
91
|
+
data=heatmap_df,
|
|
92
|
+
country=first_row["country"],
|
|
93
|
+
tournament=first_row["tournament"],
|
|
94
|
+
season=first_row["season"],
|
|
95
|
+
week_number=first_row["week"]
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if enable_excel_export:
|
|
99
|
+
save_excel(
|
|
100
|
+
data=heatmap_df,
|
|
101
|
+
country=first_row["country"],
|
|
102
|
+
tournament=first_row["tournament"],
|
|
103
|
+
season=first_row["season"],
|
|
104
|
+
week_number=first_row["week"]
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return heatmap_df
|
|
108
|
+
|
|
109
|
+
except WebDriverException as e:
|
|
110
|
+
raise RuntimeError(f"Selenium WebDriver error: {str(e)}")
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise RuntimeError(f"Unexpected error while fetching heatmap data: {e.__class__.__name__} - {e}")
|
|
113
|
+
|
|
114
|
+
finally:
|
|
115
|
+
if webdriver_instance:
|
|
116
|
+
webdriver_instance.quit()
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from selenium.webdriver.common.by import By
|
|
4
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
5
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
6
|
+
from selenium.common.exceptions import TimeoutException, WebDriverException
|
|
7
|
+
from datafc.utils._setup_webdriver import setup_webdriver
|
|
8
|
+
from datafc.utils._save_files import save_json, save_excel
|
|
9
|
+
from datafc.utils._config import ALLOWED_SOURCES, API_BASE_URLS
|
|
10
|
+
|
|
11
|
+
def goal_networks_data(
|
|
12
|
+
match_df: pd.DataFrame,
|
|
13
|
+
data_source: str = "sofascore",
|
|
14
|
+
element_load_timeout: int = 10,
|
|
15
|
+
enable_json_export: bool = False,
|
|
16
|
+
enable_excel_export: bool = False
|
|
17
|
+
) -> pd.DataFrame:
|
|
18
|
+
"""
|
|
19
|
+
Fetches goal network data for each match in the provided match dataset.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
match_df (pd.DataFrame): A DataFrame containing match metadata,
|
|
23
|
+
which should be generated by the `match_data` function.
|
|
24
|
+
data_source (str): The data source ('sofavpn' or 'sofascore'). Defaults to 'sofascore'.
|
|
25
|
+
element_load_timeout (int): The maximum time (in seconds) to wait for the API response. Defaults to 10.
|
|
26
|
+
enable_json_export (bool): If `True`, exports the fetched goal network data as a JSON file. Defaults to `False`.
|
|
27
|
+
enable_excel_export (bool): If `True`, exports the fetched goal network data as an Excel file. Defaults to `False`.
|
|
28
|
+
"""
|
|
29
|
+
if data_source not in ALLOWED_SOURCES:
|
|
30
|
+
raise ValueError(f"Invalid data source: {data_source}. Must be one of {ALLOWED_SOURCES}")
|
|
31
|
+
|
|
32
|
+
if match_df is None or match_df.empty:
|
|
33
|
+
raise ValueError("Match dataframe must be provided and cannot be empty.")
|
|
34
|
+
|
|
35
|
+
webdriver_instance = None
|
|
36
|
+
try:
|
|
37
|
+
webdriver_instance = setup_webdriver()
|
|
38
|
+
goal_networks_list = []
|
|
39
|
+
|
|
40
|
+
for _, row in match_df.iterrows():
|
|
41
|
+
country, tournament, season, week, game_id = row[
|
|
42
|
+
["country", "tournament", "season", "week", "game_id"]
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
api_request_url = f"{API_BASE_URLS[data_source]}/api/v1/event/{game_id}/incidents"
|
|
46
|
+
webdriver_instance.get(api_request_url)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
response_element = WebDriverWait(webdriver_instance, element_load_timeout).until(
|
|
50
|
+
EC.visibility_of_element_located((By.TAG_NAME, "pre"))
|
|
51
|
+
)
|
|
52
|
+
json_data = json.loads(response_element.text).get("incidents", [])
|
|
53
|
+
|
|
54
|
+
if isinstance(json_data, list):
|
|
55
|
+
goal_networks_df = pd.DataFrame(json_data)
|
|
56
|
+
goal_networks_df["country"] = country
|
|
57
|
+
goal_networks_df["tournament"] = tournament
|
|
58
|
+
goal_networks_df["season"] = season
|
|
59
|
+
goal_networks_df["week"] = week
|
|
60
|
+
goal_networks_df["game_id"] = game_id
|
|
61
|
+
goal_networks_list.append(goal_networks_df)
|
|
62
|
+
|
|
63
|
+
except TimeoutException:
|
|
64
|
+
raise RuntimeError(f"Timeout while fetching goal network data for game {game_id}.")
|
|
65
|
+
except json.JSONDecodeError:
|
|
66
|
+
raise RuntimeError(f"Failed to decode goal network data for game {game_id}.")
|
|
67
|
+
|
|
68
|
+
goal_networks_df = pd.concat(goal_networks_list, ignore_index=True)
|
|
69
|
+
|
|
70
|
+
if goal_networks_df.empty:
|
|
71
|
+
raise ValueError("No goal network data found for the specified parameters.")
|
|
72
|
+
|
|
73
|
+
actions_list = []
|
|
74
|
+
for _, row in goal_networks_df[["id", "footballPassingNetworkAction", "country", "tournament", "season", "week", "game_id"]].iterrows():
|
|
75
|
+
if isinstance(row["footballPassingNetworkAction"], list):
|
|
76
|
+
for event in row["footballPassingNetworkAction"]:
|
|
77
|
+
event["id"] = row["id"]
|
|
78
|
+
event.update({
|
|
79
|
+
"country": row["country"],
|
|
80
|
+
"tournament": row["tournament"],
|
|
81
|
+
"season": row["season"],
|
|
82
|
+
"week": row["week"],
|
|
83
|
+
"game_id": row["game_id"]
|
|
84
|
+
})
|
|
85
|
+
actions_list.append(event)
|
|
86
|
+
|
|
87
|
+
actions_list_df = pd.DataFrame(actions_list)
|
|
88
|
+
|
|
89
|
+
if actions_list_df.empty:
|
|
90
|
+
raise ValueError("No passing network data found.")
|
|
91
|
+
|
|
92
|
+
def extract_coordinates(coords, coord_type):
|
|
93
|
+
return coords.get(coord_type) if isinstance(coords, dict) else None
|
|
94
|
+
|
|
95
|
+
actions_list_df = actions_list_df.assign(
|
|
96
|
+
player_name=actions_list_df["player"].apply(lambda p: p.get("name") if isinstance(p, dict) else None),
|
|
97
|
+
player_id=actions_list_df["player"].apply(lambda p: p.get("id") if isinstance(p, dict) else None),
|
|
98
|
+
event_type=actions_list_df["eventType"],
|
|
99
|
+
player_x=actions_list_df["playerCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
|
|
100
|
+
player_y=actions_list_df["playerCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
|
|
101
|
+
pass_end_x=actions_list_df["passEndCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
|
|
102
|
+
pass_end_y=actions_list_df["passEndCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
|
|
103
|
+
id=actions_list_df["id"],
|
|
104
|
+
is_assist=actions_list_df["isAssist"],
|
|
105
|
+
goalkeeper_x=actions_list_df["gkCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
|
|
106
|
+
goalkeeper_y=actions_list_df["gkCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
|
|
107
|
+
goal_shot_x=actions_list_df["goalShotCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
|
|
108
|
+
goal_shot_y=actions_list_df["goalShotCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
|
|
109
|
+
goal_mouth_x=actions_list_df["goalMouthCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
|
|
110
|
+
goal_mouth_y=actions_list_df["goalMouthCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
|
|
111
|
+
goalkeeper_name=actions_list_df["goalkeeper"].apply(lambda gk: gk.get("name") if isinstance(gk, dict) else None),
|
|
112
|
+
goalkeeper_id=actions_list_df["goalkeeper"].apply(lambda gk: gk.get("id") if isinstance(gk, dict) else None)
|
|
113
|
+
)[
|
|
114
|
+
[
|
|
115
|
+
"country", "tournament", "season", "week", "game_id", "player_name", "player_id", "event_type",
|
|
116
|
+
"player_x", "player_y", "pass_end_x", "pass_end_y", "is_assist", "id", "goalkeeper_x", "goalkeeper_y",
|
|
117
|
+
"goal_shot_x", "goal_shot_y", "goal_mouth_x", "goal_mouth_y", "goalkeeper_name", "goalkeeper_id"
|
|
118
|
+
]
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
if enable_json_export or enable_excel_export:
|
|
122
|
+
first_row = actions_list_df.iloc[0]
|
|
123
|
+
|
|
124
|
+
if enable_json_export:
|
|
125
|
+
save_json(
|
|
126
|
+
data=actions_list_df,
|
|
127
|
+
country=first_row["country"],
|
|
128
|
+
tournament=first_row["tournament"],
|
|
129
|
+
season=first_row["season"],
|
|
130
|
+
week_number=first_row["week"]
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if enable_excel_export:
|
|
134
|
+
save_excel(
|
|
135
|
+
data=actions_list_df,
|
|
136
|
+
country=first_row["country"],
|
|
137
|
+
tournament=first_row["tournament"],
|
|
138
|
+
season=first_row["season"],
|
|
139
|
+
week_number=first_row["week"]
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return actions_list_df
|
|
143
|
+
|
|
144
|
+
except WebDriverException as e:
|
|
145
|
+
raise RuntimeError(f"Selenium WebDriver error: {str(e)}")
|
|
146
|
+
except Exception as e:
|
|
147
|
+
raise RuntimeError(f"Unexpected error while fetching goal network data: {e.__class__.__name__} - {e}")
|
|
148
|
+
|
|
149
|
+
finally:
|
|
150
|
+
if webdriver_instance:
|
|
151
|
+
webdriver_instance.quit()
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from selenium.webdriver.common.by import By
|
|
4
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
5
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
6
|
+
from selenium.common.exceptions import TimeoutException, WebDriverException
|
|
7
|
+
from datafc.utils._setup_webdriver import setup_webdriver
|
|
8
|
+
from datafc.utils._save_files import save_json, save_excel
|
|
9
|
+
from datafc.utils._config import ALLOWED_SOURCES, API_BASE_URLS
|
|
10
|
+
|
|
11
|
+
def lineups_data(
|
|
12
|
+
match_df: pd.DataFrame,
|
|
13
|
+
data_source: str = "sofascore",
|
|
14
|
+
element_load_timeout: int = 10,
|
|
15
|
+
enable_json_export: bool = False,
|
|
16
|
+
enable_excel_export: bool = False
|
|
17
|
+
) -> pd.DataFrame:
|
|
18
|
+
"""
|
|
19
|
+
Fetches lineup data for each match in the provided match dataset.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
match_df (pd.DataFrame): A DataFrame containing match metadata,
|
|
23
|
+
which should be generated by the `match_data` function.
|
|
24
|
+
data_source (str): The data source ('sofavpn' or 'sofascore'). Defaults to 'sofascore'.
|
|
25
|
+
element_load_timeout (int): The maximum time (in seconds) to wait for the API response. Defaults to 10.
|
|
26
|
+
enable_json_export (bool): If `True`, saves the fetched data as a JSON file. Defaults to `False`.
|
|
27
|
+
enable_excel_export (bool): If `True`, saves the fetched data as an Excel file. Defaults to `False`.
|
|
28
|
+
"""
|
|
29
|
+
if data_source not in ALLOWED_SOURCES:
|
|
30
|
+
raise ValueError(f"Invalid data source: {data_source}. Must be one of {ALLOWED_SOURCES}")
|
|
31
|
+
|
|
32
|
+
if match_df is None or match_df.empty:
|
|
33
|
+
raise ValueError("Match dataframe must be provided and cannot be empty.")
|
|
34
|
+
|
|
35
|
+
webdriver_instance = None
|
|
36
|
+
try:
|
|
37
|
+
webdriver_instance = setup_webdriver()
|
|
38
|
+
lineups_data_df = pd.DataFrame()
|
|
39
|
+
|
|
40
|
+
def process_player_statistics(lineups_data, team_key):
|
|
41
|
+
extracted_data = []
|
|
42
|
+
for _, row in lineups_data.iterrows():
|
|
43
|
+
for player in row[team_key]:
|
|
44
|
+
for stat_name, stat_value in player.get("statistics", {}).items():
|
|
45
|
+
extracted_data.append({
|
|
46
|
+
"country": row["country"],
|
|
47
|
+
"tournament": row["tournament"],
|
|
48
|
+
"season": row["season"],
|
|
49
|
+
"week": row["week"],
|
|
50
|
+
"game_id": row["game_id"],
|
|
51
|
+
"team": team_key,
|
|
52
|
+
"player_name": player["player"]["name"],
|
|
53
|
+
"player_id": player["player"]["id"],
|
|
54
|
+
"stat_name": stat_name,
|
|
55
|
+
"stat_value": stat_value
|
|
56
|
+
})
|
|
57
|
+
return extracted_data
|
|
58
|
+
|
|
59
|
+
for _, row in match_df.iterrows():
|
|
60
|
+
country = row["country"]
|
|
61
|
+
tournament = row["tournament"]
|
|
62
|
+
game_id = row["game_id"]
|
|
63
|
+
season = row["season"]
|
|
64
|
+
week = row["week"]
|
|
65
|
+
|
|
66
|
+
url = f"{API_BASE_URLS[data_source]}/api/v1/event/{game_id}/lineups"
|
|
67
|
+
webdriver_instance.get(url)
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
pre_tag = WebDriverWait(webdriver_instance, element_load_timeout).until(
|
|
71
|
+
EC.visibility_of_element_located((By.TAG_NAME, "pre"))
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
lineups_json = json.loads(pre_tag.text)
|
|
75
|
+
lineups_data = pd.DataFrame.from_dict(lineups_json)
|
|
76
|
+
lineups_data["country"] = country
|
|
77
|
+
lineups_data["tournament"] = tournament
|
|
78
|
+
lineups_data["season"] = season
|
|
79
|
+
lineups_data["week"] = week
|
|
80
|
+
lineups_data["game_id"] = game_id
|
|
81
|
+
|
|
82
|
+
lineups_data = lineups_data[lineups_data.index == "players"]
|
|
83
|
+
lineups_data_df = pd.concat([lineups_data_df, lineups_data], ignore_index=True)
|
|
84
|
+
|
|
85
|
+
except TimeoutException:
|
|
86
|
+
raise RuntimeError(f"Timeout while fetching lineup data for game_id {game_id}.")
|
|
87
|
+
except json.JSONDecodeError:
|
|
88
|
+
raise RuntimeError(f"Failed to decode lineup data for game_id {game_id}.")
|
|
89
|
+
except WebDriverException as e:
|
|
90
|
+
raise RuntimeError(f"Selenium WebDriver error while fetching lineup data for game_id {game_id}: {str(e)}")
|
|
91
|
+
except Exception as e:
|
|
92
|
+
raise RuntimeError(f"Unexpected error while fetching lineup data for game_id {game_id}: {e.__class__.__name__} - {e}")
|
|
93
|
+
|
|
94
|
+
if lineups_data_df.empty:
|
|
95
|
+
raise ValueError("No lineup data found for the specified parameters.")
|
|
96
|
+
|
|
97
|
+
extracted_lineups_data = []
|
|
98
|
+
for team_key in ["home", "away"]:
|
|
99
|
+
extracted_lineups_data.extend(process_player_statistics(lineups_data_df, team_key))
|
|
100
|
+
|
|
101
|
+
extracted_lineups_data_df = pd.DataFrame(extracted_lineups_data)
|
|
102
|
+
|
|
103
|
+
def extract_original_rating(rating_versions):
|
|
104
|
+
if isinstance(rating_versions, dict) and "original" in rating_versions:
|
|
105
|
+
return rating_versions["original"]
|
|
106
|
+
return rating_versions
|
|
107
|
+
|
|
108
|
+
extracted_lineups_data_df["stat_value"] = extracted_lineups_data_df.apply(
|
|
109
|
+
lambda row: extract_original_rating(row["stat_value"]) if row["stat_name"] == "ratingVersions" else row["stat_value"],
|
|
110
|
+
axis=1
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if extracted_lineups_data_df.empty:
|
|
114
|
+
raise ValueError("No extracted lineup data available.")
|
|
115
|
+
|
|
116
|
+
if enable_json_export or enable_excel_export:
|
|
117
|
+
first_row = extracted_lineups_data_df.iloc[0]
|
|
118
|
+
|
|
119
|
+
if enable_json_export:
|
|
120
|
+
save_json(
|
|
121
|
+
data=extracted_lineups_data_df,
|
|
122
|
+
country=first_row["country"],
|
|
123
|
+
tournament=first_row["tournament"],
|
|
124
|
+
season=first_row["season"],
|
|
125
|
+
week_number=first_row["week"]
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
if enable_excel_export:
|
|
129
|
+
save_excel(
|
|
130
|
+
data=extracted_lineups_data_df,
|
|
131
|
+
country=first_row["country"],
|
|
132
|
+
tournament=first_row["tournament"],
|
|
133
|
+
season=first_row["season"],
|
|
134
|
+
week_number=first_row["week"]
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return extracted_lineups_data_df
|
|
138
|
+
|
|
139
|
+
except WebDriverException as e:
|
|
140
|
+
raise RuntimeError(f"Selenium WebDriver error: {str(e)}")
|
|
141
|
+
except Exception as e:
|
|
142
|
+
raise RuntimeError(f"Unexpected error while fetching lineup data: {e.__class__.__name__} - {e}")
|
|
143
|
+
|
|
144
|
+
finally:
|
|
145
|
+
if webdriver_instance:
|
|
146
|
+
webdriver_instance.quit()
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from selenium.webdriver.common.by import By
|
|
4
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
5
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
6
|
+
from selenium.common.exceptions import TimeoutException, WebDriverException
|
|
7
|
+
from datafc.utils._setup_webdriver import setup_webdriver
|
|
8
|
+
from datafc.utils._save_files import save_json, save_excel
|
|
9
|
+
from datafc.utils._config import ALLOWED_SOURCES, API_BASE_URLS
|
|
10
|
+
|
|
11
|
+
def match_data(
|
|
12
|
+
tournament_id: int,
|
|
13
|
+
season_id: int,
|
|
14
|
+
week_number: int,
|
|
15
|
+
data_source: str = "sofascore",
|
|
16
|
+
element_load_timeout: int = 10,
|
|
17
|
+
enable_json_export: bool = False,
|
|
18
|
+
enable_excel_export: bool = False
|
|
19
|
+
) -> pd.DataFrame:
|
|
20
|
+
"""
|
|
21
|
+
Fetches match data for a specified tournament, season, and matchweek.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
tournament_id (int): The unique identifier for the tournament.
|
|
25
|
+
season_id (int): The unique identifier for the season.
|
|
26
|
+
week_number (int): The matchweek number within the season.
|
|
27
|
+
data_source (str): The data source ('sofavpn' or 'sofascore'). Defaults to 'sofascore'.
|
|
28
|
+
element_load_timeout (int): The maximum time (in seconds) to wait for the API response. Defaults to 10.
|
|
29
|
+
enable_json_export (bool): If `True`, exports the fetched data as a JSON file. Defaults to `False`.
|
|
30
|
+
enable_excel_export (bool): If `True`, exports the fetched data as an Excel file. Defaults to `False`.
|
|
31
|
+
"""
|
|
32
|
+
if data_source not in ALLOWED_SOURCES:
|
|
33
|
+
raise ValueError(f"Invalid data source: {data_source}. Must be one of {ALLOWED_SOURCES}")
|
|
34
|
+
|
|
35
|
+
api_request_url = f"{API_BASE_URLS[data_source]}/api/v1/unique-tournament/{tournament_id}/season/{season_id}/events/round/{week_number}"
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
webdriver_instance = setup_webdriver()
|
|
39
|
+
webdriver_instance.get(api_request_url)
|
|
40
|
+
|
|
41
|
+
response_element = WebDriverWait(webdriver_instance, element_load_timeout).until(
|
|
42
|
+
EC.visibility_of_element_located((By.TAG_NAME, "pre"))
|
|
43
|
+
)
|
|
44
|
+
response_text = response_element.text.strip()
|
|
45
|
+
if not response_text:
|
|
46
|
+
raise RuntimeError("API response is empty.")
|
|
47
|
+
|
|
48
|
+
api_response_data = json.loads(response_text)
|
|
49
|
+
if "events" not in api_response_data or not isinstance(api_response_data["events"], list):
|
|
50
|
+
raise ValueError("Invalid API response format: 'events' key is missing or not a list.")
|
|
51
|
+
|
|
52
|
+
events_df = pd.DataFrame(api_response_data.get("events", []))
|
|
53
|
+
if events_df.empty:
|
|
54
|
+
raise ValueError("No match data found for the specified parameters.")
|
|
55
|
+
|
|
56
|
+
match_data_df = pd.DataFrame({
|
|
57
|
+
"country": events_df["tournament"].apply(lambda x: x.get("category", {}).get("name", "")),
|
|
58
|
+
"tournament": events_df["tournament"].apply(lambda x: x.get("name", "")),
|
|
59
|
+
"season": events_df["season"].apply(lambda x: x.get("year", "")),
|
|
60
|
+
"week": events_df["roundInfo"].apply(lambda x: x.get("round", "")),
|
|
61
|
+
"game_id": events_df["id"],
|
|
62
|
+
"home_team": events_df["homeTeam"].apply(lambda x: x.get("name", "")),
|
|
63
|
+
"away_team": events_df["awayTeam"].apply(lambda x: x.get("name", "")),
|
|
64
|
+
"start_timestamp": events_df["startTimestamp"],
|
|
65
|
+
"status": events_df["status"].apply(lambda x: x.get("description", ""))
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
if enable_json_export or enable_excel_export:
|
|
69
|
+
first_row = match_data_df.iloc[0]
|
|
70
|
+
|
|
71
|
+
if enable_json_export:
|
|
72
|
+
save_json(
|
|
73
|
+
data=match_data_df,
|
|
74
|
+
country=first_row["country"],
|
|
75
|
+
tournament=first_row["tournament"],
|
|
76
|
+
season=first_row["season"],
|
|
77
|
+
week_number=first_row["week"]
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if enable_excel_export:
|
|
81
|
+
save_excel(
|
|
82
|
+
data=match_data_df,
|
|
83
|
+
country=first_row["country"],
|
|
84
|
+
tournament=first_row["tournament"],
|
|
85
|
+
season=first_row["season"],
|
|
86
|
+
week_number=first_row["week"]
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return match_data_df
|
|
90
|
+
|
|
91
|
+
except TimeoutException:
|
|
92
|
+
raise RuntimeError("Timeout occurred while waiting for the page or API response.")
|
|
93
|
+
except WebDriverException as e:
|
|
94
|
+
raise RuntimeError(f"Selenium WebDriver error: {str(e)}")
|
|
95
|
+
except json.JSONDecodeError:
|
|
96
|
+
raise RuntimeError("Failed to decode API response as JSON.")
|
|
97
|
+
except Exception as e:
|
|
98
|
+
raise RuntimeError(f"Unexpected error while fetching match data: {e.__class__.__name__} - {e}")
|
|
99
|
+
|
|
100
|
+
finally:
|
|
101
|
+
if webdriver_instance:
|
|
102
|
+
webdriver_instance.quit()
|