datafc 0.1.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {datafc-0.1.0 → datafc-1.0.0}/PKG-INFO +3 -3
  2. {datafc-0.1.0 → datafc-1.0.0}/README.md +2 -2
  3. datafc-1.0.0/datafc/__init__.py +3 -0
  4. datafc-1.0.0/datafc/sofascore/__init__.py +23 -0
  5. datafc-1.0.0/datafc/sofascore/fetch_coordinates_data.py +116 -0
  6. datafc-1.0.0/datafc/sofascore/fetch_goal_networks_data.py +151 -0
  7. datafc-1.0.0/datafc/sofascore/fetch_lineups_data.py +146 -0
  8. datafc-1.0.0/datafc/sofascore/fetch_match_data.py +102 -0
  9. datafc-1.0.0/datafc/sofascore/fetch_match_odds_data.py +111 -0
  10. datafc-1.0.0/datafc/sofascore/fetch_match_stats_data.py +109 -0
  11. datafc-1.0.0/datafc/sofascore/fetch_momentum_data.py +104 -0
  12. datafc-1.0.0/datafc/sofascore/fetch_shots_data.py +140 -0
  13. datafc-1.0.0/datafc/sofascore/fetch_standings_data.py +108 -0
  14. datafc-1.0.0/datafc/sofascore/fetch_substitutions_data.py +108 -0
  15. datafc-1.0.0/datafc/utils/__init__.py +0 -0
  16. datafc-1.0.0/datafc/utils/_config.py +6 -0
  17. datafc-1.0.0/datafc/utils/_save_files.py +50 -0
  18. datafc-1.0.0/datafc/utils/_setup_webdriver.py +49 -0
  19. {datafc-0.1.0 → datafc-1.0.0}/datafc.egg-info/PKG-INFO +3 -3
  20. datafc-1.0.0/datafc.egg-info/SOURCES.txt +24 -0
  21. datafc-1.0.0/datafc.egg-info/top_level.txt +1 -0
  22. {datafc-0.1.0 → datafc-1.0.0}/setup.py +4 -4
  23. datafc-0.1.0/datafc.egg-info/SOURCES.txt +0 -8
  24. datafc-0.1.0/datafc.egg-info/top_level.txt +0 -1
  25. {datafc-0.1.0 → datafc-1.0.0}/LICENSE +0 -0
  26. {datafc-0.1.0 → datafc-1.0.0}/datafc.egg-info/dependency_links.txt +0 -0
  27. {datafc-0.1.0 → datafc-1.0.0}/datafc.egg-info/requires.txt +0 -0
  28. {datafc-0.1.0 → datafc-1.0.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datafc
3
- Version: 0.1.0
3
+ Version: 1.0.0
4
4
  Summary: A scalable Python library for fetching, processing, and exporting structured football match data.
5
5
  Home-page: https://github.com/urazakgul/datafc
6
6
  Author: Uraz Akgül
@@ -14,7 +14,7 @@ Requires-Python: >=3.8
14
14
  Description-Content-Type: text/markdown
15
15
  License-File: LICENSE
16
16
 
17
- # datafc v0.1.0
17
+ # datafc v1.0.0
18
18
 
19
19
  ## Overview
20
20
 
@@ -53,7 +53,7 @@ pip install git+https://github.com/urazakgul/datafc.git
53
53
  To install a specific version of `datafc`, use:
54
54
 
55
55
  ```bash
56
- pip install datafc==0.1.0
56
+ pip install datafc==1.0.0
57
57
  ```
58
58
 
59
59
  ## Why Selenium?
@@ -1,4 +1,4 @@
1
- # datafc v0.1.0
1
+ # datafc v1.0.0
2
2
 
3
3
  ## Overview
4
4
 
@@ -37,7 +37,7 @@ pip install git+https://github.com/urazakgul/datafc.git
37
37
  To install a specific version of `datafc`, use:
38
38
 
39
39
  ```bash
40
- pip install datafc==0.1.0
40
+ pip install datafc==1.0.0
41
41
  ```
42
42
 
43
43
  ## Why Selenium?
@@ -0,0 +1,3 @@
1
+ from .sofascore import *
2
+
3
+ __all__ = ["sofascore"]
@@ -0,0 +1,23 @@
1
+ from .fetch_match_data import match_data
2
+ from .fetch_match_stats_data import match_stats_data
3
+ from .fetch_standings_data import standings_data
4
+ from .fetch_shots_data import shots_data
5
+ from .fetch_goal_networks_data import goal_networks_data
6
+ from .fetch_lineups_data import lineups_data
7
+ from .fetch_coordinates_data import coordinates_data
8
+ from .fetch_substitutions_data import substitutions_data
9
+ from .fetch_match_odds_data import match_odds_data
10
+ from .fetch_momentum_data import momentum_data
11
+
12
+ __all__ = [
13
+ "match_data",
14
+ "match_stats_data",
15
+ "standings_data",
16
+ "shots_data",
17
+ "goal_networks_data",
18
+ "lineups_data",
19
+ "coordinates_data",
20
+ "substitutions_data",
21
+ "match_odds_data",
22
+ "momentum_data"
23
+ ]
@@ -0,0 +1,116 @@
1
+ import json
2
+ import pandas as pd
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.common.exceptions import TimeoutException, WebDriverException
7
+ from datafc.utils._setup_webdriver import setup_webdriver
8
+ from datafc.utils._save_files import save_json, save_excel
9
+ from datafc.utils._config import ALLOWED_SOURCES, API_BASE_URLS
10
+
11
+ def coordinates_data(
12
+ lineups_df: pd.DataFrame,
13
+ data_source: str = "sofascore",
14
+ element_load_timeout: int = 10,
15
+ enable_json_export: bool = False,
16
+ enable_excel_export: bool = False
17
+ ) -> pd.DataFrame:
18
+ """
19
+ Fetches coordinate data for each player in the provided lineup dataset.
20
+
21
+ Args:
22
+ lineups_df (pd.DataFrame): A DataFrame containing player and match metadata,
23
+ which should be generated by the `lineups_data` function.
24
+ data_source (str): The data source ('sofavpn' or 'sofascore'). Defaults to 'sofascore'.
25
+ element_load_timeout (int): The maximum time (in seconds) to wait for the API response. Defaults to 10.
26
+ enable_json_export (bool): If `True`, exports the fetched coordinate data as a JSON file. Defaults to `False`.
27
+ enable_excel_export (bool): If `True`, exports the fetched coordinate data as an Excel file. Defaults to `False`.
28
+ """
29
+ if data_source not in ALLOWED_SOURCES:
30
+ raise ValueError(f"Invalid data source: {data_source}. Must be one of {ALLOWED_SOURCES}")
31
+
32
+ if lineups_df is None or lineups_df.empty:
33
+ raise ValueError("Match dataframe must be provided and cannot be empty.")
34
+
35
+ try:
36
+ webdriver_instance = setup_webdriver()
37
+ heatmap_data = []
38
+
39
+ unique_players = lineups_df[[
40
+ "country", "tournament", "season", "week", "game_id", "team", "player_id", "player_name"
41
+ ]].drop_duplicates()
42
+
43
+ if unique_players.empty:
44
+ raise ValueError("No unique players found in lineup data.")
45
+
46
+ def process_heatmap_points(data, row):
47
+ if not data:
48
+ return []
49
+ return [
50
+ {
51
+ "country": row["country"],
52
+ "tournament": row["tournament"],
53
+ "season": row["season"],
54
+ "week": row["week"],
55
+ "game_id": row["game_id"],
56
+ "team": row["team"],
57
+ "player_id": row["player_id"],
58
+ "player_name": row["player_name"],
59
+ "x": point.get("x"),
60
+ "y": point.get("y")
61
+ }
62
+ for point in data if isinstance(point, dict) and "x" in point and "y" in point
63
+ ]
64
+
65
+ for _, row in unique_players.iterrows():
66
+ try:
67
+ url = f"{API_BASE_URLS[data_source]}/api/v1/event/{row['game_id']}/player/{row['player_id']}/heatmap"
68
+ webdriver_instance.get(url)
69
+
70
+ pre_tag = WebDriverWait(webdriver_instance, element_load_timeout).until(
71
+ EC.visibility_of_element_located((By.TAG_NAME, "pre"))
72
+ )
73
+
74
+ heatmap_json = json.loads(pre_tag.text).get("heatmap", [])
75
+ player_heatmap_data = process_heatmap_points(heatmap_json, row)
76
+ heatmap_data.extend(player_heatmap_data)
77
+
78
+ except (TimeoutException, json.JSONDecodeError, WebDriverException) as e:
79
+ raise RuntimeError(f"Error fetching heatmap data for player {row['player_name']} (ID: {row['player_id']}): {str(e)}")
80
+
81
+ heatmap_df = pd.DataFrame(heatmap_data)
82
+
83
+ if heatmap_df.empty:
84
+ raise ValueError("No heatmap data extracted.")
85
+
86
+ if enable_json_export or enable_excel_export:
87
+ first_row = heatmap_df.iloc[0]
88
+
89
+ if enable_json_export:
90
+ save_json(
91
+ data=heatmap_df,
92
+ country=first_row["country"],
93
+ tournament=first_row["tournament"],
94
+ season=first_row["season"],
95
+ week_number=first_row["week"]
96
+ )
97
+
98
+ if enable_excel_export:
99
+ save_excel(
100
+ data=heatmap_df,
101
+ country=first_row["country"],
102
+ tournament=first_row["tournament"],
103
+ season=first_row["season"],
104
+ week_number=first_row["week"]
105
+ )
106
+
107
+ return heatmap_df
108
+
109
+ except WebDriverException as e:
110
+ raise RuntimeError(f"Selenium WebDriver error: {str(e)}")
111
+ except Exception as e:
112
+ raise RuntimeError(f"Unexpected error while fetching heatmap data: {e.__class__.__name__} - {e}")
113
+
114
+ finally:
115
+ if webdriver_instance:
116
+ webdriver_instance.quit()
@@ -0,0 +1,151 @@
1
+ import json
2
+ import pandas as pd
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.common.exceptions import TimeoutException, WebDriverException
7
+ from datafc.utils._setup_webdriver import setup_webdriver
8
+ from datafc.utils._save_files import save_json, save_excel
9
+ from datafc.utils._config import ALLOWED_SOURCES, API_BASE_URLS
10
+
11
+ def goal_networks_data(
12
+ match_df: pd.DataFrame,
13
+ data_source: str = "sofascore",
14
+ element_load_timeout: int = 10,
15
+ enable_json_export: bool = False,
16
+ enable_excel_export: bool = False
17
+ ) -> pd.DataFrame:
18
+ """
19
+ Fetches goal network data for each match in the provided match dataset.
20
+
21
+ Args:
22
+ match_df (pd.DataFrame): A DataFrame containing match metadata,
23
+ which should be generated by the `match_data` function.
24
+ data_source (str): The data source ('sofavpn' or 'sofascore'). Defaults to 'sofascore'.
25
+ element_load_timeout (int): The maximum time (in seconds) to wait for the API response. Defaults to 10.
26
+ enable_json_export (bool): If `True`, exports the fetched goal network data as a JSON file. Defaults to `False`.
27
+ enable_excel_export (bool): If `True`, exports the fetched goal network data as an Excel file. Defaults to `False`.
28
+ """
29
+ if data_source not in ALLOWED_SOURCES:
30
+ raise ValueError(f"Invalid data source: {data_source}. Must be one of {ALLOWED_SOURCES}")
31
+
32
+ if match_df is None or match_df.empty:
33
+ raise ValueError("Match dataframe must be provided and cannot be empty.")
34
+
35
+ webdriver_instance = None
36
+ try:
37
+ webdriver_instance = setup_webdriver()
38
+ goal_networks_list = []
39
+
40
+ for _, row in match_df.iterrows():
41
+ country, tournament, season, week, game_id = row[
42
+ ["country", "tournament", "season", "week", "game_id"]
43
+ ]
44
+
45
+ api_request_url = f"{API_BASE_URLS[data_source]}/api/v1/event/{game_id}/incidents"
46
+ webdriver_instance.get(api_request_url)
47
+
48
+ try:
49
+ response_element = WebDriverWait(webdriver_instance, element_load_timeout).until(
50
+ EC.visibility_of_element_located((By.TAG_NAME, "pre"))
51
+ )
52
+ json_data = json.loads(response_element.text).get("incidents", [])
53
+
54
+ if isinstance(json_data, list):
55
+ goal_networks_df = pd.DataFrame(json_data)
56
+ goal_networks_df["country"] = country
57
+ goal_networks_df["tournament"] = tournament
58
+ goal_networks_df["season"] = season
59
+ goal_networks_df["week"] = week
60
+ goal_networks_df["game_id"] = game_id
61
+ goal_networks_list.append(goal_networks_df)
62
+
63
+ except TimeoutException:
64
+ raise RuntimeError(f"Timeout while fetching goal network data for game {game_id}.")
65
+ except json.JSONDecodeError:
66
+ raise RuntimeError(f"Failed to decode goal network data for game {game_id}.")
67
+
68
+ goal_networks_df = pd.concat(goal_networks_list, ignore_index=True)
69
+
70
+ if goal_networks_df.empty:
71
+ raise ValueError("No goal network data found for the specified parameters.")
72
+
73
+ actions_list = []
74
+ for _, row in goal_networks_df[["id", "footballPassingNetworkAction", "country", "tournament", "season", "week", "game_id"]].iterrows():
75
+ if isinstance(row["footballPassingNetworkAction"], list):
76
+ for event in row["footballPassingNetworkAction"]:
77
+ event["id"] = row["id"]
78
+ event.update({
79
+ "country": row["country"],
80
+ "tournament": row["tournament"],
81
+ "season": row["season"],
82
+ "week": row["week"],
83
+ "game_id": row["game_id"]
84
+ })
85
+ actions_list.append(event)
86
+
87
+ actions_list_df = pd.DataFrame(actions_list)
88
+
89
+ if actions_list_df.empty:
90
+ raise ValueError("No passing network data found.")
91
+
92
+ def extract_coordinates(coords, coord_type):
93
+ return coords.get(coord_type) if isinstance(coords, dict) else None
94
+
95
+ actions_list_df = actions_list_df.assign(
96
+ player_name=actions_list_df["player"].apply(lambda p: p.get("name") if isinstance(p, dict) else None),
97
+ player_id=actions_list_df["player"].apply(lambda p: p.get("id") if isinstance(p, dict) else None),
98
+ event_type=actions_list_df["eventType"],
99
+ player_x=actions_list_df["playerCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
100
+ player_y=actions_list_df["playerCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
101
+ pass_end_x=actions_list_df["passEndCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
102
+ pass_end_y=actions_list_df["passEndCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
103
+ id=actions_list_df["id"],
104
+ is_assist=actions_list_df["isAssist"],
105
+ goalkeeper_x=actions_list_df["gkCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
106
+ goalkeeper_y=actions_list_df["gkCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
107
+ goal_shot_x=actions_list_df["goalShotCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
108
+ goal_shot_y=actions_list_df["goalShotCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
109
+ goal_mouth_x=actions_list_df["goalMouthCoordinates"].apply(lambda coord: extract_coordinates(coord, "x")),
110
+ goal_mouth_y=actions_list_df["goalMouthCoordinates"].apply(lambda coord: extract_coordinates(coord, "y")),
111
+ goalkeeper_name=actions_list_df["goalkeeper"].apply(lambda gk: gk.get("name") if isinstance(gk, dict) else None),
112
+ goalkeeper_id=actions_list_df["goalkeeper"].apply(lambda gk: gk.get("id") if isinstance(gk, dict) else None)
113
+ )[
114
+ [
115
+ "country", "tournament", "season", "week", "game_id", "player_name", "player_id", "event_type",
116
+ "player_x", "player_y", "pass_end_x", "pass_end_y", "is_assist", "id", "goalkeeper_x", "goalkeeper_y",
117
+ "goal_shot_x", "goal_shot_y", "goal_mouth_x", "goal_mouth_y", "goalkeeper_name", "goalkeeper_id"
118
+ ]
119
+ ]
120
+
121
+ if enable_json_export or enable_excel_export:
122
+ first_row = actions_list_df.iloc[0]
123
+
124
+ if enable_json_export:
125
+ save_json(
126
+ data=actions_list_df,
127
+ country=first_row["country"],
128
+ tournament=first_row["tournament"],
129
+ season=first_row["season"],
130
+ week_number=first_row["week"]
131
+ )
132
+
133
+ if enable_excel_export:
134
+ save_excel(
135
+ data=actions_list_df,
136
+ country=first_row["country"],
137
+ tournament=first_row["tournament"],
138
+ season=first_row["season"],
139
+ week_number=first_row["week"]
140
+ )
141
+
142
+ return actions_list_df
143
+
144
+ except WebDriverException as e:
145
+ raise RuntimeError(f"Selenium WebDriver error: {str(e)}")
146
+ except Exception as e:
147
+ raise RuntimeError(f"Unexpected error while fetching goal network data: {e.__class__.__name__} - {e}")
148
+
149
+ finally:
150
+ if webdriver_instance:
151
+ webdriver_instance.quit()
@@ -0,0 +1,146 @@
1
+ import json
2
+ import pandas as pd
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.common.exceptions import TimeoutException, WebDriverException
7
+ from datafc.utils._setup_webdriver import setup_webdriver
8
+ from datafc.utils._save_files import save_json, save_excel
9
+ from datafc.utils._config import ALLOWED_SOURCES, API_BASE_URLS
10
+
11
+ def lineups_data(
12
+ match_df: pd.DataFrame,
13
+ data_source: str = "sofascore",
14
+ element_load_timeout: int = 10,
15
+ enable_json_export: bool = False,
16
+ enable_excel_export: bool = False
17
+ ) -> pd.DataFrame:
18
+ """
19
+ Fetches lineup data for each match in the provided match dataset.
20
+
21
+ Args:
22
+ match_df (pd.DataFrame): A DataFrame containing match metadata,
23
+ which should be generated by the `match_data` function.
24
+ data_source (str): The data source ('sofavpn' or 'sofascore'). Defaults to 'sofascore'.
25
+ element_load_timeout (int): The maximum time (in seconds) to wait for the API response. Defaults to 10.
26
+ enable_json_export (bool): If `True`, saves the fetched data as a JSON file. Defaults to `False`.
27
+ enable_excel_export (bool): If `True`, saves the fetched data as an Excel file. Defaults to `False`.
28
+ """
29
+ if data_source not in ALLOWED_SOURCES:
30
+ raise ValueError(f"Invalid data source: {data_source}. Must be one of {ALLOWED_SOURCES}")
31
+
32
+ if match_df is None or match_df.empty:
33
+ raise ValueError("Match dataframe must be provided and cannot be empty.")
34
+
35
+ webdriver_instance = None
36
+ try:
37
+ webdriver_instance = setup_webdriver()
38
+ lineups_data_df = pd.DataFrame()
39
+
40
+ def process_player_statistics(lineups_data, team_key):
41
+ extracted_data = []
42
+ for _, row in lineups_data.iterrows():
43
+ for player in row[team_key]:
44
+ for stat_name, stat_value in player.get("statistics", {}).items():
45
+ extracted_data.append({
46
+ "country": row["country"],
47
+ "tournament": row["tournament"],
48
+ "season": row["season"],
49
+ "week": row["week"],
50
+ "game_id": row["game_id"],
51
+ "team": team_key,
52
+ "player_name": player["player"]["name"],
53
+ "player_id": player["player"]["id"],
54
+ "stat_name": stat_name,
55
+ "stat_value": stat_value
56
+ })
57
+ return extracted_data
58
+
59
+ for _, row in match_df.iterrows():
60
+ country = row["country"]
61
+ tournament = row["tournament"]
62
+ game_id = row["game_id"]
63
+ season = row["season"]
64
+ week = row["week"]
65
+
66
+ url = f"{API_BASE_URLS[data_source]}/api/v1/event/{game_id}/lineups"
67
+ webdriver_instance.get(url)
68
+
69
+ try:
70
+ pre_tag = WebDriverWait(webdriver_instance, element_load_timeout).until(
71
+ EC.visibility_of_element_located((By.TAG_NAME, "pre"))
72
+ )
73
+
74
+ lineups_json = json.loads(pre_tag.text)
75
+ lineups_data = pd.DataFrame.from_dict(lineups_json)
76
+ lineups_data["country"] = country
77
+ lineups_data["tournament"] = tournament
78
+ lineups_data["season"] = season
79
+ lineups_data["week"] = week
80
+ lineups_data["game_id"] = game_id
81
+
82
+ lineups_data = lineups_data[lineups_data.index == "players"]
83
+ lineups_data_df = pd.concat([lineups_data_df, lineups_data], ignore_index=True)
84
+
85
+ except TimeoutException:
86
+ raise RuntimeError(f"Timeout while fetching lineup data for game_id {game_id}.")
87
+ except json.JSONDecodeError:
88
+ raise RuntimeError(f"Failed to decode lineup data for game_id {game_id}.")
89
+ except WebDriverException as e:
90
+ raise RuntimeError(f"Selenium WebDriver error while fetching lineup data for game_id {game_id}: {str(e)}")
91
+ except Exception as e:
92
+ raise RuntimeError(f"Unexpected error while fetching lineup data for game_id {game_id}: {e.__class__.__name__} - {e}")
93
+
94
+ if lineups_data_df.empty:
95
+ raise ValueError("No lineup data found for the specified parameters.")
96
+
97
+ extracted_lineups_data = []
98
+ for team_key in ["home", "away"]:
99
+ extracted_lineups_data.extend(process_player_statistics(lineups_data_df, team_key))
100
+
101
+ extracted_lineups_data_df = pd.DataFrame(extracted_lineups_data)
102
+
103
+ def extract_original_rating(rating_versions):
104
+ if isinstance(rating_versions, dict) and "original" in rating_versions:
105
+ return rating_versions["original"]
106
+ return rating_versions
107
+
108
+ extracted_lineups_data_df["stat_value"] = extracted_lineups_data_df.apply(
109
+ lambda row: extract_original_rating(row["stat_value"]) if row["stat_name"] == "ratingVersions" else row["stat_value"],
110
+ axis=1
111
+ )
112
+
113
+ if extracted_lineups_data_df.empty:
114
+ raise ValueError("No extracted lineup data available.")
115
+
116
+ if enable_json_export or enable_excel_export:
117
+ first_row = extracted_lineups_data_df.iloc[0]
118
+
119
+ if enable_json_export:
120
+ save_json(
121
+ data=extracted_lineups_data_df,
122
+ country=first_row["country"],
123
+ tournament=first_row["tournament"],
124
+ season=first_row["season"],
125
+ week_number=first_row["week"]
126
+ )
127
+
128
+ if enable_excel_export:
129
+ save_excel(
130
+ data=extracted_lineups_data_df,
131
+ country=first_row["country"],
132
+ tournament=first_row["tournament"],
133
+ season=first_row["season"],
134
+ week_number=first_row["week"]
135
+ )
136
+
137
+ return extracted_lineups_data_df
138
+
139
+ except WebDriverException as e:
140
+ raise RuntimeError(f"Selenium WebDriver error: {str(e)}")
141
+ except Exception as e:
142
+ raise RuntimeError(f"Unexpected error while fetching lineup data: {e.__class__.__name__} - {e}")
143
+
144
+ finally:
145
+ if webdriver_instance:
146
+ webdriver_instance.quit()
@@ -0,0 +1,102 @@
1
+ import json
2
+ import pandas as pd
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.common.exceptions import TimeoutException, WebDriverException
7
+ from datafc.utils._setup_webdriver import setup_webdriver
8
+ from datafc.utils._save_files import save_json, save_excel
9
+ from datafc.utils._config import ALLOWED_SOURCES, API_BASE_URLS
10
+
11
+ def match_data(
12
+ tournament_id: int,
13
+ season_id: int,
14
+ week_number: int,
15
+ data_source: str = "sofascore",
16
+ element_load_timeout: int = 10,
17
+ enable_json_export: bool = False,
18
+ enable_excel_export: bool = False
19
+ ) -> pd.DataFrame:
20
+ """
21
+ Fetches match data for a specified tournament, season, and matchweek.
22
+
23
+ Args:
24
+ tournament_id (int): The unique identifier for the tournament.
25
+ season_id (int): The unique identifier for the season.
26
+ week_number (int): The matchweek number within the season.
27
+ data_source (str): The data source ('sofavpn' or 'sofascore'). Defaults to 'sofascore'.
28
+ element_load_timeout (int): The maximum time (in seconds) to wait for the API response. Defaults to 10.
29
+ enable_json_export (bool): If `True`, exports the fetched data as a JSON file. Defaults to `False`.
30
+ enable_excel_export (bool): If `True`, exports the fetched data as an Excel file. Defaults to `False`.
31
+ """
32
+ if data_source not in ALLOWED_SOURCES:
33
+ raise ValueError(f"Invalid data source: {data_source}. Must be one of {ALLOWED_SOURCES}")
34
+
35
+ api_request_url = f"{API_BASE_URLS[data_source]}/api/v1/unique-tournament/{tournament_id}/season/{season_id}/events/round/{week_number}"
36
+
37
+ try:
38
+ webdriver_instance = setup_webdriver()
39
+ webdriver_instance.get(api_request_url)
40
+
41
+ response_element = WebDriverWait(webdriver_instance, element_load_timeout).until(
42
+ EC.visibility_of_element_located((By.TAG_NAME, "pre"))
43
+ )
44
+ response_text = response_element.text.strip()
45
+ if not response_text:
46
+ raise RuntimeError("API response is empty.")
47
+
48
+ api_response_data = json.loads(response_text)
49
+ if "events" not in api_response_data or not isinstance(api_response_data["events"], list):
50
+ raise ValueError("Invalid API response format: 'events' key is missing or not a list.")
51
+
52
+ events_df = pd.DataFrame(api_response_data.get("events", []))
53
+ if events_df.empty:
54
+ raise ValueError("No match data found for the specified parameters.")
55
+
56
+ match_data_df = pd.DataFrame({
57
+ "country": events_df["tournament"].apply(lambda x: x.get("category", {}).get("name", "")),
58
+ "tournament": events_df["tournament"].apply(lambda x: x.get("name", "")),
59
+ "season": events_df["season"].apply(lambda x: x.get("year", "")),
60
+ "week": events_df["roundInfo"].apply(lambda x: x.get("round", "")),
61
+ "game_id": events_df["id"],
62
+ "home_team": events_df["homeTeam"].apply(lambda x: x.get("name", "")),
63
+ "away_team": events_df["awayTeam"].apply(lambda x: x.get("name", "")),
64
+ "start_timestamp": events_df["startTimestamp"],
65
+ "status": events_df["status"].apply(lambda x: x.get("description", ""))
66
+ })
67
+
68
+ if enable_json_export or enable_excel_export:
69
+ first_row = match_data_df.iloc[0]
70
+
71
+ if enable_json_export:
72
+ save_json(
73
+ data=match_data_df,
74
+ country=first_row["country"],
75
+ tournament=first_row["tournament"],
76
+ season=first_row["season"],
77
+ week_number=first_row["week"]
78
+ )
79
+
80
+ if enable_excel_export:
81
+ save_excel(
82
+ data=match_data_df,
83
+ country=first_row["country"],
84
+ tournament=first_row["tournament"],
85
+ season=first_row["season"],
86
+ week_number=first_row["week"]
87
+ )
88
+
89
+ return match_data_df
90
+
91
+ except TimeoutException:
92
+ raise RuntimeError("Timeout occurred while waiting for the page or API response.")
93
+ except WebDriverException as e:
94
+ raise RuntimeError(f"Selenium WebDriver error: {str(e)}")
95
+ except json.JSONDecodeError:
96
+ raise RuntimeError("Failed to decode API response as JSON.")
97
+ except Exception as e:
98
+ raise RuntimeError(f"Unexpected error while fetching match data: {e.__class__.__name__} - {e}")
99
+
100
+ finally:
101
+ if webdriver_instance:
102
+ webdriver_instance.quit()