openmatchkit 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openmatchkit/export.py ADDED
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import json
5
+ from io import StringIO
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from pydantic import BaseModel
10
+
11
+ from openmatchkit.models import Match
12
+
13
+
14
+ def matches_to_rows(matches: list[Match]) -> list[dict[str, object]]:
15
+ rows: list[dict[str, object]] = []
16
+
17
+ for match in matches:
18
+ rows.append(
19
+ {
20
+ "match_id": match.match_id,
21
+ "competition": match.competition,
22
+ "season": match.season,
23
+ "round": match.round,
24
+ "group": match.group,
25
+ "kickoff": match.kickoff.isoformat() if match.kickoff else None,
26
+ "home": match.home.name,
27
+ "away": match.away.name,
28
+ "home_score": match.score.home,
29
+ "away_score": match.score.away,
30
+ "status": match.status.value,
31
+ "venue": match.venue,
32
+ "source": match.source,
33
+ "source_url": match.source_url,
34
+ "fetched_at": match.fetched_at.isoformat() if match.fetched_at else None,
35
+ }
36
+ )
37
+
38
+ return rows
39
+
40
+
41
+ def to_json_string(matches: list[Match]) -> str:
42
+ payload = [match.model_dump(mode="json") for match in matches]
43
+ return json.dumps(payload, indent=2)
44
+
45
+
46
+ def to_model_json_string(value: BaseModel | list[BaseModel] | dict[str, Any] | list[Any]) -> str:
47
+ if isinstance(value, BaseModel):
48
+ payload: Any = value.model_dump(mode="json")
49
+ elif isinstance(value, list):
50
+ payload = [
51
+ item.model_dump(mode="json") if isinstance(item, BaseModel) else item for item in value
52
+ ]
53
+ else:
54
+ payload = value
55
+
56
+ return json.dumps(payload, indent=2)
57
+
58
+
59
+ def to_csv_string(matches: list[Match]) -> str:
60
+ rows = matches_to_rows(matches)
61
+ fieldnames = [
62
+ "match_id",
63
+ "competition",
64
+ "season",
65
+ "round",
66
+ "group",
67
+ "kickoff",
68
+ "home",
69
+ "away",
70
+ "home_score",
71
+ "away_score",
72
+ "status",
73
+ "venue",
74
+ "source",
75
+ "source_url",
76
+ "fetched_at",
77
+ ]
78
+ output = StringIO()
79
+ writer = csv.DictWriter(output, fieldnames=fieldnames)
80
+ writer.writeheader()
81
+ writer.writerows(rows)
82
+ return output.getvalue()
83
+
84
+
85
+ def to_json(matches: list[Match], path: str | Path) -> Path:
86
+ target = Path(path)
87
+ target.write_text(to_json_string(matches), encoding="utf-8")
88
+ return target
89
+
90
+
91
+ def to_csv(matches: list[Match], path: str | Path) -> Path:
92
+ target = Path(path)
93
+ target.write_text(to_csv_string(matches), encoding="utf-8", newline="")
94
+ return target
openmatchkit/http.py ADDED
@@ -0,0 +1,181 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime, timedelta, timezone
7
+ from urllib.parse import urlparse
8
+ from urllib.robotparser import RobotFileParser
9
+
10
+ import httpx
11
+
12
+ from openmatchkit.exceptions import SourceFetchError, SourceNotAllowedError
13
+
14
+
15
+ @dataclass
16
+ class _RobotsEntry:
17
+ parser: RobotFileParser
18
+ fetched_at: datetime
19
+
20
+
21
+ @dataclass
22
+ class _CacheEntry:
23
+ body: str
24
+ fetched_at: datetime
25
+
26
+
27
+ @dataclass
28
+ class SafeHttpClient:
29
+ """HTTP client with conservative scraping defaults.
30
+
31
+ It respects robots.txt, keeps an in-memory cache, applies per-origin delays,
32
+ and never attempts to bypass login, CAPTCHA, paywalls, or other protections.
33
+ """
34
+
35
+ user_agent: str = "openmatchkit/0.2.1 (+https://github.com/patilprashan246/openmatchkit)"
36
+ min_delay_seconds: float = 2.0
37
+ timeout_seconds: float = 20.0
38
+ respect_robots: bool = True
39
+ cache_ttl_seconds: float = 300.0
40
+ robots_ttl_seconds: float = 24 * 60 * 60
41
+ transport: httpx.BaseTransport | None = field(default=None, repr=False)
42
+
43
+ _client: httpx.Client = field(init=False, repr=False)
44
+ _last_request_at: dict[str, float] = field(default_factory=dict, init=False, repr=False)
45
+ _robots_cache: dict[str, _RobotsEntry] = field(default_factory=dict, init=False, repr=False)
46
+ _response_cache: dict[str, _CacheEntry] = field(default_factory=dict, init=False, repr=False)
47
+
48
+ def __post_init__(self) -> None:
49
+ self._client = httpx.Client(
50
+ timeout=self.timeout_seconds,
51
+ headers={"User-Agent": self.user_agent},
52
+ follow_redirects=True,
53
+ transport=self.transport,
54
+ )
55
+
56
+ def close(self) -> None:
57
+ self._client.close()
58
+
59
+ def __enter__(self) -> SafeHttpClient:
60
+ return self
61
+
62
+ def __exit__(self, *_args: object) -> None:
63
+ self.close()
64
+
65
+ def _origin(self, url: str) -> str:
66
+ parsed = urlparse(url)
67
+ return f"{parsed.scheme}://{parsed.netloc}"
68
+
69
+ def _robots_url(self, url: str) -> str:
70
+ return f"{self._origin(url)}/robots.txt"
71
+
72
+ def _allow_all_robots(self, robots_url: str) -> RobotFileParser:
73
+ parser = RobotFileParser(robots_url)
74
+ parser.parse([])
75
+ return parser
76
+
77
+ def _block_all_robots(self, robots_url: str) -> RobotFileParser:
78
+ parser = RobotFileParser(robots_url)
79
+ parser.parse(["User-agent: *", "Disallow: /"])
80
+ return parser
81
+
82
+ def _get_robots(self, url: str) -> RobotFileParser:
83
+ origin = self._origin(url)
84
+ now = datetime.now(timezone.utc)
85
+ cached = self._robots_cache.get(origin)
86
+
87
+ if cached and now - cached.fetched_at < timedelta(seconds=self.robots_ttl_seconds):
88
+ return cached.parser
89
+
90
+ robots_url = self._robots_url(url)
91
+
92
+ try:
93
+ response = self._client.get(robots_url)
94
+ except httpx.HTTPError:
95
+ parser = self._block_all_robots(robots_url)
96
+ else:
97
+ if response.status_code == 404:
98
+ parser = self._allow_all_robots(robots_url)
99
+ elif response.status_code in {401, 403} or response.status_code >= 500:
100
+ parser = self._block_all_robots(robots_url)
101
+ elif response.status_code >= 400:
102
+ parser = self._allow_all_robots(robots_url)
103
+ else:
104
+ parser = RobotFileParser(robots_url)
105
+ parser.parse(response.text.splitlines())
106
+
107
+ self._robots_cache[origin] = _RobotsEntry(parser=parser, fetched_at=now)
108
+ return parser
109
+
110
+ def _ensure_allowed(self, url: str) -> None:
111
+ if not self.respect_robots:
112
+ return
113
+
114
+ robots = self._get_robots(url)
115
+ if not robots.can_fetch(self.user_agent, url):
116
+ raise SourceNotAllowedError(f"Blocked by robots.txt: {url}")
117
+
118
+ def _rate_limit(self, url: str) -> None:
119
+ origin = self._origin(url)
120
+ now = time.monotonic()
121
+ last = self._last_request_at.get(origin)
122
+
123
+ if last is not None:
124
+ elapsed = now - last
125
+ wait_time = self.min_delay_seconds - elapsed
126
+ if wait_time > 0:
127
+ time.sleep(wait_time)
128
+
129
+ self._last_request_at[origin] = time.monotonic()
130
+
131
+ def _cache_key(self, url: str) -> str:
132
+ return f"GET {url}"
133
+
134
+ def _cache_get(self, url: str) -> str | None:
135
+ if self.cache_ttl_seconds <= 0:
136
+ return None
137
+
138
+ entry = self._response_cache.get(self._cache_key(url))
139
+ if entry is None:
140
+ return None
141
+
142
+ now = datetime.now(timezone.utc)
143
+ if now - entry.fetched_at >= timedelta(seconds=self.cache_ttl_seconds):
144
+ self._response_cache.pop(self._cache_key(url), None)
145
+ return None
146
+
147
+ return entry.body
148
+
149
+ def _cache_set(self, url: str, body: str) -> None:
150
+ if self.cache_ttl_seconds <= 0:
151
+ return
152
+
153
+ self._response_cache[self._cache_key(url)] = _CacheEntry(
154
+ body=body,
155
+ fetched_at=datetime.now(timezone.utc),
156
+ )
157
+
158
+ def get_text(self, url: str) -> str:
159
+ cached = self._cache_get(url)
160
+ if cached is not None:
161
+ return cached
162
+
163
+ self._ensure_allowed(url)
164
+ self._rate_limit(url)
165
+
166
+ try:
167
+ response = self._client.get(url)
168
+ response.raise_for_status()
169
+ except httpx.HTTPError as exc:
170
+ raise SourceFetchError(f"Failed to fetch {url}: {exc}") from exc
171
+
172
+ self._cache_set(url, response.text)
173
+ return response.text
174
+
175
+ def get_json(self, url: str) -> object:
176
+ text = self.get_text(url)
177
+
178
+ try:
179
+ return json.loads(text)
180
+ except json.JSONDecodeError as exc:
181
+ raise SourceFetchError(f"Failed to parse JSON from {url}: {exc}") from exc
openmatchkit/models.py ADDED
@@ -0,0 +1,198 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+
10
+ class MatchStatus(str, Enum):
11
+ SCHEDULED = "scheduled"
12
+ LIVE = "live"
13
+ HALF_TIME = "half_time"
14
+ FULL_TIME = "full_time"
15
+ POSTPONED = "postponed"
16
+ CANCELLED = "cancelled"
17
+ UNKNOWN = "unknown"
18
+
19
+
20
+ class Team(BaseModel):
21
+ model_config = ConfigDict(str_strip_whitespace=True)
22
+
23
+ name: str = Field(min_length=1)
24
+ code: str | None = None
25
+ country: str | None = None
26
+
27
+
28
+ class TeamInfo(BaseModel):
29
+ name: str
30
+ code: str | None = None
31
+ country: str | None = None
32
+ appearances: int = 0
33
+ sources: list[str] = Field(default_factory=list)
34
+
35
+
36
+ class Player(BaseModel):
37
+ model_config = ConfigDict(str_strip_whitespace=True)
38
+
39
+ name: str = Field(min_length=1)
40
+ player_id: str | None = None
41
+ team: str | None = None
42
+ position: str | None = None
43
+ shirt_number: int | None = None
44
+ country: str | None = None
45
+
46
+
47
+ class Score(BaseModel):
48
+ home: int | None = None
49
+ away: int | None = None
50
+
51
+
52
+ class Match(BaseModel):
53
+ match_id: str
54
+ competition: str
55
+ season: str | None = None
56
+ round: str | None = None
57
+ group: str | None = None
58
+ kickoff: datetime | None = None
59
+ home: Team
60
+ away: Team
61
+ score: Score = Field(default_factory=Score)
62
+ status: MatchStatus = MatchStatus.UNKNOWN
63
+ venue: str | None = None
64
+ source: str
65
+ source_url: str | None = None
66
+ fetched_at: datetime | None = None
67
+
68
+
69
+ class StandingRow(BaseModel):
70
+ team: str
71
+ group: str | None = None
72
+ played: int = 0
73
+ won: int = 0
74
+ drawn: int = 0
75
+ lost: int = 0
76
+ goals_for: int = 0
77
+ goals_against: int = 0
78
+ goal_difference: int = 0
79
+ points: int = 0
80
+
81
+
82
+ class MatchClock(BaseModel):
83
+ minute: int | None = None
84
+ added_time: int | None = None
85
+ period: str | None = None
86
+ display: str | None = None
87
+
88
+
89
+ class MatchEvent(BaseModel):
90
+ event_type: str
91
+ team: str | None = None
92
+ player: str | None = None
93
+ assist: str | None = None
94
+ minute: int | None = None
95
+ added_time: int | None = None
96
+ detail: str | None = None
97
+ home_score: int | None = None
98
+ away_score: int | None = None
99
+
100
+
101
+ class TeamMatchStats(BaseModel):
102
+ team: str
103
+ possession: float | None = None
104
+ shots: int | None = None
105
+ shots_on_target: int | None = None
106
+ corners: int | None = None
107
+ fouls: int | None = None
108
+ offsides: int | None = None
109
+ yellow_cards: int | None = None
110
+ red_cards: int | None = None
111
+ saves: int | None = None
112
+ passes: int | None = None
113
+ pass_accuracy: float | None = None
114
+ expected_goals: float | None = None
115
+
116
+
117
+ class PlayerMatchStats(BaseModel):
118
+ player: Player
119
+ started: bool | None = None
120
+ minutes_played: int | None = None
121
+ goals: int = 0
122
+ assists: int = 0
123
+ yellow_cards: int = 0
124
+ red_cards: int = 0
125
+ shots: int | None = None
126
+ shots_on_target: int | None = None
127
+ saves: int | None = None
128
+ passes: int | None = None
129
+ tackles: int | None = None
130
+ rating: float | None = None
131
+ source: str | None = None
132
+
133
+
134
+ class TeamLineup(BaseModel):
135
+ team: Team
136
+ formation: str | None = None
137
+ coach: str | None = None
138
+ starters: list[Player] = Field(default_factory=list)
139
+ substitutes: list[Player] = Field(default_factory=list)
140
+
141
+
142
+ class Scoreboard(BaseModel):
143
+ match: Match
144
+ clock: MatchClock = Field(default_factory=MatchClock)
145
+ team_stats: list[TeamMatchStats] = Field(default_factory=list)
146
+ lineups: list[TeamLineup] = Field(default_factory=list)
147
+ events: list[MatchEvent] = Field(default_factory=list)
148
+ player_stats: list[PlayerMatchStats] = Field(default_factory=list)
149
+ source_notes: list[str] = Field(default_factory=list)
150
+
151
+
152
+ class PlayerTotals(BaseModel):
153
+ appearances: int = 0
154
+ starts: int = 0
155
+ minutes_played: int = 0
156
+ goals: int = 0
157
+ assists: int = 0
158
+ yellow_cards: int = 0
159
+ red_cards: int = 0
160
+
161
+
162
+ class PlayerMatchAppearance(BaseModel):
163
+ match_id: str
164
+ competition: str
165
+ season: str | None = None
166
+ kickoff: datetime | None = None
167
+ team: str
168
+ opponent: str | None = None
169
+ home_away: Literal["home", "away", "neutral", "unknown"] = "unknown"
170
+ status: MatchStatus = MatchStatus.UNKNOWN
171
+ minutes_played: int | None = None
172
+ started: bool | None = None
173
+ goals: int = 0
174
+ assists: int = 0
175
+ yellow_cards: int = 0
176
+ red_cards: int = 0
177
+ source: str
178
+ source_url: str | None = None
179
+
180
+
181
+ class PlayerHistory(BaseModel):
182
+ player: Player
183
+ totals: PlayerTotals = Field(default_factory=PlayerTotals)
184
+ appearances: list[PlayerMatchAppearance] = Field(default_factory=list)
185
+ source_notes: list[str] = Field(default_factory=list)
186
+
187
+
188
+ class Prediction(BaseModel):
189
+ home: str
190
+ away: str
191
+ home_win_probability: float
192
+ draw_probability: float
193
+ away_win_probability: float
194
+ expected_home_goals: float
195
+ expected_away_goals: float
196
+ model: str
197
+ training_matches: int = 0
198
+ note: str = "Educational baseline only. Not betting, financial, or professional advice."
@@ -0,0 +1,7 @@
1
+ from openmatchkit.prediction.elo import EloRatings
2
+ from openmatchkit.prediction.poisson import SimplePoissonPredictor
3
+
4
+ __all__ = [
5
+ "EloRatings",
6
+ "SimplePoissonPredictor",
7
+ ]
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+
3
+ from openmatchkit.models import Match
4
+
5
+
6
+ class EloRatings:
7
+ """Small Elo baseline for team strength summaries."""
8
+
9
+ def __init__(self, default_rating: float = 1500.0, k_factor: float = 24.0) -> None:
10
+ self.default_rating = default_rating
11
+ self.k_factor = k_factor
12
+ self.ratings: dict[str, float] = {}
13
+
14
+ def rating(self, team: str) -> float:
15
+ return self.ratings.get(team, self.default_rating)
16
+
17
+ def _expected(self, team: str, opponent: str) -> float:
18
+ return 1 / (1 + 10 ** ((self.rating(opponent) - self.rating(team)) / 400))
19
+
20
+ def _actual_scores(self, match: Match) -> tuple[float, float] | None:
21
+ if match.score.home is None or match.score.away is None:
22
+ return None
23
+
24
+ if match.score.home > match.score.away:
25
+ return 1.0, 0.0
26
+ if match.score.home < match.score.away:
27
+ return 0.0, 1.0
28
+ return 0.5, 0.5
29
+
30
+ def fit(self, matches: list[Match]) -> None:
31
+ for match in matches:
32
+ actual = self._actual_scores(match)
33
+ if actual is None:
34
+ continue
35
+
36
+ home = match.home.name
37
+ away = match.away.name
38
+ home_actual, away_actual = actual
39
+ home_expected = self._expected(home, away)
40
+ away_expected = self._expected(away, home)
41
+
42
+ self.ratings[home] = self.rating(home) + self.k_factor * (home_actual - home_expected)
43
+ self.ratings[away] = self.rating(away) + self.k_factor * (away_actual - away_expected)
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from collections import defaultdict
5
+
6
+ from openmatchkit.models import Match, Prediction
7
+
8
+
9
+ class SimplePoissonPredictor:
10
+ """Educational Poisson baseline for win/draw/loss probabilities."""
11
+
12
+ def __init__(self, max_goals: int = 8) -> None:
13
+ self.max_goals = max_goals
14
+ self.team_for_goals: dict[str, float] = {}
15
+ self.team_against_goals: dict[str, float] = {}
16
+ self.global_home_goals = 1.35
17
+ self.global_away_goals = 1.10
18
+ self.training_matches = 0
19
+
20
+ def fit(self, matches: list[Match]) -> None:
21
+ scored = [m for m in matches if m.score.home is not None and m.score.away is not None]
22
+ self.training_matches = len(scored)
23
+
24
+ if not scored:
25
+ return
26
+
27
+ home_goals = sum(m.score.home or 0 for m in scored)
28
+ away_goals = sum(m.score.away or 0 for m in scored)
29
+
30
+ self.global_home_goals = home_goals / len(scored)
31
+ self.global_away_goals = away_goals / len(scored)
32
+
33
+ goals_for: dict[str, list[int]] = defaultdict(list)
34
+ goals_against: dict[str, list[int]] = defaultdict(list)
35
+
36
+ for match in scored:
37
+ home = match.home.name
38
+ away = match.away.name
39
+ hg = match.score.home or 0
40
+ ag = match.score.away or 0
41
+
42
+ goals_for[home].append(hg)
43
+ goals_against[home].append(ag)
44
+ goals_for[away].append(ag)
45
+ goals_against[away].append(hg)
46
+
47
+ self.team_for_goals = {
48
+ team: sum(values) / len(values) for team, values in goals_for.items()
49
+ }
50
+ self.team_against_goals = {
51
+ team: sum(values) / len(values) for team, values in goals_against.items()
52
+ }
53
+
54
+ def _poisson(self, goals: int, expected: float) -> float:
55
+ return (expected**goals) * math.exp(-expected) / math.factorial(goals)
56
+
57
+ def _expected_goals(self, home: str, away: str) -> tuple[float, float]:
58
+ home_attack = self.team_for_goals.get(home, self.global_home_goals)
59
+ away_defense = self.team_against_goals.get(away, self.global_home_goals)
60
+
61
+ away_attack = self.team_for_goals.get(away, self.global_away_goals)
62
+ home_defense = self.team_against_goals.get(home, self.global_away_goals)
63
+
64
+ expected_home = max(0.2, (home_attack + away_defense) / 2)
65
+ expected_away = max(0.2, (away_attack + home_defense) / 2)
66
+
67
+ return expected_home, expected_away
68
+
69
+ def predict(self, home: str, away: str) -> Prediction:
70
+ expected_home, expected_away = self._expected_goals(home, away)
71
+
72
+ home_win = 0.0
73
+ draw = 0.0
74
+ away_win = 0.0
75
+
76
+ for hg in range(self.max_goals + 1):
77
+ for ag in range(self.max_goals + 1):
78
+ probability = self._poisson(hg, expected_home) * self._poisson(ag, expected_away)
79
+
80
+ if hg > ag:
81
+ home_win += probability
82
+ elif hg == ag:
83
+ draw += probability
84
+ else:
85
+ away_win += probability
86
+
87
+ total = home_win + draw + away_win
88
+
89
+ return Prediction(
90
+ home=home,
91
+ away=away,
92
+ home_win_probability=home_win / total,
93
+ draw_probability=draw / total,
94
+ away_win_probability=away_win / total,
95
+ expected_home_goals=expected_home,
96
+ expected_away_goals=expected_away,
97
+ model="simple_poisson",
98
+ training_matches=self.training_matches,
99
+ )
@@ -0,0 +1,11 @@
1
+ from openmatchkit.sources.football_data_uk import FootballDataUkSource
2
+ from openmatchkit.sources.json_file import JsonFileSource
3
+ from openmatchkit.sources.openfootball import OpenFootballSource
4
+ from openmatchkit.sources.public_html import PublicHtmlSource
5
+
6
+ __all__ = [
7
+ "FootballDataUkSource",
8
+ "JsonFileSource",
9
+ "OpenFootballSource",
10
+ "PublicHtmlSource",
11
+ ]
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+ from openmatchkit.models import Match, PlayerHistory, Scoreboard
6
+
7
+
8
+ class MatchSource(Protocol):
9
+ name: str
10
+
11
+ def fixtures(self, competition: str, season: str | None = None) -> list[Match]: ...
12
+
13
+ def live_scores(self) -> list[Match]: ...
14
+
15
+ def scoreboard(
16
+ self,
17
+ match_id: str,
18
+ competition: str | None = None,
19
+ season: str | None = None,
20
+ ) -> Scoreboard | None: ...
21
+
22
+ def live_scoreboards(self) -> list[Scoreboard]: ...
23
+
24
+ def player_history(
25
+ self,
26
+ player: str,
27
+ competition: str | None = None,
28
+ season: str | None = None,
29
+ ) -> PlayerHistory: ...