hkjc 0.3.10__tar.gz → 0.3.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hkjc-0.3.10 → hkjc-0.3.12}/PKG-INFO +2 -1
- {hkjc-0.3.10 → hkjc-0.3.12}/pyproject.toml +2 -1
- {hkjc-0.3.10 → hkjc-0.3.12}/src/hkjc/__init__.py +4 -3
- hkjc-0.3.12/src/hkjc/historical.py +147 -0
- {hkjc-0.3.10 → hkjc-0.3.12}/src/hkjc/live_odds.py +7 -8
- {hkjc-0.3.10 → hkjc-0.3.12}/src/hkjc/processing.py +66 -1
- hkjc-0.3.12/src/hkjc/utils.py +189 -0
- {hkjc-0.3.10 → hkjc-0.3.12}/uv.lock +34 -1
- hkjc-0.3.10/src/hkjc/historical.py +0 -13
- hkjc-0.3.10/src/hkjc/optimization.py +0 -106
- {hkjc-0.3.10 → hkjc-0.3.12}/.python-version +0 -0
- {hkjc-0.3.10 → hkjc-0.3.12}/README.md +0 -0
- {hkjc-0.3.10 → hkjc-0.3.12}/src/hkjc/harville_model.py +0 -0
- {hkjc-0.3.10 → hkjc-0.3.12}/src/hkjc/py.typed +0 -0
- {hkjc-0.3.10 → hkjc-0.3.12}/src/hkjc/speedpro.py +0 -0
- {hkjc-0.3.10 → hkjc-0.3.12}/src/hkjc/strategy/place_only.py +0 -0
- {hkjc-0.3.10 → hkjc-0.3.12}/src/hkjc/strategy/qpbanker.py +0 -0
@@ -1,8 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: hkjc
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.12
|
4
4
|
Summary: Library for scrapping HKJC data and perform basic analysis
|
5
5
|
Requires-Python: >=3.11
|
6
|
+
Requires-Dist: beautifulsoup4>=4.14.2
|
6
7
|
Requires-Dist: cachetools>=6.2.0
|
7
8
|
Requires-Dist: fastexcel>=0.16.0
|
8
9
|
Requires-Dist: numba>=0.62.1
|
@@ -1,10 +1,11 @@
|
|
1
1
|
[project]
|
2
2
|
name = "hkjc"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.12"
|
4
4
|
description = "Library for scrapping HKJC data and perform basic analysis"
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.11"
|
7
7
|
dependencies = [
|
8
|
+
"beautifulsoup4>=4.14.2",
|
8
9
|
"cachetools>=6.2.0",
|
9
10
|
"fastexcel>=0.16.0",
|
10
11
|
"numba>=0.62.1",
|
@@ -6,7 +6,8 @@ from importlib.metadata import version as _version
|
|
6
6
|
|
7
7
|
__all__ = ["live_odds", "qpbanker",
|
8
8
|
"generate_all_qp_trades", "generate_all_pla_trades", "pareto_filter",
|
9
|
-
"speedpro_energy", "speedmap", "harveille_model"
|
9
|
+
"speedpro_energy", "speedmap", "harveille_model",
|
10
|
+
"generate_historical_data"]
|
10
11
|
|
11
12
|
try:
|
12
13
|
__version__ = _version(__name__)
|
@@ -14,7 +15,7 @@ except Exception: # pragma: no cover - best-effort version resolution
|
|
14
15
|
__version__ = "0.0.0"
|
15
16
|
|
16
17
|
from .live_odds import live_odds
|
17
|
-
from .processing import generate_all_qp_trades, generate_all_pla_trades
|
18
|
-
from .
|
18
|
+
from .processing import generate_all_qp_trades, generate_all_pla_trades, generate_historical_data
|
19
|
+
from .utils import pareto_filter
|
19
20
|
from .speedpro import speedmap, speedpro_energy
|
20
21
|
from . import harville_model
|
@@ -0,0 +1,147 @@
|
|
1
|
+
"""Functions to fetch and process historical race and horse data from HKJC
|
2
|
+
"""
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import requests
|
6
|
+
import polars as pl
|
7
|
+
from bs4 import BeautifulSoup
|
8
|
+
from cachetools.func import ttl_cache
|
9
|
+
|
10
|
+
from .utils import _validate_date, _validate_venue_code, _parse_html_table
|
11
|
+
|
12
|
+
HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
|
13
|
+
HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId={horse_id}"
|
14
|
+
|
15
|
+
|
16
|
+
@ttl_cache(maxsize=100, ttl=3600)
|
17
|
+
def _soupify(url: str) -> BeautifulSoup:
|
18
|
+
"""Fetch and parse a webpage and return BeautifulSoup object
|
19
|
+
"""
|
20
|
+
response = requests.get(url, timeout=30)
|
21
|
+
response.raise_for_status()
|
22
|
+
return BeautifulSoup(response.content, 'html.parser')
|
23
|
+
|
24
|
+
|
25
|
+
def _soupify_race_page(date: str, venue_code: str, race_number: int) -> BeautifulSoup:
|
26
|
+
"""Fetch and parse HKJC race results page and return BeautifulSoup object
|
27
|
+
"""
|
28
|
+
url = HKJC_RACE_URL_TEMPLATE.format(
|
29
|
+
date=date, venue_code=venue_code, race_number=race_number)
|
30
|
+
return _soupify(url)
|
31
|
+
|
32
|
+
|
33
|
+
def _soupify_horse_page(horse_id: str) -> BeautifulSoup:
|
34
|
+
"""Fetch and parse HKJC race results page and return BeautifulSoup object
|
35
|
+
"""
|
36
|
+
url = HKJC_HORSE_URL_TEMPLATE.format(horse_id=horse_id)
|
37
|
+
return _soupify(url)
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition") -> pl.DataFrame:
|
42
|
+
"""Classify running style based on RunningPosition column
|
43
|
+
"""
|
44
|
+
# Split the RunningPosition column into separate columns and convert to integers
|
45
|
+
df = df.with_columns(
|
46
|
+
pl.col(running_pos_col)
|
47
|
+
.str.split_exact(" ", n=3)
|
48
|
+
.struct.rename_fields(["StartPosition", "Position2", "Position3", "FinishPosition"])
|
49
|
+
# Give an alias to the struct for easier selection
|
50
|
+
.alias("split_data").cast(pl.Int64, strict=False)
|
51
|
+
).unnest("split_data")
|
52
|
+
|
53
|
+
df = df.with_columns([
|
54
|
+
(pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
|
55
|
+
pl.mean_horizontal("StartPosition", "Position2",
|
56
|
+
"Position3", "FinishPosition").alias("AvgPosition"),
|
57
|
+
]).with_columns(pl.when(pl.col("StartPosition").is_null()).then(pl.lit("--"))
|
58
|
+
.when((pl.col("PositionChange") <= 0) & pl.col("StartPosition") <= 3).then(pl.lit("FrontRunner"))
|
59
|
+
.when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
|
60
|
+
.otherwise(pl.lit("Pacer")).alias("RunningStyle"))
|
61
|
+
|
62
|
+
recent_style = df['RunningStyle'][:10].mode()[0]
|
63
|
+
df = df.with_columns(pl.lit(recent_style).alias("FavoriteRunningStyle"))
|
64
|
+
|
65
|
+
return df
|
66
|
+
|
67
|
+
|
68
|
+
def _extract_horse_data(horse_id: str) -> pl.DataFrame:
|
69
|
+
"""Extract horse info and history from horse page
|
70
|
+
"""
|
71
|
+
soup = _soupify_horse_page(horse_id)
|
72
|
+
table = soup.find('table', class_='bigborder')
|
73
|
+
horse_data = _parse_html_table(table).filter(
|
74
|
+
pl.col('Date') != '') # Remove empty rows
|
75
|
+
horse_data = _classify_running_style(horse_data)
|
76
|
+
|
77
|
+
# Extract horse profile info
|
78
|
+
table = soup.find_all('table', class_='table_eng_text')
|
79
|
+
profile_data = _parse_html_table(table[0], skip_header=True)
|
80
|
+
country, age = profile_data.filter(pl.col("column_0").str.starts_with("Country"))['column_2'].item(0).split('/')
|
81
|
+
profile_data = _parse_html_table(table[1], skip_header=True)
|
82
|
+
current_rating = profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0)
|
83
|
+
season_start_rating = profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0)
|
84
|
+
|
85
|
+
horse_info = {
|
86
|
+
'HorseID': horse_id,
|
87
|
+
'OriginCountry': country.strip(),
|
88
|
+
'Age': int(age),
|
89
|
+
'CurrentRating': int(current_rating),
|
90
|
+
'SeasonStartRating': int(season_start_rating)
|
91
|
+
}
|
92
|
+
horse_data = (horse_data.with_columns([
|
93
|
+
pl.lit(value).alias(key) for key, value in horse_info.items()
|
94
|
+
])
|
95
|
+
)
|
96
|
+
return horse_data
|
97
|
+
|
98
|
+
|
99
|
+
def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
|
100
|
+
soup = _soupify_race_page(date, venue_code, race_number)
|
101
|
+
table = soup.find('div', class_='race_tab').find('table')
|
102
|
+
race_data = _parse_html_table(table)
|
103
|
+
|
104
|
+
# Extract the relevant race information
|
105
|
+
race_class = race_data.item(1, 0).split('-')[0].strip()
|
106
|
+
race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
|
107
|
+
race_name = race_data.item(2, 0).strip()
|
108
|
+
going = race_data.item(1, 2).strip()
|
109
|
+
course = race_data.item(2, 2).strip()
|
110
|
+
|
111
|
+
race_info = {'RaceDate': date,
|
112
|
+
'Venue': venue_code,
|
113
|
+
'RaceNumber': race_number,
|
114
|
+
'RaceClass': race_class,
|
115
|
+
'RaceDistance': race_dist,
|
116
|
+
'RaceName': race_name,
|
117
|
+
'Going': going,
|
118
|
+
'Course': course}
|
119
|
+
|
120
|
+
# Extract the results table
|
121
|
+
table = soup.find('div', class_='performance').find('table')
|
122
|
+
race_data = (_parse_html_table(table)
|
123
|
+
.with_columns([
|
124
|
+
pl.lit(value).alias(key) for key, value in race_info.items()
|
125
|
+
])
|
126
|
+
)
|
127
|
+
|
128
|
+
# Extract horse IDs from links
|
129
|
+
horse_ids = []
|
130
|
+
rows = table.find_all('tr')[1:] # Skip header row
|
131
|
+
for row in rows:
|
132
|
+
horse_id = 'UNKNOWN' # Horse link not found
|
133
|
+
links = row.find_all('a')
|
134
|
+
for link in links:
|
135
|
+
if 'href' in link.attrs and 'HorseId=' in link['href']:
|
136
|
+
horse_id = link['href'].split('HorseId=')[1]
|
137
|
+
break
|
138
|
+
horse_ids.append(horse_id)
|
139
|
+
|
140
|
+
race_data = race_data.with_columns(pl.Series('HorseID', horse_ids))
|
141
|
+
|
142
|
+
# Join with horse data
|
143
|
+
horse_data_list = [_extract_horse_data(horse_id) for horse_id in horse_ids]
|
144
|
+
horse_data_df = pl.concat(horse_data_list).unique(subset=['HorseID'])
|
145
|
+
race_data = race_data.join(horse_data_df, on='HorseID', how='left')
|
146
|
+
|
147
|
+
return race_data
|
@@ -6,9 +6,10 @@ from typing import Tuple, List
|
|
6
6
|
import requests
|
7
7
|
from cachetools.func import ttl_cache
|
8
8
|
import numpy as np
|
9
|
-
from datetime import datetime as dt
|
10
9
|
|
11
|
-
|
10
|
+
from .utils import _validate_date, _validate_venue_code
|
11
|
+
|
12
|
+
HKJC_LIVEODDS_ENDPOINT = "https://info.cld.hkjc.com/graphql/base/"
|
12
13
|
|
13
14
|
LIVEODDS_PAYLOAD = {
|
14
15
|
"operationName": "racing",
|
@@ -70,7 +71,8 @@ def _fetch_live_odds(date: str, venue_code: str, race_number: int, odds_type: Tu
|
|
70
71
|
"User-Agent": "python-hkjc-fetch/0.1",
|
71
72
|
}
|
72
73
|
|
73
|
-
r = requests.post(
|
74
|
+
r = requests.post(HKJC_LIVEODDS_ENDPOINT, json=payload,
|
75
|
+
headers=headers, timeout=10)
|
74
76
|
if r.status_code != 200:
|
75
77
|
raise RuntimeError(f"Request failed: {r.status_code} - {r.text}")
|
76
78
|
|
@@ -104,11 +106,8 @@ def live_odds(date: str, venue_code: str, race_number: int, odds_type: List[str]
|
|
104
106
|
If odds_type is 'WIN','PLA', returns a 1D array of place odds.
|
105
107
|
If odds_type is 'QIN','QPL', returns a 2D array of quinella place odds.
|
106
108
|
"""
|
107
|
-
|
108
|
-
|
109
|
-
dt.strptime(date, "%Y-%m-%d")
|
110
|
-
except Exception:
|
111
|
-
raise ValueError("Date must be in 'YYYY-MM-DD' format")
|
109
|
+
_validate_date(date)
|
110
|
+
_validate_venue_code(venue_code)
|
112
111
|
|
113
112
|
mandatory_types = ['PLA']
|
114
113
|
|
@@ -1,22 +1,87 @@
|
|
1
1
|
"""Functions to batch process trades into dataframes for analysis.
|
2
2
|
"""
|
3
3
|
from __future__ import annotations
|
4
|
-
from typing import Tuple, List
|
4
|
+
from typing import Tuple, List, Union
|
5
5
|
|
6
6
|
from .live_odds import live_odds
|
7
7
|
from .strategy import qpbanker, place_only
|
8
8
|
from .harville_model import fit_harville_to_odds
|
9
|
+
from .historical import _extract_race_data
|
10
|
+
from .utils import _validate_date
|
9
11
|
|
10
12
|
import polars as pl
|
11
13
|
import numpy as np
|
12
14
|
from itertools import combinations
|
13
15
|
from tqdm import tqdm
|
16
|
+
from datetime import datetime as dt
|
14
17
|
|
15
18
|
|
16
19
|
def _all_subsets(lst): return [list(x) for r in range(
|
17
20
|
1, len(lst)+1) for x in combinations(lst, r)] # list subsets of a list
|
18
21
|
|
19
22
|
|
23
|
+
# ======================================
|
24
|
+
# Historical data processing functions
|
25
|
+
# ======================================
|
26
|
+
incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
|
27
|
+
'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
|
28
|
+
|
29
|
+
|
30
|
+
def _historical_process_single_date_venue(date: str, venue_code: str) -> Union[pl.DataFrame, None]:
|
31
|
+
for race_number in range(1, 12):
|
32
|
+
try:
|
33
|
+
_extract_race_data(date.strftime('%Y/%m/%d'),
|
34
|
+
venue_code, race_number)
|
35
|
+
except:
|
36
|
+
return None
|
37
|
+
|
38
|
+
|
39
|
+
def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
40
|
+
"""Generate historical race dataset from start_date to end_date"""
|
41
|
+
_validate_date(start_date)
|
42
|
+
_validate_date(end_date)
|
43
|
+
start_dt = dt.strptime(start_date, '%Y-%m-%d')
|
44
|
+
end_dt = dt.strptime(end_date, '%Y-%m-%d')
|
45
|
+
|
46
|
+
dfs = []
|
47
|
+
|
48
|
+
for date in pl.date_range(start_dt, end_dt, interval='1d'):
|
49
|
+
for venue_code in ['ST', 'HV']:
|
50
|
+
df = _historical_process_single_date_venue(date, venue_code)
|
51
|
+
if df is None:
|
52
|
+
continue
|
53
|
+
dfs.append(df)
|
54
|
+
|
55
|
+
df = (pl.concat(dfs)
|
56
|
+
.filter(~pl.col('Pla').is_in(incidents))
|
57
|
+
.with_columns(
|
58
|
+
pl.col('Pla').str.split(' ').list.first().alias('Pla')
|
59
|
+
)
|
60
|
+
)
|
61
|
+
|
62
|
+
df = df.with_columns([
|
63
|
+
pl.col('Pla').cast(pl.Int64, strict=False),
|
64
|
+
pl.col('HorseNo').cast(pl.Int64, strict=False),
|
65
|
+
pl.col('ActWt').cast(pl.Int64, strict=False),
|
66
|
+
pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
|
67
|
+
pl.col('Dr').cast(pl.Int64, strict=False),
|
68
|
+
pl.col('RaceDistance').cast(pl.Int64, strict=False),
|
69
|
+
pl.col('WinOdds').cast(pl.Float64, strict=False)
|
70
|
+
])
|
71
|
+
|
72
|
+
df = df.with_columns(pl.col('Finish Time')
|
73
|
+
.str.strptime(pl.Duration, format='%M:%S.%f', strict=False)
|
74
|
+
.dt.total_seconds()
|
75
|
+
.alias('Finish Time')
|
76
|
+
)
|
77
|
+
|
78
|
+
return df
|
79
|
+
|
80
|
+
|
81
|
+
# ==========================
|
82
|
+
# Trade processing functions
|
83
|
+
# ==========================
|
84
|
+
|
20
85
|
def _process_single_qp_trade(banker: int, covered: List[int], pla_odds: np.ndarray, qpl_odds: np.ndarray, rebate: float) -> Tuple[int, List, float, float, float]:
|
21
86
|
"""Process a single qp trade.
|
22
87
|
"""
|
@@ -0,0 +1,189 @@
|
|
1
|
+
import polars as pl
|
2
|
+
from typing import List, Union
|
3
|
+
from datetime import datetime as dt
|
4
|
+
import bs4
|
5
|
+
import re
|
6
|
+
|
7
|
+
|
8
|
+
def _validate_date(date_str: str) -> bool:
|
9
|
+
# validate date format
|
10
|
+
try:
|
11
|
+
dt.strptime(date_str, "%Y-%m-%d")
|
12
|
+
except Exception:
|
13
|
+
raise ValueError("Date must be in 'YYYY-MM-DD' format")
|
14
|
+
return True
|
15
|
+
|
16
|
+
|
17
|
+
def _validate_venue_code(venue_code: str) -> bool:
|
18
|
+
if venue_code not in ['HV', 'ST']:
|
19
|
+
raise ValueError(
|
20
|
+
"Venue code must be 'HV' (Happy Valley) or 'ST' (Sha Tin)")
|
21
|
+
return True
|
22
|
+
|
23
|
+
|
24
|
+
def _parse_html_table(table: bs4.element.Tag, skip_header=False) -> pl.DataFrame:
|
25
|
+
"""Parse an HTML table (HKJC format) into a Polars DataFrame
|
26
|
+
"""
|
27
|
+
if table is None:
|
28
|
+
raise ValueError("No table found in HTML tag")
|
29
|
+
|
30
|
+
# Extract headers
|
31
|
+
headers = []
|
32
|
+
if not skip_header:
|
33
|
+
header_row = table.find('thead')
|
34
|
+
if header_row:
|
35
|
+
headers = [td.get_text(strip=True) for td in header_row.find_all('td')]
|
36
|
+
else:
|
37
|
+
# If no thead, try first tr
|
38
|
+
first_row = table.find('tr')
|
39
|
+
if first_row:
|
40
|
+
headers = [th.get_text(strip=True)
|
41
|
+
for th in first_row.find_all('th')]
|
42
|
+
if not headers:
|
43
|
+
# If no th tags, use td tags from first row as headers
|
44
|
+
headers = [td.get_text(strip=True)
|
45
|
+
for td in first_row.find_all('td')]
|
46
|
+
|
47
|
+
# Extract data rows
|
48
|
+
data = []
|
49
|
+
tbody = table.find('tbody')
|
50
|
+
rows = tbody.find_all('tr') if tbody else table.find_all('tr')
|
51
|
+
|
52
|
+
# Skip first row if it was used for headers
|
53
|
+
start_idx = 1 if not tbody and headers else 0
|
54
|
+
|
55
|
+
for row in rows[start_idx:]:
|
56
|
+
cells = row.find_all(['td', 'th'])
|
57
|
+
row_data = [cell.get_text(separator=' ',strip=True) for cell in cells]
|
58
|
+
if row_data: # Skip empty rows
|
59
|
+
data.append(row_data)
|
60
|
+
|
61
|
+
# Create DataFrame
|
62
|
+
if not headers:
|
63
|
+
# Generate default column names if no headers found
|
64
|
+
headers = [f"column_{i}" for i in range(len(data[0]))] if data else []
|
65
|
+
|
66
|
+
# Ensure all rows have the same number of columns
|
67
|
+
if data:
|
68
|
+
max_cols = len(headers)
|
69
|
+
data = [row + [''] * (max_cols - len(row)) if len(row)
|
70
|
+
< max_cols else row[:max_cols] for row in data]
|
71
|
+
|
72
|
+
df = pl.DataFrame(data, schema=headers, orient='row')
|
73
|
+
# Clean column names by removing special characters
|
74
|
+
df.columns = [re.sub(r'[^\w]', '', col)
|
75
|
+
for col in df.columns]
|
76
|
+
return df
|
77
|
+
|
78
|
+
|
79
|
+
def pareto_filter(
|
80
|
+
df: pl.DataFrame,
|
81
|
+
groupby: List[str],
|
82
|
+
by: List[str],
|
83
|
+
maximize: Union[bool, List[bool]] = True
|
84
|
+
) -> pl.DataFrame:
|
85
|
+
"""
|
86
|
+
Filter dataframe to only include Pareto optimal rows within each group.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
df: Input dataframe
|
90
|
+
groupby: Columns to group by (empty list for global filter)
|
91
|
+
by: Columns to consider for Pareto optimality
|
92
|
+
maximize: Whether to maximize (True) or minimize (False) each 'by' column
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
DataFrame containing only Pareto optimal rows
|
96
|
+
"""
|
97
|
+
if df.is_empty() or not by:
|
98
|
+
return df
|
99
|
+
|
100
|
+
# Normalize maximize to list
|
101
|
+
maximize_list = [maximize] * \
|
102
|
+
len(by) if isinstance(maximize, bool) else maximize
|
103
|
+
|
104
|
+
if len(maximize_list) != len(by):
|
105
|
+
raise ValueError(
|
106
|
+
f"Length of 'maximize' ({len(maximize_list)}) must equal length of 'by' ({len(by)})")
|
107
|
+
|
108
|
+
# Single objective: simple min/max filter
|
109
|
+
if len(by) == 1:
|
110
|
+
opt_expr = pl.col(by[0]).max(
|
111
|
+
) if maximize_list[0] else pl.col(by[0]).min()
|
112
|
+
if groupby:
|
113
|
+
opt_expr = opt_expr.over(groupby)
|
114
|
+
return df.filter(pl.col(by[0]) == opt_expr)
|
115
|
+
|
116
|
+
# Two objectives: efficient skyline algorithm
|
117
|
+
if len(by) == 2:
|
118
|
+
temp_cols = ["__obj_0", "__obj_1"]
|
119
|
+
|
120
|
+
# Transform to maximization problem
|
121
|
+
df_temp = df.with_columns([
|
122
|
+
(pl.col(by[i]) * (1 if maximize_list[i] else -1)
|
123
|
+
).alias(temp_cols[i])
|
124
|
+
for i in range(2)
|
125
|
+
])
|
126
|
+
|
127
|
+
# Sort by first objective descending, then second descending (for stability)
|
128
|
+
groupby = groupby or []
|
129
|
+
sort_cols = (groupby if groupby else []) + temp_cols
|
130
|
+
sorted_df = df_temp.sort(sort_cols, descending=[
|
131
|
+
False] * len(groupby) + [True, True])
|
132
|
+
|
133
|
+
# Keep rows where second objective is not dominated by any previous row in group
|
134
|
+
if groupby:
|
135
|
+
max_so_far = pl.col(temp_cols[1]).cum_max().shift(
|
136
|
+
1, fill_value=float("-inf")).over(groupby)
|
137
|
+
else:
|
138
|
+
max_so_far = pl.col(temp_cols[1]).cum_max().shift(
|
139
|
+
1, fill_value=float("-inf"))
|
140
|
+
|
141
|
+
mask = pl.col(temp_cols[1]) > max_so_far
|
142
|
+
return sorted_df.filter(mask).drop(temp_cols)
|
143
|
+
|
144
|
+
# N objectives (N > 2): pairwise dominance check
|
145
|
+
df_with_id = df.with_row_index("__id")
|
146
|
+
|
147
|
+
# Self-join to compare all pairs
|
148
|
+
left = df_with_id.lazy()
|
149
|
+
right = df_with_id.lazy()
|
150
|
+
|
151
|
+
if groupby:
|
152
|
+
pairs = left.join(right, on=groupby, suffix="_r")
|
153
|
+
else:
|
154
|
+
pairs = left.join(right, how="cross", suffix="_r")
|
155
|
+
|
156
|
+
# Only compare different rows
|
157
|
+
pairs = pairs.filter(pl.col("__id") != pl.col("__id_r"))
|
158
|
+
|
159
|
+
# Build dominance conditions
|
160
|
+
dominance_conditions = []
|
161
|
+
for col, is_max in zip(by, maximize_list):
|
162
|
+
if is_max:
|
163
|
+
# right dominates left if right[col] >= left[col] for all cols
|
164
|
+
dominance_conditions.append(pl.col(f"{col}_r") >= pl.col(col))
|
165
|
+
else:
|
166
|
+
dominance_conditions.append(pl.col(f"{col}_r") <= pl.col(col))
|
167
|
+
|
168
|
+
# Strict dominance: all >= and at least one >
|
169
|
+
strict_conditions = []
|
170
|
+
for col, is_max in zip(by, maximize_list):
|
171
|
+
if is_max:
|
172
|
+
strict_conditions.append(pl.col(f"{col}_r") > pl.col(col))
|
173
|
+
else:
|
174
|
+
strict_conditions.append(pl.col(f"{col}_r") < pl.col(col))
|
175
|
+
|
176
|
+
is_dominated = pl.all_horizontal(
|
177
|
+
dominance_conditions) & pl.any_horizontal(strict_conditions)
|
178
|
+
|
179
|
+
# Find IDs of dominated rows
|
180
|
+
dominated_ids = (
|
181
|
+
pairs.filter(is_dominated)
|
182
|
+
.select("__id")
|
183
|
+
.unique()
|
184
|
+
.collect()
|
185
|
+
.get_column("__id")
|
186
|
+
)
|
187
|
+
|
188
|
+
# Return non-dominated rows
|
189
|
+
return df_with_id.filter(~pl.col("__id").is_in(dominated_ids)).drop("__id")
|
@@ -2,6 +2,19 @@ version = 1
|
|
2
2
|
revision = 2
|
3
3
|
requires-python = ">=3.11"
|
4
4
|
|
5
|
+
[[package]]
|
6
|
+
name = "beautifulsoup4"
|
7
|
+
version = "4.14.2"
|
8
|
+
source = { registry = "https://pypi.org/simple" }
|
9
|
+
dependencies = [
|
10
|
+
{ name = "soupsieve" },
|
11
|
+
{ name = "typing-extensions" },
|
12
|
+
]
|
13
|
+
sdist = { url = "https://files.pythonhosted.org/packages/77/e9/df2358efd7659577435e2177bfa69cba6c33216681af51a707193dec162a/beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e", size = 625822, upload-time = "2025-09-29T10:05:42.613Z" }
|
14
|
+
wheels = [
|
15
|
+
{ url = "https://files.pythonhosted.org/packages/94/fe/3aed5d0be4d404d12d36ab97e2f1791424d9ca39c2f754a6285d59a3b01d/beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515", size = 106392, upload-time = "2025-09-29T10:05:43.771Z" },
|
16
|
+
]
|
17
|
+
|
5
18
|
[[package]]
|
6
19
|
name = "cachetools"
|
7
20
|
version = "6.2.0"
|
@@ -97,9 +110,10 @@ wheels = [
|
|
97
110
|
|
98
111
|
[[package]]
|
99
112
|
name = "hkjc"
|
100
|
-
version = "0.3.
|
113
|
+
version = "0.3.12"
|
101
114
|
source = { editable = "." }
|
102
115
|
dependencies = [
|
116
|
+
{ name = "beautifulsoup4" },
|
103
117
|
{ name = "cachetools" },
|
104
118
|
{ name = "fastexcel" },
|
105
119
|
{ name = "numba" },
|
@@ -113,6 +127,7 @@ dependencies = [
|
|
113
127
|
|
114
128
|
[package.metadata]
|
115
129
|
requires-dist = [
|
130
|
+
{ name = "beautifulsoup4", specifier = ">=4.14.2" },
|
116
131
|
{ name = "cachetools", specifier = ">=6.2.0" },
|
117
132
|
{ name = "fastexcel", specifier = ">=0.16.0" },
|
118
133
|
{ name = "numba", specifier = ">=0.62.1" },
|
@@ -400,6 +415,15 @@ wheels = [
|
|
400
415
|
{ url = "https://files.pythonhosted.org/packages/97/30/2f9a5243008f76dfc5dee9a53dfb939d9b31e16ce4bd4f2e628bfc5d89d2/scipy-1.16.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d2a4472c231328d4de38d5f1f68fdd6d28a615138f842580a8a321b5845cf779", size = 26448374, upload-time = "2025-09-11T17:45:03.45Z" },
|
401
416
|
]
|
402
417
|
|
418
|
+
[[package]]
|
419
|
+
name = "soupsieve"
|
420
|
+
version = "2.8"
|
421
|
+
source = { registry = "https://pypi.org/simple" }
|
422
|
+
sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" }
|
423
|
+
wheels = [
|
424
|
+
{ url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" },
|
425
|
+
]
|
426
|
+
|
403
427
|
[[package]]
|
404
428
|
name = "tqdm"
|
405
429
|
version = "4.67.1"
|
@@ -412,6 +436,15 @@ wheels = [
|
|
412
436
|
{ url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
|
413
437
|
]
|
414
438
|
|
439
|
+
[[package]]
|
440
|
+
name = "typing-extensions"
|
441
|
+
version = "4.15.0"
|
442
|
+
source = { registry = "https://pypi.org/simple" }
|
443
|
+
sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
|
444
|
+
wheels = [
|
445
|
+
{ url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
|
446
|
+
]
|
447
|
+
|
415
448
|
[[package]]
|
416
449
|
name = "urllib3"
|
417
450
|
version = "2.5.0"
|
@@ -1,13 +0,0 @@
|
|
1
|
-
"""Functions to fetch and process historical race and horse data from HKJC
|
2
|
-
"""
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
import requests
|
6
|
-
|
7
|
-
# TODO read and process all races from start date to end date
|
8
|
-
|
9
|
-
# TODO query all basic info and race history for a specific horse
|
10
|
-
|
11
|
-
# TODO classify running style & draw to determine blocking probability
|
12
|
-
|
13
|
-
#
|
@@ -1,106 +0,0 @@
|
|
1
|
-
import polars as pl
|
2
|
-
from typing import List, Union
|
3
|
-
|
4
|
-
def pareto_filter(
|
5
|
-
df: pl.DataFrame,
|
6
|
-
groupby: List[str],
|
7
|
-
by: List[str],
|
8
|
-
maximize: Union[bool, List[bool]] = True
|
9
|
-
) -> pl.DataFrame:
|
10
|
-
"""
|
11
|
-
Filter dataframe to only include Pareto optimal rows within each group.
|
12
|
-
|
13
|
-
Args:
|
14
|
-
df: Input dataframe
|
15
|
-
groupby: Columns to group by (empty list for global filter)
|
16
|
-
by: Columns to consider for Pareto optimality
|
17
|
-
maximize: Whether to maximize (True) or minimize (False) each 'by' column
|
18
|
-
|
19
|
-
Returns:
|
20
|
-
DataFrame containing only Pareto optimal rows
|
21
|
-
"""
|
22
|
-
if df.is_empty() or not by:
|
23
|
-
return df
|
24
|
-
|
25
|
-
# Normalize maximize to list
|
26
|
-
maximize_list = [maximize] * len(by) if isinstance(maximize, bool) else maximize
|
27
|
-
|
28
|
-
if len(maximize_list) != len(by):
|
29
|
-
raise ValueError(f"Length of 'maximize' ({len(maximize_list)}) must equal length of 'by' ({len(by)})")
|
30
|
-
|
31
|
-
# Single objective: simple min/max filter
|
32
|
-
if len(by) == 1:
|
33
|
-
opt_expr = pl.col(by[0]).max() if maximize_list[0] else pl.col(by[0]).min()
|
34
|
-
if groupby:
|
35
|
-
opt_expr = opt_expr.over(groupby)
|
36
|
-
return df.filter(pl.col(by[0]) == opt_expr)
|
37
|
-
|
38
|
-
# Two objectives: efficient skyline algorithm
|
39
|
-
if len(by) == 2:
|
40
|
-
temp_cols = ["__obj_0", "__obj_1"]
|
41
|
-
|
42
|
-
# Transform to maximization problem
|
43
|
-
df_temp = df.with_columns([
|
44
|
-
(pl.col(by[i]) * (1 if maximize_list[i] else -1)).alias(temp_cols[i])
|
45
|
-
for i in range(2)
|
46
|
-
])
|
47
|
-
|
48
|
-
# Sort by first objective descending, then second descending (for stability)
|
49
|
-
groupby = groupby or []
|
50
|
-
sort_cols = (groupby if groupby else []) + temp_cols
|
51
|
-
sorted_df = df_temp.sort(sort_cols, descending=[False] * len(groupby) + [True, True])
|
52
|
-
|
53
|
-
# Keep rows where second objective is not dominated by any previous row in group
|
54
|
-
if groupby:
|
55
|
-
max_so_far = pl.col(temp_cols[1]).cum_max().shift(1, fill_value=float("-inf")).over(groupby)
|
56
|
-
else:
|
57
|
-
max_so_far = pl.col(temp_cols[1]).cum_max().shift(1, fill_value=float("-inf"))
|
58
|
-
|
59
|
-
mask = pl.col(temp_cols[1]) > max_so_far
|
60
|
-
return sorted_df.filter(mask).drop(temp_cols)
|
61
|
-
|
62
|
-
# N objectives (N > 2): pairwise dominance check
|
63
|
-
df_with_id = df.with_row_index("__id")
|
64
|
-
|
65
|
-
# Self-join to compare all pairs
|
66
|
-
left = df_with_id.lazy()
|
67
|
-
right = df_with_id.lazy()
|
68
|
-
|
69
|
-
if groupby:
|
70
|
-
pairs = left.join(right, on=groupby, suffix="_r")
|
71
|
-
else:
|
72
|
-
pairs = left.join(right, how="cross", suffix="_r")
|
73
|
-
|
74
|
-
# Only compare different rows
|
75
|
-
pairs = pairs.filter(pl.col("__id") != pl.col("__id_r"))
|
76
|
-
|
77
|
-
# Build dominance conditions
|
78
|
-
dominance_conditions = []
|
79
|
-
for col, is_max in zip(by, maximize_list):
|
80
|
-
if is_max:
|
81
|
-
# right dominates left if right[col] >= left[col] for all cols
|
82
|
-
dominance_conditions.append(pl.col(f"{col}_r") >= pl.col(col))
|
83
|
-
else:
|
84
|
-
dominance_conditions.append(pl.col(f"{col}_r") <= pl.col(col))
|
85
|
-
|
86
|
-
# Strict dominance: all >= and at least one >
|
87
|
-
strict_conditions = []
|
88
|
-
for col, is_max in zip(by, maximize_list):
|
89
|
-
if is_max:
|
90
|
-
strict_conditions.append(pl.col(f"{col}_r") > pl.col(col))
|
91
|
-
else:
|
92
|
-
strict_conditions.append(pl.col(f"{col}_r") < pl.col(col))
|
93
|
-
|
94
|
-
is_dominated = pl.all_horizontal(dominance_conditions) & pl.any_horizontal(strict_conditions)
|
95
|
-
|
96
|
-
# Find IDs of dominated rows
|
97
|
-
dominated_ids = (
|
98
|
-
pairs.filter(is_dominated)
|
99
|
-
.select("__id")
|
100
|
-
.unique()
|
101
|
-
.collect()
|
102
|
-
.get_column("__id")
|
103
|
-
)
|
104
|
-
|
105
|
-
# Return non-dominated rows
|
106
|
-
return df_with_id.filter(~pl.col("__id").is_in(dominated_ids)).drop("__id")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|