hkjc 0.3.14__tar.gz → 0.3.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hkjc-0.3.16/2024-2025-hkjc.parquet +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/PKG-INFO +2 -1
- hkjc-0.3.16/process.py +4 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/pyproject.toml +2 -1
- hkjc-0.3.16/src/hkjc/analysis.py +3 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/historical.py +57 -36
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/processing.py +35 -20
- {hkjc-0.3.14 → hkjc-0.3.16}/uv.lock +12 -1
- {hkjc-0.3.14 → hkjc-0.3.16}/.python-version +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/README.md +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/__init__.py +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/harville_model.py +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/live_odds.py +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/py.typed +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/speedpro.py +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/strategy/place_only.py +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/strategy/qpbanker.py +0 -0
- {hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/utils.py +0 -0
Binary file
|
@@ -1,11 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: hkjc
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.16
|
4
4
|
Summary: Library for scrapping HKJC data and perform basic analysis
|
5
5
|
Requires-Python: >=3.11
|
6
6
|
Requires-Dist: beautifulsoup4>=4.14.2
|
7
7
|
Requires-Dist: cachetools>=6.2.0
|
8
8
|
Requires-Dist: fastexcel>=0.16.0
|
9
|
+
Requires-Dist: joblib>=1.5.2
|
9
10
|
Requires-Dist: numba>=0.62.1
|
10
11
|
Requires-Dist: numpy>=2.3.3
|
11
12
|
Requires-Dist: polars>=1.33.1
|
hkjc-0.3.16/process.py
ADDED
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "hkjc"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.16"
|
4
4
|
description = "Library for scrapping HKJC data and perform basic analysis"
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.11"
|
@@ -8,6 +8,7 @@ dependencies = [
|
|
8
8
|
"beautifulsoup4>=4.14.2",
|
9
9
|
"cachetools>=6.2.0",
|
10
10
|
"fastexcel>=0.16.0",
|
11
|
+
"joblib>=1.5.2",
|
11
12
|
"numba>=0.62.1",
|
12
13
|
"numpy>=2.3.3",
|
13
14
|
"polars>=1.33.1",
|
@@ -7,10 +7,10 @@ import polars as pl
|
|
7
7
|
from bs4 import BeautifulSoup
|
8
8
|
from cachetools.func import ttl_cache
|
9
9
|
|
10
|
-
from .utils import
|
10
|
+
from .utils import _parse_html_table
|
11
11
|
|
12
12
|
HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
|
13
|
-
HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?
|
13
|
+
HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseNo={horse_no}"
|
14
14
|
|
15
15
|
|
16
16
|
@ttl_cache(maxsize=100, ttl=3600)
|
@@ -30,10 +30,10 @@ def _soupify_race_page(date: str, venue_code: str, race_number: int) -> Beautifu
|
|
30
30
|
return _soupify(url)
|
31
31
|
|
32
32
|
|
33
|
-
def _soupify_horse_page(
|
33
|
+
def _soupify_horse_page(horse_no: str) -> BeautifulSoup:
|
34
34
|
"""Fetch and parse HKJC race results page and return BeautifulSoup object
|
35
35
|
"""
|
36
|
-
url = HKJC_HORSE_URL_TEMPLATE.format(
|
36
|
+
url = HKJC_HORSE_URL_TEMPLATE.format(horse_no=horse_no)
|
37
37
|
return _soupify(url)
|
38
38
|
|
39
39
|
|
@@ -50,6 +50,8 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
|
|
50
50
|
.alias("split_data").cast(pl.Int64, strict=False)
|
51
51
|
).unnest("split_data")
|
52
52
|
|
53
|
+
df.with_columns(pl.col('FinishPosition').fill_null(pl.col('Position3')))
|
54
|
+
|
53
55
|
df = df.with_columns([
|
54
56
|
(pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
|
55
57
|
pl.mean_horizontal("StartPosition", "Position2",
|
@@ -59,16 +61,16 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
|
|
59
61
|
.when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
|
60
62
|
.otherwise(pl.lit("Pacer")).alias("RunningStyle"))
|
61
63
|
|
62
|
-
recent_style = df['RunningStyle'][:
|
64
|
+
recent_style = df['RunningStyle'][:5].mode()[0]
|
63
65
|
df = df.with_columns(pl.lit(recent_style).alias("FavoriteRunningStyle"))
|
64
66
|
|
65
67
|
return df
|
66
68
|
|
67
69
|
|
68
|
-
def
|
70
|
+
def get_horse_data(horse_no: str) -> pl.DataFrame:
|
69
71
|
"""Extract horse info and history from horse page
|
70
72
|
"""
|
71
|
-
soup = _soupify_horse_page(
|
73
|
+
soup = _soupify_horse_page(horse_no)
|
72
74
|
table = soup.find('table', class_='bigborder')
|
73
75
|
horse_data = _parse_html_table(table).filter(
|
74
76
|
pl.col('Date') != '') # Remove empty rows
|
@@ -77,39 +79,73 @@ def _extract_horse_data(horse_id: str) -> pl.DataFrame:
|
|
77
79
|
# Extract horse profile info
|
78
80
|
table = soup.find_all('table', class_='table_eng_text')
|
79
81
|
profile_data = _parse_html_table(table[0], skip_header=True)
|
80
|
-
country, age = profile_data.filter(pl.col("column_0").str.starts_with("Country"))['column_2'].item(0).split('/')
|
81
82
|
profile_data = _parse_html_table(table[1], skip_header=True)
|
82
|
-
|
83
|
-
|
83
|
+
|
84
|
+
try:
|
85
|
+
current_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0))
|
86
|
+
season_start_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0))
|
87
|
+
except:
|
88
|
+
current_rating, season_start_rating = 0, 0
|
89
|
+
|
90
|
+
try:
|
91
|
+
last_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Last Rating"))['column_2'].item(0))
|
92
|
+
except:
|
93
|
+
last_rating = 0
|
84
94
|
|
85
95
|
horse_info = {
|
86
|
-
'HorseID':
|
87
|
-
'
|
88
|
-
'
|
89
|
-
'
|
90
|
-
'SeasonStartRating': int(season_start_rating)
|
96
|
+
'HorseID': horse_no,
|
97
|
+
'CurrentRating': current_rating,
|
98
|
+
'SeasonStartRating': season_start_rating,
|
99
|
+
'LastRating' : last_rating if current_rating==0 else current_rating
|
91
100
|
}
|
92
101
|
horse_data = (horse_data.with_columns([
|
93
102
|
pl.lit(value).alias(key) for key, value in horse_info.items()
|
94
103
|
])
|
95
104
|
)
|
105
|
+
|
106
|
+
horse_data = horse_data.with_columns([
|
107
|
+
pl.col('Pla').cast(pl.Int64, strict=False),
|
108
|
+
pl.col('WinOdds').cast(pl.Int64, strict=False),
|
109
|
+
pl.col('ActWt').cast(pl.Int64, strict=False),
|
110
|
+
pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
|
111
|
+
pl.col('Dr').cast(pl.Int64, strict=False),
|
112
|
+
pl.col('Rtg').cast(pl.Int64, strict=False),
|
113
|
+
pl.col('RaceIndex').cast(pl.Int64, strict=False),
|
114
|
+
pl.col('Dist').cast(pl.Int64, strict=False)
|
115
|
+
])
|
116
|
+
|
117
|
+
horse_data = horse_data.with_columns(
|
118
|
+
(
|
119
|
+
pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
|
120
|
+
pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
|
121
|
+
).cast(pl.Float64).alias("FinishTime")
|
122
|
+
)
|
123
|
+
|
124
|
+
horse_data = horse_data.with_columns(
|
125
|
+
pl.col('RCTrackCourse').str.split_exact(' / ', 2)
|
126
|
+
.struct.rename_fields(['Venue', 'Track', 'Course'])
|
127
|
+
.alias('RCTrackCourse')
|
128
|
+
).unnest('RCTrackCourse')
|
129
|
+
|
96
130
|
return horse_data
|
97
131
|
|
98
132
|
|
99
|
-
def
|
133
|
+
def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
|
100
134
|
soup = _soupify_race_page(date, venue_code, race_number)
|
101
135
|
table = soup.find('div', class_='race_tab').find('table')
|
102
136
|
race_data = _parse_html_table(table)
|
103
137
|
|
104
138
|
# Extract the relevant race information
|
139
|
+
race_id = race_data.columns[0].replace(f'RACE{race_number}','')
|
105
140
|
race_class = race_data.item(1, 0).split('-')[0].strip()
|
106
141
|
race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
|
107
142
|
race_name = race_data.item(2, 0).strip()
|
108
143
|
going = race_data.item(1, 2).strip()
|
109
144
|
course = race_data.item(2, 2).strip()
|
110
145
|
|
111
|
-
race_info = {'
|
146
|
+
race_info = {'Date': date,
|
112
147
|
'Venue': venue_code,
|
148
|
+
'RaceIndex': int(race_id),
|
113
149
|
'RaceNumber': race_number,
|
114
150
|
'RaceClass': race_class,
|
115
151
|
'RaceDistance': race_dist,
|
@@ -123,25 +159,10 @@ def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataF
|
|
123
159
|
.with_columns([
|
124
160
|
pl.lit(value).alias(key) for key, value in race_info.items()
|
125
161
|
])
|
162
|
+
.with_columns(
|
163
|
+
pl.col("Horse").str.extract(r"\((.*?)\)")
|
164
|
+
.alias("HorseID")
|
165
|
+
)
|
126
166
|
)
|
127
|
-
|
128
|
-
# Extract horse IDs from links
|
129
|
-
horse_ids = []
|
130
|
-
rows = table.find_all('tr')[1:] # Skip header row
|
131
|
-
for row in rows:
|
132
|
-
horse_id = 'UNKNOWN' # Horse link not found
|
133
|
-
links = row.find_all('a')
|
134
|
-
for link in links:
|
135
|
-
if 'href' in link.attrs and 'HorseId=' in link['href']:
|
136
|
-
horse_id = link['href'].split('HorseId=')[1]
|
137
|
-
break
|
138
|
-
horse_ids.append(horse_id)
|
139
|
-
|
140
|
-
race_data = race_data.with_columns(pl.Series('HorseID', horse_ids))
|
141
|
-
|
142
|
-
# Join with horse data
|
143
|
-
horse_data_list = [_extract_horse_data(horse_id) for horse_id in horse_ids]
|
144
|
-
horse_data_df = pl.concat(horse_data_list).unique(subset=['HorseID'])
|
145
|
-
race_data = race_data.join(horse_data_df, on='HorseID', how='left')
|
146
167
|
|
147
168
|
return race_data
|
@@ -6,7 +6,7 @@ from typing import Tuple, List, Union
|
|
6
6
|
from .live_odds import live_odds
|
7
7
|
from .strategy import qpbanker, place_only
|
8
8
|
from .harville_model import fit_harville_to_odds
|
9
|
-
from .historical import
|
9
|
+
from .historical import get_race_data, get_horse_data
|
10
10
|
from .utils import _validate_date
|
11
11
|
|
12
12
|
import polars as pl
|
@@ -27,13 +27,19 @@ incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
|
|
27
27
|
'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
|
28
28
|
|
29
29
|
|
30
|
-
def _historical_process_single_date_venue(date: str, venue_code: str) ->
|
31
|
-
|
30
|
+
def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl.DataFrame]:
|
31
|
+
dfs = []
|
32
|
+
iter_date = tqdm(
|
33
|
+
range(1, 12), desc=f"Processing {date} {venue_code} ...", leave=False)
|
34
|
+
for race_number in iter_date:
|
32
35
|
try:
|
33
|
-
|
34
|
-
|
36
|
+
dfs.append(get_race_data(date.strftime('%Y/%m/%d'),
|
37
|
+
venue_code, race_number))
|
35
38
|
except:
|
36
|
-
|
39
|
+
if race_number == 1:
|
40
|
+
iter_date.close()
|
41
|
+
return []
|
42
|
+
return dfs
|
37
43
|
|
38
44
|
|
39
45
|
def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
@@ -47,16 +53,24 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
|
47
53
|
|
48
54
|
for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True)):
|
49
55
|
for venue_code in ['ST', 'HV']:
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
)
|
56
|
+
dfs += _historical_process_single_date_venue(date, venue_code)
|
57
|
+
|
58
|
+
if dfs == []:
|
59
|
+
raise ValueError(
|
60
|
+
"Failed to obtain any race data. This could be due to invalid date range, or server requests limit. Please try again later.")
|
61
|
+
|
62
|
+
horse_ids = pl.concat(dfs)['HorseID'].unique()
|
63
|
+
|
64
|
+
# Use horse track records
|
65
|
+
dfs = [get_horse_data(horse_id) for horse_id in horse_ids]
|
66
|
+
df = (
|
67
|
+
pl.concat(dfs).with_columns(
|
68
|
+
pl.col('Date').str.strptime(pl.Date, '%m/%d/%y')
|
69
|
+
).filter(pl.col('Date').is_between(start_dt, end_dt))
|
70
|
+
.filter(~pl.col('Pla').is_in(incidents))
|
71
|
+
.with_columns(
|
72
|
+
pl.col('Pla').str.split(' ').list.first().alias('Pla')
|
73
|
+
)
|
60
74
|
)
|
61
75
|
|
62
76
|
df = df.with_columns([
|
@@ -69,10 +83,11 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
|
69
83
|
pl.col('WinOdds').cast(pl.Float64, strict=False)
|
70
84
|
])
|
71
85
|
|
72
|
-
df = df.with_columns(
|
73
|
-
|
74
|
-
|
75
|
-
|
86
|
+
df = df.with_columns(
|
87
|
+
(
|
88
|
+
pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
|
89
|
+
pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
|
90
|
+
).cast(pl.Float64).alias("FinishTime")
|
76
91
|
)
|
77
92
|
|
78
93
|
return df
|
@@ -110,12 +110,13 @@ wheels = [
|
|
110
110
|
|
111
111
|
[[package]]
|
112
112
|
name = "hkjc"
|
113
|
-
version = "0.3.
|
113
|
+
version = "0.3.16"
|
114
114
|
source = { editable = "." }
|
115
115
|
dependencies = [
|
116
116
|
{ name = "beautifulsoup4" },
|
117
117
|
{ name = "cachetools" },
|
118
118
|
{ name = "fastexcel" },
|
119
|
+
{ name = "joblib" },
|
119
120
|
{ name = "numba" },
|
120
121
|
{ name = "numpy" },
|
121
122
|
{ name = "polars" },
|
@@ -130,6 +131,7 @@ requires-dist = [
|
|
130
131
|
{ name = "beautifulsoup4", specifier = ">=4.14.2" },
|
131
132
|
{ name = "cachetools", specifier = ">=6.2.0" },
|
132
133
|
{ name = "fastexcel", specifier = ">=0.16.0" },
|
134
|
+
{ name = "joblib", specifier = ">=1.5.2" },
|
133
135
|
{ name = "numba", specifier = ">=0.62.1" },
|
134
136
|
{ name = "numpy", specifier = ">=2.3.3" },
|
135
137
|
{ name = "polars", specifier = ">=1.33.1" },
|
@@ -148,6 +150,15 @@ wheels = [
|
|
148
150
|
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
|
149
151
|
]
|
150
152
|
|
153
|
+
[[package]]
|
154
|
+
name = "joblib"
|
155
|
+
version = "1.5.2"
|
156
|
+
source = { registry = "https://pypi.org/simple" }
|
157
|
+
sdist = { url = "https://files.pythonhosted.org/packages/e8/5d/447af5ea094b9e4c4054f82e223ada074c552335b9b4b2d14bd9b35a67c4/joblib-1.5.2.tar.gz", hash = "sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55", size = 331077, upload-time = "2025-08-27T12:15:46.575Z" }
|
158
|
+
wheels = [
|
159
|
+
{ url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" },
|
160
|
+
]
|
161
|
+
|
151
162
|
[[package]]
|
152
163
|
name = "llvmlite"
|
153
164
|
version = "0.45.1"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|