hkjc 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hkjc/analysis.py
ADDED
hkjc/historical.py
CHANGED
@@ -10,7 +10,10 @@ from cachetools.func import ttl_cache
|
|
10
10
|
from .utils import _parse_html_table
|
11
11
|
|
12
12
|
HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
|
13
|
-
HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?
|
13
|
+
HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseNo={horse_no}"
|
14
|
+
|
15
|
+
incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
|
16
|
+
'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
|
14
17
|
|
15
18
|
|
16
19
|
@ttl_cache(maxsize=100, ttl=3600)
|
@@ -30,14 +33,13 @@ def _soupify_race_page(date: str, venue_code: str, race_number: int) -> Beautifu
|
|
30
33
|
return _soupify(url)
|
31
34
|
|
32
35
|
|
33
|
-
def _soupify_horse_page(
|
36
|
+
def _soupify_horse_page(horse_no: str) -> BeautifulSoup:
|
34
37
|
"""Fetch and parse HKJC race results page and return BeautifulSoup object
|
35
38
|
"""
|
36
|
-
url = HKJC_HORSE_URL_TEMPLATE.format(
|
39
|
+
url = HKJC_HORSE_URL_TEMPLATE.format(horse_no=horse_no)
|
37
40
|
return _soupify(url)
|
38
41
|
|
39
42
|
|
40
|
-
|
41
43
|
def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition") -> pl.DataFrame:
|
42
44
|
"""Classify running style based on RunningPosition column
|
43
45
|
"""
|
@@ -50,25 +52,27 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
|
|
50
52
|
.alias("split_data").cast(pl.Int64, strict=False)
|
51
53
|
).unnest("split_data")
|
52
54
|
|
55
|
+
df = df.with_columns(pl.col('FinishPosition').fill_null(pl.col('Position3')))
|
56
|
+
|
53
57
|
df = df.with_columns([
|
54
58
|
(pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
|
55
59
|
pl.mean_horizontal("StartPosition", "Position2",
|
56
60
|
"Position3", "FinishPosition").alias("AvgPosition"),
|
57
61
|
]).with_columns(pl.when(pl.col("StartPosition").is_null()).then(pl.lit("--"))
|
58
|
-
.when((pl.col("
|
62
|
+
.when((pl.col("AvgPosition") <= 3.5) & (pl.col("StartPosition") <= 3)).then(pl.lit("FrontRunner"))
|
59
63
|
.when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
|
60
64
|
.otherwise(pl.lit("Pacer")).alias("RunningStyle"))
|
61
65
|
|
62
|
-
recent_style = df['RunningStyle'][:
|
66
|
+
recent_style = df['RunningStyle'][:5].mode()[0]
|
63
67
|
df = df.with_columns(pl.lit(recent_style).alias("FavoriteRunningStyle"))
|
64
68
|
|
65
69
|
return df
|
66
70
|
|
67
71
|
|
68
|
-
def _extract_horse_data(
|
72
|
+
def _extract_horse_data(horse_no: str) -> pl.DataFrame:
|
69
73
|
"""Extract horse info and history from horse page
|
70
74
|
"""
|
71
|
-
soup = _soupify_horse_page(
|
75
|
+
soup = _soupify_horse_page(horse_no)
|
72
76
|
table = soup.find('table', class_='bigborder')
|
73
77
|
horse_data = _parse_html_table(table).filter(
|
74
78
|
pl.col('Date') != '') # Remove empty rows
|
@@ -78,35 +82,113 @@ def _extract_horse_data(horse_id: str) -> pl.DataFrame:
|
|
78
82
|
table = soup.find_all('table', class_='table_eng_text')
|
79
83
|
profile_data = _parse_html_table(table[0], skip_header=True)
|
80
84
|
profile_data = _parse_html_table(table[1], skip_header=True)
|
81
|
-
|
82
|
-
|
85
|
+
|
86
|
+
try:
|
87
|
+
current_rating = int(profile_data.filter(
|
88
|
+
pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0))
|
89
|
+
season_start_rating = int(profile_data.filter(pl.col(
|
90
|
+
"column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0))
|
91
|
+
except:
|
92
|
+
current_rating, season_start_rating = 0, 0
|
93
|
+
|
94
|
+
try:
|
95
|
+
last_rating = int(profile_data.filter(
|
96
|
+
pl.col("column_0").str.starts_with("Last Rating"))['column_2'].item(0))
|
97
|
+
except:
|
98
|
+
last_rating = 0
|
83
99
|
|
84
100
|
horse_info = {
|
85
|
-
'HorseID':
|
86
|
-
'CurrentRating':
|
87
|
-
'SeasonStartRating':
|
101
|
+
'HorseID': horse_no,
|
102
|
+
'CurrentRating': current_rating,
|
103
|
+
'SeasonStartRating': season_start_rating,
|
104
|
+
'LastRating': last_rating if current_rating == 0 else current_rating
|
88
105
|
}
|
89
106
|
horse_data = (horse_data.with_columns([
|
90
107
|
pl.lit(value).alias(key) for key, value in horse_info.items()
|
91
108
|
])
|
92
109
|
)
|
110
|
+
|
93
111
|
return horse_data
|
94
112
|
|
95
113
|
|
114
|
+
def _clean_horse_data(df: pl.DataFrame) -> pl.DataFrame:
|
115
|
+
""" Clean and convert horse data to suitable data types
|
116
|
+
"""
|
117
|
+
df = df.with_columns(
|
118
|
+
pl.col('Pla').str.split(' ').list.first().alias('Pla')
|
119
|
+
).filter(~pl.col('Pla').is_in(incidents))
|
120
|
+
|
121
|
+
df = df.with_columns([
|
122
|
+
pl.col('Pla').cast(pl.Int64, strict=False),
|
123
|
+
pl.col('ActWt').cast(pl.Int64, strict=False),
|
124
|
+
pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
|
125
|
+
pl.col('Dr').cast(pl.Int64, strict=False),
|
126
|
+
pl.col('Rtg').cast(pl.Int64, strict=False),
|
127
|
+
pl.col('Dist').cast(pl.Int64, strict=False),
|
128
|
+
pl.col('WinOdds').cast(pl.Float64, strict=False),
|
129
|
+
pl.col('RaceIndex').cast(pl.Int64, strict=False)
|
130
|
+
])
|
131
|
+
|
132
|
+
df = df.with_columns(
|
133
|
+
(
|
134
|
+
pl.col("FinishTime").str.split_exact(".", 1).struct.field("field_0").cast(pl.Int64) * 60 +
|
135
|
+
pl.col("FinishTime").str.split_exact(".", 1).struct.field("field_1").cast(pl.Int64)
|
136
|
+
).cast(pl.Float64).alias("FinishTime")
|
137
|
+
)
|
138
|
+
|
139
|
+
df = df.with_columns(
|
140
|
+
pl.col('RCTrackCourse').str.split_exact(' / ', 2)
|
141
|
+
.struct.rename_fields(['Venue', 'Track', 'Course'])
|
142
|
+
.alias('RCTrackCourse')
|
143
|
+
).unnest('RCTrackCourse')
|
144
|
+
|
145
|
+
return df
|
146
|
+
|
147
|
+
def get_horse_data(horse_no: str) -> pl.DataFrame:
|
148
|
+
df = _extract_horse_data(horse_no)
|
149
|
+
return _clean_horse_data(df)
|
150
|
+
|
151
|
+
def _clean_race_data(df: pl.DataFrame) -> pl.DataFrame:
|
152
|
+
""" Clean and convert horse data to suitable data types
|
153
|
+
"""
|
154
|
+
df = df.with_columns(
|
155
|
+
pl.col('Pla').str.split(' ').list.first().alias('Pla')
|
156
|
+
).filter(~pl.col('Pla').is_in(incidents))
|
157
|
+
|
158
|
+
df = df.with_columns([
|
159
|
+
pl.col('Pla').cast(pl.Int64, strict=False),
|
160
|
+
pl.col('HorseNo').cast(pl.Int64, strict=False),
|
161
|
+
pl.col('ActWt').cast(pl.Int64, strict=False),
|
162
|
+
pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
|
163
|
+
pl.col('Dr').cast(pl.Int64, strict=False),
|
164
|
+
pl.col('WinOdds').cast(pl.Float64, strict=False)
|
165
|
+
])
|
166
|
+
|
167
|
+
df = df.with_columns(
|
168
|
+
(
|
169
|
+
pl.col("FinishTime").str.split_exact(":", 1).struct.field("field_0").cast(pl.Int64) * 60 +
|
170
|
+
pl.col("FinishTime").str.split_exact(":", 1).struct.field("field_1").cast(pl.Int64)
|
171
|
+
).cast(pl.Float64).alias("FinishTime")
|
172
|
+
)
|
173
|
+
|
174
|
+
return df
|
175
|
+
|
96
176
|
def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
|
97
177
|
soup = _soupify_race_page(date, venue_code, race_number)
|
98
178
|
table = soup.find('div', class_='race_tab').find('table')
|
99
179
|
race_data = _parse_html_table(table)
|
100
180
|
|
101
181
|
# Extract the relevant race information
|
182
|
+
race_id = race_data.columns[0].replace(f'RACE{race_number}', '')
|
102
183
|
race_class = race_data.item(1, 0).split('-')[0].strip()
|
103
184
|
race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
|
104
185
|
race_name = race_data.item(2, 0).strip()
|
105
186
|
going = race_data.item(1, 2).strip()
|
106
187
|
course = race_data.item(2, 2).strip()
|
107
188
|
|
108
|
-
race_info = {'
|
189
|
+
race_info = {'Date': date,
|
109
190
|
'Venue': venue_code,
|
191
|
+
'RaceIndex': int(race_id),
|
110
192
|
'RaceNumber': race_number,
|
111
193
|
'RaceClass': race_class,
|
112
194
|
'RaceDistance': race_dist,
|
@@ -120,25 +202,15 @@ def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataF
|
|
120
202
|
.with_columns([
|
121
203
|
pl.lit(value).alias(key) for key, value in race_info.items()
|
122
204
|
])
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
horse_ids.append(horse_id)
|
136
|
-
|
137
|
-
race_data = race_data.with_columns(pl.Series('HorseID', horse_ids))
|
138
|
-
|
139
|
-
# Join with horse data
|
140
|
-
horse_data_list = [_extract_horse_data(horse_id) for horse_id in horse_ids]
|
141
|
-
horse_data_df = pl.concat(horse_data_list).unique(subset=['HorseID'])
|
142
|
-
race_data = race_data.join(horse_data_df, on='HorseID', how='left')
|
143
|
-
|
144
|
-
return race_data
|
205
|
+
.with_columns(
|
206
|
+
pl.col("Horse").str.extract(r"\((.*?)\)")
|
207
|
+
.alias("HorseID")
|
208
|
+
)
|
209
|
+
)
|
210
|
+
|
211
|
+
return race_data
|
212
|
+
|
213
|
+
|
214
|
+
def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
|
215
|
+
df = _extract_race_data(date,venue_code,race_number)
|
216
|
+
return _clean_race_data(df)
|
hkjc/processing.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Tuple, List, Union
|
|
6
6
|
from .live_odds import live_odds
|
7
7
|
from .strategy import qpbanker, place_only
|
8
8
|
from .harville_model import fit_harville_to_odds
|
9
|
-
from .historical import _extract_race_data
|
9
|
+
from .historical import _extract_horse_data, _extract_race_data, _clean_horse_data
|
10
10
|
from .utils import _validate_date
|
11
11
|
|
12
12
|
import polars as pl
|
@@ -14,7 +14,6 @@ import numpy as np
|
|
14
14
|
from itertools import combinations
|
15
15
|
from tqdm import tqdm
|
16
16
|
from datetime import datetime as dt
|
17
|
-
from joblib import delayed, Parallel
|
18
17
|
|
19
18
|
|
20
19
|
def _all_subsets(lst): return [list(x) for r in range(
|
@@ -24,17 +23,17 @@ def _all_subsets(lst): return [list(x) for r in range(
|
|
24
23
|
# ======================================
|
25
24
|
# Historical data processing functions
|
26
25
|
# ======================================
|
27
|
-
|
28
|
-
'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
|
26
|
+
|
29
27
|
|
30
28
|
|
31
29
|
def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl.DataFrame]:
|
32
30
|
dfs = []
|
33
|
-
iter_date = tqdm(
|
31
|
+
iter_date = tqdm(
|
32
|
+
range(1, 12), desc=f"Processing {date} {venue_code} ...", leave=False)
|
34
33
|
for race_number in iter_date:
|
35
34
|
try:
|
36
35
|
dfs.append(_extract_race_data(date.strftime('%Y/%m/%d'),
|
37
|
-
|
36
|
+
venue_code, race_number))
|
38
37
|
except:
|
39
38
|
if race_number == 1:
|
40
39
|
iter_date.close()
|
@@ -51,38 +50,20 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
|
51
50
|
|
52
51
|
dfs = []
|
53
52
|
|
54
|
-
for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True)):
|
53
|
+
for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True), leave=False, desc='Scanning for horse IDs ...'):
|
55
54
|
for venue_code in ['ST', 'HV']:
|
56
55
|
dfs += _historical_process_single_date_venue(date, venue_code)
|
57
56
|
|
58
57
|
if dfs == []:
|
59
|
-
raise ValueError(
|
60
|
-
|
61
|
-
df = (pl.concat(dfs)
|
62
|
-
.filter(~pl.col('Pla').is_in(incidents))
|
63
|
-
.with_columns(
|
64
|
-
pl.col('Pla').str.split(' ').list.first().alias('Pla')
|
65
|
-
)
|
66
|
-
)
|
67
|
-
|
68
|
-
df = df.with_columns([
|
69
|
-
pl.col('Pla').cast(pl.Int64, strict=False),
|
70
|
-
pl.col('HorseNo').cast(pl.Int64, strict=False),
|
71
|
-
pl.col('ActWt').cast(pl.Int64, strict=False),
|
72
|
-
pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
|
73
|
-
pl.col('Dr').cast(pl.Int64, strict=False),
|
74
|
-
pl.col('RaceDistance').cast(pl.Int64, strict=False),
|
75
|
-
pl.col('WinOdds').cast(pl.Float64, strict=False)
|
76
|
-
])
|
77
|
-
|
78
|
-
df = df.with_columns(
|
79
|
-
(
|
80
|
-
pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
|
81
|
-
pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
|
82
|
-
).cast(pl.Float64).alias("FinishTime")
|
83
|
-
)
|
58
|
+
raise ValueError(
|
59
|
+
"Failed to obtain any race data. This could be due to invalid date range, or server requests limit. Please try again later.")
|
84
60
|
|
85
|
-
|
61
|
+
horse_ids = pl.concat(dfs)['HorseID'].unique()
|
62
|
+
|
63
|
+
# Use horse track records
|
64
|
+
dfs = [_extract_horse_data(horse_id) for horse_id in tqdm(horse_ids, desc='Processing horses ...', leave=False)]
|
65
|
+
df = pl.concat(dfs)
|
66
|
+
return _clean_horse_data(df)
|
86
67
|
|
87
68
|
|
88
69
|
# ==========================
|
@@ -1,13 +1,14 @@
|
|
1
1
|
hkjc/__init__.py,sha256=TI7PVhmoWSvYX-xdTEdaT3jfY99LiYQFRQZaIwBhJd8,785
|
2
|
+
hkjc/analysis.py,sha256=0042_NMIkQCl0J6B0P4TFfrBDCnm2B6jsCZKOEO30yI,108
|
2
3
|
hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
|
3
|
-
hkjc/historical.py,sha256=
|
4
|
+
hkjc/historical.py,sha256=yQQAx8vlr2EqcPazpYp1x2ku7dy3imQoDWImHCRv1QA,8330
|
4
5
|
hkjc/live_odds.py,sha256=G4ELBBp1d2prxye9kKzu2pwtS4vSfRPOmEuT7-Nd-3A,4741
|
5
|
-
hkjc/processing.py,sha256=
|
6
|
+
hkjc/processing.py,sha256=xrvEUgu_jz8ZxevOsRsYz0T7pWyNtSCMI6LUYByOLOw,6812
|
6
7
|
hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
8
|
hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
|
8
9
|
hkjc/utils.py,sha256=4CA_FPf_U3GvzoLkqBX0qDPZgrSvKJKvbP7VWqd5FiA,6323
|
9
10
|
hkjc/strategy/place_only.py,sha256=lHPjTSj8PzghxncNBg8FI4T4HJigekB9a3bV7l7VtPA,2079
|
10
11
|
hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
|
11
|
-
hkjc-0.3.
|
12
|
-
hkjc-0.3.
|
13
|
-
hkjc-0.3.
|
12
|
+
hkjc-0.3.17.dist-info/METADATA,sha256=gKSkXKYo_HCg2S4ZeAjnqZniWV0V2kGpRH_g25K9Rmo,481
|
13
|
+
hkjc-0.3.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
14
|
+
hkjc-0.3.17.dist-info/RECORD,,
|
File without changes
|