hkjc 0.3.16__py3-none-any.whl → 0.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hkjc/historical.py
CHANGED
@@ -12,6 +12,9 @@ from .utils import _parse_html_table
|
|
12
12
|
HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
|
13
13
|
HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseNo={horse_no}"
|
14
14
|
|
15
|
+
incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
|
16
|
+
'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
|
17
|
+
|
15
18
|
|
16
19
|
@ttl_cache(maxsize=100, ttl=3600)
|
17
20
|
def _soupify(url: str) -> BeautifulSoup:
|
@@ -37,7 +40,6 @@ def _soupify_horse_page(horse_no: str) -> BeautifulSoup:
|
|
37
40
|
return _soupify(url)
|
38
41
|
|
39
42
|
|
40
|
-
|
41
43
|
def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition") -> pl.DataFrame:
|
42
44
|
"""Classify running style based on RunningPosition column
|
43
45
|
"""
|
@@ -50,14 +52,14 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
|
|
50
52
|
.alias("split_data").cast(pl.Int64, strict=False)
|
51
53
|
).unnest("split_data")
|
52
54
|
|
53
|
-
df.with_columns(pl.col('FinishPosition').fill_null(pl.col('Position3')))
|
55
|
+
df = df.with_columns(pl.col('FinishPosition').fill_null(pl.col('Position3')))
|
54
56
|
|
55
57
|
df = df.with_columns([
|
56
58
|
(pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
|
57
59
|
pl.mean_horizontal("StartPosition", "Position2",
|
58
60
|
"Position3", "FinishPosition").alias("AvgPosition"),
|
59
61
|
]).with_columns(pl.when(pl.col("StartPosition").is_null()).then(pl.lit("--"))
|
60
|
-
.when((pl.col("
|
62
|
+
.when((pl.col("AvgPosition") <= 3.5) & (pl.col("StartPosition") <= 3)).then(pl.lit("FrontRunner"))
|
61
63
|
.when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
|
62
64
|
.otherwise(pl.lit("Pacer")).alias("RunningStyle"))
|
63
65
|
|
@@ -67,7 +69,7 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
|
|
67
69
|
return df
|
68
70
|
|
69
71
|
|
70
|
-
def
|
72
|
+
def _extract_horse_data(horse_no: str) -> pl.DataFrame:
|
71
73
|
"""Extract horse info and history from horse page
|
72
74
|
"""
|
73
75
|
soup = _soupify_horse_page(horse_no)
|
@@ -82,13 +84,16 @@ def get_horse_data(horse_no: str) -> pl.DataFrame:
|
|
82
84
|
profile_data = _parse_html_table(table[1], skip_header=True)
|
83
85
|
|
84
86
|
try:
|
85
|
-
current_rating = int(profile_data.filter(
|
86
|
-
|
87
|
+
current_rating = int(profile_data.filter(
|
88
|
+
pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0))
|
89
|
+
season_start_rating = int(profile_data.filter(pl.col(
|
90
|
+
"column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0))
|
87
91
|
except:
|
88
92
|
current_rating, season_start_rating = 0, 0
|
89
|
-
|
93
|
+
|
90
94
|
try:
|
91
|
-
last_rating = int(profile_data.filter(
|
95
|
+
last_rating = int(profile_data.filter(
|
96
|
+
pl.col("column_0").str.starts_with("Last Rating"))['column_2'].item(0))
|
92
97
|
except:
|
93
98
|
last_rating = 0
|
94
99
|
|
@@ -96,47 +101,85 @@ def get_horse_data(horse_no: str) -> pl.DataFrame:
|
|
96
101
|
'HorseID': horse_no,
|
97
102
|
'CurrentRating': current_rating,
|
98
103
|
'SeasonStartRating': season_start_rating,
|
99
|
-
'LastRating'
|
104
|
+
'LastRating': last_rating if current_rating == 0 else current_rating
|
100
105
|
}
|
101
106
|
horse_data = (horse_data.with_columns([
|
102
107
|
pl.lit(value).alias(key) for key, value in horse_info.items()
|
103
108
|
])
|
104
109
|
)
|
105
110
|
|
106
|
-
|
111
|
+
return horse_data
|
112
|
+
|
113
|
+
|
114
|
+
def _clean_horse_data(df: pl.DataFrame) -> pl.DataFrame:
|
115
|
+
""" Clean and convert horse data to suitable data types
|
116
|
+
"""
|
117
|
+
df = df.with_columns(
|
118
|
+
pl.col('Pla').str.split(' ').list.first().alias('Pla')
|
119
|
+
).filter(~pl.col('Pla').is_in(incidents))
|
120
|
+
|
121
|
+
df = df.with_columns([
|
107
122
|
pl.col('Pla').cast(pl.Int64, strict=False),
|
108
|
-
pl.col('WinOdds').cast(pl.Int64, strict=False),
|
109
123
|
pl.col('ActWt').cast(pl.Int64, strict=False),
|
110
124
|
pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
|
111
125
|
pl.col('Dr').cast(pl.Int64, strict=False),
|
112
126
|
pl.col('Rtg').cast(pl.Int64, strict=False),
|
113
|
-
pl.col('
|
114
|
-
pl.col('
|
127
|
+
pl.col('Dist').cast(pl.Int64, strict=False),
|
128
|
+
pl.col('WinOdds').cast(pl.Float64, strict=False),
|
129
|
+
pl.col('RaceIndex').cast(pl.Int64, strict=False)
|
115
130
|
])
|
116
131
|
|
117
|
-
|
132
|
+
df = df.with_columns(
|
118
133
|
(
|
119
|
-
pl.col("FinishTime").str.
|
120
|
-
pl.col("FinishTime").str.
|
134
|
+
pl.col("FinishTime").str.split_exact(".", 1).struct.field("field_0").cast(pl.Int64) * 60 +
|
135
|
+
pl.col("FinishTime").str.split_exact(".", 1).struct.field("field_1").cast(pl.Int64)
|
121
136
|
).cast(pl.Float64).alias("FinishTime")
|
122
137
|
)
|
123
138
|
|
124
|
-
|
139
|
+
df = df.with_columns(
|
125
140
|
pl.col('RCTrackCourse').str.split_exact(' / ', 2)
|
126
141
|
.struct.rename_fields(['Venue', 'Track', 'Course'])
|
127
142
|
.alias('RCTrackCourse')
|
128
143
|
).unnest('RCTrackCourse')
|
129
144
|
|
130
|
-
return
|
145
|
+
return df
|
131
146
|
|
147
|
+
def get_horse_data(horse_no: str) -> pl.DataFrame:
|
148
|
+
df = _extract_horse_data(horse_no)
|
149
|
+
return _clean_horse_data(df)
|
132
150
|
|
133
|
-
def
|
151
|
+
def _clean_race_data(df: pl.DataFrame) -> pl.DataFrame:
|
152
|
+
""" Clean and convert horse data to suitable data types
|
153
|
+
"""
|
154
|
+
df = df.with_columns(
|
155
|
+
pl.col('Pla').str.split(' ').list.first().alias('Pla')
|
156
|
+
).filter(~pl.col('Pla').is_in(incidents))
|
157
|
+
|
158
|
+
df = df.with_columns([
|
159
|
+
pl.col('Pla').cast(pl.Int64, strict=False),
|
160
|
+
pl.col('HorseNo').cast(pl.Int64, strict=False),
|
161
|
+
pl.col('ActWt').cast(pl.Int64, strict=False),
|
162
|
+
pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
|
163
|
+
pl.col('Dr').cast(pl.Int64, strict=False),
|
164
|
+
pl.col('WinOdds').cast(pl.Float64, strict=False)
|
165
|
+
])
|
166
|
+
|
167
|
+
df = df.with_columns(
|
168
|
+
(
|
169
|
+
pl.col("FinishTime").str.split_exact(":", 1).struct.field("field_0").cast(pl.Int64) * 60 +
|
170
|
+
pl.col("FinishTime").str.split_exact(":", 1).struct.field("field_1").cast(pl.Int64)
|
171
|
+
).cast(pl.Float64).alias("FinishTime")
|
172
|
+
)
|
173
|
+
|
174
|
+
return df
|
175
|
+
|
176
|
+
def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
|
134
177
|
soup = _soupify_race_page(date, venue_code, race_number)
|
135
178
|
table = soup.find('div', class_='race_tab').find('table')
|
136
179
|
race_data = _parse_html_table(table)
|
137
180
|
|
138
181
|
# Extract the relevant race information
|
139
|
-
race_id = race_data.columns[0].replace(f'RACE{race_number}','')
|
182
|
+
race_id = race_data.columns[0].replace(f'RACE{race_number}', '')
|
140
183
|
race_class = race_data.item(1, 0).split('-')[0].strip()
|
141
184
|
race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
|
142
185
|
race_name = race_data.item(2, 0).strip()
|
@@ -162,7 +205,12 @@ def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
|
|
162
205
|
.with_columns(
|
163
206
|
pl.col("Horse").str.extract(r"\((.*?)\)")
|
164
207
|
.alias("HorseID")
|
165
|
-
|
166
|
-
|
208
|
+
)
|
209
|
+
)
|
210
|
+
|
211
|
+
return race_data
|
167
212
|
|
168
|
-
|
213
|
+
|
214
|
+
def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
|
215
|
+
df = _extract_race_data(date,venue_code,race_number)
|
216
|
+
return _clean_race_data(df)
|
hkjc/processing.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Tuple, List, Union
|
|
6
6
|
from .live_odds import live_odds
|
7
7
|
from .strategy import qpbanker, place_only
|
8
8
|
from .harville_model import fit_harville_to_odds
|
9
|
-
from .historical import
|
9
|
+
from .historical import _extract_horse_data, _extract_race_data, _clean_horse_data
|
10
10
|
from .utils import _validate_date
|
11
11
|
|
12
12
|
import polars as pl
|
@@ -23,8 +23,7 @@ def _all_subsets(lst): return [list(x) for r in range(
|
|
23
23
|
# ======================================
|
24
24
|
# Historical data processing functions
|
25
25
|
# ======================================
|
26
|
-
|
27
|
-
'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
|
26
|
+
|
28
27
|
|
29
28
|
|
30
29
|
def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl.DataFrame]:
|
@@ -33,7 +32,7 @@ def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl
|
|
33
32
|
range(1, 12), desc=f"Processing {date} {venue_code} ...", leave=False)
|
34
33
|
for race_number in iter_date:
|
35
34
|
try:
|
36
|
-
dfs.append(
|
35
|
+
dfs.append(_extract_race_data(date.strftime('%Y/%m/%d'),
|
37
36
|
venue_code, race_number))
|
38
37
|
except:
|
39
38
|
if race_number == 1:
|
@@ -51,7 +50,7 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
|
51
50
|
|
52
51
|
dfs = []
|
53
52
|
|
54
|
-
for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True)):
|
53
|
+
for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True), leave=False, desc='Scanning for horse IDs ...'):
|
55
54
|
for venue_code in ['ST', 'HV']:
|
56
55
|
dfs += _historical_process_single_date_venue(date, venue_code)
|
57
56
|
|
@@ -62,35 +61,9 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
|
62
61
|
horse_ids = pl.concat(dfs)['HorseID'].unique()
|
63
62
|
|
64
63
|
# Use horse track records
|
65
|
-
dfs = [
|
66
|
-
df = (
|
67
|
-
|
68
|
-
pl.col('Date').str.strptime(pl.Date, '%m/%d/%y')
|
69
|
-
).filter(pl.col('Date').is_between(start_dt, end_dt))
|
70
|
-
.filter(~pl.col('Pla').is_in(incidents))
|
71
|
-
.with_columns(
|
72
|
-
pl.col('Pla').str.split(' ').list.first().alias('Pla')
|
73
|
-
)
|
74
|
-
)
|
75
|
-
|
76
|
-
df = df.with_columns([
|
77
|
-
pl.col('Pla').cast(pl.Int64, strict=False),
|
78
|
-
pl.col('HorseNo').cast(pl.Int64, strict=False),
|
79
|
-
pl.col('ActWt').cast(pl.Int64, strict=False),
|
80
|
-
pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
|
81
|
-
pl.col('Dr').cast(pl.Int64, strict=False),
|
82
|
-
pl.col('RaceDistance').cast(pl.Int64, strict=False),
|
83
|
-
pl.col('WinOdds').cast(pl.Float64, strict=False)
|
84
|
-
])
|
85
|
-
|
86
|
-
df = df.with_columns(
|
87
|
-
(
|
88
|
-
pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
|
89
|
-
pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
|
90
|
-
).cast(pl.Float64).alias("FinishTime")
|
91
|
-
)
|
92
|
-
|
93
|
-
return df
|
64
|
+
dfs = [_extract_horse_data(horse_id) for horse_id in tqdm(horse_ids, desc='Processing horses ...', leave=False)]
|
65
|
+
df = pl.concat(dfs)
|
66
|
+
return _clean_horse_data(df)
|
94
67
|
|
95
68
|
|
96
69
|
# ==========================
|
@@ -1,14 +1,14 @@
|
|
1
1
|
hkjc/__init__.py,sha256=TI7PVhmoWSvYX-xdTEdaT3jfY99LiYQFRQZaIwBhJd8,785
|
2
2
|
hkjc/analysis.py,sha256=0042_NMIkQCl0J6B0P4TFfrBDCnm2B6jsCZKOEO30yI,108
|
3
3
|
hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
|
4
|
-
hkjc/historical.py,sha256=
|
4
|
+
hkjc/historical.py,sha256=yQQAx8vlr2EqcPazpYp1x2ku7dy3imQoDWImHCRv1QA,8330
|
5
5
|
hkjc/live_odds.py,sha256=G4ELBBp1d2prxye9kKzu2pwtS4vSfRPOmEuT7-Nd-3A,4741
|
6
|
-
hkjc/processing.py,sha256=
|
6
|
+
hkjc/processing.py,sha256=xrvEUgu_jz8ZxevOsRsYz0T7pWyNtSCMI6LUYByOLOw,6812
|
7
7
|
hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
|
9
9
|
hkjc/utils.py,sha256=4CA_FPf_U3GvzoLkqBX0qDPZgrSvKJKvbP7VWqd5FiA,6323
|
10
10
|
hkjc/strategy/place_only.py,sha256=lHPjTSj8PzghxncNBg8FI4T4HJigekB9a3bV7l7VtPA,2079
|
11
11
|
hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
|
12
|
-
hkjc-0.3.
|
13
|
-
hkjc-0.3.
|
14
|
-
hkjc-0.3.
|
12
|
+
hkjc-0.3.17.dist-info/METADATA,sha256=gKSkXKYo_HCg2S4ZeAjnqZniWV0V2kGpRH_g25K9Rmo,481
|
13
|
+
hkjc-0.3.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
14
|
+
hkjc-0.3.17.dist-info/RECORD,,
|
File without changes
|