hkjc 0.3.18__py3-none-any.whl → 0.3.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hkjc/__init__.py +4 -4
- hkjc/features.py +6 -0
- hkjc/harville_model.py +122 -31
- hkjc/historical.py +29 -44
- hkjc/live.py +22 -18
- hkjc/processing.py +16 -3
- {hkjc-0.3.18.dist-info → hkjc-0.3.21.dist-info}/METADATA +1 -1
- hkjc-0.3.21.dist-info/RECORD +14 -0
- hkjc/analysis.py +0 -3
- hkjc-0.3.18.dist-info/RECORD +0 -14
- {hkjc-0.3.18.dist-info → hkjc-0.3.21.dist-info}/WHEEL +0 -0
hkjc/__init__.py
CHANGED
@@ -4,10 +4,10 @@ This module re-exports commonly used symbols from the submodules.
|
|
4
4
|
"""
|
5
5
|
from importlib.metadata import version as _version
|
6
6
|
|
7
|
-
__all__ = ["live", "
|
7
|
+
__all__ = ["live", "features",
|
8
8
|
"generate_all_qp_trades", "generate_all_pla_trades", "pareto_filter",
|
9
|
-
|
10
|
-
|
9
|
+
"speedpro_energy", "speedmap", "harveille_model",
|
10
|
+
"generate_historical_data"]
|
11
11
|
|
12
12
|
try:
|
13
13
|
__version__ = _version(__name__)
|
@@ -17,4 +17,4 @@ except Exception: # pragma: no cover - best-effort version resolution
|
|
17
17
|
from .processing import generate_all_qp_trades, generate_all_pla_trades, generate_historical_data
|
18
18
|
from .utils import pareto_filter
|
19
19
|
from .speedpro import speedmap, speedpro_energy
|
20
|
-
from . import harville_model, live
|
20
|
+
from . import harville_model, live, features
|
hkjc/features.py
ADDED
@@ -0,0 +1,6 @@
|
|
1
|
+
""" Polars expressions for commonly-used analysis features, subject to frequent changes.
|
2
|
+
"""
|
3
|
+
import polars as pl
|
4
|
+
|
5
|
+
rating_diff = (pl.col('Rtg').max().over('RaceId')-pl.col('Rtg')).alias('RtgDiff')
|
6
|
+
frontrunner_pct = (pl.col('FavoriteRunningStyle')=='FrontRunner').mean().over('RaceId').alias('FRPct')
|
hkjc/harville_model.py
CHANGED
@@ -198,10 +198,12 @@ class HarvilleModel:
|
|
198
198
|
lambda_qin (float): Weight for Qin pool loss
|
199
199
|
lambda_quinella (float): Weight for Quinella pool loss
|
200
200
|
lambda_banker (float): Weight for Banker pool loss
|
201
|
+
takeout_rate (float): House take out rate (e.g., 0.175 = 17.5%)
|
201
202
|
"""
|
202
203
|
|
203
204
|
def __init__(self, n_horses: int, lambda_win: float = LAMBDA_DEFAULTS['WIN'], lambda_qin: float = LAMBDA_DEFAULTS['QIN'],
|
204
|
-
lambda_quinella: float = LAMBDA_DEFAULTS['QPL'], lambda_banker: float = LAMBDA_DEFAULTS['PLA']
|
205
|
+
lambda_quinella: float = LAMBDA_DEFAULTS['QPL'], lambda_banker: float = LAMBDA_DEFAULTS['PLA'],
|
206
|
+
takeout_rate: float = 0.175) -> None:
|
205
207
|
"""
|
206
208
|
Initialize model.
|
207
209
|
|
@@ -211,6 +213,9 @@ class HarvilleModel:
|
|
211
213
|
lambda_qin: Weight for Qin odds (prob pair finishes 1st-2nd)
|
212
214
|
lambda_quinella: Weight for Quinella odds (prob pair in top 3)
|
213
215
|
lambda_banker: Weight for Banker odds (prob horse in top 3)
|
216
|
+
takeout_rate: House take out rate as decimal (default 0.175 = 17.5%).
|
217
|
+
The observed odds include the house's take, which makes
|
218
|
+
them higher than true odds. This parameter adjusts for that.
|
214
219
|
|
215
220
|
Raises:
|
216
221
|
ValueError: If n_horses > 20 (exponential complexity warning)
|
@@ -223,9 +228,51 @@ class HarvilleModel:
|
|
223
228
|
self.lambda_qin = lambda_qin
|
224
229
|
self.lambda_quinella = lambda_quinella
|
225
230
|
self.lambda_banker = lambda_banker
|
231
|
+
self.takeout_rate = takeout_rate
|
226
232
|
self._eval_count = 0
|
227
233
|
self.result = None
|
228
234
|
|
235
|
+
def _adjust_for_takeout(self, probs: Optional[np.ndarray]) -> Optional[np.ndarray]:
|
236
|
+
"""
|
237
|
+
Adjust observed probabilities to remove house takeout rate.
|
238
|
+
|
239
|
+
Observed odds from the betting market include the house's take, causing
|
240
|
+
the sum of implied probabilities to exceed 1.0. This method adjusts them
|
241
|
+
to represent true probabilities.
|
242
|
+
|
243
|
+
Args:
|
244
|
+
probs: Observed probabilities (can be 1D or 2D array)
|
245
|
+
|
246
|
+
Returns:
|
247
|
+
Adjusted probabilities with takeout removed, or None if input is None
|
248
|
+
"""
|
249
|
+
if probs is None:
|
250
|
+
return None
|
251
|
+
|
252
|
+
# Multiply by (1 - takeout_rate) to remove the house edge
|
253
|
+
adjusted = probs * (1.0 - self.takeout_rate)
|
254
|
+
|
255
|
+
return adjusted
|
256
|
+
|
257
|
+
def _probs_to_market_odds(self, probs: np.ndarray) -> np.ndarray:
|
258
|
+
"""
|
259
|
+
Convert fitted probabilities to market odds including takeout rate.
|
260
|
+
|
261
|
+
This converts true probabilities (which sum to 1.0) back to decimal odds
|
262
|
+
as they would appear in the betting market, which includes the house's
|
263
|
+
takeout rate. The resulting odds can be directly compared to observed odds.
|
264
|
+
|
265
|
+
Args:
|
266
|
+
probs: Fitted probabilities (1D or 2D array)
|
267
|
+
|
268
|
+
Returns:
|
269
|
+
Market odds (decimal format) with takeout reintroduced
|
270
|
+
"""
|
271
|
+
|
272
|
+
# Convert true probabilities to market odds with takeout
|
273
|
+
# Market odds are worse (higher) than fair odds due to house edge
|
274
|
+
return (1.0 - self.takeout_rate) / probs
|
275
|
+
|
229
276
|
def _loss(self, theta: np.ndarray, W_obs: Optional[np.ndarray],
|
230
277
|
Qin_obs: Optional[np.ndarray], Q_obs: Optional[np.ndarray],
|
231
278
|
b_obs: Optional[np.ndarray]) -> float:
|
@@ -290,26 +337,34 @@ class HarvilleModel:
|
|
290
337
|
|
291
338
|
Returns:
|
292
339
|
Dictionary containing:
|
293
|
-
- theta: Fitted strength parameters (n,)
|
294
|
-
- W_fitted: Fitted Win probabilities (n,)
|
295
|
-
- Qin_fitted: Fitted Qin probabilities (n, n)
|
296
|
-
- Q_fitted: Fitted Quinella probabilities (n, n)
|
297
|
-
- b_fitted: Fitted Banker probabilities (n,)
|
298
|
-
- P_fitted: Full place probability matrix (n, n), P[i,j] =
|
299
|
-
prob horse i finishes in position j
|
300
|
-
- loss: Final loss value
|
301
340
|
- success: Whether optimization converged
|
302
341
|
- message: Optimizer status message
|
303
342
|
- n_eval: Number of loss function evaluations
|
343
|
+
- loss: Final loss value
|
344
|
+
- prob_fit: Dictionary of fitted probabilities (sum to 1.0)
|
345
|
+
- theta: Fitted strength parameters (n,)
|
346
|
+
- W: Win probabilities (n,)
|
347
|
+
- Qin: Qin probabilities (n, n)
|
348
|
+
- Q: Quinella probabilities (n, n)
|
349
|
+
- b: Banker probabilities (n,)
|
350
|
+
- P: Full place probability matrix (n, n), P[i,j] =
|
351
|
+
prob horse i finishes in position j
|
352
|
+
- odds_fit: Dictionary of fitted market odds (directly comparable to observed)
|
353
|
+
- WIN: Win odds (n,)
|
354
|
+
- QIN: Qin odds (n, n)
|
355
|
+
- QPL: Quinella Place odds (n, n)
|
356
|
+
- PLA: Place odds (n,)
|
304
357
|
|
305
358
|
Raises:
|
306
359
|
ValueError: If no odds provided or shapes don't match n_horses
|
307
360
|
|
308
361
|
Example:
|
309
|
-
>>> opt =
|
362
|
+
>>> opt = HarvilleModel(n_horses=10, takeout_rate=0.175)
|
310
363
|
>>> results = opt.fit(W_obs=win_probs, Q_obs=quinella_probs)
|
311
|
-
>>> print(f"Fitted strengths: {results['theta']}")
|
364
|
+
>>> print(f"Fitted strengths: {results['prob_fit']['theta']}")
|
312
365
|
>>> print(f"Converged: {results['success']}")
|
366
|
+
>>> # Compare fitted odds to observed odds
|
367
|
+
>>> diff = results['odds_fit']['WIN'] - observed_win_odds
|
313
368
|
"""
|
314
369
|
if W_obs is None and Qin_obs is None and Q_obs is None and b_obs is None:
|
315
370
|
raise ValueError("At least one type of odds must be provided")
|
@@ -323,6 +378,12 @@ class HarvilleModel:
|
|
323
378
|
if b_obs is not None and b_obs.shape != (self.n,):
|
324
379
|
raise ValueError(f"b_obs must be ({self.n},)")
|
325
380
|
|
381
|
+
# Adjust observed probabilities for house takeout rate
|
382
|
+
W_obs = self._adjust_for_takeout(W_obs)
|
383
|
+
Qin_obs = self._adjust_for_takeout(Qin_obs)
|
384
|
+
Q_obs = self._adjust_for_takeout(Q_obs)
|
385
|
+
b_obs = self._adjust_for_takeout(b_obs)
|
386
|
+
|
326
387
|
if theta_init is None:
|
327
388
|
if W_obs is not None:
|
328
389
|
theta_init = W_obs / W_obs.sum()
|
@@ -356,27 +417,41 @@ class HarvilleModel:
|
|
356
417
|
|
357
418
|
W_fitted, Qin_fitted, Q_fitted, b_fitted, P_fitted = _compute_probabilities(theta_opt)
|
358
419
|
|
420
|
+
# Convert fitted probabilities to market odds (with takeout reintroduced)
|
421
|
+
WIN_odds_fitted = self._probs_to_market_odds(W_fitted)
|
422
|
+
PLA_odds_fitted = self._probs_to_market_odds(b_fitted)
|
423
|
+
QIN_odds_fitted = self._probs_to_market_odds(Qin_fitted)
|
424
|
+
QPL_odds_fitted = self._probs_to_market_odds(Q_fitted)
|
425
|
+
|
359
426
|
self.result = {
|
360
|
-
'theta': theta_opt,
|
361
|
-
'W_fitted': W_fitted,
|
362
|
-
'Qin_fitted': Qin_fitted,
|
363
|
-
'Q_fitted': Q_fitted,
|
364
|
-
'b_fitted': b_fitted,
|
365
|
-
'P_fitted': P_fitted,
|
366
|
-
'loss': result.fun,
|
367
427
|
'success': result.success,
|
368
428
|
'message': result.message,
|
369
|
-
'n_eval': self._eval_count
|
429
|
+
'n_eval': self._eval_count,
|
430
|
+
'loss': result.fun,
|
431
|
+
'prob_fit': {
|
432
|
+
'theta': theta_opt,
|
433
|
+
'W': W_fitted,
|
434
|
+
'Qin': Qin_fitted,
|
435
|
+
'Q': Q_fitted,
|
436
|
+
'b': b_fitted,
|
437
|
+
'P': P_fitted
|
438
|
+
},
|
439
|
+
'odds_fit': {
|
440
|
+
'WIN': WIN_odds_fitted,
|
441
|
+
'QIN': QIN_odds_fitted,
|
442
|
+
'QPL': QPL_odds_fitted,
|
443
|
+
'PLA': PLA_odds_fitted
|
444
|
+
}
|
370
445
|
}
|
371
446
|
|
372
447
|
return self.result
|
373
448
|
|
374
|
-
def fit_harville_to_odds(odds : dict[str, np.ndarray], lambdas : dict[str, float] = None) -> dict:
|
449
|
+
def fit_harville_to_odds(odds : dict[str, np.ndarray], lambdas : dict[str, float] = None, takeout_rate: float = 0.175) -> dict:
|
375
450
|
"""
|
376
451
|
Fit Harville model to observed betting odds.
|
377
452
|
|
378
|
-
At least one odds type must be provided. All odds should be
|
379
|
-
(not
|
453
|
+
At least one odds type must be provided. All odds should be decimal odds
|
454
|
+
(not probabilities). Matrices should be symmetric where applicable.
|
380
455
|
|
381
456
|
Args:
|
382
457
|
odds: Dictionary of odds arrays with types as keys.:
|
@@ -384,20 +459,35 @@ def fit_harville_to_odds(odds : dict[str, np.ndarray], lambdas : dict[str, float
|
|
384
459
|
lambdas: Optional dictionary of lambda weights for each odds type.
|
385
460
|
Keys can be 'WIN', 'QIN', 'QPL', 'PLA'. Defaults to
|
386
461
|
{'WIN': 1.0, 'QIN': 2.0, 'QPL': 1.5, 'PLA': 0.7}
|
462
|
+
takeout_rate: House take out rate as decimal (default 0.175 = 17.5%).
|
463
|
+
The house keeps this percentage of the betting pool, causing
|
464
|
+
observed odds to be higher than fair odds.
|
387
465
|
|
388
466
|
Returns:
|
389
467
|
Dictionary containing:
|
390
|
-
- theta: Fitted strength parameters (n,)
|
391
|
-
- W_fitted: Fitted Win probabilities (n,)
|
392
|
-
- Qin_fitted: Fitted Qin probabilities (n, n)
|
393
|
-
- Q_fitted: Fitted Quinella probabilities (n, n)
|
394
|
-
- b_fitted: Fitted Banker probabilities (n,)
|
395
|
-
- P_fitted: Full place probability matrix (n, n), P[i,j] =
|
396
|
-
prob horse i finishes in position j
|
397
|
-
- loss: Final loss value
|
398
468
|
- success: Whether optimization converged
|
399
469
|
- message: Optimizer status message
|
400
470
|
- n_eval: Number of loss function evaluations
|
471
|
+
- loss: Final loss value
|
472
|
+
- prob_fit: Dictionary of fitted probabilities (sum to 1.0)
|
473
|
+
- theta: Fitted strength parameters (n,)
|
474
|
+
- W: Win probabilities (n,)
|
475
|
+
- Qin: Qin probabilities (n, n)
|
476
|
+
- Q: Quinella probabilities (n, n)
|
477
|
+
- b: Banker probabilities (n,)
|
478
|
+
- P: Full place probability matrix (n, n), P[i,j] =
|
479
|
+
prob horse i finishes in position j
|
480
|
+
- odds_fit: Dictionary of fitted market odds (directly comparable to observed)
|
481
|
+
- WIN: Win odds (n,)
|
482
|
+
- QIN: Qin odds (n, n)
|
483
|
+
- QPL: Quinella Place odds (n, n)
|
484
|
+
- PLA: Place odds (n,)
|
485
|
+
|
486
|
+
Example:
|
487
|
+
>>> odds = {'WIN': np.array([3.5, 4.2, 5.0, 8.5, 12.0])}
|
488
|
+
>>> result = fit_harville_to_odds(odds, takeout_rate=0.175)
|
489
|
+
>>> print(result['prob_fit']['theta']) # True winning probabilities
|
490
|
+
>>> print(result['odds_fit']['WIN']) # Fitted market odds (compare to input)
|
401
491
|
"""
|
402
492
|
n_horses = None
|
403
493
|
W_obs = None
|
@@ -443,7 +533,8 @@ def fit_harville_to_odds(odds : dict[str, np.ndarray], lambdas : dict[str, float
|
|
443
533
|
lambda_win=merged_lambdas['WIN'],
|
444
534
|
lambda_qin=merged_lambdas['QIN'],
|
445
535
|
lambda_quinella=merged_lambdas['QPL'],
|
446
|
-
lambda_banker=merged_lambdas['PLA']
|
536
|
+
lambda_banker=merged_lambdas['PLA'],
|
537
|
+
takeout_rate=takeout_rate
|
447
538
|
)
|
448
539
|
result = ho.fit(W_obs=W_obs, Qin_obs=Qin_obs, Q_obs=Q_obs, b_obs=b_obs)
|
449
540
|
return result
|
hkjc/historical.py
CHANGED
@@ -20,7 +20,7 @@ incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
|
|
20
20
|
def _soupify(url: str) -> BeautifulSoup:
|
21
21
|
"""Fetch and parse a webpage and return BeautifulSoup object
|
22
22
|
"""
|
23
|
-
response = requests.get(url, timeout=
|
23
|
+
response = requests.get(url, timeout=180)
|
24
24
|
response.raise_for_status()
|
25
25
|
return BeautifulSoup(response.content, 'html.parser')
|
26
26
|
|
@@ -52,11 +52,13 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
|
|
52
52
|
.alias("split_data").cast(pl.Int64, strict=False)
|
53
53
|
).unnest("split_data")
|
54
54
|
|
55
|
-
df = df.with_columns(
|
55
|
+
df = df.with_columns(
|
56
|
+
pl.col('FinishPosition').fill_null(pl.col('Position3')))
|
56
57
|
|
57
58
|
df = df.with_columns([
|
58
59
|
(pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
|
59
|
-
pl.mean_horizontal("StartPosition", "Position2").alias(
|
60
|
+
pl.mean_horizontal("StartPosition", "Position2").alias(
|
61
|
+
"AvgStartPosition"),
|
60
62
|
]).with_columns(pl.when(pl.col("StartPosition").is_null()).then(pl.lit("--"))
|
61
63
|
.when((pl.col("AvgStartPosition") <= 3) & (pl.col("StartPosition") <= 3)).then(pl.lit("FrontRunner"))
|
62
64
|
.when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
|
@@ -77,35 +79,7 @@ def _extract_horse_data(horse_no: str) -> pl.DataFrame:
|
|
77
79
|
pl.col('Date') != '') # Remove empty rows
|
78
80
|
horse_data = _classify_running_style(horse_data)
|
79
81
|
|
80
|
-
|
81
|
-
table = soup.find_all('table', class_='table_eng_text')
|
82
|
-
profile_data = _parse_html_table(table[0], skip_header=True)
|
83
|
-
profile_data = _parse_html_table(table[1], skip_header=True)
|
84
|
-
|
85
|
-
try:
|
86
|
-
current_rating = int(profile_data.filter(
|
87
|
-
pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0))
|
88
|
-
season_start_rating = int(profile_data.filter(pl.col(
|
89
|
-
"column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0))
|
90
|
-
except:
|
91
|
-
current_rating, season_start_rating = 0, 0
|
92
|
-
|
93
|
-
try:
|
94
|
-
last_rating = int(profile_data.filter(
|
95
|
-
pl.col("column_0").str.starts_with("Last Rating"))['column_2'].item(0))
|
96
|
-
except:
|
97
|
-
last_rating = 0
|
98
|
-
|
99
|
-
horse_info = {
|
100
|
-
'HorseID': horse_no,
|
101
|
-
'CurrentRating': current_rating,
|
102
|
-
'SeasonStartRating': season_start_rating,
|
103
|
-
'LastRating': last_rating if current_rating == 0 else current_rating
|
104
|
-
}
|
105
|
-
horse_data = (horse_data.with_columns([
|
106
|
-
pl.lit(value).alias(key) for key, value in horse_info.items()
|
107
|
-
])
|
108
|
-
)
|
82
|
+
horse_data = horse_data.with_columns(pl.lit(horse_no).alias('HorseNo'))
|
109
83
|
|
110
84
|
return horse_data
|
111
85
|
|
@@ -124,16 +98,16 @@ def _clean_horse_data(df: pl.DataFrame) -> pl.DataFrame:
|
|
124
98
|
pl.col('Dr').cast(pl.Int64, strict=False),
|
125
99
|
pl.col('Rtg').cast(pl.Int64, strict=False),
|
126
100
|
pl.col('Dist').cast(pl.Int64, strict=False),
|
127
|
-
pl.col('WinOdds').cast(pl.Float64, strict=False)
|
128
|
-
pl.col('RaceIndex').cast(pl.Int64, strict=False)
|
101
|
+
pl.col('WinOdds').cast(pl.Float64, strict=False)
|
129
102
|
])
|
130
103
|
|
131
|
-
df = df.
|
104
|
+
df = (df.filter(~pl.col('FinishTime').str.starts_with('--'))
|
105
|
+
.with_columns(
|
132
106
|
(
|
133
|
-
pl.col("FinishTime").str.
|
134
|
-
pl.col("FinishTime").str.
|
135
|
-
).cast(pl.Float64).alias("FinishTime")
|
136
|
-
)
|
107
|
+
pl.col("FinishTime").str.splitn(".", 2).struct.field("field_0").cast(pl.Int64) * 60 +
|
108
|
+
pl.col("FinishTime").str.splitn(".", 2).struct.field("field_1").cast(pl.Float64)
|
109
|
+
).cast(pl.Float64).round(2).alias("FinishTime")
|
110
|
+
))
|
137
111
|
|
138
112
|
df = df.with_columns(
|
139
113
|
pl.col('RCTrackCourse').str.split_exact(' / ', 2)
|
@@ -141,12 +115,22 @@ def _clean_horse_data(df: pl.DataFrame) -> pl.DataFrame:
|
|
141
115
|
.alias('RCTrackCourse')
|
142
116
|
).unnest('RCTrackCourse')
|
143
117
|
|
118
|
+
df = df.with_columns(
|
119
|
+
pl.when(pl.col('Date').str.len_chars() <= 8)
|
120
|
+
.then(pl.col('Date').str.strptime(pl.Date, '%d/%m/%y', strict=False))
|
121
|
+
.otherwise(pl.col('Date').str.strptime(pl.Date, '%d/%m/%Y'))
|
122
|
+
).with_columns(
|
123
|
+
pl.concat_str(pl.col('Date').dt.strftime('%Y%m%d'), pl.col(
|
124
|
+
'Venue'), pl.col('RaceIndex')).alias('RaceId')
|
125
|
+
).drop("VideoReplay")
|
144
126
|
return df
|
145
127
|
|
128
|
+
|
146
129
|
def get_horse_data(horse_no: str) -> pl.DataFrame:
|
147
130
|
df = _extract_horse_data(horse_no)
|
148
131
|
return _clean_horse_data(df)
|
149
132
|
|
133
|
+
|
150
134
|
def _clean_race_data(df: pl.DataFrame) -> pl.DataFrame:
|
151
135
|
""" Clean and convert horse data to suitable data types
|
152
136
|
"""
|
@@ -165,13 +149,14 @@ def _clean_race_data(df: pl.DataFrame) -> pl.DataFrame:
|
|
165
149
|
|
166
150
|
df = df.with_columns(
|
167
151
|
(
|
168
|
-
pl.col("FinishTime").str.
|
169
|
-
pl.col("FinishTime").str.
|
170
|
-
).cast(pl.Float64).alias("FinishTime")
|
152
|
+
pl.col("FinishTime").str.splitn(":", 2).struct.field("field_0").cast(pl.Int64) * 60 +
|
153
|
+
pl.col("FinishTime").str.splitn(":", 2).struct.field("field_1").cast(pl.Float64)
|
154
|
+
).cast(pl.Float64).round(2).alias("FinishTime")
|
171
155
|
)
|
172
156
|
|
173
157
|
return df
|
174
158
|
|
159
|
+
|
175
160
|
def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
|
176
161
|
soup = _soupify_race_page(date, venue_code, race_number)
|
177
162
|
table = soup.find('div', class_='race_tab').find('table')
|
@@ -211,5 +196,5 @@ def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataF
|
|
211
196
|
|
212
197
|
|
213
198
|
def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
|
214
|
-
df = _extract_race_data(date,venue_code,race_number)
|
215
|
-
return _clean_race_data(df)
|
199
|
+
df = _extract_race_data(date, venue_code, race_number)
|
200
|
+
return _clean_race_data(df)
|
hkjc/live.py
CHANGED
@@ -7,8 +7,6 @@ import requests
|
|
7
7
|
from cachetools.func import ttl_cache
|
8
8
|
import numpy as np
|
9
9
|
|
10
|
-
from .utils import _validate_date, _validate_venue_code
|
11
|
-
|
12
10
|
HKJC_LIVEODDS_ENDPOINT = "https://info.cld.hkjc.com/graphql/base/"
|
13
11
|
|
14
12
|
RACEMTG_PAYLOAD = {
|
@@ -245,7 +243,7 @@ query racing($date: String, $venueCode: String, $oddsTypes: [OddsType], $raceNo:
|
|
245
243
|
|
246
244
|
|
247
245
|
@ttl_cache(maxsize=12, ttl=1000)
|
248
|
-
def _fetch_live_races(date: str, venue_code: str) -> dict:
|
246
|
+
def _fetch_live_races(date: str=None, venue_code: str=None) -> dict:
|
249
247
|
"""Fetch live race data from HKJC GraphQL endpoint."""
|
250
248
|
payload = RACEMTG_PAYLOAD.copy()
|
251
249
|
payload["variables"] = payload["variables"].copy()
|
@@ -265,9 +263,10 @@ def _fetch_live_races(date: str, venue_code: str) -> dict:
|
|
265
263
|
if r.status_code != 200:
|
266
264
|
raise RuntimeError(f"Request failed: {r.status_code} - {r.text}")
|
267
265
|
|
268
|
-
|
266
|
+
data = r.json()['data']['raceMeetings'][0]
|
267
|
+
races = data['races']
|
269
268
|
|
270
|
-
race_info = {}
|
269
|
+
race_info = {'Date': data['date'], 'Venue': data['venueCode'], 'Races': {}}
|
271
270
|
for race in races:
|
272
271
|
race_num = race['no']
|
273
272
|
race_name = race['raceName_en']
|
@@ -277,12 +276,15 @@ def _fetch_live_races(date: str, venue_code: str) -> dict:
|
|
277
276
|
race_class = race['raceClass_en']
|
278
277
|
race_course = race['raceCourse']['displayCode']
|
279
278
|
|
280
|
-
runners = [{'
|
281
|
-
'
|
282
|
-
'
|
279
|
+
runners = [{'No': runner['no'],
|
280
|
+
'Name': runner['name_en'],
|
281
|
+
'Dr': runner['barrierDrawNumber'],
|
282
|
+
'Rtg': int(runner['currentRating']),
|
283
|
+
'Wt': int(runner['currentWeight']),
|
284
|
+
'Handicap': int(runner['handicapWeight']),
|
283
285
|
'HorseNo': runner['horse']['code']
|
284
|
-
} for runner in race['runners']]
|
285
|
-
race_info[race_num]={
|
286
|
+
} for runner in race['runners'] if runner['status'] != "Standby"]
|
287
|
+
race_info['Races'][race_num] = {
|
286
288
|
'No': race_num,
|
287
289
|
'Name': race_name,
|
288
290
|
'Class': race_class,
|
@@ -290,13 +292,13 @@ def _fetch_live_races(date: str, venue_code: str) -> dict:
|
|
290
292
|
'Dist': race_dist,
|
291
293
|
'Going': race_going,
|
292
294
|
'Track': race_track,
|
293
|
-
'Runners': runners
|
295
|
+
'Runners': runners
|
294
296
|
}
|
295
297
|
return race_info
|
296
298
|
|
297
299
|
|
298
300
|
@ttl_cache(maxsize=12, ttl=30)
|
299
|
-
def _fetch_live_odds(date: str, venue_code: str, race_number: int, odds_type: Tuple[str] = ('PLA',
|
301
|
+
def _fetch_live_odds(date: str, venue_code: str, race_number: int, odds_type: Tuple[str] = ('PLA', )) -> List[dict]:
|
300
302
|
"""Fetch live odds data from HKJC GraphQL endpoint."""
|
301
303
|
payload = LIVEODDS_PAYLOAD.copy()
|
302
304
|
payload["variables"] = payload["variables"].copy()
|
@@ -329,14 +331,14 @@ def _fetch_live_odds(date: str, venue_code: str, race_number: int, odds_type: Tu
|
|
329
331
|
]
|
330
332
|
|
331
333
|
|
332
|
-
def live_odds(date: str, venue_code: str, race_number: int, odds_type: List[str] = ['PLA', 'QPL']) -> dict:
|
334
|
+
def live_odds(date: str, venue_code: str, race_number: int, odds_type: List[str] = ['WIN', 'PLA', 'QPL', 'QIN']) -> dict:
|
333
335
|
"""Fetch live odds as numpy arrays.
|
334
336
|
|
335
337
|
Args:
|
336
338
|
date (str): Date in 'YYYY-MM-DD' format.
|
337
339
|
venue_code (str): Venue code, e.g., 'ST' for Shatin, 'HV' for Happy Valley.
|
338
340
|
race_number (int): Race number.
|
339
|
-
odds_type (List[str]): Types of odds to fetch. Default is ['PLA', 'QPL']. Currently the following types are supported:
|
341
|
+
odds_type (List[str]): Types of odds to fetch. Default is ['WIN', 'PLA', 'QPL', 'QIN']. Currently the following types are supported:
|
340
342
|
- 'WIN': Win odds
|
341
343
|
- 'PLA': Place odds
|
342
344
|
- 'QIN': Quinella odds
|
@@ -348,11 +350,13 @@ def live_odds(date: str, venue_code: str, race_number: int, odds_type: List[str]
|
|
348
350
|
If odds_type is 'WIN','PLA', returns a 1D array of place odds.
|
349
351
|
If odds_type is 'QIN','QPL', returns a 2D array of quinella place odds.
|
350
352
|
"""
|
351
|
-
_validate_date(date)
|
352
|
-
_validate_venue_code(venue_code)
|
353
|
-
|
354
353
|
race_info = _fetch_live_races(date, venue_code)
|
355
|
-
N = len(race_info[race_number]['Runners'])
|
354
|
+
N = len(race_info['Races'][race_number]['Runners'])
|
355
|
+
|
356
|
+
if (race_info['Date'] != date) or (race_info['Venue'] != venue_code):
|
357
|
+
print(f"[WARNING] Requested {date} {venue_code} but server returned {race_info['Date']} {race_info['Venue']}.")
|
358
|
+
date = race_info['Date']
|
359
|
+
venue_code = race_info['Venue']
|
356
360
|
|
357
361
|
data = _fetch_live_odds(date, venue_code, race_number,
|
358
362
|
odds_type=tuple(odds_type))
|
hkjc/processing.py
CHANGED
@@ -42,7 +42,15 @@ def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl
|
|
42
42
|
|
43
43
|
|
44
44
|
def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
45
|
-
"""Generate historical race dataset from start_date to end_date
|
45
|
+
"""Generate historical race dataset from start_date to end_date (inclusive).
|
46
|
+
|
47
|
+
Args:
|
48
|
+
start_date (str): Date in 'YYYY-MM-DD' format.
|
49
|
+
end_date (str): Date in 'YYYY-MM-DD' format.
|
50
|
+
|
51
|
+
Returns:
|
52
|
+
pl.DataFrame: DataFrame with all records.
|
53
|
+
"""
|
46
54
|
_validate_date(start_date)
|
47
55
|
_validate_date(end_date)
|
48
56
|
start_dt = dt.strptime(start_date, '%Y-%m-%d')
|
@@ -50,7 +58,7 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
|
50
58
|
|
51
59
|
dfs = []
|
52
60
|
|
53
|
-
for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True), leave=False, desc='Scanning for horse IDs ...'):
|
61
|
+
for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True, closed='both'), leave=False, desc='Scanning for horse IDs ...'):
|
54
62
|
for venue_code in ['ST', 'HV']:
|
55
63
|
dfs += _historical_process_single_date_venue(date, venue_code)
|
56
64
|
|
@@ -63,7 +71,12 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
|
|
63
71
|
# Use horse track records
|
64
72
|
dfs = [_extract_horse_data(horse_id) for horse_id in tqdm(horse_ids, desc='Processing horses ...', leave=False)]
|
65
73
|
df = pl.concat(dfs)
|
66
|
-
|
74
|
+
|
75
|
+
try:
|
76
|
+
return _clean_horse_data(df).filter(pl.col('Date').is_between(start_dt, end_dt))
|
77
|
+
except:
|
78
|
+
print('Failed to clean data. Returning raw data for debug.')
|
79
|
+
return df
|
67
80
|
|
68
81
|
|
69
82
|
# ==========================
|
@@ -0,0 +1,14 @@
|
|
1
|
+
hkjc/__init__.py,sha256=XSm9N6YbZ2SzyxjO9aR26ctB4Z1-VeBImuroSgncUfk,737
|
2
|
+
hkjc/features.py,sha256=LicwtKBpMzpz_dSX9bjoCLLaRUu8oeZo1AloTe7v7sI,298
|
3
|
+
hkjc/harville_model.py,sha256=WSA_1EcNOHKGraP6WVHJ3FXZPGrDrjKhJc_q70KKx80,20188
|
4
|
+
hkjc/historical.py,sha256=aONchf7CMNs2B-WVDS_GWg8g0U0ZEH-FjbfhdJwc_N0,7683
|
5
|
+
hkjc/live.py,sha256=CfMeHRQfhKSmhQaexM99sdP0KRbIEqg2DIvNPc1gohk,10696
|
6
|
+
hkjc/processing.py,sha256=hQnHxl6HYlFOeSLSOCVsemgTKcwt9_tYUQI-itpvjUg,7188
|
7
|
+
hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
|
9
|
+
hkjc/utils.py,sha256=4CA_FPf_U3GvzoLkqBX0qDPZgrSvKJKvbP7VWqd5FiA,6323
|
10
|
+
hkjc/strategy/place_only.py,sha256=lHPjTSj8PzghxncNBg8FI4T4HJigekB9a3bV7l7VtPA,2079
|
11
|
+
hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
|
12
|
+
hkjc-0.3.21.dist-info/METADATA,sha256=YuIC0EvFVS3Z-8cwdzczMV7qQxMYvIKtO442iUQu5Jg,480
|
13
|
+
hkjc-0.3.21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
14
|
+
hkjc-0.3.21.dist-info/RECORD,,
|
hkjc/analysis.py
DELETED
hkjc-0.3.18.dist-info/RECORD
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
hkjc/__init__.py,sha256=5A9MzcITYJDcA2UbIBpkimZBYSqS4pgRuQJhTagOfpE,753
|
2
|
-
hkjc/analysis.py,sha256=0042_NMIkQCl0J6B0P4TFfrBDCnm2B6jsCZKOEO30yI,108
|
3
|
-
hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
|
4
|
-
hkjc/historical.py,sha256=v9k_R47Na5en5ftrocjIHofkNAUthE_lp4CyLaCTsQE,8280
|
5
|
-
hkjc/live.py,sha256=GqctH-BVdIL6Vi1g8XHe3p8fZBopCQf5KACLAR0meP0,10249
|
6
|
-
hkjc/processing.py,sha256=H0chtW_FBMMhK3IzcjYjrryd3fAPYimanc2fWuGiB0M,6807
|
7
|
-
hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
|
9
|
-
hkjc/utils.py,sha256=4CA_FPf_U3GvzoLkqBX0qDPZgrSvKJKvbP7VWqd5FiA,6323
|
10
|
-
hkjc/strategy/place_only.py,sha256=lHPjTSj8PzghxncNBg8FI4T4HJigekB9a3bV7l7VtPA,2079
|
11
|
-
hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
|
12
|
-
hkjc-0.3.18.dist-info/METADATA,sha256=aoXp6Fvn3EkuXyv6p5LClSbZa5XS_bfcUxMKBJXcNvw,480
|
13
|
-
hkjc-0.3.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
14
|
-
hkjc-0.3.18.dist-info/RECORD,,
|
File without changes
|