hkjc 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hkjc/__init__.py CHANGED
@@ -6,7 +6,7 @@ from importlib.metadata import version as _version
6
6
 
7
7
  __all__ = ["live_odds", "qpbanker",
8
8
  "generate_all_qp_trades", "generate_pareto_qp_trades",
9
- "speedpro_df", "speedmap"]
9
+ "speedpro_df", "speedmap","harveille_model"]
10
10
 
11
11
  try:
12
12
  __version__ = _version(__name__)
@@ -15,4 +15,5 @@ except Exception: # pragma: no cover - best-effort version resolution
15
15
 
16
16
  from .live_odds import live_odds
17
17
  from .processing import generate_all_qp_trades, generate_pareto_qp_trades
18
- from .speedpro import speedmap, speedpro_df
18
+ from .speedpro import speedmap, speedpro_df
19
+ from . import harville_model
hkjc/harville_model.py ADDED
@@ -0,0 +1,362 @@
1
+ """
2
+ Harville Race Model Optimizer
3
+
4
+ Estimates horse racing outcome probabilities using the Harville model via dynamic
5
+ programming. Fits latent strength parameters from observed betting market odds across
6
+ multiple pool types (Win, Qin, Quinella, Banker).
7
+
8
+ The optimizer uses O(N * 2^N) complexity DP with Numba JIT compilation for speed.
9
+ Suitable for races with up to ~20 horses.
10
+
11
+ Example:
12
+ >>> optimizer = HarvilleOptimizer(n_horses=14)
13
+ >>> results = optimizer.fit(W_obs=win_odds, Qin_obs=qin_odds,
14
+ ... Q_obs=quinella_odds, b_obs=banker_odds)
15
+ >>> print(results['theta']) # Fitted strength parameters
16
+ """
17
+
18
+ import numpy as np
19
+ from scipy.optimize import minimize
20
+ from numba import njit
21
+ from typing import Tuple, Optional
22
+
23
+
24
+ @njit(cache=True)
25
+ def _popcount(mask: int) -> int:
26
+ count = 0
27
+ while mask:
28
+ count += 1
29
+ mask &= mask - 1
30
+ return count
31
+
32
+
33
+ @njit(cache=True)
34
+ def _precompute_mask_info(n: int) -> Tuple[np.ndarray, np.ndarray]:
35
+ max_mask = 1 << n
36
+ mask_strength_coef = np.zeros((max_mask, n), dtype=np.float64)
37
+ mask_popcount = np.zeros(max_mask, dtype=np.int32)
38
+
39
+ for mask in range(max_mask):
40
+ mask_popcount[mask] = _popcount(mask)
41
+ for i in range(n):
42
+ if mask & (1 << i):
43
+ mask_strength_coef[mask, i] = 1.0
44
+
45
+ return mask_strength_coef, mask_popcount
46
+
47
+
48
+ @njit(cache=True)
49
+ def _compute_dp_vectorized(theta: np.ndarray, k_max: int) -> np.ndarray:
50
+ n = len(theta)
51
+ max_mask = 1 << n
52
+
53
+ mask_strength_coef, mask_popcount = _precompute_mask_info(n)
54
+ mask_strength = mask_strength_coef @ theta
55
+
56
+ dp = np.zeros((k_max + 1, max_mask))
57
+ dp[0, 0] = 1.0
58
+
59
+ for k in range(k_max):
60
+ valid_masks = np.where(mask_popcount == k)[0]
61
+
62
+ for mask in valid_masks:
63
+ if dp[k, mask] == 0:
64
+ continue
65
+
66
+ s_mask = mask_strength[mask]
67
+ remaining = 1.0 - s_mask
68
+
69
+ if remaining < 1e-12:
70
+ continue
71
+
72
+ prob_current = dp[k, mask]
73
+
74
+ for i in range(n):
75
+ if not (mask & (1 << i)):
76
+ next_mask = mask | (1 << i)
77
+ dp[k + 1, next_mask] += prob_current * theta[i] / remaining
78
+
79
+ return dp
80
+
81
+
82
+ @njit(cache=True)
83
+ def _extract_pair_in_top_k(dp: np.ndarray, n: int, k: int) -> np.ndarray:
84
+ M = np.zeros((n, n))
85
+ max_mask = 1 << n
86
+
87
+ mask_popcount = np.zeros(max_mask, dtype=np.int32)
88
+ for mask in range(max_mask):
89
+ mask_popcount[mask] = _popcount(mask)
90
+
91
+ masks_size_k = np.where(mask_popcount == k)[0]
92
+
93
+ for mask in masks_size_k:
94
+ prob = dp[k, mask]
95
+ if prob == 0:
96
+ continue
97
+
98
+ horses = np.empty(k, dtype=np.int32)
99
+ idx = 0
100
+ for i in range(n):
101
+ if mask & (1 << i):
102
+ horses[idx] = i
103
+ idx += 1
104
+
105
+ for i in range(k):
106
+ for j in range(k):
107
+ M[horses[i], horses[j]] += prob
108
+
109
+ return M
110
+
111
+
112
+ @njit(cache=True)
113
+ def _extract_top_k_probs(dp: np.ndarray, n: int, k_max: int) -> np.ndarray:
114
+ T = np.zeros((n, k_max + 1))
115
+ max_mask = 1 << n
116
+
117
+ mask_popcount = np.zeros(max_mask, dtype=np.int32)
118
+ for mask in range(max_mask):
119
+ mask_popcount[mask] = _popcount(mask)
120
+
121
+ for k in range(1, k_max + 1):
122
+ masks_size_k = np.where(mask_popcount == k)[0]
123
+
124
+ for mask in masks_size_k:
125
+ prob = dp[k, mask]
126
+ if prob == 0:
127
+ continue
128
+
129
+ for i in range(n):
130
+ if mask & (1 << i):
131
+ T[i, k] += prob
132
+
133
+ return T
134
+
135
+
136
+ @njit(cache=True)
137
+ def _compute_probabilities(theta: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
138
+ n = len(theta)
139
+
140
+ dp = _compute_dp_vectorized(theta, n)
141
+
142
+ T = _extract_top_k_probs(dp, n, n)
143
+
144
+ P = np.zeros((n, n))
145
+ for i in range(n):
146
+ for j in range(n):
147
+ P[i, j] = T[i, j + 1] - T[i, j]
148
+
149
+ W = P[:, 0]
150
+ Qin = _extract_pair_in_top_k(dp, n, 2)
151
+ Q = _extract_pair_in_top_k(dp, n, 3)
152
+ b = T[:, 3]
153
+
154
+ return W, Qin, Q, b, P
155
+
156
+
157
+ @njit(cache=True)
158
+ def _kl_divergence(p_obs: np.ndarray, p_model: np.ndarray) -> float:
159
+ eps = 1e-10
160
+
161
+ p_obs_flat = np.maximum(p_obs.ravel(), eps)
162
+ p_model_flat = np.maximum(p_model.ravel(), eps)
163
+
164
+ sum_obs = p_obs_flat.sum()
165
+ sum_model = p_model_flat.sum()
166
+
167
+ if sum_obs > eps:
168
+ p_obs_flat = p_obs_flat / sum_obs
169
+ if sum_model > eps:
170
+ p_model_flat = p_model_flat / sum_model
171
+
172
+ return np.sum(p_obs_flat * np.log(p_obs_flat / p_model_flat))
173
+
174
+
175
+ class HarvilleOptimizer:
176
+ """
177
+ Fits Harville race model to betting market odds using dynamic programming.
178
+
179
+ The Harville model assigns each horse a latent strength parameter theta_i, where
180
+ the probability of finishing next among remaining horses is proportional to
181
+ relative strength. This optimizer estimates theta from observed betting odds
182
+ across multiple pool types.
183
+
184
+ Default lambda weights (1.0, 2.0, 1.5, 0.7) reflect that early Win odds are
185
+ biased by informed traders waiting until closing, while exotic pools provide
186
+ more stable signals for ensemble estimation.
187
+
188
+ Attributes:
189
+ n (int): Number of horses
190
+ lambda_win (float): Weight for Win pool loss
191
+ lambda_qin (float): Weight for Qin pool loss
192
+ lambda_quinella (float): Weight for Quinella pool loss
193
+ lambda_banker (float): Weight for Banker pool loss
194
+ """
195
+
196
+ def __init__(self, n_horses: int, lambda_win: float = 1.0, lambda_qin: float = 2.0,
197
+ lambda_quinella: float = 1.5, lambda_banker: float = 0.7):
198
+ """
199
+ Initialize optimizer.
200
+
201
+ Args:
202
+ n_horses: Number of horses in race (recommend <= 20 for speed)
203
+ lambda_win: Weight for Win odds (prob horse finishes 1st)
204
+ lambda_qin: Weight for Qin odds (prob pair finishes 1st-2nd)
205
+ lambda_quinella: Weight for Quinella odds (prob pair in top 3)
206
+ lambda_banker: Weight for Banker odds (prob horse in top 3)
207
+
208
+ Raises:
209
+ ValueError: If n_horses > 20 (exponential complexity warning)
210
+ """
211
+ if n_horses > 20:
212
+ raise ValueError("N > 20 may be too slow (2^N complexity)")
213
+
214
+ self.n = n_horses
215
+ self.lambda_win = lambda_win
216
+ self.lambda_qin = lambda_qin
217
+ self.lambda_quinella = lambda_quinella
218
+ self.lambda_banker = lambda_banker
219
+ self._eval_count = 0
220
+
221
+ def loss(self, theta: np.ndarray, W_obs: Optional[np.ndarray],
222
+ Qin_obs: Optional[np.ndarray], Q_obs: Optional[np.ndarray],
223
+ b_obs: Optional[np.ndarray]) -> float:
224
+ """
225
+ Compute weighted KL divergence loss between observed and model odds.
226
+
227
+ Args:
228
+ theta: Strength parameters (will be normalized to simplex)
229
+ W_obs: Observed Win probabilities (n,) or None
230
+ Qin_obs: Observed Qin probabilities (n, n) or None
231
+ Q_obs: Observed Quinella probabilities (n, n) or None
232
+ b_obs: Observed Banker probabilities (n,) or None
233
+
234
+ Returns:
235
+ Scalar loss value (sum of weighted KL divergences)
236
+ """
237
+ self._eval_count += 1
238
+
239
+ theta = np.abs(theta) + 1e-10
240
+ theta = theta / theta.sum()
241
+
242
+ W_model, Qin_model, Q_model, b_model, P_model = _compute_probabilities(theta)
243
+
244
+ loss = 0.0
245
+
246
+ if W_obs is not None:
247
+ loss += self.lambda_win * _kl_divergence(W_obs, W_model)
248
+
249
+ if Qin_obs is not None:
250
+ loss += self.lambda_qin * _kl_divergence(Qin_obs, Qin_model)
251
+
252
+ if Q_obs is not None:
253
+ loss += self.lambda_quinella * _kl_divergence(Q_obs, Q_model)
254
+
255
+ if b_obs is not None:
256
+ loss += self.lambda_banker * _kl_divergence(b_obs, b_model)
257
+
258
+ return loss
259
+
260
+ def fit(self, W_obs: Optional[np.ndarray] = None,
261
+ Qin_obs: Optional[np.ndarray] = None,
262
+ Q_obs: Optional[np.ndarray] = None,
263
+ b_obs: Optional[np.ndarray] = None,
264
+ theta_init: Optional[np.ndarray] = None,
265
+ method: str = 'L-BFGS-B') -> dict:
266
+ """
267
+ Fit Harville model to observed betting odds.
268
+
269
+ At least one odds type must be provided. All odds should be probabilities
270
+ (not decimal/fractional odds). Matrices should be symmetric where applicable.
271
+
272
+ Args:
273
+ W_obs: Win probabilities, shape (n,). W_obs[i] = prob horse i wins
274
+ Qin_obs: Qin probabilities, shape (n, n). Qin_obs[i,j] = prob horses
275
+ i,j finish 1st-2nd in any order
276
+ Q_obs: Quinella probabilities, shape (n, n). Q_obs[i,j] = prob horses
277
+ i,j both finish in top 3
278
+ b_obs: Banker probabilities, shape (n,). b_obs[i] = prob horse i
279
+ finishes in top 3
280
+ theta_init: Initial strength guess (default: W_obs if available, else uniform)
281
+ method: Scipy optimizer ('L-BFGS-B' or 'SLSQP')
282
+
283
+ Returns:
284
+ Dictionary containing:
285
+ - theta: Fitted strength parameters (n,)
286
+ - W_fitted: Fitted Win probabilities (n,)
287
+ - Qin_fitted: Fitted Qin probabilities (n, n)
288
+ - Q_fitted: Fitted Quinella probabilities (n, n)
289
+ - b_fitted: Fitted Banker probabilities (n,)
290
+ - P_fitted: Full place probability matrix (n, n), P[i,j] =
291
+ prob horse i finishes in position j
292
+ - loss: Final loss value
293
+ - success: Whether optimization converged
294
+ - message: Optimizer status message
295
+ - n_eval: Number of loss function evaluations
296
+
297
+ Raises:
298
+ ValueError: If no odds provided or shapes don't match n_horses
299
+
300
+ Example:
301
+ >>> opt = HarvilleOptimizer(n_horses=10)
302
+ >>> results = opt.fit(W_obs=win_probs, Q_obs=quinella_probs)
303
+ >>> print(f"Fitted strengths: {results['theta']}")
304
+ >>> print(f"Converged: {results['success']}")
305
+ """
306
+ if W_obs is None and Qin_obs is None and Q_obs is None and b_obs is None:
307
+ raise ValueError("At least one type of odds must be provided")
308
+
309
+ if W_obs is not None and W_obs.shape != (self.n,):
310
+ raise ValueError(f"W_obs must be ({self.n},)")
311
+ if Qin_obs is not None and Qin_obs.shape != (self.n, self.n):
312
+ raise ValueError(f"Qin_obs must be ({self.n}, {self.n})")
313
+ if Q_obs is not None and Q_obs.shape != (self.n, self.n):
314
+ raise ValueError(f"Q_obs must be ({self.n}, {self.n})")
315
+ if b_obs is not None and b_obs.shape != (self.n,):
316
+ raise ValueError(f"b_obs must be ({self.n},)")
317
+
318
+ if theta_init is None:
319
+ if W_obs is not None:
320
+ theta_init = W_obs / W_obs.sum()
321
+ else:
322
+ theta_init = np.ones(self.n) / self.n
323
+ else:
324
+ theta_init = theta_init / theta_init.sum()
325
+
326
+ self._eval_count = 0
327
+
328
+ if method == 'L-BFGS-B':
329
+ result = minimize(
330
+ fun=lambda x: self.loss(x, W_obs, Qin_obs, Q_obs, b_obs),
331
+ x0=theta_init,
332
+ method='L-BFGS-B',
333
+ bounds=[(1e-6, 1.0) for _ in range(self.n)],
334
+ options={'maxiter': 500, 'ftol': 1e-9, 'maxls': 50}
335
+ )
336
+ else:
337
+ result = minimize(
338
+ fun=lambda x: self.loss(x, W_obs, Qin_obs, Q_obs, b_obs),
339
+ x0=theta_init,
340
+ method='SLSQP',
341
+ bounds=[(1e-6, 1.0) for _ in range(self.n)],
342
+ constraints={'type': 'eq', 'fun': lambda x: x.sum() - 1},
343
+ options={'maxiter': 500, 'ftol': 1e-9}
344
+ )
345
+
346
+ theta_opt = np.abs(result.x) + 1e-10
347
+ theta_opt = theta_opt / theta_opt.sum()
348
+
349
+ W_fitted, Qin_fitted, Q_fitted, b_fitted, P_fitted = _compute_probabilities(theta_opt)
350
+
351
+ return {
352
+ 'theta': theta_opt,
353
+ 'W_fitted': W_fitted,
354
+ 'Qin_fitted': Qin_fitted,
355
+ 'Q_fitted': Q_fitted,
356
+ 'b_fitted': b_fitted,
357
+ 'P_fitted': P_fitted,
358
+ 'loss': result.fun,
359
+ 'success': result.success,
360
+ 'message': result.message,
361
+ 'n_eval': self._eval_count
362
+ }
hkjc/processing.py CHANGED
@@ -6,6 +6,7 @@ from typing import Tuple, List
6
6
  from .live_odds import live_odds
7
7
  from .qpbanker import win_probability, expected_value, average_odds
8
8
  from .optimization import _pareto_filter
9
+ from .harville_model import HarvilleOptimizer
9
10
 
10
11
  import polars as pl
11
12
  import numpy as np
@@ -26,7 +27,7 @@ def _process_single_qp_trade(banker: int, covered: List[int], odds_pla: List[flo
26
27
  return (banker, covered, win_prob, exp_value, ave_odds)
27
28
 
28
29
 
29
- def generate_all_qp_trades(date: str, venue_code: str, race_number: int, rebate: float = 0.12) -> pl.DataFrame:
30
+ def generate_all_qp_trades(date: str, venue_code: str, race_number: int, rebate: float = 0.12, harville_fit=True) -> pl.DataFrame:
30
31
  """Generate all possible qp tickets for the specified race.
31
32
 
32
33
  Args:
@@ -34,14 +35,25 @@ def generate_all_qp_trades(date: str, venue_code: str, race_number: int, rebate:
34
35
  venue_code (str): Venue code, e.g., 'ST' for Shatin, 'HV' for Happy Valley.
35
36
  race_number (int): Race number.
36
37
  rebate (float, optional): The rebate percentage. Defaults to 0.12.
38
+ harville_fit (bool, optional): Whether to fit the odds using Harville model. Defaults to True.
37
39
 
38
40
  Returns:
39
41
  pl.DataFrame: DataFrame with all possible trades and their metrics.
40
42
  """
41
- odds = live_odds(date, venue_code, race_number, odds_type=['PLA', 'QPL'])
43
+
44
+ odds = live_odds(date, venue_code, race_number,
45
+ odds_type=['PLA', 'QPL', 'WIN', 'QIN'])
42
46
  N = len(odds['PLA'])
43
47
  candidates = np.arange(1, N+1)
44
48
 
49
+ if harville_fit:
50
+ ho = HarvilleOptimizer(N)
51
+ fit_res = ho.fit(1/odds['WIN'], 1/odds['QIN'],
52
+ 1/odds['QPL'], 1/odds['PLA'])
53
+ if fit_res['success']:
54
+ odds['PLA'] = 1/fit_res['b_fitted']
55
+ odds['QPL'] = 1/fit_res['Q_fitted']
56
+
45
57
  results = [_process_single_qp_trade(banker, covered, odds['PLA'], odds['QPL'], rebate)
46
58
  for banker in tqdm(candidates, desc="Processing bankers")
47
59
  for covered in _all_subsets(candidates[candidates != banker])]
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hkjc
3
- Version: 0.2.1
3
+ Version: 0.3.1
4
4
  Summary: Library for scrapping HKJC data and perform basic analysis
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: cachetools>=6.2.0
7
7
  Requires-Dist: fastexcel>=0.16.0
8
+ Requires-Dist: numba>=0.62.1
8
9
  Requires-Dist: numpy>=2.3.3
9
10
  Requires-Dist: polars>=1.33.1
10
11
  Requires-Dist: pyarrow>=21.0.0
@@ -1,13 +1,13 @@
1
- hkjc/__init__.py,sha256=LPSYUYKnXLM7A6AC8Le8DJRP-D5smO6w9SXhYUJXbi8,572
1
+ hkjc/__init__.py,sha256=KBbWVwLXPPb93bk_h2Qt9t5OH8y6RrVUeH-ZYNKQAoQ,619
2
2
  hkjc/analysis.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ hkjc/harville_model.py,sha256=Kn9IeiaWBxDcbzZIGd3B6DAIA8MTaQuC5qBj-cSJfMM,12752
3
4
  hkjc/live_odds.py,sha256=i_g9ckQKA9GWbwPXNvbmNvm-dPbF9UJoGiWv6_bHzwA,4603
4
- hkjc/odds_fitting.py,sha256=abHa19Vv3yAjX4PPFhwoMldmG1DF1tXGXtYVaFszhJI,33
5
5
  hkjc/optimization.py,sha256=OArQ3w9bwcIV_lTNuE5za6AROoa90xk_gwAoGwQ-8RE,3784
6
- hkjc/processing.py,sha256=9AiTkjsx51sZtyA4XcfK-werwFWxdea0BeIEuNvGQYQ,2983
6
+ hkjc/processing.py,sha256=WLjIF-p7hX4aVJkhTuVebEdawxNcaP9eEOTvVXLz7i4,3480
7
7
  hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  hkjc/qpbanker.py,sha256=vhvYb5_nGrKgYgre9gGF6tgswovca5C9pZVOPGxEP1Q,4804
9
9
  hkjc/speedpro.py,sha256=vKnSz9yY1rfVmRo7GVxXLjsiQN-YgwxSbV0B7yuszS4,1702
10
10
  hkjc/visualization.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- hkjc-0.2.1.dist-info/METADATA,sha256=l4xH-xAdWLN8yDKwBg27J2o1Tpw42u0UfCIzMGji_xk,384
12
- hkjc-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
- hkjc-0.2.1.dist-info/RECORD,,
11
+ hkjc-0.3.1.dist-info/METADATA,sha256=yn9N5730YazXG0HrUTWth92pbbwDjVdq_p_5Y4MnYAY,413
12
+ hkjc-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
+ hkjc-0.3.1.dist-info/RECORD,,
hkjc/odds_fitting.py DELETED
@@ -1 +0,0 @@
1
- ## TODO: implement odds filtering
File without changes