PyPI - hkjc - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

hkjc 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

hkjc/__init__.py +3 -2
hkjc/harville_model.py +362 -0
hkjc/processing.py +14 -2
{hkjc-0.2.1.dist-info → hkjc-0.3.1.dist-info}/METADATA +2 -1
{hkjc-0.2.1.dist-info → hkjc-0.3.1.dist-info}/RECORD +6 -6
hkjc/odds_fitting.py +0 -1
{hkjc-0.2.1.dist-info → hkjc-0.3.1.dist-info}/WHEEL +0 -0

hkjc/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ from importlib.metadata import version as _version
 __all__ = ["live_odds", "qpbanker",
 		    "generate_all_qp_trades", "generate_pareto_qp_trades",
-			"speedpro_df", "speedmap"]
+			"speedpro_df", "speedmap","harveille_model"]
 try:
 	__version__ = _version(__name__)
@@ -15,4 +15,5 @@ except Exception:  # pragma: no cover - best-effort version resolution
 from .live_odds import live_odds
 from .processing import generate_all_qp_trades, generate_pareto_qp_trades
-from .speedpro import speedmap, speedpro_df
+from .speedpro import speedmap, speedpro_df
+from . import harville_model

hkjc/harville_model.py ADDED Viewed

@@ -0,0 +1,362 @@
+"""
+Harville Race Model Optimizer
+Estimates horse racing outcome probabilities using the Harville model via dynamic
+programming. Fits latent strength parameters from observed betting market odds across
+multiple pool types (Win, Qin, Quinella, Banker).
+The optimizer uses O(N * 2^N) complexity DP with Numba JIT compilation for speed.
+Suitable for races with up to ~20 horses.
+Example:
+    >>> optimizer = HarvilleOptimizer(n_horses=14)
+    >>> results = optimizer.fit(W_obs=win_odds, Qin_obs=qin_odds,
+    ...                          Q_obs=quinella_odds, b_obs=banker_odds)
+    >>> print(results['theta'])  # Fitted strength parameters
+"""
+import numpy as np
+from scipy.optimize import minimize
+from numba import njit
+from typing import Tuple, Optional
+@njit(cache=True)
+def _popcount(mask: int) -> int:
+    count = 0
+    while mask:
+        count += 1
+        mask &= mask - 1
+    return count
+@njit(cache=True)
+def _precompute_mask_info(n: int) -> Tuple[np.ndarray, np.ndarray]:
+    max_mask = 1 << n
+    mask_strength_coef = np.zeros((max_mask, n), dtype=np.float64)
+    mask_popcount = np.zeros(max_mask, dtype=np.int32)
+    for mask in range(max_mask):
+        mask_popcount[mask] = _popcount(mask)
+        for i in range(n):
+            if mask & (1 << i):
+                mask_strength_coef[mask, i] = 1.0
+    return mask_strength_coef, mask_popcount
+@njit(cache=True)
+def _compute_dp_vectorized(theta: np.ndarray, k_max: int) -> np.ndarray:
+    n = len(theta)
+    max_mask = 1 << n
+    mask_strength_coef, mask_popcount = _precompute_mask_info(n)
+    mask_strength = mask_strength_coef @ theta
+    dp = np.zeros((k_max + 1, max_mask))
+    dp[0, 0] = 1.0
+    for k in range(k_max):
+        valid_masks = np.where(mask_popcount == k)[0]
+        for mask in valid_masks:
+            if dp[k, mask] == 0:
+                continue
+            s_mask = mask_strength[mask]
+            remaining = 1.0 - s_mask
+            if remaining < 1e-12:
+                continue
+            prob_current = dp[k, mask]
+            for i in range(n):
+                if not (mask & (1 << i)):
+                    next_mask = mask | (1 << i)
+                    dp[k + 1, next_mask] += prob_current * theta[i] / remaining
+    return dp
+@njit(cache=True)
+def _extract_pair_in_top_k(dp: np.ndarray, n: int, k: int) -> np.ndarray:
+    M = np.zeros((n, n))
+    max_mask = 1 << n
+    mask_popcount = np.zeros(max_mask, dtype=np.int32)
+    for mask in range(max_mask):
+        mask_popcount[mask] = _popcount(mask)
+    masks_size_k = np.where(mask_popcount == k)[0]
+    for mask in masks_size_k:
+        prob = dp[k, mask]
+        if prob == 0:
+            continue
+        horses = np.empty(k, dtype=np.int32)
+        idx = 0
+        for i in range(n):
+            if mask & (1 << i):
+                horses[idx] = i
+                idx += 1
+        for i in range(k):
+            for j in range(k):
+                M[horses[i], horses[j]] += prob
+    return M
+@njit(cache=True)
+def _extract_top_k_probs(dp: np.ndarray, n: int, k_max: int) -> np.ndarray:
+    T = np.zeros((n, k_max + 1))
+    max_mask = 1 << n
+    mask_popcount = np.zeros(max_mask, dtype=np.int32)
+    for mask in range(max_mask):
+        mask_popcount[mask] = _popcount(mask)
+    for k in range(1, k_max + 1):
+        masks_size_k = np.where(mask_popcount == k)[0]
+        for mask in masks_size_k:
+            prob = dp[k, mask]
+            if prob == 0:
+                continue
+            for i in range(n):
+                if mask & (1 << i):
+                    T[i, k] += prob
+    return T
+@njit(cache=True)
+def _compute_probabilities(theta: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    n = len(theta)
+    dp = _compute_dp_vectorized(theta, n)
+    T = _extract_top_k_probs(dp, n, n)
+    P = np.zeros((n, n))
+    for i in range(n):
+        for j in range(n):
+            P[i, j] = T[i, j + 1] - T[i, j]
+    W = P[:, 0]
+    Qin = _extract_pair_in_top_k(dp, n, 2)
+    Q = _extract_pair_in_top_k(dp, n, 3)
+    b = T[:, 3]
+    return W, Qin, Q, b, P
+@njit(cache=True)
+def _kl_divergence(p_obs: np.ndarray, p_model: np.ndarray) -> float:
+    eps = 1e-10
+    p_obs_flat = np.maximum(p_obs.ravel(), eps)
+    p_model_flat = np.maximum(p_model.ravel(), eps)
+    sum_obs = p_obs_flat.sum()
+    sum_model = p_model_flat.sum()
+    if sum_obs > eps:
+        p_obs_flat = p_obs_flat / sum_obs
+    if sum_model > eps:
+        p_model_flat = p_model_flat / sum_model
+    return np.sum(p_obs_flat * np.log(p_obs_flat / p_model_flat))
+class HarvilleOptimizer:
+    """
+    Fits Harville race model to betting market odds using dynamic programming.
+    The Harville model assigns each horse a latent strength parameter theta_i, where
+    the probability of finishing next among remaining horses is proportional to
+    relative strength. This optimizer estimates theta from observed betting odds
+    across multiple pool types.
+    Default lambda weights (1.0, 2.0, 1.5, 0.7) reflect that early Win odds are
+    biased by informed traders waiting until closing, while exotic pools provide
+    more stable signals for ensemble estimation.
+    Attributes:
+        n (int): Number of horses
+        lambda_win (float): Weight for Win pool loss
+        lambda_qin (float): Weight for Qin pool loss
+        lambda_quinella (float): Weight for Quinella pool loss
+        lambda_banker (float): Weight for Banker pool loss
+    """
+    def __init__(self, n_horses: int, lambda_win: float = 1.0, lambda_qin: float = 2.0,
+                 lambda_quinella: float = 1.5, lambda_banker: float = 0.7):
+        """
+        Initialize optimizer.
+        Args:
+            n_horses: Number of horses in race (recommend <= 20 for speed)
+            lambda_win: Weight for Win odds (prob horse finishes 1st)
+            lambda_qin: Weight for Qin odds (prob pair finishes 1st-2nd)
+            lambda_quinella: Weight for Quinella odds (prob pair in top 3)
+            lambda_banker: Weight for Banker odds (prob horse in top 3)
+        Raises:
+            ValueError: If n_horses > 20 (exponential complexity warning)
+        """
+        if n_horses > 20:
+            raise ValueError("N > 20 may be too slow (2^N complexity)")
+        self.n = n_horses
+        self.lambda_win = lambda_win
+        self.lambda_qin = lambda_qin
+        self.lambda_quinella = lambda_quinella
+        self.lambda_banker = lambda_banker
+        self._eval_count = 0
+    def loss(self, theta: np.ndarray, W_obs: Optional[np.ndarray],
+             Qin_obs: Optional[np.ndarray], Q_obs: Optional[np.ndarray],
+             b_obs: Optional[np.ndarray]) -> float:
+        """
+        Compute weighted KL divergence loss between observed and model odds.
+        Args:
+            theta: Strength parameters (will be normalized to simplex)
+            W_obs: Observed Win probabilities (n,) or None
+            Qin_obs: Observed Qin probabilities (n, n) or None
+            Q_obs: Observed Quinella probabilities (n, n) or None
+            b_obs: Observed Banker probabilities (n,) or None
+        Returns:
+            Scalar loss value (sum of weighted KL divergences)
+        """
+        self._eval_count += 1
+        theta = np.abs(theta) + 1e-10
+        theta = theta / theta.sum()
+        W_model, Qin_model, Q_model, b_model, P_model = _compute_probabilities(theta)
+        loss = 0.0
+        if W_obs is not None:
+            loss += self.lambda_win * _kl_divergence(W_obs, W_model)
+        if Qin_obs is not None:
+            loss += self.lambda_qin * _kl_divergence(Qin_obs, Qin_model)
+        if Q_obs is not None:
+            loss += self.lambda_quinella * _kl_divergence(Q_obs, Q_model)
+        if b_obs is not None:
+            loss += self.lambda_banker * _kl_divergence(b_obs, b_model)
+        return loss
+    def fit(self, W_obs: Optional[np.ndarray] = None,
+            Qin_obs: Optional[np.ndarray] = None,
+            Q_obs: Optional[np.ndarray] = None,
+            b_obs: Optional[np.ndarray] = None,
+            theta_init: Optional[np.ndarray] = None,
+            method: str = 'L-BFGS-B') -> dict:
+        """
+        Fit Harville model to observed betting odds.
+        At least one odds type must be provided. All odds should be probabilities
+        (not decimal/fractional odds). Matrices should be symmetric where applicable.
+        Args:
+            W_obs: Win probabilities, shape (n,). W_obs[i] = prob horse i wins
+            Qin_obs: Qin probabilities, shape (n, n). Qin_obs[i,j] = prob horses
+                     i,j finish 1st-2nd in any order
+            Q_obs: Quinella probabilities, shape (n, n). Q_obs[i,j] = prob horses
+                   i,j both finish in top 3
+            b_obs: Banker probabilities, shape (n,). b_obs[i] = prob horse i
+                   finishes in top 3
+            theta_init: Initial strength guess (default: W_obs if available, else uniform)
+            method: Scipy optimizer ('L-BFGS-B' or 'SLSQP')
+        Returns:
+            Dictionary containing:
+                - theta: Fitted strength parameters (n,)
+                - W_fitted: Fitted Win probabilities (n,)
+                - Qin_fitted: Fitted Qin probabilities (n, n)
+                - Q_fitted: Fitted Quinella probabilities (n, n)
+                - b_fitted: Fitted Banker probabilities (n,)
+                - P_fitted: Full place probability matrix (n, n), P[i,j] =
+                            prob horse i finishes in position j
+                - loss: Final loss value
+                - success: Whether optimization converged
+                - message: Optimizer status message
+                - n_eval: Number of loss function evaluations
+        Raises:
+            ValueError: If no odds provided or shapes don't match n_horses
+        Example:
+            >>> opt = HarvilleOptimizer(n_horses=10)
+            >>> results = opt.fit(W_obs=win_probs, Q_obs=quinella_probs)
+            >>> print(f"Fitted strengths: {results['theta']}")
+            >>> print(f"Converged: {results['success']}")
+        """
+        if W_obs is None and Qin_obs is None and Q_obs is None and b_obs is None:
+            raise ValueError("At least one type of odds must be provided")
+        if W_obs is not None and W_obs.shape != (self.n,):
+            raise ValueError(f"W_obs must be ({self.n},)")
+        if Qin_obs is not None and Qin_obs.shape != (self.n, self.n):
+            raise ValueError(f"Qin_obs must be ({self.n}, {self.n})")
+        if Q_obs is not None and Q_obs.shape != (self.n, self.n):
+            raise ValueError(f"Q_obs must be ({self.n}, {self.n})")
+        if b_obs is not None and b_obs.shape != (self.n,):
+            raise ValueError(f"b_obs must be ({self.n},)")
+        if theta_init is None:
+            if W_obs is not None:
+                theta_init = W_obs / W_obs.sum()
+            else:
+                theta_init = np.ones(self.n) / self.n
+        else:
+            theta_init = theta_init / theta_init.sum()
+        self._eval_count = 0
+        if method == 'L-BFGS-B':
+            result = minimize(
+                fun=lambda x: self.loss(x, W_obs, Qin_obs, Q_obs, b_obs),
+                x0=theta_init,
+                method='L-BFGS-B',
+                bounds=[(1e-6, 1.0) for _ in range(self.n)],
+                options={'maxiter': 500, 'ftol': 1e-9, 'maxls': 50}
+            )
+        else:
+            result = minimize(
+                fun=lambda x: self.loss(x, W_obs, Qin_obs, Q_obs, b_obs),
+                x0=theta_init,
+                method='SLSQP',
+                bounds=[(1e-6, 1.0) for _ in range(self.n)],
+                constraints={'type': 'eq', 'fun': lambda x: x.sum() - 1},
+                options={'maxiter': 500, 'ftol': 1e-9}
+            )
+        theta_opt = np.abs(result.x) + 1e-10
+        theta_opt = theta_opt / theta_opt.sum()
+        W_fitted, Qin_fitted, Q_fitted, b_fitted, P_fitted = _compute_probabilities(theta_opt)
+        return {
+            'theta': theta_opt,
+            'W_fitted': W_fitted,
+            'Qin_fitted': Qin_fitted,
+            'Q_fitted': Q_fitted,
+            'b_fitted': b_fitted,
+            'P_fitted': P_fitted,
+            'loss': result.fun,
+            'success': result.success,
+            'message': result.message,
+            'n_eval': self._eval_count
+        }

hkjc/processing.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Tuple, List
 from .live_odds import live_odds
 from .qpbanker import win_probability, expected_value, average_odds
 from .optimization import _pareto_filter
+from .harville_model import HarvilleOptimizer
 import polars as pl
 import numpy as np
@@ -26,7 +27,7 @@ def _process_single_qp_trade(banker: int, covered: List[int], odds_pla: List[flo
     return (banker, covered, win_prob, exp_value, ave_odds)
-def generate_all_qp_trades(date: str, venue_code: str, race_number: int, rebate: float = 0.12) -> pl.DataFrame:
+def generate_all_qp_trades(date: str, venue_code: str, race_number: int, rebate: float = 0.12, harville_fit=True) -> pl.DataFrame:
     """Generate all possible qp tickets for the specified race.
     Args:
@@ -34,14 +35,25 @@ def generate_all_qp_trades(date: str, venue_code: str, race_number: int, rebate:
         venue_code (str): Venue code, e.g., 'ST' for Shatin, 'HV' for Happy Valley.
         race_number (int): Race number.
         rebate (float, optional): The rebate percentage. Defaults to 0.12.
+        harville_fit (bool, optional): Whether to fit the odds using Harville model. Defaults to True.
     Returns:
         pl.DataFrame: DataFrame with all possible trades and their metrics.
     """
-    odds = live_odds(date, venue_code, race_number, odds_type=['PLA', 'QPL'])
+    odds = live_odds(date, venue_code, race_number,
+                     odds_type=['PLA', 'QPL', 'WIN', 'QIN'])
     N = len(odds['PLA'])
     candidates = np.arange(1, N+1)
+    if harville_fit:
+        ho = HarvilleOptimizer(N)
+        fit_res = ho.fit(1/odds['WIN'], 1/odds['QIN'],
+                         1/odds['QPL'], 1/odds['PLA'])
+        if fit_res['success']:
+            odds['PLA'] = 1/fit_res['b_fitted']
+            odds['QPL'] = 1/fit_res['Q_fitted']
     results = [_process_single_qp_trade(banker, covered, odds['PLA'], odds['QPL'], rebate)
                for banker in tqdm(candidates, desc="Processing bankers")
                for covered in _all_subsets(candidates[candidates != banker])]

{hkjc-0.2.1.dist-info → hkjc-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,11 @@
 Metadata-Version: 2.4
 Name: hkjc
-Version: 0.2.1
+Version: 0.3.1
 Summary: Library for scrapping HKJC data and perform basic analysis
 Requires-Python: >=3.11
 Requires-Dist: cachetools>=6.2.0
 Requires-Dist: fastexcel>=0.16.0
+Requires-Dist: numba>=0.62.1
 Requires-Dist: numpy>=2.3.3
 Requires-Dist: polars>=1.33.1
 Requires-Dist: pyarrow>=21.0.0

{hkjc-0.2.1.dist-info → hkjc-0.3.1.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
-hkjc/__init__.py,sha256=LPSYUYKnXLM7A6AC8Le8DJRP-D5smO6w9SXhYUJXbi8,572
+hkjc/__init__.py,sha256=KBbWVwLXPPb93bk_h2Qt9t5OH8y6RrVUeH-ZYNKQAoQ,619
 hkjc/analysis.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+hkjc/harville_model.py,sha256=Kn9IeiaWBxDcbzZIGd3B6DAIA8MTaQuC5qBj-cSJfMM,12752
 hkjc/live_odds.py,sha256=i_g9ckQKA9GWbwPXNvbmNvm-dPbF9UJoGiWv6_bHzwA,4603
-hkjc/odds_fitting.py,sha256=abHa19Vv3yAjX4PPFhwoMldmG1DF1tXGXtYVaFszhJI,33
 hkjc/optimization.py,sha256=OArQ3w9bwcIV_lTNuE5za6AROoa90xk_gwAoGwQ-8RE,3784
-hkjc/processing.py,sha256=9AiTkjsx51sZtyA4XcfK-werwFWxdea0BeIEuNvGQYQ,2983
+hkjc/processing.py,sha256=WLjIF-p7hX4aVJkhTuVebEdawxNcaP9eEOTvVXLz7i4,3480
 hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hkjc/qpbanker.py,sha256=vhvYb5_nGrKgYgre9gGF6tgswovca5C9pZVOPGxEP1Q,4804
 hkjc/speedpro.py,sha256=vKnSz9yY1rfVmRo7GVxXLjsiQN-YgwxSbV0B7yuszS4,1702
 hkjc/visualization.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-hkjc-0.2.1.dist-info/METADATA,sha256=l4xH-xAdWLN8yDKwBg27J2o1Tpw42u0UfCIzMGji_xk,384
-hkjc-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-hkjc-0.2.1.dist-info/RECORD,,
+hkjc-0.3.1.dist-info/METADATA,sha256=yn9N5730YazXG0HrUTWth92pbbwDjVdq_p_5Y4MnYAY,413
+hkjc-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+hkjc-0.3.1.dist-info/RECORD,,

hkjc/odds_fitting.py DELETED Viewed

	@@ -1 +0,0 @@
1	- ## TODO: implement odds filtering

{hkjc-0.2.1.dist-info → hkjc-0.3.1.dist-info}/WHEEL RENAMED Viewed

File without changes

hkjc 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

hkjc 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl