PyPI - hkjc - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

hkjc 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

hkjc/__init__.py +4 -3
hkjc/historical.py +138 -4
hkjc/live_odds.py +7 -8
hkjc/processing.py +66 -1
hkjc/strategy/place_only.py +2 -2
hkjc/utils.py +189 -0
{hkjc-0.3.9.dist-info → hkjc-0.3.11.dist-info}/METADATA +2 -1
hkjc-0.3.11.dist-info/RECORD +13 -0
hkjc/optimization.py +0 -106
hkjc-0.3.9.dist-info/RECORD +0 -13
{hkjc-0.3.9.dist-info → hkjc-0.3.11.dist-info}/WHEEL +0 -0

hkjc/__init__.py CHANGED Viewed

@@ -6,7 +6,8 @@ from importlib.metadata import version as _version
 __all__ = ["live_odds", "qpbanker",
            "generate_all_qp_trades", "generate_all_pla_trades", "pareto_filter",
-                        "speedpro_energy", "speedmap", "harveille_model"]
+                        "speedpro_energy", "speedmap", "harveille_model",
+                        "generate_historical_data"]
 try:
     __version__ = _version(__name__)
@@ -14,7 +15,7 @@ except Exception:  # pragma: no cover - best-effort version resolution
     __version__ = "0.0.0"
 from .live_odds import live_odds
-from .processing import generate_all_qp_trades, generate_all_pla_trades
-from .optimization import pareto_filter
+from .processing import generate_all_qp_trades, generate_all_pla_trades, generate_historical_data
+from .utils import pareto_filter
 from .speedpro import speedmap, speedpro_energy
 from . import harville_model

hkjc/historical.py CHANGED Viewed

@@ -3,11 +3,145 @@
 from __future__ import annotations
 import requests
+import polars as pl
+from bs4 import BeautifulSoup
+from cachetools.func import ttl_cache
-# TODO read and process all races from start date to end date
+from utils import _validate_date, _validate_venue_code, _parse_html_table
-# TODO query all basic info and race history for a specific horse
+HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
+HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId={horse_id}"
-# TODO classify running style & draw to determine blocking probability
-#
+@ttl_cache(maxsize=100, ttl=3600)
+def _soupify(url: str) -> BeautifulSoup:
+    """Fetch and parse a webpage and return BeautifulSoup object
+    """
+    response = requests.get(url, timeout=30)
+    response.raise_for_status()
+    return BeautifulSoup(response.content, 'html.parser')
+def _soupify_race_page(date: str, venue_code: str, race_number: int) -> BeautifulSoup:
+    """Fetch and parse HKJC race results page and return BeautifulSoup object
+    """
+    url = HKJC_RACE_URL_TEMPLATE.format(
+        date=date, venue_code=venue_code, race_number=race_number)
+    return _soupify(url)
+def _soupify_horse_page(horse_id: str) -> BeautifulSoup:
+    """Fetch and parse HKJC race results page and return BeautifulSoup object
+    """
+    url = HKJC_HORSE_URL_TEMPLATE.format(horse_id=horse_id)
+    return _soupify(url)
+def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition") -> pl.DataFrame:
+    """Classify running style based on RunningPosition column
+    """
+    # Split the RunningPosition column into separate columns and convert to integers
+    df = df.with_columns(
+        pl.col(running_pos_col)
+        .str.split_exact(" ", n=3)
+        .struct.rename_fields(["StartPosition", "Position2", "Position3", "FinishPosition"])
+        # Give an alias to the struct for easier selection
+        .alias("split_data").cast(pl.Int64, strict=False)
+    ).unnest("split_data")
+    df = df.with_columns([
+        (pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
+        pl.mean_horizontal("StartPosition", "Position2",
+                           "Position3", "FinishPosition").alias("AvgPosition"),
+    ]).with_columns(pl.when(pl.col("StartPosition").is_null()).then(pl.lit("--"))
+                    .when((pl.col("PositionChange") <= 0) & pl.col("StartPosition") <= 3).then(pl.lit("FrontRunner"))
+                    .when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
+                    .otherwise(pl.lit("Pacer")).alias("RunningStyle"))
+    recent_style = df['RunningStyle'][:10].mode()[0]
+    df = df.with_columns(pl.lit(recent_style).alias("FavoriteRunningStyle"))
+    return df
+def _extract_horse_data(horse_id: str) -> pl.DataFrame:
+    """Extract horse info and history from horse page
+    """
+    soup = _soupify_horse_page(horse_id)
+    table = soup.find('table', class_='bigborder')
+    horse_data = _parse_html_table(table).filter(
+        pl.col('Date') != '')  # Remove empty rows
+    horse_data = _classify_running_style(horse_data)
+    # Extract horse profile info
+    table = soup.find_all('table', class_='table_eng_text')
+    profile_data = _parse_html_table(table[0], skip_header=True)
+    country, age = profile_data.filter(pl.col("column_0").str.starts_with("Country"))['column_2'].item(0).split('/')
+    profile_data = _parse_html_table(table[1], skip_header=True)
+    current_rating = profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0)
+    season_start_rating = profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0)
+    horse_info = {
+        'HorseID': horse_id,
+        'OriginCountry': country.strip(),
+        'Age': int(age),
+        'CurrentRating': int(current_rating),
+        'SeasonStartRating': int(season_start_rating)
+    }
+    horse_data = (horse_data.with_columns([
+        pl.lit(value).alias(key) for key, value in horse_info.items()
+    ])
+    )
+    return horse_data
+def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
+    soup = _soupify_race_page(date, venue_code, race_number)
+    table = soup.find('div', class_='race_tab').find('table')
+    race_data = _parse_html_table(table)
+    # Extract the relevant race information
+    race_class = race_data.item(1, 0).split('-')[0].strip()
+    race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
+    race_name = race_data.item(2, 0).strip()
+    going = race_data.item(1, 2).strip()
+    course = race_data.item(2, 2).strip()
+    race_info = {'RaceDate': date,
+                 'Venue': venue_code,
+                 'RaceNumber': race_number,
+                 'RaceClass': race_class,
+                 'RaceDistance': race_dist,
+                 'RaceName': race_name,
+                 'Going': going,
+                 'Course': course}
+    # Extract the results table
+    table = soup.find('div', class_='performance').find('table')
+    race_data = (_parse_html_table(table)
+                 .with_columns([
+                     pl.lit(value).alias(key) for key, value in race_info.items()
+                 ])
+                 )
+    # Extract horse IDs from links
+    horse_ids = []
+    rows = table.find_all('tr')[1:]  # Skip header row
+    for row in rows:
+        horse_id = 'UNKNOWN'  # Horse link not found
+        links = row.find_all('a')
+        for link in links:
+            if 'href' in link.attrs and 'HorseId=' in link['href']:
+                horse_id = link['href'].split('HorseId=')[1]
+                break
+        horse_ids.append(horse_id)
+    race_data = race_data.with_columns(pl.Series('HorseID', horse_ids))
+    # Join with horse data
+    horse_data_list = [_extract_horse_data(horse_id) for horse_id in horse_ids]
+    horse_data_df = pl.concat(horse_data_list).unique(subset=['HorseID'])
+    race_data = race_data.join(horse_data_df, on='HorseID', how='left')
+    return race_data

hkjc/live_odds.py CHANGED Viewed

@@ -6,9 +6,10 @@ from typing import Tuple, List
 import requests
 from cachetools.func import ttl_cache
 import numpy as np
-from datetime import datetime as dt
-ENDPOINT = "https://info.cld.hkjc.com/graphql/base/"
+from .utils import _validate_date, _validate_venue_code
+HKJC_LIVEODDS_ENDPOINT = "https://info.cld.hkjc.com/graphql/base/"
 LIVEODDS_PAYLOAD = {
     "operationName": "racing",
@@ -70,7 +71,8 @@ def _fetch_live_odds(date: str, venue_code: str, race_number: int, odds_type: Tu
         "User-Agent": "python-hkjc-fetch/0.1",
     }
-    r = requests.post(ENDPOINT, json=payload, headers=headers, timeout=10)
+    r = requests.post(HKJC_LIVEODDS_ENDPOINT, json=payload,
+                      headers=headers, timeout=10)
     if r.status_code != 200:
         raise RuntimeError(f"Request failed: {r.status_code} - {r.text}")
@@ -104,11 +106,8 @@ def live_odds(date: str, venue_code: str, race_number: int, odds_type: List[str]
             If odds_type is 'WIN','PLA', returns a 1D array of place odds.
             If odds_type is 'QIN','QPL', returns a 2D array of quinella place odds.
     """
-    # validate date format
-    try:
-        dt.strptime(date, "%Y-%m-%d")
-    except Exception:
-        raise ValueError("Date must be in 'YYYY-MM-DD' format")
+    _validate_date(date)
+    _validate_venue_code(venue_code)
     mandatory_types = ['PLA']

hkjc/processing.py CHANGED Viewed

@@ -1,22 +1,87 @@
 """Functions to batch process trades into dataframes for analysis.
 """
 from __future__ import annotations
-from typing import Tuple, List
+from typing import Tuple, List, Union
 from .live_odds import live_odds
 from .strategy import qpbanker, place_only
 from .harville_model import fit_harville_to_odds
+from .historical import _extract_race_data
+from .utils import _validate_date
 import polars as pl
 import numpy as np
 from itertools import combinations
 from tqdm import tqdm
+from datetime import datetime as dt
 def _all_subsets(lst): return [list(x) for r in range(
     1, len(lst)+1) for x in combinations(lst, r)]  # list subsets of a list
+# ======================================
+# Historical data processing functions
+# ======================================
+incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
+             'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
+def _historical_process_single_date_venue(date: str, venue_code: str) -> Union[pl.DataFrame, None]:
+    for race_number in range(1, 12):
+        try:
+            _extract_race_data(date.strftime('%Y/%m/%d'),
+                               venue_code, race_number)
+        except:
+            return None
+def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
+    """Generate historical race dataset from start_date to end_date"""
+    _validate_date(start_date)
+    _validate_date(end_date)
+    start_dt = dt.strptime(start_date, '%Y-%m-%d')
+    end_dt = dt.strptime(end_date, '%Y-%m-%d')
+    dfs = []
+    for date in pl.date_range(start_dt, end_dt, interval='1d'):
+        for venue_code in ['ST', 'HV']:
+            df = _historical_process_single_date_venue(date, venue_code)
+            if df is None:
+                continue
+            dfs.append(df)
+    df = (pl.concat(dfs)
+          .filter(~pl.col('Pla').is_in(incidents))
+          .with_columns(
+        pl.col('Pla').str.split(' ').list.first().alias('Pla')
+    )
+    )
+    df = df.with_columns([
+        pl.col('Pla').cast(pl.Int64, strict=False),
+        pl.col('HorseNo').cast(pl.Int64, strict=False),
+        pl.col('ActWt').cast(pl.Int64, strict=False),
+        pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
+        pl.col('Dr').cast(pl.Int64, strict=False),
+        pl.col('RaceDistance').cast(pl.Int64, strict=False),
+        pl.col('WinOdds').cast(pl.Float64, strict=False)
+    ])
+    df = df.with_columns(pl.col('Finish Time')
+        .str.strptime(pl.Duration, format='%M:%S.%f', strict=False)
+        .dt.total_seconds()
+        .alias('Finish Time')
+    )
+    return df
+# ==========================
+# Trade processing functions
+# ==========================
 def _process_single_qp_trade(banker: int, covered: List[int], pla_odds: np.ndarray, qpl_odds: np.ndarray, rebate: float) -> Tuple[int, List, float, float, float]:
     """Process a single qp trade.
     """

hkjc/strategy/place_only.py CHANGED Viewed

@@ -17,7 +17,7 @@ def win_probability(p_matrix: np.ndarray, covered: List[int]) -> float:
         float: probability
     """
-    win_prob = 1-np.prod(1-np.sum(p_matrix[covered, :3], axis=1))
+    win_prob = 1-np.prod(1-np.sum([p_matrix[c-1, :3] for c in covered], axis=1))
     return win_prob
@@ -35,7 +35,7 @@ def expected_value(pla_odds: np.ndarray, p_matrix: np.ndarray, covered: List[int
     """
     true_prob = np.sum(p_matrix[:, :3], axis=1)
     C = len(covered)
-    ev = np.sum((true_prob*(pla_odds-rebate))[covered])/C - (1-rebate)
+    ev = np.sum([(true_prob*(pla_odds-rebate))[c-1] for c in covered])/C - (1-rebate)
     return ev
 def average_odds(pla_odds: np.ndarray, covered: List[int]) -> float:

hkjc/utils.py ADDED Viewed

@@ -0,0 +1,189 @@
+import polars as pl
+from typing import List, Union
+from datetime import datetime as dt
+import bs4
+import re
+def _validate_date(date_str: str) -> bool:
+    # validate date format
+    try:
+        dt.strptime(date_str, "%Y-%m-%d")
+    except Exception:
+        raise ValueError("Date must be in 'YYYY-MM-DD' format")
+    return True
+def _validate_venue_code(venue_code: str) -> bool:
+    if venue_code not in ['HV', 'ST']:
+        raise ValueError(
+            "Venue code must be 'HV' (Happy Valley) or 'ST' (Sha Tin)")
+    return True
+def _parse_html_table(table: bs4.element.Tag, skip_header=False) -> pl.DataFrame:
+    """Parse an HTML table (HKJC format) into a Polars DataFrame
+    """
+    if table is None:
+        raise ValueError("No table found in HTML tag")
+    # Extract headers
+    headers = []
+    if not skip_header:
+        header_row = table.find('thead')
+        if header_row:
+            headers = [td.get_text(strip=True) for td in header_row.find_all('td')]
+        else:
+            # If no thead, try first tr
+            first_row = table.find('tr')
+            if first_row:
+                headers = [th.get_text(strip=True)
+                        for th in first_row.find_all('th')]
+                if not headers:
+                    # If no th tags, use td tags from first row as headers
+                    headers = [td.get_text(strip=True)
+                            for td in first_row.find_all('td')]
+    # Extract data rows
+    data = []
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr') if tbody else table.find_all('tr')
+    # Skip first row if it was used for headers
+    start_idx = 1 if not tbody and headers else 0
+    for row in rows[start_idx:]:
+        cells = row.find_all(['td', 'th'])
+        row_data = [cell.get_text(separator=' ',strip=True) for cell in cells]
+        if row_data:  # Skip empty rows
+            data.append(row_data)
+    # Create DataFrame
+    if not headers:
+        # Generate default column names if no headers found
+        headers = [f"column_{i}" for i in range(len(data[0]))] if data else []
+    # Ensure all rows have the same number of columns
+    if data:
+        max_cols = len(headers)
+        data = [row + [''] * (max_cols - len(row)) if len(row)
+                < max_cols else row[:max_cols] for row in data]
+    df = pl.DataFrame(data, schema=headers, orient='row')
+    # Clean column names by removing special characters
+    df.columns = [re.sub(r'[^\w]', '', col)
+                         for col in df.columns]
+    return df
+def pareto_filter(
+    df: pl.DataFrame,
+    groupby: List[str],
+    by: List[str],
+    maximize: Union[bool, List[bool]] = True
+) -> pl.DataFrame:
+    """
+    Filter dataframe to only include Pareto optimal rows within each group.
+    Args:
+        df: Input dataframe
+        groupby: Columns to group by (empty list for global filter)
+        by: Columns to consider for Pareto optimality
+        maximize: Whether to maximize (True) or minimize (False) each 'by' column
+    Returns:
+        DataFrame containing only Pareto optimal rows
+    """
+    if df.is_empty() or not by:
+        return df
+    # Normalize maximize to list
+    maximize_list = [maximize] * \
+        len(by) if isinstance(maximize, bool) else maximize
+    if len(maximize_list) != len(by):
+        raise ValueError(
+            f"Length of 'maximize' ({len(maximize_list)}) must equal length of 'by' ({len(by)})")
+    # Single objective: simple min/max filter
+    if len(by) == 1:
+        opt_expr = pl.col(by[0]).max(
+        ) if maximize_list[0] else pl.col(by[0]).min()
+        if groupby:
+            opt_expr = opt_expr.over(groupby)
+        return df.filter(pl.col(by[0]) == opt_expr)
+    # Two objectives: efficient skyline algorithm
+    if len(by) == 2:
+        temp_cols = ["__obj_0", "__obj_1"]
+        # Transform to maximization problem
+        df_temp = df.with_columns([
+            (pl.col(by[i]) * (1 if maximize_list[i] else -1)
+             ).alias(temp_cols[i])
+            for i in range(2)
+        ])
+        # Sort by first objective descending, then second descending (for stability)
+        groupby = groupby or []
+        sort_cols = (groupby if groupby else []) + temp_cols
+        sorted_df = df_temp.sort(sort_cols, descending=[
+                                 False] * len(groupby) + [True, True])
+        # Keep rows where second objective is not dominated by any previous row in group
+        if groupby:
+            max_so_far = pl.col(temp_cols[1]).cum_max().shift(
+                1, fill_value=float("-inf")).over(groupby)
+        else:
+            max_so_far = pl.col(temp_cols[1]).cum_max().shift(
+                1, fill_value=float("-inf"))
+        mask = pl.col(temp_cols[1]) > max_so_far
+        return sorted_df.filter(mask).drop(temp_cols)
+    # N objectives (N > 2): pairwise dominance check
+    df_with_id = df.with_row_index("__id")
+    # Self-join to compare all pairs
+    left = df_with_id.lazy()
+    right = df_with_id.lazy()
+    if groupby:
+        pairs = left.join(right, on=groupby, suffix="_r")
+    else:
+        pairs = left.join(right, how="cross", suffix="_r")
+    # Only compare different rows
+    pairs = pairs.filter(pl.col("__id") != pl.col("__id_r"))
+    # Build dominance conditions
+    dominance_conditions = []
+    for col, is_max in zip(by, maximize_list):
+        if is_max:
+            # right dominates left if right[col] >= left[col] for all cols
+            dominance_conditions.append(pl.col(f"{col}_r") >= pl.col(col))
+        else:
+            dominance_conditions.append(pl.col(f"{col}_r") <= pl.col(col))
+    # Strict dominance: all >= and at least one >
+    strict_conditions = []
+    for col, is_max in zip(by, maximize_list):
+        if is_max:
+            strict_conditions.append(pl.col(f"{col}_r") > pl.col(col))
+        else:
+            strict_conditions.append(pl.col(f"{col}_r") < pl.col(col))
+    is_dominated = pl.all_horizontal(
+        dominance_conditions) & pl.any_horizontal(strict_conditions)
+    # Find IDs of dominated rows
+    dominated_ids = (
+        pairs.filter(is_dominated)
+        .select("__id")
+        .unique()
+        .collect()
+        .get_column("__id")
+    )
+    # Return non-dominated rows
+    return df_with_id.filter(~pl.col("__id").is_in(dominated_ids)).drop("__id")

{hkjc-0.3.9.dist-info → hkjc-0.3.11.dist-info}/METADATA RENAMED Viewed

@@ -1,8 +1,9 @@
 Metadata-Version: 2.4
 Name: hkjc
-Version: 0.3.9
+Version: 0.3.11
 Summary: Library for scrapping HKJC data and perform basic analysis
 Requires-Python: >=3.11
+Requires-Dist: beautifulsoup4>=4.14.2
 Requires-Dist: cachetools>=6.2.0
 Requires-Dist: fastexcel>=0.16.0
 Requires-Dist: numba>=0.62.1

hkjc-0.3.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+hkjc/__init__.py,sha256=TI7PVhmoWSvYX-xdTEdaT3jfY99LiYQFRQZaIwBhJd8,785
+hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
+hkjc/historical.py,sha256=P1eRRHzwhdQ4xR1xspj-HO1OyTSGZJDjJ-V5Sj8Pstg,6026
+hkjc/live_odds.py,sha256=G4ELBBp1d2prxye9kKzu2pwtS4vSfRPOmEuT7-Nd-3A,4741
+hkjc/processing.py,sha256=HeJmEyHe0JHO2V68dmm8eD9EIvjUUrFg2dhoYvYaik8,7064
+hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
+hkjc/utils.py,sha256=4CA_FPf_U3GvzoLkqBX0qDPZgrSvKJKvbP7VWqd5FiA,6323
+hkjc/strategy/place_only.py,sha256=lHPjTSj8PzghxncNBg8FI4T4HJigekB9a3bV7l7VtPA,2079
+hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
+hkjc-0.3.11.dist-info/METADATA,sha256=XxHC610mB4eZNVl5g_jGHBE1Rq5oW_VPXDzLwz0FR6k,452
+hkjc-0.3.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+hkjc-0.3.11.dist-info/RECORD,,

hkjc/optimization.py DELETED Viewed

@@ -1,106 +0,0 @@
-import polars as pl
-from typing import List, Union
-def pareto_filter(
-    df: pl.DataFrame,
-    groupby: List[str],
-    by: List[str],
-    maximize: Union[bool, List[bool]] = True
-) -> pl.DataFrame:
-    """
-    Filter dataframe to only include Pareto optimal rows within each group.
-    Args:
-        df: Input dataframe
-        groupby: Columns to group by (empty list for global filter)
-        by: Columns to consider for Pareto optimality
-        maximize: Whether to maximize (True) or minimize (False) each 'by' column
-    Returns:
-        DataFrame containing only Pareto optimal rows
-    """
-    if df.is_empty() or not by:
-        return df
-    # Normalize maximize to list
-    maximize_list = [maximize] * len(by) if isinstance(maximize, bool) else maximize
-    if len(maximize_list) != len(by):
-        raise ValueError(f"Length of 'maximize' ({len(maximize_list)}) must equal length of 'by' ({len(by)})")
-    # Single objective: simple min/max filter
-    if len(by) == 1:
-        opt_expr = pl.col(by[0]).max() if maximize_list[0] else pl.col(by[0]).min()
-        if groupby:
-            opt_expr = opt_expr.over(groupby)
-        return df.filter(pl.col(by[0]) == opt_expr)
-    # Two objectives: efficient skyline algorithm
-    if len(by) == 2:
-        temp_cols = ["__obj_0", "__obj_1"]
-        # Transform to maximization problem
-        df_temp = df.with_columns([
-            (pl.col(by[i]) * (1 if maximize_list[i] else -1)).alias(temp_cols[i])
-            for i in range(2)
-        ])
-        # Sort by first objective descending, then second descending (for stability)
-        groupby = groupby or []
-        sort_cols = (groupby if groupby else []) + temp_cols
-        sorted_df = df_temp.sort(sort_cols, descending=[False] * len(groupby) + [True, True])
-        # Keep rows where second objective is not dominated by any previous row in group
-        if groupby:
-            max_so_far = pl.col(temp_cols[1]).cum_max().shift(1, fill_value=float("-inf")).over(groupby)
-        else:
-            max_so_far = pl.col(temp_cols[1]).cum_max().shift(1, fill_value=float("-inf"))
-        mask = pl.col(temp_cols[1]) > max_so_far
-        return sorted_df.filter(mask).drop(temp_cols)
-    # N objectives (N > 2): pairwise dominance check
-    df_with_id = df.with_row_index("__id")
-    # Self-join to compare all pairs
-    left = df_with_id.lazy()
-    right = df_with_id.lazy()
-    if groupby:
-        pairs = left.join(right, on=groupby, suffix="_r")
-    else:
-        pairs = left.join(right, how="cross", suffix="_r")
-    # Only compare different rows
-    pairs = pairs.filter(pl.col("__id") != pl.col("__id_r"))
-    # Build dominance conditions
-    dominance_conditions = []
-    for col, is_max in zip(by, maximize_list):
-        if is_max:
-            # right dominates left if right[col] >= left[col] for all cols
-            dominance_conditions.append(pl.col(f"{col}_r") >= pl.col(col))
-        else:
-            dominance_conditions.append(pl.col(f"{col}_r") <= pl.col(col))
-    # Strict dominance: all >= and at least one >
-    strict_conditions = []
-    for col, is_max in zip(by, maximize_list):
-        if is_max:
-            strict_conditions.append(pl.col(f"{col}_r") > pl.col(col))
-        else:
-            strict_conditions.append(pl.col(f"{col}_r") < pl.col(col))
-    is_dominated = pl.all_horizontal(dominance_conditions) & pl.any_horizontal(strict_conditions)
-    # Find IDs of dominated rows
-    dominated_ids = (
-        pairs.filter(is_dominated)
-        .select("__id")
-        .unique()
-        .collect()
-        .get_column("__id")
-    )
-    # Return non-dominated rows
-    return df_with_id.filter(~pl.col("__id").is_in(dominated_ids)).drop("__id")

hkjc-0.3.9.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-hkjc/__init__.py,sha256=jgA3OiBaRifvNd5b5qR7VqdBTFfY1t9zQwhiQYh-Q4o,714
-hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
-hkjc/historical.py,sha256=wKTJi--0Mx_x0vO0ysOGD37oM8453woQK-cLzPOLgiQ,336
-hkjc/live_odds.py,sha256=HQZCvEMUG4YNVj2IaFshU5HD0j5mfBSSDhksNla-ERk,4768
-hkjc/optimization.py,sha256=p_NwPfl8qrcg2XWfHX4D7_jSRT819oVcctK-4VuvtME,3783
-hkjc/processing.py,sha256=bOc1j7xjeguMNDwJ2rovFf24xkdTSfaTD3O15J3JR2Q,4919
-hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
-hkjc/strategy/place_only.py,sha256=Dfzqr1PmWd9xHpylXO0Zlww9xMoIFPQ_gMHvRunw_1Q,2049
-hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
-hkjc-0.3.9.dist-info/METADATA,sha256=cF0R7g7vlmAGFh7x0r3s67uEQoufyYeMUuBmEsw25c8,413
-hkjc-0.3.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-hkjc-0.3.9.dist-info/RECORD,,

{hkjc-0.3.9.dist-info → hkjc-0.3.11.dist-info}/WHEEL RENAMED Viewed

File without changes

hkjc 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

hkjc 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl