hkjc 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hkjc/__init__.py CHANGED
@@ -6,7 +6,8 @@ from importlib.metadata import version as _version
6
6
 
7
7
  __all__ = ["live_odds", "qpbanker",
8
8
  "generate_all_qp_trades", "generate_all_pla_trades", "pareto_filter",
9
- "speedpro_energy", "speedmap", "harveille_model"]
9
+ "speedpro_energy", "speedmap", "harveille_model",
10
+ "generate_historical_data"]
10
11
 
11
12
  try:
12
13
  __version__ = _version(__name__)
@@ -14,7 +15,7 @@ except Exception: # pragma: no cover - best-effort version resolution
14
15
  __version__ = "0.0.0"
15
16
 
16
17
  from .live_odds import live_odds
17
- from .processing import generate_all_qp_trades, generate_all_pla_trades
18
- from .optimization import pareto_filter
18
+ from .processing import generate_all_qp_trades, generate_all_pla_trades, generate_historical_data
19
+ from .utils import pareto_filter
19
20
  from .speedpro import speedmap, speedpro_energy
20
21
  from . import harville_model
hkjc/historical.py CHANGED
@@ -3,11 +3,145 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import requests
6
+ import polars as pl
7
+ from bs4 import BeautifulSoup
8
+ from cachetools.func import ttl_cache
6
9
 
7
- # TODO read and process all races from start date to end date
10
+ from utils import _validate_date, _validate_venue_code, _parse_html_table
8
11
 
9
- # TODO query all basic info and race history for a specific horse
12
+ HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
13
+ HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId={horse_id}"
10
14
 
11
- # TODO classify running style & draw to determine blocking probability
12
15
 
13
- #
16
+ @ttl_cache(maxsize=100, ttl=3600)
17
+ def _soupify(url: str) -> BeautifulSoup:
18
+ """Fetch and parse a webpage and return BeautifulSoup object
19
+ """
20
+ response = requests.get(url, timeout=30)
21
+ response.raise_for_status()
22
+ return BeautifulSoup(response.content, 'html.parser')
23
+
24
+
25
+ def _soupify_race_page(date: str, venue_code: str, race_number: int) -> BeautifulSoup:
26
+ """Fetch and parse HKJC race results page and return BeautifulSoup object
27
+ """
28
+ url = HKJC_RACE_URL_TEMPLATE.format(
29
+ date=date, venue_code=venue_code, race_number=race_number)
30
+ return _soupify(url)
31
+
32
+
33
+ def _soupify_horse_page(horse_id: str) -> BeautifulSoup:
34
+ """Fetch and parse HKJC race results page and return BeautifulSoup object
35
+ """
36
+ url = HKJC_HORSE_URL_TEMPLATE.format(horse_id=horse_id)
37
+ return _soupify(url)
38
+
39
+
40
+
41
+ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition") -> pl.DataFrame:
42
+ """Classify running style based on RunningPosition column
43
+ """
44
+ # Split the RunningPosition column into separate columns and convert to integers
45
+ df = df.with_columns(
46
+ pl.col(running_pos_col)
47
+ .str.split_exact(" ", n=3)
48
+ .struct.rename_fields(["StartPosition", "Position2", "Position3", "FinishPosition"])
49
+ # Give an alias to the struct for easier selection
50
+ .alias("split_data").cast(pl.Int64, strict=False)
51
+ ).unnest("split_data")
52
+
53
+ df = df.with_columns([
54
+ (pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
55
+ pl.mean_horizontal("StartPosition", "Position2",
56
+ "Position3", "FinishPosition").alias("AvgPosition"),
57
+ ]).with_columns(pl.when(pl.col("StartPosition").is_null()).then(pl.lit("--"))
58
+ .when((pl.col("PositionChange") <= 0) & pl.col("StartPosition") <= 3).then(pl.lit("FrontRunner"))
59
+ .when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
60
+ .otherwise(pl.lit("Pacer")).alias("RunningStyle"))
61
+
62
+ recent_style = df['RunningStyle'][:10].mode()[0]
63
+ df = df.with_columns(pl.lit(recent_style).alias("FavoriteRunningStyle"))
64
+
65
+ return df
66
+
67
+
68
+ def _extract_horse_data(horse_id: str) -> pl.DataFrame:
69
+ """Extract horse info and history from horse page
70
+ """
71
+ soup = _soupify_horse_page(horse_id)
72
+ table = soup.find('table', class_='bigborder')
73
+ horse_data = _parse_html_table(table).filter(
74
+ pl.col('Date') != '') # Remove empty rows
75
+ horse_data = _classify_running_style(horse_data)
76
+
77
+ # Extract horse profile info
78
+ table = soup.find_all('table', class_='table_eng_text')
79
+ profile_data = _parse_html_table(table[0], skip_header=True)
80
+ country, age = profile_data.filter(pl.col("column_0").str.starts_with("Country"))['column_2'].item(0).split('/')
81
+ profile_data = _parse_html_table(table[1], skip_header=True)
82
+ current_rating = profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0)
83
+ season_start_rating = profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0)
84
+
85
+ horse_info = {
86
+ 'HorseID': horse_id,
87
+ 'OriginCountry': country.strip(),
88
+ 'Age': int(age),
89
+ 'CurrentRating': int(current_rating),
90
+ 'SeasonStartRating': int(season_start_rating)
91
+ }
92
+ horse_data = (horse_data.with_columns([
93
+ pl.lit(value).alias(key) for key, value in horse_info.items()
94
+ ])
95
+ )
96
+ return horse_data
97
+
98
+
99
+ def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
100
+ soup = _soupify_race_page(date, venue_code, race_number)
101
+ table = soup.find('div', class_='race_tab').find('table')
102
+ race_data = _parse_html_table(table)
103
+
104
+ # Extract the relevant race information
105
+ race_class = race_data.item(1, 0).split('-')[0].strip()
106
+ race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
107
+ race_name = race_data.item(2, 0).strip()
108
+ going = race_data.item(1, 2).strip()
109
+ course = race_data.item(2, 2).strip()
110
+
111
+ race_info = {'RaceDate': date,
112
+ 'Venue': venue_code,
113
+ 'RaceNumber': race_number,
114
+ 'RaceClass': race_class,
115
+ 'RaceDistance': race_dist,
116
+ 'RaceName': race_name,
117
+ 'Going': going,
118
+ 'Course': course}
119
+
120
+ # Extract the results table
121
+ table = soup.find('div', class_='performance').find('table')
122
+ race_data = (_parse_html_table(table)
123
+ .with_columns([
124
+ pl.lit(value).alias(key) for key, value in race_info.items()
125
+ ])
126
+ )
127
+
128
+ # Extract horse IDs from links
129
+ horse_ids = []
130
+ rows = table.find_all('tr')[1:] # Skip header row
131
+ for row in rows:
132
+ horse_id = 'UNKNOWN' # Horse link not found
133
+ links = row.find_all('a')
134
+ for link in links:
135
+ if 'href' in link.attrs and 'HorseId=' in link['href']:
136
+ horse_id = link['href'].split('HorseId=')[1]
137
+ break
138
+ horse_ids.append(horse_id)
139
+
140
+ race_data = race_data.with_columns(pl.Series('HorseID', horse_ids))
141
+
142
+ # Join with horse data
143
+ horse_data_list = [_extract_horse_data(horse_id) for horse_id in horse_ids]
144
+ horse_data_df = pl.concat(horse_data_list).unique(subset=['HorseID'])
145
+ race_data = race_data.join(horse_data_df, on='HorseID', how='left')
146
+
147
+ return race_data
hkjc/live_odds.py CHANGED
@@ -6,9 +6,10 @@ from typing import Tuple, List
6
6
  import requests
7
7
  from cachetools.func import ttl_cache
8
8
  import numpy as np
9
- from datetime import datetime as dt
10
9
 
11
- ENDPOINT = "https://info.cld.hkjc.com/graphql/base/"
10
+ from .utils import _validate_date, _validate_venue_code
11
+
12
+ HKJC_LIVEODDS_ENDPOINT = "https://info.cld.hkjc.com/graphql/base/"
12
13
 
13
14
  LIVEODDS_PAYLOAD = {
14
15
  "operationName": "racing",
@@ -70,7 +71,8 @@ def _fetch_live_odds(date: str, venue_code: str, race_number: int, odds_type: Tu
70
71
  "User-Agent": "python-hkjc-fetch/0.1",
71
72
  }
72
73
 
73
- r = requests.post(ENDPOINT, json=payload, headers=headers, timeout=10)
74
+ r = requests.post(HKJC_LIVEODDS_ENDPOINT, json=payload,
75
+ headers=headers, timeout=10)
74
76
  if r.status_code != 200:
75
77
  raise RuntimeError(f"Request failed: {r.status_code} - {r.text}")
76
78
 
@@ -104,11 +106,8 @@ def live_odds(date: str, venue_code: str, race_number: int, odds_type: List[str]
104
106
  If odds_type is 'WIN','PLA', returns a 1D array of place odds.
105
107
  If odds_type is 'QIN','QPL', returns a 2D array of quinella place odds.
106
108
  """
107
- # validate date format
108
- try:
109
- dt.strptime(date, "%Y-%m-%d")
110
- except Exception:
111
- raise ValueError("Date must be in 'YYYY-MM-DD' format")
109
+ _validate_date(date)
110
+ _validate_venue_code(venue_code)
112
111
 
113
112
  mandatory_types = ['PLA']
114
113
 
hkjc/processing.py CHANGED
@@ -1,22 +1,87 @@
1
1
  """Functions to batch process trades into dataframes for analysis.
2
2
  """
3
3
  from __future__ import annotations
4
- from typing import Tuple, List
4
+ from typing import Tuple, List, Union
5
5
 
6
6
  from .live_odds import live_odds
7
7
  from .strategy import qpbanker, place_only
8
8
  from .harville_model import fit_harville_to_odds
9
+ from .historical import _extract_race_data
10
+ from .utils import _validate_date
9
11
 
10
12
  import polars as pl
11
13
  import numpy as np
12
14
  from itertools import combinations
13
15
  from tqdm import tqdm
16
+ from datetime import datetime as dt
14
17
 
15
18
 
16
19
  def _all_subsets(lst): return [list(x) for r in range(
17
20
  1, len(lst)+1) for x in combinations(lst, r)] # list subsets of a list
18
21
 
19
22
 
23
+ # ======================================
24
+ # Historical data processing functions
25
+ # ======================================
26
+ incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
27
+ 'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
28
+
29
+
30
+ def _historical_process_single_date_venue(date: str, venue_code: str) -> Union[pl.DataFrame, None]:
31
+ for race_number in range(1, 12):
32
+ try:
33
+ _extract_race_data(date.strftime('%Y/%m/%d'),
34
+ venue_code, race_number)
35
+ except:
36
+ return None
37
+
38
+
39
+ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
40
+ """Generate historical race dataset from start_date to end_date"""
41
+ _validate_date(start_date)
42
+ _validate_date(end_date)
43
+ start_dt = dt.strptime(start_date, '%Y-%m-%d')
44
+ end_dt = dt.strptime(end_date, '%Y-%m-%d')
45
+
46
+ dfs = []
47
+
48
+ for date in pl.date_range(start_dt, end_dt, interval='1d'):
49
+ for venue_code in ['ST', 'HV']:
50
+ df = _historical_process_single_date_venue(date, venue_code)
51
+ if df is None:
52
+ continue
53
+ dfs.append(df)
54
+
55
+ df = (pl.concat(dfs)
56
+ .filter(~pl.col('Pla').is_in(incidents))
57
+ .with_columns(
58
+ pl.col('Pla').str.split(' ').list.first().alias('Pla')
59
+ )
60
+ )
61
+
62
+ df = df.with_columns([
63
+ pl.col('Pla').cast(pl.Int64, strict=False),
64
+ pl.col('HorseNo').cast(pl.Int64, strict=False),
65
+ pl.col('ActWt').cast(pl.Int64, strict=False),
66
+ pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
67
+ pl.col('Dr').cast(pl.Int64, strict=False),
68
+ pl.col('RaceDistance').cast(pl.Int64, strict=False),
69
+ pl.col('WinOdds').cast(pl.Float64, strict=False)
70
+ ])
71
+
72
+ df = df.with_columns(pl.col('Finish Time')
73
+ .str.strptime(pl.Duration, format='%M:%S.%f', strict=False)
74
+ .dt.total_seconds()
75
+ .alias('Finish Time')
76
+ )
77
+
78
+ return df
79
+
80
+
81
+ # ==========================
82
+ # Trade processing functions
83
+ # ==========================
84
+
20
85
  def _process_single_qp_trade(banker: int, covered: List[int], pla_odds: np.ndarray, qpl_odds: np.ndarray, rebate: float) -> Tuple[int, List, float, float, float]:
21
86
  """Process a single qp trade.
22
87
  """
@@ -17,7 +17,7 @@ def win_probability(p_matrix: np.ndarray, covered: List[int]) -> float:
17
17
  float: probability
18
18
  """
19
19
 
20
- win_prob = 1-np.prod(1-np.sum(p_matrix[covered, :3], axis=1))
20
+ win_prob = 1-np.prod(1-np.sum([p_matrix[c-1, :3] for c in covered], axis=1))
21
21
  return win_prob
22
22
 
23
23
 
@@ -35,7 +35,7 @@ def expected_value(pla_odds: np.ndarray, p_matrix: np.ndarray, covered: List[int
35
35
  """
36
36
  true_prob = np.sum(p_matrix[:, :3], axis=1)
37
37
  C = len(covered)
38
- ev = np.sum((true_prob*(pla_odds-rebate))[covered])/C - (1-rebate)
38
+ ev = np.sum([(true_prob*(pla_odds-rebate))[c-1] for c in covered])/C - (1-rebate)
39
39
  return ev
40
40
 
41
41
  def average_odds(pla_odds: np.ndarray, covered: List[int]) -> float:
hkjc/utils.py ADDED
@@ -0,0 +1,189 @@
1
+ import polars as pl
2
+ from typing import List, Union
3
+ from datetime import datetime as dt
4
+ import bs4
5
+ import re
6
+
7
+
8
+ def _validate_date(date_str: str) -> bool:
9
+ # validate date format
10
+ try:
11
+ dt.strptime(date_str, "%Y-%m-%d")
12
+ except Exception:
13
+ raise ValueError("Date must be in 'YYYY-MM-DD' format")
14
+ return True
15
+
16
+
17
+ def _validate_venue_code(venue_code: str) -> bool:
18
+ if venue_code not in ['HV', 'ST']:
19
+ raise ValueError(
20
+ "Venue code must be 'HV' (Happy Valley) or 'ST' (Sha Tin)")
21
+ return True
22
+
23
+
24
+ def _parse_html_table(table: bs4.element.Tag, skip_header=False) -> pl.DataFrame:
25
+ """Parse an HTML table (HKJC format) into a Polars DataFrame
26
+ """
27
+ if table is None:
28
+ raise ValueError("No table found in HTML tag")
29
+
30
+ # Extract headers
31
+ headers = []
32
+ if not skip_header:
33
+ header_row = table.find('thead')
34
+ if header_row:
35
+ headers = [td.get_text(strip=True) for td in header_row.find_all('td')]
36
+ else:
37
+ # If no thead, try first tr
38
+ first_row = table.find('tr')
39
+ if first_row:
40
+ headers = [th.get_text(strip=True)
41
+ for th in first_row.find_all('th')]
42
+ if not headers:
43
+ # If no th tags, use td tags from first row as headers
44
+ headers = [td.get_text(strip=True)
45
+ for td in first_row.find_all('td')]
46
+
47
+ # Extract data rows
48
+ data = []
49
+ tbody = table.find('tbody')
50
+ rows = tbody.find_all('tr') if tbody else table.find_all('tr')
51
+
52
+ # Skip first row if it was used for headers
53
+ start_idx = 1 if not tbody and headers else 0
54
+
55
+ for row in rows[start_idx:]:
56
+ cells = row.find_all(['td', 'th'])
57
+ row_data = [cell.get_text(separator=' ',strip=True) for cell in cells]
58
+ if row_data: # Skip empty rows
59
+ data.append(row_data)
60
+
61
+ # Create DataFrame
62
+ if not headers:
63
+ # Generate default column names if no headers found
64
+ headers = [f"column_{i}" for i in range(len(data[0]))] if data else []
65
+
66
+ # Ensure all rows have the same number of columns
67
+ if data:
68
+ max_cols = len(headers)
69
+ data = [row + [''] * (max_cols - len(row)) if len(row)
70
+ < max_cols else row[:max_cols] for row in data]
71
+
72
+ df = pl.DataFrame(data, schema=headers, orient='row')
73
+ # Clean column names by removing special characters
74
+ df.columns = [re.sub(r'[^\w]', '', col)
75
+ for col in df.columns]
76
+ return df
77
+
78
+
79
+ def pareto_filter(
80
+ df: pl.DataFrame,
81
+ groupby: List[str],
82
+ by: List[str],
83
+ maximize: Union[bool, List[bool]] = True
84
+ ) -> pl.DataFrame:
85
+ """
86
+ Filter dataframe to only include Pareto optimal rows within each group.
87
+
88
+ Args:
89
+ df: Input dataframe
90
+ groupby: Columns to group by (empty list for global filter)
91
+ by: Columns to consider for Pareto optimality
92
+ maximize: Whether to maximize (True) or minimize (False) each 'by' column
93
+
94
+ Returns:
95
+ DataFrame containing only Pareto optimal rows
96
+ """
97
+ if df.is_empty() or not by:
98
+ return df
99
+
100
+ # Normalize maximize to list
101
+ maximize_list = [maximize] * \
102
+ len(by) if isinstance(maximize, bool) else maximize
103
+
104
+ if len(maximize_list) != len(by):
105
+ raise ValueError(
106
+ f"Length of 'maximize' ({len(maximize_list)}) must equal length of 'by' ({len(by)})")
107
+
108
+ # Single objective: simple min/max filter
109
+ if len(by) == 1:
110
+ opt_expr = pl.col(by[0]).max(
111
+ ) if maximize_list[0] else pl.col(by[0]).min()
112
+ if groupby:
113
+ opt_expr = opt_expr.over(groupby)
114
+ return df.filter(pl.col(by[0]) == opt_expr)
115
+
116
+ # Two objectives: efficient skyline algorithm
117
+ if len(by) == 2:
118
+ temp_cols = ["__obj_0", "__obj_1"]
119
+
120
+ # Transform to maximization problem
121
+ df_temp = df.with_columns([
122
+ (pl.col(by[i]) * (1 if maximize_list[i] else -1)
123
+ ).alias(temp_cols[i])
124
+ for i in range(2)
125
+ ])
126
+
127
+ # Sort by first objective descending, then second descending (for stability)
128
+ groupby = groupby or []
129
+ sort_cols = (groupby if groupby else []) + temp_cols
130
+ sorted_df = df_temp.sort(sort_cols, descending=[
131
+ False] * len(groupby) + [True, True])
132
+
133
+ # Keep rows where second objective is not dominated by any previous row in group
134
+ if groupby:
135
+ max_so_far = pl.col(temp_cols[1]).cum_max().shift(
136
+ 1, fill_value=float("-inf")).over(groupby)
137
+ else:
138
+ max_so_far = pl.col(temp_cols[1]).cum_max().shift(
139
+ 1, fill_value=float("-inf"))
140
+
141
+ mask = pl.col(temp_cols[1]) > max_so_far
142
+ return sorted_df.filter(mask).drop(temp_cols)
143
+
144
+ # N objectives (N > 2): pairwise dominance check
145
+ df_with_id = df.with_row_index("__id")
146
+
147
+ # Self-join to compare all pairs
148
+ left = df_with_id.lazy()
149
+ right = df_with_id.lazy()
150
+
151
+ if groupby:
152
+ pairs = left.join(right, on=groupby, suffix="_r")
153
+ else:
154
+ pairs = left.join(right, how="cross", suffix="_r")
155
+
156
+ # Only compare different rows
157
+ pairs = pairs.filter(pl.col("__id") != pl.col("__id_r"))
158
+
159
+ # Build dominance conditions
160
+ dominance_conditions = []
161
+ for col, is_max in zip(by, maximize_list):
162
+ if is_max:
163
+ # right dominates left if right[col] >= left[col] for all cols
164
+ dominance_conditions.append(pl.col(f"{col}_r") >= pl.col(col))
165
+ else:
166
+ dominance_conditions.append(pl.col(f"{col}_r") <= pl.col(col))
167
+
168
+ # Strict dominance: all >= and at least one >
169
+ strict_conditions = []
170
+ for col, is_max in zip(by, maximize_list):
171
+ if is_max:
172
+ strict_conditions.append(pl.col(f"{col}_r") > pl.col(col))
173
+ else:
174
+ strict_conditions.append(pl.col(f"{col}_r") < pl.col(col))
175
+
176
+ is_dominated = pl.all_horizontal(
177
+ dominance_conditions) & pl.any_horizontal(strict_conditions)
178
+
179
+ # Find IDs of dominated rows
180
+ dominated_ids = (
181
+ pairs.filter(is_dominated)
182
+ .select("__id")
183
+ .unique()
184
+ .collect()
185
+ .get_column("__id")
186
+ )
187
+
188
+ # Return non-dominated rows
189
+ return df_with_id.filter(~pl.col("__id").is_in(dominated_ids)).drop("__id")
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hkjc
3
- Version: 0.3.9
3
+ Version: 0.3.11
4
4
  Summary: Library for scrapping HKJC data and perform basic analysis
5
5
  Requires-Python: >=3.11
6
+ Requires-Dist: beautifulsoup4>=4.14.2
6
7
  Requires-Dist: cachetools>=6.2.0
7
8
  Requires-Dist: fastexcel>=0.16.0
8
9
  Requires-Dist: numba>=0.62.1
@@ -0,0 +1,13 @@
1
+ hkjc/__init__.py,sha256=TI7PVhmoWSvYX-xdTEdaT3jfY99LiYQFRQZaIwBhJd8,785
2
+ hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
3
+ hkjc/historical.py,sha256=P1eRRHzwhdQ4xR1xspj-HO1OyTSGZJDjJ-V5Sj8Pstg,6026
4
+ hkjc/live_odds.py,sha256=G4ELBBp1d2prxye9kKzu2pwtS4vSfRPOmEuT7-Nd-3A,4741
5
+ hkjc/processing.py,sha256=HeJmEyHe0JHO2V68dmm8eD9EIvjUUrFg2dhoYvYaik8,7064
6
+ hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
8
+ hkjc/utils.py,sha256=4CA_FPf_U3GvzoLkqBX0qDPZgrSvKJKvbP7VWqd5FiA,6323
9
+ hkjc/strategy/place_only.py,sha256=lHPjTSj8PzghxncNBg8FI4T4HJigekB9a3bV7l7VtPA,2079
10
+ hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
11
+ hkjc-0.3.11.dist-info/METADATA,sha256=XxHC610mB4eZNVl5g_jGHBE1Rq5oW_VPXDzLwz0FR6k,452
12
+ hkjc-0.3.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
+ hkjc-0.3.11.dist-info/RECORD,,
hkjc/optimization.py DELETED
@@ -1,106 +0,0 @@
1
- import polars as pl
2
- from typing import List, Union
3
-
4
- def pareto_filter(
5
- df: pl.DataFrame,
6
- groupby: List[str],
7
- by: List[str],
8
- maximize: Union[bool, List[bool]] = True
9
- ) -> pl.DataFrame:
10
- """
11
- Filter dataframe to only include Pareto optimal rows within each group.
12
-
13
- Args:
14
- df: Input dataframe
15
- groupby: Columns to group by (empty list for global filter)
16
- by: Columns to consider for Pareto optimality
17
- maximize: Whether to maximize (True) or minimize (False) each 'by' column
18
-
19
- Returns:
20
- DataFrame containing only Pareto optimal rows
21
- """
22
- if df.is_empty() or not by:
23
- return df
24
-
25
- # Normalize maximize to list
26
- maximize_list = [maximize] * len(by) if isinstance(maximize, bool) else maximize
27
-
28
- if len(maximize_list) != len(by):
29
- raise ValueError(f"Length of 'maximize' ({len(maximize_list)}) must equal length of 'by' ({len(by)})")
30
-
31
- # Single objective: simple min/max filter
32
- if len(by) == 1:
33
- opt_expr = pl.col(by[0]).max() if maximize_list[0] else pl.col(by[0]).min()
34
- if groupby:
35
- opt_expr = opt_expr.over(groupby)
36
- return df.filter(pl.col(by[0]) == opt_expr)
37
-
38
- # Two objectives: efficient skyline algorithm
39
- if len(by) == 2:
40
- temp_cols = ["__obj_0", "__obj_1"]
41
-
42
- # Transform to maximization problem
43
- df_temp = df.with_columns([
44
- (pl.col(by[i]) * (1 if maximize_list[i] else -1)).alias(temp_cols[i])
45
- for i in range(2)
46
- ])
47
-
48
- # Sort by first objective descending, then second descending (for stability)
49
- groupby = groupby or []
50
- sort_cols = (groupby if groupby else []) + temp_cols
51
- sorted_df = df_temp.sort(sort_cols, descending=[False] * len(groupby) + [True, True])
52
-
53
- # Keep rows where second objective is not dominated by any previous row in group
54
- if groupby:
55
- max_so_far = pl.col(temp_cols[1]).cum_max().shift(1, fill_value=float("-inf")).over(groupby)
56
- else:
57
- max_so_far = pl.col(temp_cols[1]).cum_max().shift(1, fill_value=float("-inf"))
58
-
59
- mask = pl.col(temp_cols[1]) > max_so_far
60
- return sorted_df.filter(mask).drop(temp_cols)
61
-
62
- # N objectives (N > 2): pairwise dominance check
63
- df_with_id = df.with_row_index("__id")
64
-
65
- # Self-join to compare all pairs
66
- left = df_with_id.lazy()
67
- right = df_with_id.lazy()
68
-
69
- if groupby:
70
- pairs = left.join(right, on=groupby, suffix="_r")
71
- else:
72
- pairs = left.join(right, how="cross", suffix="_r")
73
-
74
- # Only compare different rows
75
- pairs = pairs.filter(pl.col("__id") != pl.col("__id_r"))
76
-
77
- # Build dominance conditions
78
- dominance_conditions = []
79
- for col, is_max in zip(by, maximize_list):
80
- if is_max:
81
- # right dominates left if right[col] >= left[col] for all cols
82
- dominance_conditions.append(pl.col(f"{col}_r") >= pl.col(col))
83
- else:
84
- dominance_conditions.append(pl.col(f"{col}_r") <= pl.col(col))
85
-
86
- # Strict dominance: all >= and at least one >
87
- strict_conditions = []
88
- for col, is_max in zip(by, maximize_list):
89
- if is_max:
90
- strict_conditions.append(pl.col(f"{col}_r") > pl.col(col))
91
- else:
92
- strict_conditions.append(pl.col(f"{col}_r") < pl.col(col))
93
-
94
- is_dominated = pl.all_horizontal(dominance_conditions) & pl.any_horizontal(strict_conditions)
95
-
96
- # Find IDs of dominated rows
97
- dominated_ids = (
98
- pairs.filter(is_dominated)
99
- .select("__id")
100
- .unique()
101
- .collect()
102
- .get_column("__id")
103
- )
104
-
105
- # Return non-dominated rows
106
- return df_with_id.filter(~pl.col("__id").is_in(dominated_ids)).drop("__id")
@@ -1,13 +0,0 @@
1
- hkjc/__init__.py,sha256=jgA3OiBaRifvNd5b5qR7VqdBTFfY1t9zQwhiQYh-Q4o,714
2
- hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
3
- hkjc/historical.py,sha256=wKTJi--0Mx_x0vO0ysOGD37oM8453woQK-cLzPOLgiQ,336
4
- hkjc/live_odds.py,sha256=HQZCvEMUG4YNVj2IaFshU5HD0j5mfBSSDhksNla-ERk,4768
5
- hkjc/optimization.py,sha256=p_NwPfl8qrcg2XWfHX4D7_jSRT819oVcctK-4VuvtME,3783
6
- hkjc/processing.py,sha256=bOc1j7xjeguMNDwJ2rovFf24xkdTSfaTD3O15J3JR2Q,4919
7
- hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
9
- hkjc/strategy/place_only.py,sha256=Dfzqr1PmWd9xHpylXO0Zlww9xMoIFPQ_gMHvRunw_1Q,2049
10
- hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
11
- hkjc-0.3.9.dist-info/METADATA,sha256=cF0R7g7vlmAGFh7x0r3s67uEQoufyYeMUuBmEsw25c8,413
12
- hkjc-0.3.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
- hkjc-0.3.9.dist-info/RECORD,,
File without changes