hkjc 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hkjc/analysis.py ADDED
@@ -0,0 +1,3 @@
1
+ # TODO:
2
+
3
+ # Generate filtered live odds, fav run style, dr, current rating, season start rating, track record
hkjc/historical.py CHANGED
@@ -10,7 +10,10 @@ from cachetools.func import ttl_cache
10
10
  from .utils import _parse_html_table
11
11
 
12
12
  HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
13
- HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId={horse_id}"
13
+ HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseNo={horse_no}"
14
+
15
+ incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
16
+ 'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
14
17
 
15
18
 
16
19
  @ttl_cache(maxsize=100, ttl=3600)
@@ -30,14 +33,13 @@ def _soupify_race_page(date: str, venue_code: str, race_number: int) -> Beautifu
30
33
  return _soupify(url)
31
34
 
32
35
 
33
- def _soupify_horse_page(horse_id: str) -> BeautifulSoup:
36
+ def _soupify_horse_page(horse_no: str) -> BeautifulSoup:
34
37
  """Fetch and parse HKJC race results page and return BeautifulSoup object
35
38
  """
36
- url = HKJC_HORSE_URL_TEMPLATE.format(horse_id=horse_id)
39
+ url = HKJC_HORSE_URL_TEMPLATE.format(horse_no=horse_no)
37
40
  return _soupify(url)
38
41
 
39
42
 
40
-
41
43
  def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition") -> pl.DataFrame:
42
44
  """Classify running style based on RunningPosition column
43
45
  """
@@ -50,25 +52,27 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
50
52
  .alias("split_data").cast(pl.Int64, strict=False)
51
53
  ).unnest("split_data")
52
54
 
55
+ df = df.with_columns(pl.col('FinishPosition').fill_null(pl.col('Position3')))
56
+
53
57
  df = df.with_columns([
54
58
  (pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
55
59
  pl.mean_horizontal("StartPosition", "Position2",
56
60
  "Position3", "FinishPosition").alias("AvgPosition"),
57
61
  ]).with_columns(pl.when(pl.col("StartPosition").is_null()).then(pl.lit("--"))
58
- .when((pl.col("PositionChange") <= 0) & pl.col("StartPosition") <= 3).then(pl.lit("FrontRunner"))
62
+ .when((pl.col("AvgPosition") <= 3.5) & (pl.col("StartPosition") <= 3)).then(pl.lit("FrontRunner"))
59
63
  .when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
60
64
  .otherwise(pl.lit("Pacer")).alias("RunningStyle"))
61
65
 
62
- recent_style = df['RunningStyle'][:10].mode()[0]
66
+ recent_style = df['RunningStyle'][:5].mode()[0]
63
67
  df = df.with_columns(pl.lit(recent_style).alias("FavoriteRunningStyle"))
64
68
 
65
69
  return df
66
70
 
67
71
 
68
- def _extract_horse_data(horse_id: str) -> pl.DataFrame:
72
+ def _extract_horse_data(horse_no: str) -> pl.DataFrame:
69
73
  """Extract horse info and history from horse page
70
74
  """
71
- soup = _soupify_horse_page(horse_id)
75
+ soup = _soupify_horse_page(horse_no)
72
76
  table = soup.find('table', class_='bigborder')
73
77
  horse_data = _parse_html_table(table).filter(
74
78
  pl.col('Date') != '') # Remove empty rows
@@ -78,35 +82,113 @@ def _extract_horse_data(horse_id: str) -> pl.DataFrame:
78
82
  table = soup.find_all('table', class_='table_eng_text')
79
83
  profile_data = _parse_html_table(table[0], skip_header=True)
80
84
  profile_data = _parse_html_table(table[1], skip_header=True)
81
- current_rating = profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0)
82
- season_start_rating = profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0)
85
+
86
+ try:
87
+ current_rating = int(profile_data.filter(
88
+ pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0))
89
+ season_start_rating = int(profile_data.filter(pl.col(
90
+ "column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0))
91
+ except:
92
+ current_rating, season_start_rating = 0, 0
93
+
94
+ try:
95
+ last_rating = int(profile_data.filter(
96
+ pl.col("column_0").str.starts_with("Last Rating"))['column_2'].item(0))
97
+ except:
98
+ last_rating = 0
83
99
 
84
100
  horse_info = {
85
- 'HorseID': horse_id,
86
- 'CurrentRating': int(current_rating),
87
- 'SeasonStartRating': int(season_start_rating)
101
+ 'HorseID': horse_no,
102
+ 'CurrentRating': current_rating,
103
+ 'SeasonStartRating': season_start_rating,
104
+ 'LastRating': last_rating if current_rating == 0 else current_rating
88
105
  }
89
106
  horse_data = (horse_data.with_columns([
90
107
  pl.lit(value).alias(key) for key, value in horse_info.items()
91
108
  ])
92
109
  )
110
+
93
111
  return horse_data
94
112
 
95
113
 
114
+ def _clean_horse_data(df: pl.DataFrame) -> pl.DataFrame:
115
+ """ Clean and convert horse data to suitable data types
116
+ """
117
+ df = df.with_columns(
118
+ pl.col('Pla').str.split(' ').list.first().alias('Pla')
119
+ ).filter(~pl.col('Pla').is_in(incidents))
120
+
121
+ df = df.with_columns([
122
+ pl.col('Pla').cast(pl.Int64, strict=False),
123
+ pl.col('ActWt').cast(pl.Int64, strict=False),
124
+ pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
125
+ pl.col('Dr').cast(pl.Int64, strict=False),
126
+ pl.col('Rtg').cast(pl.Int64, strict=False),
127
+ pl.col('Dist').cast(pl.Int64, strict=False),
128
+ pl.col('WinOdds').cast(pl.Float64, strict=False),
129
+ pl.col('RaceIndex').cast(pl.Int64, strict=False)
130
+ ])
131
+
132
+ df = df.with_columns(
133
+ (
134
+ pl.col("FinishTime").str.split_exact(".", 1).struct.field("field_0").cast(pl.Int64) * 60 +
135
+ pl.col("FinishTime").str.split_exact(".", 1).struct.field("field_1").cast(pl.Int64)
136
+ ).cast(pl.Float64).alias("FinishTime")
137
+ )
138
+
139
+ df = df.with_columns(
140
+ pl.col('RCTrackCourse').str.split_exact(' / ', 2)
141
+ .struct.rename_fields(['Venue', 'Track', 'Course'])
142
+ .alias('RCTrackCourse')
143
+ ).unnest('RCTrackCourse')
144
+
145
+ return df
146
+
147
+ def get_horse_data(horse_no: str) -> pl.DataFrame:
148
+ df = _extract_horse_data(horse_no)
149
+ return _clean_horse_data(df)
150
+
151
+ def _clean_race_data(df: pl.DataFrame) -> pl.DataFrame:
152
+ """ Clean and convert horse data to suitable data types
153
+ """
154
+ df = df.with_columns(
155
+ pl.col('Pla').str.split(' ').list.first().alias('Pla')
156
+ ).filter(~pl.col('Pla').is_in(incidents))
157
+
158
+ df = df.with_columns([
159
+ pl.col('Pla').cast(pl.Int64, strict=False),
160
+ pl.col('HorseNo').cast(pl.Int64, strict=False),
161
+ pl.col('ActWt').cast(pl.Int64, strict=False),
162
+ pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
163
+ pl.col('Dr').cast(pl.Int64, strict=False),
164
+ pl.col('WinOdds').cast(pl.Float64, strict=False)
165
+ ])
166
+
167
+ df = df.with_columns(
168
+ (
169
+ pl.col("FinishTime").str.split_exact(":", 1).struct.field("field_0").cast(pl.Int64) * 60 +
170
+ pl.col("FinishTime").str.split_exact(":", 1).struct.field("field_1").cast(pl.Int64)
171
+ ).cast(pl.Float64).alias("FinishTime")
172
+ )
173
+
174
+ return df
175
+
96
176
  def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
97
177
  soup = _soupify_race_page(date, venue_code, race_number)
98
178
  table = soup.find('div', class_='race_tab').find('table')
99
179
  race_data = _parse_html_table(table)
100
180
 
101
181
  # Extract the relevant race information
182
+ race_id = race_data.columns[0].replace(f'RACE{race_number}', '')
102
183
  race_class = race_data.item(1, 0).split('-')[0].strip()
103
184
  race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
104
185
  race_name = race_data.item(2, 0).strip()
105
186
  going = race_data.item(1, 2).strip()
106
187
  course = race_data.item(2, 2).strip()
107
188
 
108
- race_info = {'RaceDate': date,
189
+ race_info = {'Date': date,
109
190
  'Venue': venue_code,
191
+ 'RaceIndex': int(race_id),
110
192
  'RaceNumber': race_number,
111
193
  'RaceClass': race_class,
112
194
  'RaceDistance': race_dist,
@@ -120,25 +202,15 @@ def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataF
120
202
  .with_columns([
121
203
  pl.lit(value).alias(key) for key, value in race_info.items()
122
204
  ])
123
- )
124
-
125
- # Extract horse IDs from links
126
- horse_ids = []
127
- rows = table.find_all('tr')[1:] # Skip header row
128
- for row in rows:
129
- horse_id = 'UNKNOWN' # Horse link not found
130
- links = row.find_all('a')
131
- for link in links:
132
- if 'href' in link.attrs and 'HorseId=' in link['href']:
133
- horse_id = link['href'].split('HorseId=')[1]
134
- break
135
- horse_ids.append(horse_id)
136
-
137
- race_data = race_data.with_columns(pl.Series('HorseID', horse_ids))
138
-
139
- # Join with horse data
140
- horse_data_list = [_extract_horse_data(horse_id) for horse_id in horse_ids]
141
- horse_data_df = pl.concat(horse_data_list).unique(subset=['HorseID'])
142
- race_data = race_data.join(horse_data_df, on='HorseID', how='left')
143
-
144
- return race_data
205
+ .with_columns(
206
+ pl.col("Horse").str.extract(r"\((.*?)\)")
207
+ .alias("HorseID")
208
+ )
209
+ )
210
+
211
+ return race_data
212
+
213
+
214
+ def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
215
+ df = _extract_race_data(date,venue_code,race_number)
216
+ return _clean_race_data(df)
hkjc/processing.py CHANGED
@@ -6,7 +6,7 @@ from typing import Tuple, List, Union
6
6
  from .live_odds import live_odds
7
7
  from .strategy import qpbanker, place_only
8
8
  from .harville_model import fit_harville_to_odds
9
- from .historical import _extract_race_data
9
+ from .historical import _extract_horse_data, _extract_race_data, _clean_horse_data
10
10
  from .utils import _validate_date
11
11
 
12
12
  import polars as pl
@@ -14,7 +14,6 @@ import numpy as np
14
14
  from itertools import combinations
15
15
  from tqdm import tqdm
16
16
  from datetime import datetime as dt
17
- from joblib import delayed, Parallel
18
17
 
19
18
 
20
19
  def _all_subsets(lst): return [list(x) for r in range(
@@ -24,17 +23,17 @@ def _all_subsets(lst): return [list(x) for r in range(
24
23
  # ======================================
25
24
  # Historical data processing functions
26
25
  # ======================================
27
- incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
28
- 'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
26
+
29
27
 
30
28
 
31
29
  def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl.DataFrame]:
32
30
  dfs = []
33
- iter_date = tqdm(range(1, 12), desc=f"Processing {date} {venue_code} ...", leave=False)
31
+ iter_date = tqdm(
32
+ range(1, 12), desc=f"Processing {date} {venue_code} ...", leave=False)
34
33
  for race_number in iter_date:
35
34
  try:
36
35
  dfs.append(_extract_race_data(date.strftime('%Y/%m/%d'),
37
- venue_code, race_number))
36
+ venue_code, race_number))
38
37
  except:
39
38
  if race_number == 1:
40
39
  iter_date.close()
@@ -51,38 +50,20 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
51
50
 
52
51
  dfs = []
53
52
 
54
- for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True)):
53
+ for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True), leave=False, desc='Scanning for horse IDs ...'):
55
54
  for venue_code in ['ST', 'HV']:
56
55
  dfs += _historical_process_single_date_venue(date, venue_code)
57
56
 
58
57
  if dfs == []:
59
- raise ValueError("Failed to obtain any race data. This could be due to invalid date range, or server requests limit. Please try again later.")
60
-
61
- df = (pl.concat(dfs)
62
- .filter(~pl.col('Pla').is_in(incidents))
63
- .with_columns(
64
- pl.col('Pla').str.split(' ').list.first().alias('Pla')
65
- )
66
- )
67
-
68
- df = df.with_columns([
69
- pl.col('Pla').cast(pl.Int64, strict=False),
70
- pl.col('HorseNo').cast(pl.Int64, strict=False),
71
- pl.col('ActWt').cast(pl.Int64, strict=False),
72
- pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
73
- pl.col('Dr').cast(pl.Int64, strict=False),
74
- pl.col('RaceDistance').cast(pl.Int64, strict=False),
75
- pl.col('WinOdds').cast(pl.Float64, strict=False)
76
- ])
77
-
78
- df = df.with_columns(
79
- (
80
- pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
81
- pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
82
- ).cast(pl.Float64).alias("FinishTime")
83
- )
58
+ raise ValueError(
59
+ "Failed to obtain any race data. This could be due to invalid date range, or server requests limit. Please try again later.")
84
60
 
85
- return df
61
+ horse_ids = pl.concat(dfs)['HorseID'].unique()
62
+
63
+ # Use horse track records
64
+ dfs = [_extract_horse_data(horse_id) for horse_id in tqdm(horse_ids, desc='Processing horses ...', leave=False)]
65
+ df = pl.concat(dfs)
66
+ return _clean_horse_data(df)
86
67
 
87
68
 
88
69
  # ==========================
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hkjc
3
- Version: 0.3.15
3
+ Version: 0.3.17
4
4
  Summary: Library for scrapping HKJC data and perform basic analysis
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: beautifulsoup4>=4.14.2
@@ -1,13 +1,14 @@
1
1
  hkjc/__init__.py,sha256=TI7PVhmoWSvYX-xdTEdaT3jfY99LiYQFRQZaIwBhJd8,785
2
+ hkjc/analysis.py,sha256=0042_NMIkQCl0J6B0P4TFfrBDCnm2B6jsCZKOEO30yI,108
2
3
  hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
3
- hkjc/historical.py,sha256=FRECc4pmozjFKkFeWN0vTzECF9QOS7URyJoCfTt5hlw,5805
4
+ hkjc/historical.py,sha256=yQQAx8vlr2EqcPazpYp1x2ku7dy3imQoDWImHCRv1QA,8330
4
5
  hkjc/live_odds.py,sha256=G4ELBBp1d2prxye9kKzu2pwtS4vSfRPOmEuT7-Nd-3A,4741
5
- hkjc/processing.py,sha256=XeVrF5KKkU3Oy-vqPvMgM22QHVTCVCuml2IsIGdRbYw,7483
6
+ hkjc/processing.py,sha256=xrvEUgu_jz8ZxevOsRsYz0T7pWyNtSCMI6LUYByOLOw,6812
6
7
  hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
8
  hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
8
9
  hkjc/utils.py,sha256=4CA_FPf_U3GvzoLkqBX0qDPZgrSvKJKvbP7VWqd5FiA,6323
9
10
  hkjc/strategy/place_only.py,sha256=lHPjTSj8PzghxncNBg8FI4T4HJigekB9a3bV7l7VtPA,2079
10
11
  hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
11
- hkjc-0.3.15.dist-info/METADATA,sha256=2nQL1EImJ0hXnWFdGnpORIknMdaaFham-Pw9cgjUiO4,481
12
- hkjc-0.3.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
- hkjc-0.3.15.dist-info/RECORD,,
12
+ hkjc-0.3.17.dist-info/METADATA,sha256=gKSkXKYo_HCg2S4ZeAjnqZniWV0V2kGpRH_g25K9Rmo,481
13
+ hkjc-0.3.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
+ hkjc-0.3.17.dist-info/RECORD,,
File without changes