hkjc 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hkjc/analysis.py ADDED
@@ -0,0 +1,3 @@
1
+ # TODO:
2
+
3
+ # Generate filtered live odds, fav run style, dr, current rating, season start rating, track record
hkjc/historical.py CHANGED
@@ -7,10 +7,10 @@ import polars as pl
7
7
  from bs4 import BeautifulSoup
8
8
  from cachetools.func import ttl_cache
9
9
 
10
- from .utils import _validate_date, _validate_venue_code, _parse_html_table
10
+ from .utils import _parse_html_table
11
11
 
12
12
  HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
13
- HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId={horse_id}"
13
+ HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseNo={horse_no}"
14
14
 
15
15
 
16
16
  @ttl_cache(maxsize=100, ttl=3600)
@@ -30,10 +30,10 @@ def _soupify_race_page(date: str, venue_code: str, race_number: int) -> Beautifu
30
30
  return _soupify(url)
31
31
 
32
32
 
33
- def _soupify_horse_page(horse_id: str) -> BeautifulSoup:
33
+ def _soupify_horse_page(horse_no: str) -> BeautifulSoup:
34
34
  """Fetch and parse HKJC race results page and return BeautifulSoup object
35
35
  """
36
- url = HKJC_HORSE_URL_TEMPLATE.format(horse_id=horse_id)
36
+ url = HKJC_HORSE_URL_TEMPLATE.format(horse_no=horse_no)
37
37
  return _soupify(url)
38
38
 
39
39
 
@@ -50,6 +50,8 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
50
50
  .alias("split_data").cast(pl.Int64, strict=False)
51
51
  ).unnest("split_data")
52
52
 
53
+ df.with_columns(pl.col('FinishPosition').fill_null(pl.col('Position3')))
54
+
53
55
  df = df.with_columns([
54
56
  (pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
55
57
  pl.mean_horizontal("StartPosition", "Position2",
@@ -59,16 +61,16 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
59
61
  .when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
60
62
  .otherwise(pl.lit("Pacer")).alias("RunningStyle"))
61
63
 
62
- recent_style = df['RunningStyle'][:10].mode()[0]
64
+ recent_style = df['RunningStyle'][:5].mode()[0]
63
65
  df = df.with_columns(pl.lit(recent_style).alias("FavoriteRunningStyle"))
64
66
 
65
67
  return df
66
68
 
67
69
 
68
- def _extract_horse_data(horse_id: str) -> pl.DataFrame:
70
+ def get_horse_data(horse_no: str) -> pl.DataFrame:
69
71
  """Extract horse info and history from horse page
70
72
  """
71
- soup = _soupify_horse_page(horse_id)
73
+ soup = _soupify_horse_page(horse_no)
72
74
  table = soup.find('table', class_='bigborder')
73
75
  horse_data = _parse_html_table(table).filter(
74
76
  pl.col('Date') != '') # Remove empty rows
@@ -77,39 +79,73 @@ def _extract_horse_data(horse_id: str) -> pl.DataFrame:
77
79
  # Extract horse profile info
78
80
  table = soup.find_all('table', class_='table_eng_text')
79
81
  profile_data = _parse_html_table(table[0], skip_header=True)
80
- country, age = profile_data.filter(pl.col("column_0").str.starts_with("Country"))['column_2'].item(0).split('/')
81
82
  profile_data = _parse_html_table(table[1], skip_header=True)
82
- current_rating = profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0)
83
- season_start_rating = profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0)
83
+
84
+ try:
85
+ current_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0))
86
+ season_start_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0))
87
+ except:
88
+ current_rating, season_start_rating = 0, 0
89
+
90
+ try:
91
+ last_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Last Rating"))['column_2'].item(0))
92
+ except:
93
+ last_rating = 0
84
94
 
85
95
  horse_info = {
86
- 'HorseID': horse_id,
87
- 'OriginCountry': country.strip(),
88
- 'Age': int(age),
89
- 'CurrentRating': int(current_rating),
90
- 'SeasonStartRating': int(season_start_rating)
96
+ 'HorseID': horse_no,
97
+ 'CurrentRating': current_rating,
98
+ 'SeasonStartRating': season_start_rating,
99
+ 'LastRating' : last_rating if current_rating==0 else current_rating
91
100
  }
92
101
  horse_data = (horse_data.with_columns([
93
102
  pl.lit(value).alias(key) for key, value in horse_info.items()
94
103
  ])
95
104
  )
105
+
106
+ horse_data = horse_data.with_columns([
107
+ pl.col('Pla').cast(pl.Int64, strict=False),
108
+ pl.col('WinOdds').cast(pl.Int64, strict=False),
109
+ pl.col('ActWt').cast(pl.Int64, strict=False),
110
+ pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
111
+ pl.col('Dr').cast(pl.Int64, strict=False),
112
+ pl.col('Rtg').cast(pl.Int64, strict=False),
113
+ pl.col('RaceIndex').cast(pl.Int64, strict=False),
114
+ pl.col('Dist').cast(pl.Int64, strict=False)
115
+ ])
116
+
117
+ horse_data = horse_data.with_columns(
118
+ (
119
+ pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
120
+ pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
121
+ ).cast(pl.Float64).alias("FinishTime")
122
+ )
123
+
124
+ horse_data = horse_data.with_columns(
125
+ pl.col('RCTrackCourse').str.split_exact(' / ', 2)
126
+ .struct.rename_fields(['Venue', 'Track', 'Course'])
127
+ .alias('RCTrackCourse')
128
+ ).unnest('RCTrackCourse')
129
+
96
130
  return horse_data
97
131
 
98
132
 
99
- def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
133
+ def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
100
134
  soup = _soupify_race_page(date, venue_code, race_number)
101
135
  table = soup.find('div', class_='race_tab').find('table')
102
136
  race_data = _parse_html_table(table)
103
137
 
104
138
  # Extract the relevant race information
139
+ race_id = race_data.columns[0].replace(f'RACE{race_number}','')
105
140
  race_class = race_data.item(1, 0).split('-')[0].strip()
106
141
  race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
107
142
  race_name = race_data.item(2, 0).strip()
108
143
  going = race_data.item(1, 2).strip()
109
144
  course = race_data.item(2, 2).strip()
110
145
 
111
- race_info = {'RaceDate': date,
146
+ race_info = {'Date': date,
112
147
  'Venue': venue_code,
148
+ 'RaceIndex': int(race_id),
113
149
  'RaceNumber': race_number,
114
150
  'RaceClass': race_class,
115
151
  'RaceDistance': race_dist,
@@ -123,25 +159,10 @@ def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataF
123
159
  .with_columns([
124
160
  pl.lit(value).alias(key) for key, value in race_info.items()
125
161
  ])
162
+ .with_columns(
163
+ pl.col("Horse").str.extract(r"\((.*?)\)")
164
+ .alias("HorseID")
165
+ )
126
166
  )
127
-
128
- # Extract horse IDs from links
129
- horse_ids = []
130
- rows = table.find_all('tr')[1:] # Skip header row
131
- for row in rows:
132
- horse_id = 'UNKNOWN' # Horse link not found
133
- links = row.find_all('a')
134
- for link in links:
135
- if 'href' in link.attrs and 'HorseId=' in link['href']:
136
- horse_id = link['href'].split('HorseId=')[1]
137
- break
138
- horse_ids.append(horse_id)
139
-
140
- race_data = race_data.with_columns(pl.Series('HorseID', horse_ids))
141
-
142
- # Join with horse data
143
- horse_data_list = [_extract_horse_data(horse_id) for horse_id in horse_ids]
144
- horse_data_df = pl.concat(horse_data_list).unique(subset=['HorseID'])
145
- race_data = race_data.join(horse_data_df, on='HorseID', how='left')
146
167
 
147
168
  return race_data
hkjc/processing.py CHANGED
@@ -6,7 +6,7 @@ from typing import Tuple, List, Union
6
6
  from .live_odds import live_odds
7
7
  from .strategy import qpbanker, place_only
8
8
  from .harville_model import fit_harville_to_odds
9
- from .historical import _extract_race_data
9
+ from .historical import get_race_data, get_horse_data
10
10
  from .utils import _validate_date
11
11
 
12
12
  import polars as pl
@@ -27,13 +27,19 @@ incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
27
27
  'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
28
28
 
29
29
 
30
- def _historical_process_single_date_venue(date: str, venue_code: str) -> Union[pl.DataFrame, None]:
31
- for race_number in range(1, 12):
30
+ def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl.DataFrame]:
31
+ dfs = []
32
+ iter_date = tqdm(
33
+ range(1, 12), desc=f"Processing {date} {venue_code} ...", leave=False)
34
+ for race_number in iter_date:
32
35
  try:
33
- _extract_race_data(date.strftime('%Y/%m/%d'),
34
- venue_code, race_number)
36
+ dfs.append(get_race_data(date.strftime('%Y/%m/%d'),
37
+ venue_code, race_number))
35
38
  except:
36
- return None
39
+ if race_number == 1:
40
+ iter_date.close()
41
+ return []
42
+ return dfs
37
43
 
38
44
 
39
45
  def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
@@ -47,16 +53,24 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
47
53
 
48
54
  for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True)):
49
55
  for venue_code in ['ST', 'HV']:
50
- df = _historical_process_single_date_venue(date, venue_code)
51
- if df is None:
52
- continue
53
- dfs.append(df)
54
-
55
- df = (pl.concat(dfs)
56
- .filter(~pl.col('Pla').is_in(incidents))
57
- .with_columns(
58
- pl.col('Pla').str.split(' ').list.first().alias('Pla')
59
- )
56
+ dfs += _historical_process_single_date_venue(date, venue_code)
57
+
58
+ if dfs == []:
59
+ raise ValueError(
60
+ "Failed to obtain any race data. This could be due to invalid date range, or server requests limit. Please try again later.")
61
+
62
+ horse_ids = pl.concat(dfs)['HorseID'].unique()
63
+
64
+ # Use horse track records
65
+ dfs = [get_horse_data(horse_id) for horse_id in horse_ids]
66
+ df = (
67
+ pl.concat(dfs).with_columns(
68
+ pl.col('Date').str.strptime(pl.Date, '%m/%d/%y')
69
+ ).filter(pl.col('Date').is_between(start_dt, end_dt))
70
+ .filter(~pl.col('Pla').is_in(incidents))
71
+ .with_columns(
72
+ pl.col('Pla').str.split(' ').list.first().alias('Pla')
73
+ )
60
74
  )
61
75
 
62
76
  df = df.with_columns([
@@ -69,10 +83,11 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
69
83
  pl.col('WinOdds').cast(pl.Float64, strict=False)
70
84
  ])
71
85
 
72
- df = df.with_columns(pl.col('Finish Time')
73
- .str.strptime(pl.Duration, format='%M:%S.%f', strict=False)
74
- .dt.total_seconds()
75
- .alias('Finish Time')
86
+ df = df.with_columns(
87
+ (
88
+ pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
89
+ pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
90
+ ).cast(pl.Float64).alias("FinishTime")
76
91
  )
77
92
 
78
93
  return df
@@ -1,11 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hkjc
3
- Version: 0.3.14
3
+ Version: 0.3.16
4
4
  Summary: Library for scrapping HKJC data and perform basic analysis
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: beautifulsoup4>=4.14.2
7
7
  Requires-Dist: cachetools>=6.2.0
8
8
  Requires-Dist: fastexcel>=0.16.0
9
+ Requires-Dist: joblib>=1.5.2
9
10
  Requires-Dist: numba>=0.62.1
10
11
  Requires-Dist: numpy>=2.3.3
11
12
  Requires-Dist: polars>=1.33.1
@@ -1,13 +1,14 @@
1
1
  hkjc/__init__.py,sha256=TI7PVhmoWSvYX-xdTEdaT3jfY99LiYQFRQZaIwBhJd8,785
2
+ hkjc/analysis.py,sha256=0042_NMIkQCl0J6B0P4TFfrBDCnm2B6jsCZKOEO30yI,108
2
3
  hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
3
- hkjc/historical.py,sha256=HRsy8O2NqJQ5Ljcs1ySppngL7kO0rqC49vkIKIDp1Do,6027
4
+ hkjc/historical.py,sha256=R_7z0yqLDj57G5JgvuPYDxKLBoqlhfpRnpTER6aeluM,6678
4
5
  hkjc/live_odds.py,sha256=G4ELBBp1d2prxye9kKzu2pwtS4vSfRPOmEuT7-Nd-3A,4741
5
- hkjc/processing.py,sha256=K3mlPiGaE5PlVcbjLpn0QWNpMNOFiaLXFqWGdIBe2xw,7082
6
+ hkjc/processing.py,sha256=KZFrGuCdCEJ5OI54PnrWhy-c9qx7mcWm12chc3HuDO8,7764
6
7
  hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
8
  hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
8
9
  hkjc/utils.py,sha256=4CA_FPf_U3GvzoLkqBX0qDPZgrSvKJKvbP7VWqd5FiA,6323
9
10
  hkjc/strategy/place_only.py,sha256=lHPjTSj8PzghxncNBg8FI4T4HJigekB9a3bV7l7VtPA,2079
10
11
  hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
11
- hkjc-0.3.14.dist-info/METADATA,sha256=u-6OgmWRvNgS_RySOBRWzowDULmKE7Q0TNPzAQCIPg8,452
12
- hkjc-0.3.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
- hkjc-0.3.14.dist-info/RECORD,,
12
+ hkjc-0.3.16.dist-info/METADATA,sha256=yusLSNb82ebKoie5iUxlO5XE9mBBfPiIBHmRepLXA9c,481
13
+ hkjc-0.3.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
+ hkjc-0.3.16.dist-info/RECORD,,
File without changes