hkjc 0.3.16__py3-none-any.whl → 0.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hkjc/historical.py CHANGED
@@ -12,6 +12,9 @@ from .utils import _parse_html_table
12
12
  HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
13
13
  HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseNo={horse_no}"
14
14
 
15
+ incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
16
+ 'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
17
+
15
18
 
16
19
  @ttl_cache(maxsize=100, ttl=3600)
17
20
  def _soupify(url: str) -> BeautifulSoup:
@@ -37,7 +40,6 @@ def _soupify_horse_page(horse_no: str) -> BeautifulSoup:
37
40
  return _soupify(url)
38
41
 
39
42
 
40
-
41
43
  def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition") -> pl.DataFrame:
42
44
  """Classify running style based on RunningPosition column
43
45
  """
@@ -50,14 +52,14 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
50
52
  .alias("split_data").cast(pl.Int64, strict=False)
51
53
  ).unnest("split_data")
52
54
 
53
- df.with_columns(pl.col('FinishPosition').fill_null(pl.col('Position3')))
55
+ df = df.with_columns(pl.col('FinishPosition').fill_null(pl.col('Position3')))
54
56
 
55
57
  df = df.with_columns([
56
58
  (pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
57
59
  pl.mean_horizontal("StartPosition", "Position2",
58
60
  "Position3", "FinishPosition").alias("AvgPosition"),
59
61
  ]).with_columns(pl.when(pl.col("StartPosition").is_null()).then(pl.lit("--"))
60
- .when((pl.col("PositionChange") <= 0) & pl.col("StartPosition") <= 3).then(pl.lit("FrontRunner"))
62
+ .when((pl.col("AvgPosition") <= 3.5) & (pl.col("StartPosition") <= 3)).then(pl.lit("FrontRunner"))
61
63
  .when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
62
64
  .otherwise(pl.lit("Pacer")).alias("RunningStyle"))
63
65
 
@@ -67,7 +69,7 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
67
69
  return df
68
70
 
69
71
 
70
- def get_horse_data(horse_no: str) -> pl.DataFrame:
72
+ def _extract_horse_data(horse_no: str) -> pl.DataFrame:
71
73
  """Extract horse info and history from horse page
72
74
  """
73
75
  soup = _soupify_horse_page(horse_no)
@@ -82,13 +84,16 @@ def get_horse_data(horse_no: str) -> pl.DataFrame:
82
84
  profile_data = _parse_html_table(table[1], skip_header=True)
83
85
 
84
86
  try:
85
- current_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0))
86
- season_start_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0))
87
+ current_rating = int(profile_data.filter(
88
+ pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0))
89
+ season_start_rating = int(profile_data.filter(pl.col(
90
+ "column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0))
87
91
  except:
88
92
  current_rating, season_start_rating = 0, 0
89
-
93
+
90
94
  try:
91
- last_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Last Rating"))['column_2'].item(0))
95
+ last_rating = int(profile_data.filter(
96
+ pl.col("column_0").str.starts_with("Last Rating"))['column_2'].item(0))
92
97
  except:
93
98
  last_rating = 0
94
99
 
@@ -96,47 +101,85 @@ def get_horse_data(horse_no: str) -> pl.DataFrame:
96
101
  'HorseID': horse_no,
97
102
  'CurrentRating': current_rating,
98
103
  'SeasonStartRating': season_start_rating,
99
- 'LastRating' : last_rating if current_rating==0 else current_rating
104
+ 'LastRating': last_rating if current_rating == 0 else current_rating
100
105
  }
101
106
  horse_data = (horse_data.with_columns([
102
107
  pl.lit(value).alias(key) for key, value in horse_info.items()
103
108
  ])
104
109
  )
105
110
 
106
- horse_data = horse_data.with_columns([
111
+ return horse_data
112
+
113
+
114
+ def _clean_horse_data(df: pl.DataFrame) -> pl.DataFrame:
115
+ """ Clean and convert horse data to suitable data types
116
+ """
117
+ df = df.with_columns(
118
+ pl.col('Pla').str.split(' ').list.first().alias('Pla')
119
+ ).filter(~pl.col('Pla').is_in(incidents))
120
+
121
+ df = df.with_columns([
107
122
  pl.col('Pla').cast(pl.Int64, strict=False),
108
- pl.col('WinOdds').cast(pl.Int64, strict=False),
109
123
  pl.col('ActWt').cast(pl.Int64, strict=False),
110
124
  pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
111
125
  pl.col('Dr').cast(pl.Int64, strict=False),
112
126
  pl.col('Rtg').cast(pl.Int64, strict=False),
113
- pl.col('RaceIndex').cast(pl.Int64, strict=False),
114
- pl.col('Dist').cast(pl.Int64, strict=False)
127
+ pl.col('Dist').cast(pl.Int64, strict=False),
128
+ pl.col('WinOdds').cast(pl.Float64, strict=False),
129
+ pl.col('RaceIndex').cast(pl.Int64, strict=False)
115
130
  ])
116
131
 
117
- horse_data = horse_data.with_columns(
132
+ df = df.with_columns(
118
133
  (
119
- pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
120
- pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
134
+ pl.col("FinishTime").str.split_exact(".", 1).struct.field("field_0").cast(pl.Int64) * 60 +
135
+ pl.col("FinishTime").str.split_exact(".", 1).struct.field("field_1").cast(pl.Int64)
121
136
  ).cast(pl.Float64).alias("FinishTime")
122
137
  )
123
138
 
124
- horse_data = horse_data.with_columns(
139
+ df = df.with_columns(
125
140
  pl.col('RCTrackCourse').str.split_exact(' / ', 2)
126
141
  .struct.rename_fields(['Venue', 'Track', 'Course'])
127
142
  .alias('RCTrackCourse')
128
143
  ).unnest('RCTrackCourse')
129
144
 
130
- return horse_data
145
+ return df
131
146
 
147
+ def get_horse_data(horse_no: str) -> pl.DataFrame:
148
+ df = _extract_horse_data(horse_no)
149
+ return _clean_horse_data(df)
132
150
 
133
- def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
151
+ def _clean_race_data(df: pl.DataFrame) -> pl.DataFrame:
152
+ """ Clean and convert horse data to suitable data types
153
+ """
154
+ df = df.with_columns(
155
+ pl.col('Pla').str.split(' ').list.first().alias('Pla')
156
+ ).filter(~pl.col('Pla').is_in(incidents))
157
+
158
+ df = df.with_columns([
159
+ pl.col('Pla').cast(pl.Int64, strict=False),
160
+ pl.col('HorseNo').cast(pl.Int64, strict=False),
161
+ pl.col('ActWt').cast(pl.Int64, strict=False),
162
+ pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
163
+ pl.col('Dr').cast(pl.Int64, strict=False),
164
+ pl.col('WinOdds').cast(pl.Float64, strict=False)
165
+ ])
166
+
167
+ df = df.with_columns(
168
+ (
169
+ pl.col("FinishTime").str.split_exact(":", 1).struct.field("field_0").cast(pl.Int64) * 60 +
170
+ pl.col("FinishTime").str.split_exact(":", 1).struct.field("field_1").cast(pl.Int64)
171
+ ).cast(pl.Float64).alias("FinishTime")
172
+ )
173
+
174
+ return df
175
+
176
+ def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
134
177
  soup = _soupify_race_page(date, venue_code, race_number)
135
178
  table = soup.find('div', class_='race_tab').find('table')
136
179
  race_data = _parse_html_table(table)
137
180
 
138
181
  # Extract the relevant race information
139
- race_id = race_data.columns[0].replace(f'RACE{race_number}','')
182
+ race_id = race_data.columns[0].replace(f'RACE{race_number}', '')
140
183
  race_class = race_data.item(1, 0).split('-')[0].strip()
141
184
  race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
142
185
  race_name = race_data.item(2, 0).strip()
@@ -162,7 +205,12 @@ def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
162
205
  .with_columns(
163
206
  pl.col("Horse").str.extract(r"\((.*?)\)")
164
207
  .alias("HorseID")
165
- )
166
- )
208
+ )
209
+ )
210
+
211
+ return race_data
167
212
 
168
- return race_data
213
+
214
+ def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
215
+ df = _extract_race_data(date,venue_code,race_number)
216
+ return _clean_race_data(df)
hkjc/processing.py CHANGED
@@ -6,7 +6,7 @@ from typing import Tuple, List, Union
6
6
  from .live_odds import live_odds
7
7
  from .strategy import qpbanker, place_only
8
8
  from .harville_model import fit_harville_to_odds
9
- from .historical import get_race_data, get_horse_data
9
+ from .historical import _extract_horse_data, _extract_race_data, _clean_horse_data
10
10
  from .utils import _validate_date
11
11
 
12
12
  import polars as pl
@@ -23,8 +23,7 @@ def _all_subsets(lst): return [list(x) for r in range(
23
23
  # ======================================
24
24
  # Historical data processing functions
25
25
  # ======================================
26
- incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
27
- 'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
26
+
28
27
 
29
28
 
30
29
  def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl.DataFrame]:
@@ -33,7 +32,7 @@ def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl
33
32
  range(1, 12), desc=f"Processing {date} {venue_code} ...", leave=False)
34
33
  for race_number in iter_date:
35
34
  try:
36
- dfs.append(get_race_data(date.strftime('%Y/%m/%d'),
35
+ dfs.append(_extract_race_data(date.strftime('%Y/%m/%d'),
37
36
  venue_code, race_number))
38
37
  except:
39
38
  if race_number == 1:
@@ -51,7 +50,7 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
51
50
 
52
51
  dfs = []
53
52
 
54
- for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True)):
53
+ for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True), leave=False, desc='Scanning for horse IDs ...'):
55
54
  for venue_code in ['ST', 'HV']:
56
55
  dfs += _historical_process_single_date_venue(date, venue_code)
57
56
 
@@ -62,35 +61,9 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
62
61
  horse_ids = pl.concat(dfs)['HorseID'].unique()
63
62
 
64
63
  # Use horse track records
65
- dfs = [get_horse_data(horse_id) for horse_id in horse_ids]
66
- df = (
67
- pl.concat(dfs).with_columns(
68
- pl.col('Date').str.strptime(pl.Date, '%m/%d/%y')
69
- ).filter(pl.col('Date').is_between(start_dt, end_dt))
70
- .filter(~pl.col('Pla').is_in(incidents))
71
- .with_columns(
72
- pl.col('Pla').str.split(' ').list.first().alias('Pla')
73
- )
74
- )
75
-
76
- df = df.with_columns([
77
- pl.col('Pla').cast(pl.Int64, strict=False),
78
- pl.col('HorseNo').cast(pl.Int64, strict=False),
79
- pl.col('ActWt').cast(pl.Int64, strict=False),
80
- pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
81
- pl.col('Dr').cast(pl.Int64, strict=False),
82
- pl.col('RaceDistance').cast(pl.Int64, strict=False),
83
- pl.col('WinOdds').cast(pl.Float64, strict=False)
84
- ])
85
-
86
- df = df.with_columns(
87
- (
88
- pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
89
- pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
90
- ).cast(pl.Float64).alias("FinishTime")
91
- )
92
-
93
- return df
64
+ dfs = [_extract_horse_data(horse_id) for horse_id in tqdm(horse_ids, desc='Processing horses ...', leave=False)]
65
+ df = pl.concat(dfs)
66
+ return _clean_horse_data(df)
94
67
 
95
68
 
96
69
  # ==========================
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hkjc
3
- Version: 0.3.16
3
+ Version: 0.3.17
4
4
  Summary: Library for scrapping HKJC data and perform basic analysis
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: beautifulsoup4>=4.14.2
@@ -1,14 +1,14 @@
1
1
  hkjc/__init__.py,sha256=TI7PVhmoWSvYX-xdTEdaT3jfY99LiYQFRQZaIwBhJd8,785
2
2
  hkjc/analysis.py,sha256=0042_NMIkQCl0J6B0P4TFfrBDCnm2B6jsCZKOEO30yI,108
3
3
  hkjc/harville_model.py,sha256=MZjPLS-1nbEhp1d4Syuq13DtraKnd7TlNqBmOOCwxgc,15976
4
- hkjc/historical.py,sha256=R_7z0yqLDj57G5JgvuPYDxKLBoqlhfpRnpTER6aeluM,6678
4
+ hkjc/historical.py,sha256=yQQAx8vlr2EqcPazpYp1x2ku7dy3imQoDWImHCRv1QA,8330
5
5
  hkjc/live_odds.py,sha256=G4ELBBp1d2prxye9kKzu2pwtS4vSfRPOmEuT7-Nd-3A,4741
6
- hkjc/processing.py,sha256=KZFrGuCdCEJ5OI54PnrWhy-c9qx7mcWm12chc3HuDO8,7764
6
+ hkjc/processing.py,sha256=xrvEUgu_jz8ZxevOsRsYz0T7pWyNtSCMI6LUYByOLOw,6812
7
7
  hkjc/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  hkjc/speedpro.py,sha256=Y2Z3GYGeePc4sM-ZnCHXCI1N7L-_j9nrMqS3CC5BBSo,2031
9
9
  hkjc/utils.py,sha256=4CA_FPf_U3GvzoLkqBX0qDPZgrSvKJKvbP7VWqd5FiA,6323
10
10
  hkjc/strategy/place_only.py,sha256=lHPjTSj8PzghxncNBg8FI4T4HJigekB9a3bV7l7VtPA,2079
11
11
  hkjc/strategy/qpbanker.py,sha256=MQxjwsfhllKZroKS8w8Q3bi3HMjGc1DAyBIjNZAp3yQ,4805
12
- hkjc-0.3.16.dist-info/METADATA,sha256=yusLSNb82ebKoie5iUxlO5XE9mBBfPiIBHmRepLXA9c,481
13
- hkjc-0.3.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
- hkjc-0.3.16.dist-info/RECORD,,
12
+ hkjc-0.3.17.dist-info/METADATA,sha256=gKSkXKYo_HCg2S4ZeAjnqZniWV0V2kGpRH_g25K9Rmo,481
13
+ hkjc-0.3.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
+ hkjc-0.3.17.dist-info/RECORD,,
File without changes