PyPI - hkjc - Versions diffs - 0.3.14__tar.gz → 0.3.16__tar.gz - Mend

hkjc 0.3.14tar.gz → 0.3.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

hkjc-0.3.16/2024-2025-hkjc.parquet +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/PKG-INFO +2 -1
hkjc-0.3.16/process.py +4 -0
{hkjc-0.3.14 → hkjc-0.3.16}/pyproject.toml +2 -1
hkjc-0.3.16/src/hkjc/analysis.py +3 -0
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/historical.py +57 -36
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/processing.py +35 -20
{hkjc-0.3.14 → hkjc-0.3.16}/uv.lock +12 -1
{hkjc-0.3.14 → hkjc-0.3.16}/.python-version +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/README.md +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/__init__.py +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/harville_model.py +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/live_odds.py +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/py.typed +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/speedpro.py +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/strategy/place_only.py +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/strategy/qpbanker.py +0 -0
{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/utils.py +0 -0

hkjc-0.3.16/2024-2025-hkjc.parquet ADDED Viewed

Binary file

{hkjc-0.3.14 → hkjc-0.3.16}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,12 @@
 Metadata-Version: 2.4
 Name: hkjc
-Version: 0.3.14
+Version: 0.3.16
 Summary: Library for scrapping HKJC data and perform basic analysis
 Requires-Python: >=3.11
 Requires-Dist: beautifulsoup4>=4.14.2
 Requires-Dist: cachetools>=6.2.0
 Requires-Dist: fastexcel>=0.16.0
+Requires-Dist: joblib>=1.5.2
 Requires-Dist: numba>=0.62.1
 Requires-Dist: numpy>=2.3.3
 Requires-Dist: polars>=1.33.1

hkjc-0.3.16/process.py ADDED Viewed

@@ -0,0 +1,4 @@
+from hkjc import generate_historical_data
+df = generate_historical_data('2024-09-08', '2025-10-06')
+df.write_parquet('2024-2025-hkjc.parquet')

{hkjc-0.3.14 → hkjc-0.3.16}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hkjc"
-version = "0.3.14"
+version = "0.3.16"
 description = "Library for scrapping HKJC data and perform basic analysis"
 readme = "README.md"
 requires-python = ">=3.11"
@@ -8,6 +8,7 @@ dependencies = [
     "beautifulsoup4>=4.14.2",
     "cachetools>=6.2.0",
     "fastexcel>=0.16.0",
+    "joblib>=1.5.2",
     "numba>=0.62.1",
     "numpy>=2.3.3",
     "polars>=1.33.1",

hkjc-0.3.16/src/hkjc/analysis.py ADDED Viewed

@@ -0,0 +1,3 @@
+# TODO:
+# Generate filtered live odds, fav run style, dr, current rating, season start rating, track record

{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/historical.py RENAMED Viewed

@@ -7,10 +7,10 @@ import polars as pl
 from bs4 import BeautifulSoup
 from cachetools.func import ttl_cache
-from .utils import _validate_date, _validate_venue_code, _parse_html_table
+from .utils import _parse_html_table
 HKJC_RACE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate={date}&Racecourse={venue_code}&RaceNo={race_number}"
-HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseId={horse_id}"
+HKJC_HORSE_URL_TEMPLATE = "https://racing.hkjc.com/racing/information/English/Horse/Horse.aspx?HorseNo={horse_no}"
 @ttl_cache(maxsize=100, ttl=3600)
@@ -30,10 +30,10 @@ def _soupify_race_page(date: str, venue_code: str, race_number: int) -> Beautifu
     return _soupify(url)
-def _soupify_horse_page(horse_id: str) -> BeautifulSoup:
+def _soupify_horse_page(horse_no: str) -> BeautifulSoup:
     """Fetch and parse HKJC race results page and return BeautifulSoup object
     """
-    url = HKJC_HORSE_URL_TEMPLATE.format(horse_id=horse_id)
+    url = HKJC_HORSE_URL_TEMPLATE.format(horse_no=horse_no)
     return _soupify(url)
@@ -50,6 +50,8 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
         .alias("split_data").cast(pl.Int64, strict=False)
     ).unnest("split_data")
+    df.with_columns(pl.col('FinishPosition').fill_null(pl.col('Position3')))
     df = df.with_columns([
         (pl.col("StartPosition")-pl.col("FinishPosition")).alias("PositionChange"),
         pl.mean_horizontal("StartPosition", "Position2",
@@ -59,16 +61,16 @@ def _classify_running_style(df: pl.DataFrame, running_pos_col="RunningPosition")
                     .when((pl.col("PositionChange") >= 1) & (pl.col("StartPosition") >= 6)).then(pl.lit("Closer"))
                     .otherwise(pl.lit("Pacer")).alias("RunningStyle"))
-    recent_style = df['RunningStyle'][:10].mode()[0]
+    recent_style = df['RunningStyle'][:5].mode()[0]
     df = df.with_columns(pl.lit(recent_style).alias("FavoriteRunningStyle"))
     return df
-def _extract_horse_data(horse_id: str) -> pl.DataFrame:
+def get_horse_data(horse_no: str) -> pl.DataFrame:
     """Extract horse info and history from horse page
     """
-    soup = _soupify_horse_page(horse_id)
+    soup = _soupify_horse_page(horse_no)
     table = soup.find('table', class_='bigborder')
     horse_data = _parse_html_table(table).filter(
         pl.col('Date') != '')  # Remove empty rows
@@ -77,39 +79,73 @@ def _extract_horse_data(horse_id: str) -> pl.DataFrame:
     # Extract horse profile info
     table = soup.find_all('table', class_='table_eng_text')
     profile_data = _parse_html_table(table[0], skip_header=True)
-    country, age = profile_data.filter(pl.col("column_0").str.starts_with("Country"))['column_2'].item(0).split('/')
     profile_data = _parse_html_table(table[1], skip_header=True)
-    current_rating = profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0)
-    season_start_rating = profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0)
+    try:
+        current_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Current Rating"))['column_2'].item(0))
+        season_start_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Start of Season Rating"))['column_2'].item(0))
+    except:
+        current_rating, season_start_rating = 0, 0
+    try:
+        last_rating = int(profile_data.filter(pl.col("column_0").str.starts_with("Last Rating"))['column_2'].item(0))
+    except:
+        last_rating = 0
     horse_info = {
-        'HorseID': horse_id,
-        'OriginCountry': country.strip(),
-        'Age': int(age),
-        'CurrentRating': int(current_rating),
-        'SeasonStartRating': int(season_start_rating)
+        'HorseID': horse_no,
+        'CurrentRating': current_rating,
+        'SeasonStartRating': season_start_rating,
+        'LastRating' : last_rating if current_rating==0 else current_rating
     }
     horse_data = (horse_data.with_columns([
         pl.lit(value).alias(key) for key, value in horse_info.items()
     ])
     )
+    horse_data = horse_data.with_columns([
+        pl.col('Pla').cast(pl.Int64, strict=False),
+        pl.col('WinOdds').cast(pl.Int64, strict=False),
+        pl.col('ActWt').cast(pl.Int64, strict=False),
+        pl.col('DeclarHorseWt').cast(pl.Int64, strict=False),
+        pl.col('Dr').cast(pl.Int64, strict=False),
+        pl.col('Rtg').cast(pl.Int64, strict=False),
+        pl.col('RaceIndex').cast(pl.Int64, strict=False),
+        pl.col('Dist').cast(pl.Int64, strict=False)
+    ])
+    horse_data = horse_data.with_columns(
+        (
+            pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
+            pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
+        ).cast(pl.Float64).alias("FinishTime")
+    )
+    horse_data = horse_data.with_columns(
+        pl.col('RCTrackCourse').str.split_exact(' / ', 2)
+        .struct.rename_fields(['Venue', 'Track', 'Course'])
+        .alias('RCTrackCourse')
+    ).unnest('RCTrackCourse')
     return horse_data
-def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
+def get_race_data(date: str, venue_code: str, race_number: int) -> pl.DataFrame:
     soup = _soupify_race_page(date, venue_code, race_number)
     table = soup.find('div', class_='race_tab').find('table')
     race_data = _parse_html_table(table)
     # Extract the relevant race information
+    race_id = race_data.columns[0].replace(f'RACE{race_number}','')
     race_class = race_data.item(1, 0).split('-')[0].strip()
     race_dist = race_data.item(1, 0).split('-')[1].strip().rstrip('M')
     race_name = race_data.item(2, 0).strip()
     going = race_data.item(1, 2).strip()
     course = race_data.item(2, 2).strip()
-    race_info = {'RaceDate': date,
+    race_info = {'Date': date,
                  'Venue': venue_code,
+                 'RaceIndex': int(race_id),
                  'RaceNumber': race_number,
                  'RaceClass': race_class,
                  'RaceDistance': race_dist,
@@ -123,25 +159,10 @@ def _extract_race_data(date: str, venue_code: str, race_number: int) -> pl.DataF
                  .with_columns([
                      pl.lit(value).alias(key) for key, value in race_info.items()
                  ])
+                 .with_columns(
+                     pl.col("Horse").str.extract(r"\((.*?)\)")
+                     .alias("HorseID")
+                 )
                  )
-    # Extract horse IDs from links
-    horse_ids = []
-    rows = table.find_all('tr')[1:]  # Skip header row
-    for row in rows:
-        horse_id = 'UNKNOWN'  # Horse link not found
-        links = row.find_all('a')
-        for link in links:
-            if 'href' in link.attrs and 'HorseId=' in link['href']:
-                horse_id = link['href'].split('HorseId=')[1]
-                break
-        horse_ids.append(horse_id)
-    race_data = race_data.with_columns(pl.Series('HorseID', horse_ids))
-    # Join with horse data
-    horse_data_list = [_extract_horse_data(horse_id) for horse_id in horse_ids]
-    horse_data_df = pl.concat(horse_data_list).unique(subset=['HorseID'])
-    race_data = race_data.join(horse_data_df, on='HorseID', how='left')
     return race_data

{hkjc-0.3.14 → hkjc-0.3.16}/src/hkjc/processing.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import Tuple, List, Union
 from .live_odds import live_odds
 from .strategy import qpbanker, place_only
 from .harville_model import fit_harville_to_odds
-from .historical import _extract_race_data
+from .historical import get_race_data, get_horse_data
 from .utils import _validate_date
 import polars as pl
@@ -27,13 +27,19 @@ incidents = ['DISQ', 'DNF', 'FE', 'ML', 'PU', 'TNP', 'TO',
              'UR', 'VOID', 'WR', 'WV', 'WV-A', 'WX', 'WX-A', 'WXNR']
-def _historical_process_single_date_venue(date: str, venue_code: str) -> Union[pl.DataFrame, None]:
-    for race_number in range(1, 12):
+def _historical_process_single_date_venue(date: str, venue_code: str) -> List[pl.DataFrame]:
+    dfs = []
+    iter_date = tqdm(
+        range(1, 12), desc=f"Processing {date} {venue_code} ...", leave=False)
+    for race_number in iter_date:
         try:
-            _extract_race_data(date.strftime('%Y/%m/%d'),
-                               venue_code, race_number)
+            dfs.append(get_race_data(date.strftime('%Y/%m/%d'),
+                                     venue_code, race_number))
         except:
-            return None
+            if race_number == 1:
+                iter_date.close()
+                return []
+    return dfs
 def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
@@ -47,16 +53,24 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
     for date in tqdm(pl.date_range(start_dt, end_dt, interval='1d', eager=True)):
         for venue_code in ['ST', 'HV']:
-            df = _historical_process_single_date_venue(date, venue_code)
-            if df is None:
-                continue
-            dfs.append(df)
-    df = (pl.concat(dfs)
-          .filter(~pl.col('Pla').is_in(incidents))
-          .with_columns(
-        pl.col('Pla').str.split(' ').list.first().alias('Pla')
-    )
+            dfs += _historical_process_single_date_venue(date, venue_code)
+    if dfs == []:
+        raise ValueError(
+            "Failed to obtain any race data. This could be due to invalid date range, or server requests limit. Please try again later.")
+    horse_ids = pl.concat(dfs)['HorseID'].unique()
+    # Use horse track records
+    dfs = [get_horse_data(horse_id) for horse_id in horse_ids]
+    df = (
+        pl.concat(dfs).with_columns(
+            pl.col('Date').str.strptime(pl.Date, '%m/%d/%y')
+        ).filter(pl.col('Date').is_between(start_dt, end_dt))
+        .filter(~pl.col('Pla').is_in(incidents))
+        .with_columns(
+            pl.col('Pla').str.split(' ').list.first().alias('Pla')
+        )
     )
     df = df.with_columns([
@@ -69,10 +83,11 @@ def generate_historical_data(start_date: str, end_date: str) -> pl.DataFrame:
         pl.col('WinOdds').cast(pl.Float64, strict=False)
     ])
-    df = df.with_columns(pl.col('Finish Time')
-        .str.strptime(pl.Duration, format='%M:%S.%f', strict=False)
-        .dt.total_seconds()
-        .alias('Finish Time')
+    df = df.with_columns(
+        (
+            pl.col("FinishTime").str.split(":").list.get(0).cast(pl.Int64) * 60 +
+            pl.col("FinishTime").str.split(":").list.get(1).cast(pl.Float64)
+        ).cast(pl.Float64).alias("FinishTime")
     )
     return df

{hkjc-0.3.14 → hkjc-0.3.16}/uv.lock RENAMED Viewed

@@ -110,12 +110,13 @@ wheels = [
 [[package]]
 name = "hkjc"
-version = "0.3.14"
+version = "0.3.16"
 source = { editable = "." }
 dependencies = [
     { name = "beautifulsoup4" },
     { name = "cachetools" },
     { name = "fastexcel" },
+    { name = "joblib" },
     { name = "numba" },
     { name = "numpy" },
     { name = "polars" },
@@ -130,6 +131,7 @@ requires-dist = [
     { name = "beautifulsoup4", specifier = ">=4.14.2" },
     { name = "cachetools", specifier = ">=6.2.0" },
     { name = "fastexcel", specifier = ">=0.16.0" },
+    { name = "joblib", specifier = ">=1.5.2" },
     { name = "numba", specifier = ">=0.62.1" },
     { name = "numpy", specifier = ">=2.3.3" },
     { name = "polars", specifier = ">=1.33.1" },
@@ -148,6 +150,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
 ]
+[[package]]
+name = "joblib"
+version = "1.5.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/5d/447af5ea094b9e4c4054f82e223ada074c552335b9b4b2d14bd9b35a67c4/joblib-1.5.2.tar.gz", hash = "sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55", size = 331077, upload-time = "2025-08-27T12:15:46.575Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" },
+]
 [[package]]
 name = "llvmlite"
 version = "0.45.1"