PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py ADDED Viewed

@@ -0,0 +1,89 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def _save_split(
+    train_df: pd.DataFrame, test_df: pd.DataFrame, public_path: Path, private_path: Path
+):
+    """
+    Saves a given train/test split to the specified public and private directories.
+    This helper function ensures a consistent file structure and naming convention
+    for any data split.
+    Args:
+        train_df: The training dataframe.
+        test_df: The testing dataframe (with labels).
+        public_path: The path to the public output directory.
+        private_path: The path to the private output directory.
+    """
+    # Ensure output directories exist
+    public_path.mkdir(parents=True, exist_ok=True)
+    private_path.mkdir(parents=True, exist_ok=True)
+    # Prepare dataframes for saving
+    test_without_labels = test_df.drop(columns=["fare_amount"])
+    submission_df = test_df.copy()[["key"]]
+    submission_df["fare_amount"] = 11.35  # A sample constant value
+    # Write CSVs to public and private directories
+    train_df.to_csv(public_path / "train.csv", index=False)
+    test_without_labels.to_csv(public_path / "test.csv", index=False)
+    submission_df.to_csv(public_path / "sample_submission.csv", index=False)
+    test_df.to_csv(private_path / "test.csv", index=False)
+    # Run checks to ensure data integrity
+    assert set(train_df["key"]).isdisjoint(
+        set(test_df["key"])
+    ), "Train and test sets share samples!"
+    assert test_df.shape[1] == 8, f"Test set should have 8 columns, but has {test_df.shape[1]}"
+    assert (
+        test_without_labels.shape[1] == 7
+    ), f"Test set without labels should have 7 columns, but has {test_without_labels.shape[1]}"
+    assert train_df.shape[1] == 8, f"Train set should have 8 columns, but has {train_df.shape[1]}"
+    assert (
+        submission_df.shape[1] == 2
+    ), f"Sample submission should have 2 columns, but has {submission_df.shape[1]}"
+    assert (
+        submission_df.shape[0] == test_df.shape[0]
+    ), f"Sample submission should have {test_df.shape[0]} rows, but has {submission_df.shape[0]}"
+def prepare(raw: Path, public: Path, private: Path):
+    # Load the raw training data
+    old_train = read_csv(raw / "train.csv")
+    # --- 1. Create the original train/test split for the main competition files ---
+    # This split creates the primary train and test sets.
+    # The outputs in `public/` and `private/` will not be changed.
+    train_orig, test_orig = train_test_split(old_train, test_size=9914, random_state=0)
+    # Save the original split results
+    _save_split(
+        train_df=train_orig, test_df=test_orig, public_path=public, private_path=private
+    )
+    # Copy over other files to the original public directory
+    shutil.copy(raw / "GCP-Coupons-Instructions.rtf", public / "GCP-Coupons-Instructions.rtf")
+    # --- 2. Create the new train/validation split for local validation ---
+    # This second split takes the `train_orig` set and splits it again to create
+    # a smaller training set and a new validation set (`test_val`).
+    # The logic and test size are identical to the first split.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    train_val, test_val = train_test_split(train_orig, test_size=9914, random_state=0)
+    # Save the validation split results to the new directories
+    _save_split(
+        train_df=train_val, test_df=test_val, public_path=public_val, private_path=private_val
+    )
+    # 复制相同的文件到 public_val 目录
+    shutil.copy(raw / "GCP-Coupons-Instructions.rtf", public_val / "GCP-Coupons-Instructions.rtf")

mlebench/competitions/nfl-player-contact-detection/grade.py ADDED Viewed

@@ -0,0 +1,36 @@
+from typing import Tuple
+import pandas as pd
+from sklearn import metrics
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(
+    submission: pd.DataFrame, answers: pd.DataFrame
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    assert "contact_id" in answers.columns, "contact_id column not found in answers"
+    assert "contact" in answers.columns, "contact column not found in answers"
+    if not {"contact_id", "contact"}.issubset(set(submission.columns)):
+        raise InvalidSubmissionError("Submission must have columns: contact_id, contact")
+    if not set(submission["contact_id"]) == set(answers["contact_id"]):
+        raise InvalidSubmissionError(
+            "Submission contact_id's inconsistent with answers contact_id's"
+        )
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission and answers have different lengths: submission has {len(submission)} samples, answers has {len(answers)} samples"
+        )
+    submission = submission.sort_values("contact_id")
+    answers = answers.sort_values("contact_id")
+    return submission, answers
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    submission, answers = prepare_for_metric(submission, answers)
+    return metrics.matthews_corrcoef(answers["contact"], submission["contact"])

mlebench/competitions/nfl-player-contact-detection/prepare.py ADDED Viewed

@@ -0,0 +1,101 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+def prepare(raw: Path, public: Path, private: Path):
+    (public / "train").mkdir(exist_ok=True)
+    (public / "test").mkdir(exist_ok=True)
+    # Create train, test from train split. Ensure train, test come from different game plays
+    old_train = pd.read_csv(raw / "train_labels.csv")
+    unique_game_play = old_train["game_play"].unique()
+    new_train_game_play, new_test_game_play = train_test_split(
+        unique_game_play, test_size=0.1, random_state=0
+    )
+    new_train = old_train[old_train["game_play"].isin(new_train_game_play)]
+    new_test = old_train[old_train["game_play"].isin(new_test_game_play)]
+    assert set(new_train["contact_id"]).isdisjoint(
+        set(new_test["contact_id"])
+    ), "Train and test label share samples!"
+    new_train.to_csv(public / "train_labels.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    # baseline helmets
+    old_train_baseline_helmets = pd.read_csv(raw / "train_baseline_helmets.csv")
+    new_train_baseline_helmets = old_train_baseline_helmets[
+        old_train_baseline_helmets["game_play"].isin(new_train_game_play)
+    ]
+    new_test_baseline_helmets = old_train_baseline_helmets[
+        old_train_baseline_helmets["game_play"].isin(new_test_game_play)
+    ]
+    new_train_baseline_helmets.to_csv(public / "train_baseline_helmets.csv", index=False)
+    new_test_baseline_helmets.to_csv(public / "test_baseline_helmets.csv", index=False)
+    # player tracking
+    old_train_player_tracking = pd.read_csv(raw / "train_player_tracking.csv")
+    new_train_player_trackings = old_train_player_tracking[
+        old_train_player_tracking["game_play"].isin(new_train_game_play)
+    ]
+    new_test_player_trackings = old_train_player_tracking[
+        old_train_player_tracking["game_play"].isin(new_test_game_play)
+    ]
+    new_train_player_trackings.to_csv(public / "train_player_tracking.csv", index=False)
+    new_test_player_trackings.to_csv(public / "test_player_tracking.csv", index=False)
+    # video metadata
+    old_train_video_metadata = pd.read_csv(raw / "train_video_metadata.csv")
+    new_train_video_metadata = old_train_video_metadata[
+        old_train_video_metadata["game_play"].isin(new_train_game_play)
+    ]
+    new_test_video_metadata = old_train_video_metadata[
+        old_train_video_metadata["game_play"].isin(new_test_game_play)
+    ]
+    new_train_video_metadata.to_csv(public / "train_video_metadata.csv", index=False)
+    new_test_video_metadata.to_csv(public / "test_video_metadata.csv", index=False)
+    # Copy over videos
+    for game_play_type in ["All29", "Endzone", "Sideline"]:
+        for game_play in new_train["game_play"].unique():
+            shutil.copyfile(
+                src=raw / "train" / f"{game_play}_{game_play_type}.mp4",
+                dst=public / "train" / f"{game_play}_{game_play_type}.mp4",
+            )
+        for game_play in new_test["game_play"].unique():
+            shutil.copyfile(
+                src=raw / "train" / f"{game_play}_{game_play_type}.mp4",
+                dst=public / "test" / f"{game_play}_{game_play_type}.mp4",
+            )
+    # Check integrity of the files copied
+    num_train_videos_found = len(list(public.glob("train/*.mp4")))
+    num_test_videos_found = len(list(public.glob("test/*.mp4")))
+    num_expected_train_videos = (
+        len(new_train["game_play"].unique()) * 3
+    )  # *3 for All29, Endzone, Sideline
+    num_expected_test_videos = len(new_test["game_play"].unique()) * 3
+    assert (
+        num_train_videos_found == num_expected_train_videos
+    ), f"Expected {num_expected_train_videos} images, found {num_train_videos_found}"
+    assert (
+        num_test_videos_found == num_expected_test_videos
+    ), f"Expected {num_expected_test_videos} images, found {num_test_videos_found}"
+    # Create a sample submission file
+    submission_df = pd.DataFrame(
+        {
+            "contact_id": new_test["contact_id"],
+            "contact": 0,
+        }
+    )
+    submission_df.to_csv(public / "sample_submission.csv", index=False)

mlebench/competitions/nfl-player-contact-detection/prepare_val.py ADDED Viewed

@@ -0,0 +1,186 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+def _process_and_save_split(
+    train_game_play_ids: list,
+    test_game_play_ids: list,
+    raw_path: Path,
+    public_path: Path,
+    private_path: Path,
+    old_train_labels_df: pd.DataFrame,
+    old_train_baseline_helmets_df: pd.DataFrame,
+    old_train_player_tracking_df: pd.DataFrame,
+    old_train_video_metadata_df: pd.DataFrame,
+):
+    """
+    A helper function to process and save a single data split (e.g., train/test or train_val/test_val).
+    This function filters raw dataframes based on game play IDs, saves the resulting CSVs,
+    copies the corresponding video files, and creates a sample submission file.
+    The output filenames are fixed to ensure consistent structure across different splits.
+    """
+    public_path.mkdir(exist_ok=True, parents=True)
+    private_path.mkdir(exist_ok=True, parents=True)
+    (public_path / "train").mkdir(exist_ok=True)
+    (public_path / "test").mkdir(exist_ok=True)
+    # Filter and save train/test labels
+    new_train = old_train_labels_df[
+        old_train_labels_df["game_play"].isin(train_game_play_ids)
+    ]
+    new_test = old_train_labels_df[
+        old_train_labels_df["game_play"].isin(test_game_play_ids)
+    ]
+    assert set(new_train["contact_id"]).isdisjoint(
+        set(new_test["contact_id"])
+    ), "Train and test label share samples!"
+    new_train.to_csv(public_path / "train_labels.csv", index=False)
+    new_test.to_csv(private_path / "test.csv", index=False)
+    # Filter and save baseline helmets
+    new_train_baseline_helmets = old_train_baseline_helmets_df[
+        old_train_baseline_helmets_df["game_play"].isin(train_game_play_ids)
+    ]
+    new_test_baseline_helmets = old_train_baseline_helmets_df[
+        old_train_baseline_helmets_df["game_play"].isin(test_game_play_ids)
+    ]
+    new_train_baseline_helmets.to_csv(
+        public_path / "train_baseline_helmets.csv", index=False
+    )
+    new_test_baseline_helmets.to_csv(
+        public_path / "test_baseline_helmets.csv", index=False
+    )
+    # Filter and save player tracking
+    new_train_player_trackings = old_train_player_tracking_df[
+        old_train_player_tracking_df["game_play"].isin(train_game_play_ids)
+    ]
+    new_test_player_trackings = old_train_player_tracking_df[
+        old_train_player_tracking_df["game_play"].isin(test_game_play_ids)
+    ]
+    new_train_player_trackings.to_csv(
+        public_path / "train_player_tracking.csv", index=False
+    )
+    new_test_player_trackings.to_csv(
+        public_path / "test_player_tracking.csv", index=False
+    )
+    # Filter and save video metadata
+    new_train_video_metadata = old_train_video_metadata_df[
+        old_train_video_metadata_df["game_play"].isin(train_game_play_ids)
+    ]
+    new_test_video_metadata = old_train_video_metadata_df[
+        old_train_video_metadata_df["game_play"].isin(test_game_play_ids)
+    ]
+    new_train_video_metadata.to_csv(
+        public_path / "train_video_metadata.csv", index=False
+    )
+    new_test_video_metadata.to_csv(
+        public_path / "test_video_metadata.csv", index=False
+    )
+    # Copy over videos
+    print(f"Copying videos to {public_path.name}...")
+    for game_play_type in ["All29", "Endzone", "Sideline"]:
+        for game_play in tqdm(
+            new_train["game_play"].unique(),
+            desc=f"Copying train videos ({game_play_type})",
+        ):
+            shutil.copyfile(
+                src=raw_path / "train" / f"{game_play}_{game_play_type}.mp4",
+                dst=public_path / "train" / f"{game_play}_{game_play_type}.mp4",
+            )
+        for game_play in tqdm(
+            new_test["game_play"].unique(),
+            desc=f"Copying test videos ({game_play_type})",
+        ):
+            shutil.copyfile(
+                src=raw_path / "train" / f"{game_play}_{game_play_type}.mp4",
+                dst=public_path / "test" / f"{game_play}_{game_play_type}.mp4",
+            )
+    # Check integrity of the files copied
+    num_train_videos_found = len(list(public_path.glob("train/*.mp4")))
+    num_test_videos_found = len(list(public_path.glob("test/*.mp4")))
+    num_expected_train_videos = len(new_train["game_play"].unique()) * 3
+    num_expected_test_videos = len(new_test["game_play"].unique()) * 3
+    assert (
+        num_train_videos_found == num_expected_train_videos
+    ), f"Expected {num_expected_train_videos} images, found {num_train_videos_found}"
+    assert (
+        num_test_videos_found == num_expected_test_videos
+    ), f"Expected {num_expected_test_videos} images, found {num_test_videos_found}"
+    # Create a sample submission file
+    submission_df = pd.DataFrame(
+        {
+            "contact_id": new_test["contact_id"],
+            "contact": 0,
+        }
+    )
+    submission_df.to_csv(public_path / "sample_submission.csv", index=False)
+def prepare(raw: Path, public: Path, private: Path):
+    # Load all raw dataframes once to improve efficiency
+    old_train_labels = pd.read_csv(raw / "train_labels.csv")
+    old_train_baseline_helmets = pd.read_csv(raw / "train_baseline_helmets.csv")
+    old_train_player_tracking = pd.read_csv(raw / "train_player_tracking.csv")
+    old_train_video_metadata = pd.read_csv(raw / "train_video_metadata.csv")
+    # --- Original Data Split (Train/Test) ---
+    # Create train, test from train split. Ensure train, test come from different game plays
+    unique_game_play = old_train_labels["game_play"].unique()
+    new_train_game_play, new_test_game_play = train_test_split(
+        unique_game_play, test_size=0.1, random_state=0
+    )
+    print("--- Processing original train/test split ---")
+    _process_and_save_split(
+        train_game_play_ids=new_train_game_play,
+        test_game_play_ids=new_test_game_play,
+        raw_path=raw,
+        public_path=public,
+        private_path=private,
+        old_train_labels_df=old_train_labels,
+        old_train_baseline_helmets_df=old_train_baseline_helmets,
+        old_train_player_tracking_df=old_train_player_tracking,
+        old_train_video_metadata_df=old_train_video_metadata,
+    )
+    print("--- Original split processing complete. ---\n")
+    # --- New Validation Data Split (Train_val/Test_val) ---
+    # Define new paths for the validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Split the *training set* again to create a validation set.
+    # Use the same logic and random_state for consistency.
+    train_val_game_play, test_val_game_play = train_test_split(
+        new_train_game_play, test_size=0.1, random_state=0
+    )
+    print("--- Processing validation train/test split ---")
+    _process_and_save_split(
+        train_game_play_ids=train_val_game_play,
+        test_game_play_ids=test_val_game_play,
+        raw_path=raw,
+        public_path=public_val,
+        private_path=private_val,
+        old_train_labels_df=old_train_labels,
+        old_train_baseline_helmets_df=old_train_baseline_helmets,
+        old_train_player_tracking_df=old_train_player_tracking,
+        old_train_video_metadata_df=old_train_video_metadata,
+    )
+    print("--- Validation split processing complete. ---")

mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py ADDED Viewed

@@ -0,0 +1,47 @@
+from typing import Tuple
+import pandas as pd
+from sklearn import metrics
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(
+    submission: pd.DataFrame, answers: pd.DataFrame
+) -> Tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
+    assert (
+        "formation_energy_ev_natom" in answers.columns
+    ), "Expected 'formation_energy_ev_natom' column in answers"
+    assert "bandgap_energy_ev" in answers.columns, "Expected 'bandgap_energy_ev' column in answers"
+    if "formation_energy_ev_natom" not in submission.columns:
+        raise InvalidSubmissionError("Expected 'formation_energy_ev_natom' column in submission")
+    if "bandgap_energy_ev" not in submission.columns:
+        raise InvalidSubmissionError("Expected 'bandgap_energy_ev' column in submission")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Expected {len(answers)} rows in submission, got {len(submission)}"
+        )
+    true_labels_formation, true_labels_bandgap = (
+        answers["formation_energy_ev_natom"],
+        answers["bandgap_energy_ev"],
+    )
+    predictions_formation, predictions_bandgap = (
+        submission["formation_energy_ev_natom"],
+        submission["bandgap_energy_ev"],
+    )
+    return true_labels_formation, true_labels_bandgap, predictions_formation, predictions_bandgap
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    (
+        true_labels_formation,
+        true_labels_bandgap,
+        predictions_formation,
+        predictions_bandgap,
+    ) = prepare_for_metric(submission, answers)
+    return (
+        metrics.root_mean_squared_log_error(true_labels_formation, predictions_formation)
+        + metrics.root_mean_squared_log_error(true_labels_bandgap, predictions_bandgap)
+    ) / 2

mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py ADDED Viewed

@@ -0,0 +1,77 @@
+import glob
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract, read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Extract only what we need
+    extract(raw / "train.zip", raw / "train")
+    extract(raw / "train.csv.zip", raw / "train.csv")
+    extract(raw / "test.zip", raw / "test")
+    extract(raw / "test.csv.zip", raw / "test.csv")
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv/train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Make ids go 1, 2, ... for both train and test. Keep old ids so we can map ids of other files
+    old_train_id_to_new = {
+        old_id: new_id for new_id, old_id in enumerate(new_train["id"], start=1)
+    }  # id starts from 1
+    new_train["id"] = new_train["id"].map(old_train_id_to_new)
+    old_test_id_to_new = {
+        old_id: new_id for new_id, old_id in enumerate(new_test["id"], start=1)
+    }  # id starts from 1
+    new_test["id"] = new_test["id"].map(old_test_id_to_new)
+    new_test_without_labels = new_test.drop(
+        columns=["formation_energy_ev_natom", "bandgap_energy_ev"]
+    )
+    # Copy over files
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    train_paths = sorted(glob.glob(str(raw / "train/train/**/*.xyz")))
+    for src in train_paths:
+        id = int(Path(src).parts[-2])
+        if id not in old_train_id_to_new.keys():  # Filter for train ids
+            continue
+        new_id = old_train_id_to_new[id]
+        (public / "train" / str(new_id)).mkdir(parents=True, exist_ok=True)
+        shutil.copy(src=src, dst=public / "train" / str(new_id) / "geometry.xyz")
+    assert len(list(public.glob("train/**/*.xyz"))) == len(
+        new_train
+    ), f"Expected {len(new_train)} train geometry files, found {len(list(public.glob('train/**/*.xyz')))}"
+    for src in train_paths:
+        id = int(Path(src).parts[-2])
+        if id not in old_test_id_to_new.keys():  # Filter for test ids
+            continue
+        new_id = old_test_id_to_new[id]
+        (public / "test" / str(new_id)).mkdir(parents=True, exist_ok=True)
+        shutil.copy(src=src, dst=public / "test" / str(new_id) / "geometry.xyz")
+    assert len(list(public.glob("test/**/*.xyz"))) == len(
+        new_test
+    ), f"Expected {len(new_test)} test geometry files, found {len(list(public.glob('test/**/*.xyz')))}"
+    # Create mock submission
+    sample_submission = pd.DataFrame(
+        {"id": new_test["id"], "formation_energy_ev_natom": 0.1779, "bandgap_energy_ev": 1.8892}
+    )
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    assert len(sample_submission) == len(
+        new_test
+    ), "Sample submission should have the same number of rows as the test set"

mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py ADDED Viewed

@@ -0,0 +1,144 @@
+import glob
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import extract, read_csv
+def _process_split(
+    source_df: pd.DataFrame,
+    test_size: float,
+    random_state: int,
+    source_geometry_paths: list,
+    public_dir: Path,
+    private_dir: Path,
+) -> pd.DataFrame:
+    """
+    Helper function to perform a data split, re-index, and write all necessary files.
+    Args:
+        source_df: The DataFrame to split.
+        test_size: The proportion of the dataset to allocate to the test split.
+        random_state: The seed used by the random number generator.
+        source_geometry_paths: A list of paths to all original geometry files.
+        public_dir: The destination directory for public-facing files (train set, test features).
+        private_dir: The destination directory for private-facing files (test labels).
+    Returns:
+        The created training DataFrame, which can be used for a subsequent split.
+    """
+    # Ensure destination directories exist
+    public_dir.mkdir(parents=True, exist_ok=True)
+    private_dir.mkdir(parents=True, exist_ok=True)
+    # Create train, test from the source dataframe
+    new_train, new_test = train_test_split(
+        source_df, test_size=test_size, random_state=random_state
+    )
+    # Make ids go 1, 2, ... for both train and test. Keep old ids so we can map ids of other files
+    old_train_id_to_new = {
+        old_id: new_id for new_id, old_id in enumerate(new_train["id"], start=1)
+    }  # id starts from 1
+    new_train["id"] = new_train["id"].map(old_train_id_to_new)
+    old_test_id_to_new = {
+        old_id: new_id for new_id, old_id in enumerate(new_test["id"], start=1)
+    }  # id starts from 1
+    new_test["id"] = new_test["id"].map(old_test_id_to_new)
+    new_test_without_labels = new_test.drop(
+        columns=["formation_energy_ev_natom", "bandgap_energy_ev"]
+    )
+    # Copy over files
+    new_train.to_csv(public_dir / "train.csv", index=False)
+    new_test.to_csv(private_dir / "test.csv", index=False)
+    new_test_without_labels.to_csv(public_dir / "test.csv", index=False)
+    # --- Process and copy geometry files for the new train set ---
+    train_geometry_dir = public_dir / "train"
+    for src in source_geometry_paths:
+        original_id = int(Path(src).parts[-2])
+        if original_id not in old_train_id_to_new:  # Filter for train ids
+            continue
+        new_id = old_train_id_to_new[original_id]
+        dest_dir = train_geometry_dir / str(new_id)
+        dest_dir.mkdir(parents=True, exist_ok=True)
+        shutil.copy(src=src, dst=dest_dir / "geometry.xyz")
+    assert len(list(train_geometry_dir.glob("**/*.xyz"))) == len(
+        new_train
+    ), f"Expected {len(new_train)} train geometry files in {public_dir}, found {len(list(train_geometry_dir.glob('**/*.xyz')))}"
+    # --- Process and copy geometry files for the new test set ---
+    test_geometry_dir = public_dir / "test"
+    for src in source_geometry_paths:
+        original_id = int(Path(src).parts[-2])
+        if original_id not in old_test_id_to_new:  # Filter for test ids
+            continue
+        new_id = old_test_id_to_new[original_id]
+        dest_dir = test_geometry_dir / str(new_id)
+        dest_dir.mkdir(parents=True, exist_ok=True)
+        shutil.copy(src=src, dst=dest_dir / "geometry.xyz")
+    assert len(list(test_geometry_dir.glob("**/*.xyz"))) == len(
+        new_test
+    ), f"Expected {len(new_test)} test geometry files in {public_dir}, found {len(list(test_geometry_dir.glob('**/*.xyz')))}"
+    # Create mock submission
+    sample_submission = pd.DataFrame(
+        {"id": new_test["id"], "formation_energy_ev_natom": 0.1779, "bandgap_energy_ev": 1.8892}
+    )
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    assert len(sample_submission) == len(
+        new_test
+    ), f"Sample submission in {public_dir} should have the same number of rows as its test set"
+    return new_train
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a secondary validation split (public_val/private_val) from the training data.
+    """
+    # Define paths for the new validation set
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Extract only what we need
+    extract(raw / "train.zip", raw / "train")
+    extract(raw / "train.csv.zip", raw / "train.csv")
+    extract(raw / "test.zip", raw / "test")
+    extract(raw / "test.csv.zip", raw / "test.csv")
+    # Load initial data and geometry paths
+    initial_train_df = read_csv(raw / "train.csv/train.csv")
+    all_geometry_paths = sorted(glob.glob(str(raw / "train/train/**/*.xyz")))
+    # --- First Split: Create original public/private datasets ---
+    # This split is identical to the original script to ensure unchanged outputs.
+    train_df_after_first_split = _process_split(
+        source_df=initial_train_df,
+        test_size=0.1,
+        random_state=0,
+        source_geometry_paths=all_geometry_paths,
+        public_dir=public,
+        private_dir=private,
+    )
+    # --- Second Split: Create new validation datasets from the first split's train set ---
+    # The test_size is set to 1/9 of the incoming data (which is 90% of the original).
+    # This makes the new test_val set size equal to the original test set size (0.1 * T = 1/9 * 0.9 * T).
+    _process_split(
+        source_df=train_df_after_first_split,
+        test_size=1 / 9,
+        random_state=0,
+        source_geometry_paths=all_geometry_paths,
+        public_dir=public_val,
+        private_dir=private_val,
+    )

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl