PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py ADDED Viewed

@@ -0,0 +1,130 @@
+import shutil
+from pathlib import Path
+from typing import Dict
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.utils import extract, read_csv
+def _process_and_save_split(
+    source_df: pd.DataFrame,
+    public_path: Path,
+    private_path: Path,
+    raw_images_path: Path,
+    test_size: float,
+    random_state: int,
+) -> pd.DataFrame:
+    """
+    Helper function to perform a data split, save CSVs, and copy image files.
+    Args:
+        source_df: The DataFrame to split.
+        public_path: The public output directory.
+        private_path: The private output directory.
+        raw_images_path: The path to the source images.
+        test_size: The proportion of the dataset to allocate to the test split.
+        random_state: The random state for reproducibility.
+    Returns:
+        The training portion of the split DataFrame.
+    """
+    # Create output directories
+    public_path.mkdir(exist_ok=True, parents=True)
+    private_path.mkdir(exist_ok=True, parents=True)
+    # Create train, test from the source dataframe
+    new_train, answers = train_test_split(
+        source_df, test_size=test_size, random_state=random_state
+    )
+    # Create a sample submission file
+    submission_df = answers.copy()
+    submission_df["labels"] = "healthy"
+    # Checks
+    assert len(answers) == len(submission_df), "Answers and submission should have the same length"
+    assert not set(new_train["image"]).intersection(
+        set(answers["image"])
+    ), "new_train and answers should not share any image"
+    assert (
+        "image" in new_train.columns and "labels" in new_train.columns
+    ), "Train DataFrame must have 'image' and 'labels' columns"
+    assert (
+        "image" in submission_df.columns and "labels" in submission_df.columns
+    ), "Sample submission DataFrame must have 'image' and 'labels' columns"
+    assert len(new_train) + len(answers) == len(
+        source_df
+    ), "The combined length of new_train and answers should equal the length of the source dataframe"
+    # Write CSVs using the required standard filenames
+    answers.to_csv(private_path / "answers.csv", index=False)
+    new_train.to_csv(public_path / "train.csv", index=False)
+    submission_df.to_csv(public_path / "sample_submission.csv", index=False)
+    # Copy files
+    (public_path / "test_images").mkdir(exist_ok=True)
+    (public_path / "train_images").mkdir(exist_ok=True)
+    for file_id in tqdm(new_train["image"], desc=f"Copying Train Images to {public_path.name}"):
+        shutil.copyfile(
+            src=raw_images_path / f"{file_id}",
+            dst=public_path / "train_images" / f"{file_id}",
+        )
+    for file_id in tqdm(answers["image"], desc=f"Copying Test Images to {public_path.name}"):
+        shutil.copyfile(
+            src=raw_images_path / f"{file_id}",
+            dst=public_path / "test_images" / f"{file_id}",
+        )
+    # Checks
+    assert len(list(public_path.glob("train_images/*.jpg"))) == len(
+        new_train
+    ), f"Public train images in {public_path.name} should have the same number of images as the train DataFrame"
+    assert len(list(public_path.glob("test_images/*.jpg"))) == len(
+        answers
+    ), f"Public test images in {public_path.name} should have the same number of images as the answers DataFrame"
+    return new_train
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public/private datasets for the main competition,
+    and creates a parallel validation split in public_val/private_val directories.
+    """
+    # Define paths for the new validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    old_train = read_csv(raw / "train.csv")
+    raw_images_path = raw / "train_images"
+    # 1. Create the original train/test split for the competition.
+    # This creates the `public` and `private` directories. The outputs here
+    # will be identical to the original script's outputs.
+    # The returned `competition_train_df` is the 80% training set from this first split.
+    competition_train_df = _process_and_save_split(
+        source_df=old_train,
+        public_path=public,
+        private_path=private,
+        raw_images_path=raw_images_path,
+        test_size=0.2,
+        random_state=0,
+    )
+    # 2. Create the new validation split from the competition's training data.
+    # We split the `competition_train_df` (80% of original data) again.
+    # To get a validation set of the same size as the original test set (20% of total),
+    # we take 25% from this new pool of data (0.25 * 0.8 = 0.2).
+    _process_and_save_split(
+        source_df=competition_train_df,
+        public_path=public_val,
+        private_path=private_val,
+        raw_images_path=raw_images_path,
+        test_size=0.25,
+        random_state=0,
+    )

mlebench/competitions/plant-seedlings-classification/grade.py ADDED Viewed

@@ -0,0 +1,39 @@
+import pandas as pd
+from sklearn.metrics import f1_score
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """
+    `submission` and `answers` are pd.DataFrame with "file" and "species" columns
+    """
+    # Answer checks
+    assert "file" in answers.columns, "Answers must have 'file' column"
+    assert "species" in answers.columns, "Answers must have 'species' column"
+    # Submission checks
+    if "file" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have 'file' column")
+    if "species" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have 'species' column")
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length {len(submission)} != answers length {len(answers)}"
+        )
+    # Sort submission and answers by "file"
+    submission = submission.sort_values("file")
+    answers = answers.sort_values("file")
+    if (submission["file"].values != answers["file"].values).any():
+        raise InvalidSubmissionError("Submission and answers have mismatched 'file' columns")
+    y_true = answers["species"].tolist()
+    y_pred = submission["species"].tolist()
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    return f1_score(y_true=y_true, y_pred=y_pred, average="micro")

mlebench/competitions/plant-seedlings-classification/prepare.py ADDED Viewed

@@ -0,0 +1,91 @@
+import os
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import extract, read_csv
+def create_dataframe_from_directory(directory: str) -> pd.DataFrame:
+    """
+    Creates a DataFrame from a directory of images.
+    Args:
+        directory (str): The path to the directory containing subdirectories of images.
+    Returns:
+        pd.DataFrame: A DataFrame with two columns: 'image' and 'label'. The 'image' column contains the file paths to the images, and the 'label' column contains the corresponding labels (subdirectory names).
+    """
+    data = []
+    for label in sorted(os.listdir(directory)):  # Sort labels for determinism
+        label_path = os.path.join(directory, label)
+        if os.path.isdir(label_path):
+            for file_name in sorted(os.listdir(label_path)):  # Sort files for determinism
+                if file_name.endswith(".png"):
+                    file_path = os.path.join(label_path, file_name)
+                    data.append({"file": os.path.basename(file_path), "species": label})
+    return pd.DataFrame(data)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Directory containing the training images
+    train_dir = raw / "train"
+    old_train = create_dataframe_from_directory(train_dir)
+    test_ratio = 0.14  # 794/(4750+794) = 0.14
+    train_df, test_df = train_test_split(old_train, test_size=test_ratio, random_state=0)
+    # Create a sample submission file
+    submission_df = test_df.copy()
+    submission_df["species"] = "Sugar beet"
+    # Checks
+    assert len(test_df) == len(submission_df), "Answers and submission should have the same length"
+    assert not set(train_df["file"]).intersection(
+        set(test_df["file"])
+    ), "new_train and answers should not share any image"
+    assert (
+        "file" in train_df.columns and "species" in train_df.columns
+    ), "Train DataFrame must have 'file' and 'species' columns"
+    assert (
+        "file" in submission_df.columns and "species" in submission_df.columns
+    ), "Sample submission DataFrame must have 'file' and 'species' columns"
+    assert len(train_df) + len(test_df) == len(
+        old_train
+    ), "The combined length of new_train and answers should equal the length of old_train"
+    # Write CSVs
+    test_df.to_csv(private / "answers.csv", index=False)
+    submission_df.to_csv(public / "sample_submission.csv", index=False)
+    # Copy files
+    (public / "test").mkdir(exist_ok=True)
+    (public / "train").mkdir(exist_ok=True)
+    # Create nested folder structure for train
+    for species in train_df["species"].unique():
+        (public / "train" / species).mkdir(parents=True, exist_ok=True)
+    for _, row in tqdm(train_df.iterrows(), desc="Copying Train Images", total=len(train_df)):
+        src_path = train_dir / row["species"] / row["file"]
+        dst_path = public / "train" / row["species"] / row["file"]
+        shutil.copyfile(src=src_path, dst=dst_path)
+    for _, row in tqdm(test_df.iterrows(), desc="Copying Test Images", total=len(test_df)):
+        src_path = train_dir / row["species"] / row["file"]
+        dst_path = public / "test" / row["file"]
+        shutil.copyfile(src=src_path, dst=dst_path)
+    # Checks
+    assert len(list(public.glob("train/**/*.png"))) == len(
+        train_df
+    ), f"Public train images should have the same number of images as the train DataFrame: number of files {len(list(public.glob('train/**/*.png')))} != len(train_df)={len(train_df)}"
+    assert len(list(public.glob("test/*.png"))) == len(
+        test_df
+    ), f"Public test images should have the same number of images as the answers DataFrame: number of files {len(list(public.glob('test/*.png')))} != len(test_df)={len(test_df)}"

mlebench/competitions/plant-seedlings-classification/prepare_val.py ADDED Viewed

@@ -0,0 +1,158 @@
+import os
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import extract, read_csv
+def create_dataframe_from_directory(directory: str) -> pd.DataFrame:
+    """
+    Creates a DataFrame from a directory of images.
+    Args:
+        directory (str): The path to the directory containing subdirectories of images.
+    Returns:
+        pd.DataFrame: A DataFrame with two columns: 'image' and 'label'. The 'image' column contains the file paths to the images, and the 'label' column contains the corresponding labels (subdirectory names).
+    """
+    data = []
+    for label in sorted(os.listdir(directory)):  # Sort labels for determinism
+        label_path = os.path.join(directory, label)
+        if os.path.isdir(label_path):
+            for file_name in sorted(os.listdir(label_path)):  # Sort files for determinism
+                if file_name.endswith(".png"):
+                    file_path = os.path.join(label_path, file_name)
+                    data.append({"file": os.path.basename(file_path), "species": label})
+    return pd.DataFrame(data)
+def _process_split(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    public_path: Path,
+    private_path: Path,
+    source_images_dir: Path,
+):
+    """
+    Helper function to process a train/test split and write all necessary files and folders.
+    This function creates the public and private directories, generates CSVs, and copies image files.
+    """
+    # Ensure destination directories exist
+    public_path.mkdir(exist_ok=True)
+    private_path.mkdir(exist_ok=True)
+    # Create a sample submission file
+    submission_df = test_df.copy()
+    submission_df["species"] = "Sugar beet"
+    # Checks
+    assert len(test_df) == len(submission_df), "Answers and submission should have the same length"
+    assert not set(train_df["file"]).intersection(
+        set(test_df["file"])
+    ), "new_train and answers should not share any image"
+    assert (
+        "file" in train_df.columns and "species" in train_df.columns
+    ), "Train DataFrame must have 'file' and 'species' columns"
+    assert (
+        "file" in submission_df.columns and "species" in submission_df.columns
+    ), "Sample submission DataFrame must have 'file' and 'species' columns"
+    # Write CSVs
+    test_df.to_csv(private_path / "answers.csv", index=False)
+    submission_df.to_csv(public_path / "sample_submission.csv", index=False)
+    # Prepare image directories
+    public_test_images_path = public_path / "test"
+    public_train_images_path = public_path / "train"
+    public_test_images_path.mkdir(exist_ok=True)
+    public_train_images_path.mkdir(exist_ok=True)
+    # Create nested folder structure for train
+    for species in train_df["species"].unique():
+        (public_train_images_path / species).mkdir(parents=True, exist_ok=True)
+    # Use public path name for progress bar description
+    desc_prefix = public_path.name.capitalize()
+    for _, row in tqdm(
+        train_df.iterrows(), desc=f"Copying {desc_prefix} Train Images", total=len(train_df)
+    ):
+        src_path = source_images_dir / row["species"] / row["file"]
+        dst_path = public_train_images_path / row["species"] / row["file"]
+        shutil.copyfile(src=src_path, dst=dst_path)
+    for _, row in tqdm(
+        test_df.iterrows(), desc=f"Copying {desc_prefix} Test Images", total=len(test_df)
+    ):
+        src_path = source_images_dir / row["species"] / row["file"]
+        dst_path = public_test_images_path / row["file"]
+        shutil.copyfile(src=src_path, dst=dst_path)
+    # Final checks on copied files
+    assert len(list(public_train_images_path.glob("**/*.png"))) == len(
+        train_df
+    ), f"Public train images should have the same number of images as the train DataFrame: number of files {len(list(public_train_images_path.glob('**/*.png')))} != len(train_df)={len(train_df)}"
+    assert len(list(public_test_images_path.glob("*.png"))) == len(
+        test_df
+    ), f"Public test images should have the same number of images as the answers DataFrame: number of files {len(list(public_test_images_path.glob('*.png')))} != len(test_df)={len(test_df)}"
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    It then creates a secondary validation split (public_val, private_val) from the main training data.
+    """
+    # Directory containing the original images
+    train_dir = raw / "train"
+    old_train = create_dataframe_from_directory(train_dir)
+    test_ratio = 0.14  # 794/(4750+794) = 0.14
+    # --- 1. Original Split: Create main train and test sets ---
+    train_df, test_df = train_test_split(old_train, test_size=test_ratio, random_state=0)
+    # Check for original split integrity
+    assert len(train_df) + len(test_df) == len(
+        old_train
+    ), "The combined length of new_train and answers should equal the length of old_train"
+    # Process and save the original split to public/ and private/
+    # This generates the original, unmodified competition output.
+    _process_split(
+        train_df=train_df,
+        test_df=test_df,
+        public_path=public,
+        private_path=private,
+        source_images_dir=train_dir,
+    )
+    # --- 2. New Validation Split: Split the main train_df again ---
+    # Define paths for the new validation dataset, parallel to the original ones.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Calculate the new test ratio to make the validation set (test_val_df)
+    # approximately the same size as the original test set (test_df).
+    new_test_ratio = len(test_df) / len(train_df)
+    # Split the training data to create a new, smaller training set and a validation set.
+    # We use the same random_state for reproducibility and consistency with the first split.
+    train_val_df, test_val_df = train_test_split(train_df, test_size=new_test_ratio, random_state=0)
+    # Check for new split integrity
+    assert (
+        len(train_val_df) + len(test_val_df) == len(train_df)
+    ), "The combined length of train_val_df and test_val_df should equal the length of train_df"
+    # Process and save the new validation split to public_val/ and private_val/
+    # Using the same helper ensures an identical file and directory structure.
+    _process_split(
+        train_df=train_val_df,
+        test_df=test_val_df,
+        public_path=public_val,
+        private_path=private_val,
+        source_images_dir=train_dir,
+    )

mlebench/competitions/playground-series-s3e1/__init__.py ADDED Viewed

File without changes

mlebench/competitions/playground-series-s3e1/grade.py ADDED Viewed

@@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for RMSE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "MedHouseVal" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'MedHouseVal' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["MedHouseVal"].values
+    y_true = answers["MedHouseVal"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate RMSE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # RMSE
+    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
+    return rmse

mlebench/competitions/playground-series-s3e1/prepare.py ADDED Viewed

@@ -0,0 +1,25 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare playground-series-s3e1 dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"

mlebench/competitions/playground-series-s3e11/__init__.py ADDED Viewed

File without changes

mlebench/competitions/playground-series-s3e11/grade.py ADDED Viewed

@@ -0,0 +1,55 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import mean_squared_log_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    """Prepare submission and answers for RMSLE calculation."""
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Submission length ({len(submission)}) != answers length ({len(answers)})"
+        )
+    if "id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'id' column")
+    if "cost" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'cost' column")
+    # Sort by id
+    submission = submission.sort_values("id").reset_index(drop=True)
+    answers = answers.sort_values("id").reset_index(drop=True)
+    # Check id alignment
+    if not (submission["id"] == answers["id"]).all():
+        raise InvalidSubmissionError("Submission and answers id mismatch")
+    # Extract predictions and true values
+    y_pred = submission["cost"].values
+    y_true = answers["cost"].values
+    # Validate predictions
+    try:
+        y_pred = y_pred.astype(float)
+    except (ValueError, TypeError):
+        raise InvalidSubmissionError("Predictions must be numeric")
+    if np.any(np.isnan(y_pred)):
+        raise InvalidSubmissionError("Predictions cannot contain NaN values")
+    if np.any(y_pred < 0):
+        raise InvalidSubmissionError("Predictions cannot be negative")
+    return y_true, y_pred
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    """Calculate RMSLE score."""
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    # RMSLE
+    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
+    return rmsle

mlebench/competitions/playground-series-s3e11/prepare.py ADDED Viewed

@@ -0,0 +1,25 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Prepare playground-series-s3e11 dataset.
+    """
+    # Read data
+    train = pd.read_csv(raw / "train.csv")
+    test = pd.read_csv(raw / "test.csv")
+    sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    test_answer = pd.read_csv(raw / "test_answer.csv")
+    # Public files (visible to agents)
+    train.to_csv(public / "train.csv", index=False)
+    test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Private files (for grading)
+    test_answer.to_csv(private / "test.csv", index=False)
+    # Validation checks
+    assert len(test_answer) == len(sample_submission), \
+        f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"

mlebench/competitions/playground-series-s3e18/grade.py ADDED Viewed

@@ -0,0 +1,39 @@
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    if "id" not in submission.columns or "id" not in answers.columns:
+        raise InvalidSubmissionError(
+            "Both submission and answers DataFrames must contain an 'id' column."
+        )
+    if "EC1" not in submission.columns or "EC2" not in submission.columns:
+        raise InvalidSubmissionError("Submission DataFrame must contain 'EC1' and 'EC2' columns.")
+    assert (
+        "EC1" in answers.columns and "EC2" in answers.columns
+    ), "Answers DataFrame must contain 'EC1' and 'EC2' columns."
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError("Submission must be the same length as the answers.")
+    answers_sorted = answers.sort_values("id")
+    submission_sorted = submission.sort_values("id")
+    if (submission_sorted["id"].values != answers_sorted["id"].values).any():
+        raise InvalidSubmissionError("Submission and answers have mismatched 'id' columns")
+    y_true = answers_sorted[["EC1", "EC2"]].to_numpy()
+    y_score = submission_sorted[["EC1", "EC2"]].to_numpy()
+    return {"y_true": y_true, "y_score": y_score}
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    roc_auc_inputs = prepare_for_metric(submission, answers)
+    # macro average: calc metric for each class, and then average, as outlined in comp description
+    return roc_auc_score(**roc_auc_inputs, average="macro")

mlebench/competitions/playground-series-s3e18/prepare.py ADDED Viewed

@@ -0,0 +1,36 @@
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train and test splits from train set
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Create sample submission
+    sample_submission = new_test.copy()
+    sample_submission["EC1"] = 0.5
+    sample_submission["EC2"] = 0.5
+    sample_submission.drop(
+        sample_submission.columns.difference(["id", "EC1", "EC2"]), axis=1, inplace=True
+    )
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Create private files
+    new_test.to_csv(private / "test.csv", index=False)
+    # Create public files visible to agents
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.drop(["EC1", "EC2", "EC3", "EC4", "EC5", "EC6"], axis=1, inplace=True)
+    new_test.to_csv(public / "test.csv", index=False)
+    # Checks
+    assert new_test.shape[1] == 32, "Public test set should have 32 columns"
+    assert new_train.shape[1] == 38, "Public train set should have 38 columns"
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Length of new_train and new_test should equal length of old_train"

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl