PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/kuzushiji-recognition/prepare.py ADDED Viewed

@@ -0,0 +1,92 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import extract, get_logger, read_csv
+logger = get_logger(__name__)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Extract images so we can split the train images
+    extract(raw / "train_images.zip", raw / "train")
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Save the new train and test
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    # Copy images
+    (public / "train").mkdir(exist_ok=True)
+    (public / "test").mkdir(exist_ok=True)
+    for file_id in tqdm(new_train["image_id"], desc="Copying train images"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "train" / f"{file_id}.jpg",
+        )
+    for file_id in tqdm(new_test["image_id"], desc="Copying test images"):
+        shutil.copyfile(
+            src=raw / "train" / f"{file_id}.jpg",
+            dst=public / "test" / f"{file_id}.jpg",
+        )
+    assert len(list(public.glob("train/*.jpg"))) == len(new_train)
+    assert len(list(public.glob("test/*.jpg"))) == len(new_test)
+    assert len(new_train) + len(new_test) == len(old_train)
+    # Create zips of the images
+    logger.info("Re-zipping up new image directories...")
+    shutil.make_archive(str(public / "train_images"), "zip", public / "train")
+    shutil.make_archive(str(public / "test_images"), "zip", public / "test")
+    # Remove the directories for consistency with the kaggle data
+    shutil.rmtree(public / "train")
+    shutil.rmtree(public / "test")
+    # Copy unicode_translation
+    shutil.copyfile(
+        src=raw / "unicode_translation.csv",
+        dst=public / "unicode_translation.csv",
+    )
+    assert (public / "train_images.zip").is_file()
+    assert (public / "test_images.zip").is_file()
+    assert (public / "unicode_translation.csv").is_file()
+    # Make sample submission for new test set
+    sample_submission = new_test.copy()
+    # Same guess for all, as in original sample submission
+    sample_submission["labels"] = "U+003F 1 1 U+FF2F 2 2"
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Make a gold submission in private/ (useful for testing)
+    # This submission takes the "class x y w h" labels from test and converts them to
+    # "class x+1 y+1" labels (the +1 moves the coord into the  bbox, so that the metric picks it up)
+    submission_labels = []
+    for label in new_test["labels"]:
+        # Labels have the form "class x y w h class x y w h class x y w h ... "
+        label = label.split()
+        new_label = []
+        assert len(label) % 5 == 0
+        classes, xs, ys = label[0::5], label[1::5], label[2::5]
+        # +1 to xs and ys
+        xs = [str(int(x) + 1) for x in xs]
+        ys = [str(int(y) + 1) for y in ys]
+        new_label = [f"{c} {x} {y}" for c, x, y in zip(classes, xs, ys)]
+        submission_labels.append(" ".join(new_label))
+    gold_submission = new_test.copy()
+    gold_submission["labels"] = submission_labels
+    gold_submission.to_csv(private / "gold_submission.csv", index=False)
+    assert len(gold_submission) == len(new_test)
+    assert len(gold_submission) == len(sample_submission)

mlebench/competitions/kuzushiji-recognition/prepare_val.py ADDED Viewed

@@ -0,0 +1,149 @@
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import extract, get_logger, read_csv
+if TYPE_CHECKING:
+    import pandas as pd
+logger = get_logger(__name__)
+def _create_split_files(
+    train_df: "pd.DataFrame",
+    test_df: "pd.DataFrame",
+    public_dir: Path,
+    private_dir: Path,
+    raw_dir: Path,
+):
+    """
+    Helper function to create all necessary files for a given train/test split.
+    This function populates the public and private directories with the respective
+    data (images, CSVs, submissions) based on the provided dataframes.
+    """
+    # Save the new train and test metadata
+    train_df.to_csv(public_dir / "train.csv", index=False)
+    test_df.to_csv(private_dir / "test.csv", index=False)
+    # Copy images
+    (public_dir / "train").mkdir(exist_ok=True)
+    (public_dir / "test").mkdir(exist_ok=True)
+    raw_img_dir = raw_dir / "train"
+    for file_id in tqdm(train_df["image_id"], desc=f"Copying train images to {public_dir.name}"):
+        shutil.copyfile(
+            src=raw_img_dir / f"{file_id}.jpg",
+            dst=public_dir / "train" / f"{file_id}.jpg",
+        )
+    for file_id in tqdm(test_df["image_id"], desc=f"Copying test images to {public_dir.name}"):
+        shutil.copyfile(
+            src=raw_img_dir / f"{file_id}.jpg",
+            dst=public_dir / "test" / f"{file_id}.jpg",
+        )
+    assert len(list(public_dir.glob("train/*.jpg"))) == len(train_df)
+    assert len(list(public_dir.glob("test/*.jpg"))) == len(test_df)
+    # Create zips of the images
+    logger.info(f"Re-zipping up new image directories for {public_dir.name}...")
+    shutil.make_archive(str(public_dir / "train_images"), "zip", public_dir / "train")
+    shutil.make_archive(str(public_dir / "test_images"), "zip", public_dir / "test")
+    # Remove the directories for consistency with the kaggle data
+    shutil.rmtree(public_dir / "train")
+    shutil.rmtree(public_dir / "test")
+    # Copy unicode_translation
+    shutil.copyfile(
+        src=raw_dir / "unicode_translation.csv",
+        dst=public_dir / "unicode_translation.csv",
+    )
+    assert (public_dir / "train_images.zip").is_file()
+    assert (public_dir / "test_images.zip").is_file()
+    assert (public_dir / "unicode_translation.csv").is_file()
+    # Make sample submission for new test set
+    sample_submission = test_df.copy()
+    # Same guess for all, as in original sample submission
+    sample_submission["labels"] = "U+003F 1 1 U+FF2F 2 2"
+    sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    # Make a gold submission in private/ (useful for testing)
+    # This submission takes the "class x y w h" labels from test and converts them to
+    # "class x+1 y+1" labels (the +1 moves the coord into the  bbox, so that the metric picks it up)
+    submission_labels = []
+    for label in test_df["labels"]:
+        # Labels have the form "class x y w h class x y w h class x y w h ... "
+        label = label.split()
+        new_label = []
+        assert len(label) % 5 == 0
+        classes, xs, ys = label[0::5], label[1::5], label[2::5]
+        # +1 to xs and ys
+        xs = [str(int(x) + 1) for x in xs]
+        ys = [str(int(y) + 1) for y in ys]
+        new_label = [f"{c} {x} {y}" for c, x, y in zip(classes, xs, ys)]
+        submission_labels.append(" ".join(new_label))
+    gold_submission = test_df.copy()
+    gold_submission["labels"] = submission_labels
+    gold_submission.to_csv(private_dir / "gold_submission.csv", index=False)
+    assert len(gold_submission) == len(test_df)
+    assert len(gold_submission) == len(sample_submission)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Additionally, creates a second train/validation split for local model development.
+    """
+    # Extract images so we can split the train images
+    extract(raw / "train_images.zip", raw / "train")
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    # --- Original Split (for final evaluation) ---
+    logger.info("Creating original train/test split for 'public' and 'private' directories...")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    _create_split_files(
+        train_df=new_train,
+        test_df=new_test,
+        public_dir=public,
+        private_dir=private,
+        raw_dir=raw,
+    )
+    assert len(new_train) + len(new_test) == len(old_train)
+    logger.info("Successfully created original split.")
+    # --- New Validation Split (for local development) ---
+    logger.info("Creating new train/validation split for 'public_val' and 'private_val' directories...")
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    public_val.mkdir(exist_ok=True)
+    private_val.mkdir(exist_ok=True)
+    # Split the original training data to create a new, smaller training set and a validation set.
+    # The validation set size must be approx the same as the original test set size.
+    # Original test size = 0.1 * total_data
+    # Original train size = 0.9 * total_data
+    # To make the new validation set size equal to the original test set size, we must take
+    # a fraction of the `new_train` data: test_size * (0.9 * total) = 0.1 * total.
+    # This means the required test_size = 0.1 / 0.9 = 1/9.
+    train_val, test_val = train_test_split(new_train, test_size=1 / 9, random_state=0)
+    _create_split_files(
+        train_df=train_val,
+        test_df=test_val,
+        public_dir=public_val,
+        private_dir=private_val,
+        raw_dir=raw,
+    )
+    assert len(train_val) + len(test_val) == len(new_train)
+    logger.info("Successfully created validation split.")

mlebench/competitions/leaf-classification/classes.py ADDED Viewed

@@ -0,0 +1,101 @@
+CLASSES = [
+    "Acer_Capillipes",
+    "Acer_Circinatum",
+    "Acer_Mono",
+    "Acer_Opalus",
+    "Acer_Palmatum",
+    "Acer_Pictum",
+    "Acer_Platanoids",
+    "Acer_Rubrum",
+    "Acer_Rufinerve",
+    "Acer_Saccharinum",
+    "Alnus_Cordata",
+    "Alnus_Maximowiczii",
+    "Alnus_Rubra",
+    "Alnus_Sieboldiana",
+    "Alnus_Viridis",
+    "Arundinaria_Simonii",
+    "Betula_Austrosinensis",
+    "Betula_Pendula",
+    "Callicarpa_Bodinieri",
+    "Castanea_Sativa",
+    "Celtis_Koraiensis",
+    "Cercis_Siliquastrum",
+    "Cornus_Chinensis",
+    "Cornus_Controversa",
+    "Cornus_Macrophylla",
+    "Cotinus_Coggygria",
+    "Crataegus_Monogyna",
+    "Cytisus_Battandieri",
+    "Eucalyptus_Glaucescens",
+    "Eucalyptus_Neglecta",
+    "Eucalyptus_Urnigera",
+    "Fagus_Sylvatica",
+    "Ginkgo_Biloba",
+    "Ilex_Aquifolium",
+    "Ilex_Cornuta",
+    "Liquidambar_Styraciflua",
+    "Liriodendron_Tulipifera",
+    "Lithocarpus_Cleistocarpus",
+    "Lithocarpus_Edulis",
+    "Magnolia_Heptapeta",
+    "Magnolia_Salicifolia",
+    "Morus_Nigra",
+    "Olea_Europaea",
+    "Phildelphus",
+    "Populus_Adenopoda",
+    "Populus_Grandidentata",
+    "Populus_Nigra",
+    "Prunus_Avium",
+    "Prunus_X_Shmittii",
+    "Pterocarya_Stenoptera",
+    "Quercus_Afares",
+    "Quercus_Agrifolia",
+    "Quercus_Alnifolia",
+    "Quercus_Brantii",
+    "Quercus_Canariensis",
+    "Quercus_Castaneifolia",
+    "Quercus_Cerris",
+    "Quercus_Chrysolepis",
+    "Quercus_Coccifera",
+    "Quercus_Coccinea",
+    "Quercus_Crassifolia",
+    "Quercus_Crassipes",
+    "Quercus_Dolicholepis",
+    "Quercus_Ellipsoidalis",
+    "Quercus_Greggii",
+    "Quercus_Hartwissiana",
+    "Quercus_Ilex",
+    "Quercus_Imbricaria",
+    "Quercus_Infectoria_sub",
+    "Quercus_Kewensis",
+    "Quercus_Nigra",
+    "Quercus_Palustris",
+    "Quercus_Phellos",
+    "Quercus_Phillyraeoides",
+    "Quercus_Pontica",
+    "Quercus_Pubescens",
+    "Quercus_Pyrenaica",
+    "Quercus_Rhysophylla",
+    "Quercus_Rubra",
+    "Quercus_Semecarpifolia",
+    "Quercus_Shumardii",
+    "Quercus_Suber",
+    "Quercus_Texana",
+    "Quercus_Trojana",
+    "Quercus_Variabilis",
+    "Quercus_Vulcanica",
+    "Quercus_x_Hispanica",
+    "Quercus_x_Turneri",
+    "Rhododendron_x_Russellianum",
+    "Salix_Fragilis",
+    "Salix_Intergra",
+    "Sorbus_Aria",
+    "Tilia_Oliveri",
+    "Tilia_Platyphyllos",
+    "Tilia_Tomentosa",
+    "Ulmus_Bergmanniana",
+    "Viburnum_Tinus",
+    "Viburnum_x_Rhytidophylloides",
+    "Zelkova_Serrata",
+]

mlebench/competitions/leaf-classification/grade.py ADDED Viewed

@@ -0,0 +1,44 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import log_loss
+from mlebench.competitions.utils import one_hot_dfs_to_log_loss_inputs
+from mlebench.grade_helpers import InvalidSubmissionError
+from .classes import CLASSES
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    """
+    The submission and answers are already one-hotted
+    """
+    if set(submission.columns) != {"id"} | set(CLASSES):
+        raise InvalidSubmissionError(
+            "Submission DataFrame must have an 'id' column and a column for each class."
+        )
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            "Submission and answers DataFrames must have the same number of rows."
+        )
+    tolerance = 1e-6
+    if not np.all(np.isclose(submission[CLASSES].sum(axis=1), 1, atol=tolerance)):
+        raise InvalidSubmissionError("Each row in submission should sum to one, as probabilities.")
+    if not ((submission[CLASSES] >= 0) & (submission[CLASSES] <= 1)).all().all():
+        raise InvalidSubmissionError(
+            "All probabilities in submission DataFrame must be between 0 and 1."
+        )
+    assert set(answers.columns) == {"id"} | set(
+        CLASSES
+    ), "Answers DataFrame must have an 'id' column and a column for each class."
+    log_loss_inputs = one_hot_dfs_to_log_loss_inputs(
+        submission, answers, id_column="id", apply_softmax=False
+    )
+    return log_loss_inputs
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    log_loss_inputs = prepare_for_metric(submission, answers)
+    return log_loss(**log_loss_inputs)

mlebench/competitions/leaf-classification/prepare.py ADDED Viewed

@@ -0,0 +1,60 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.competitions.utils import df_to_one_hot
+from mlebench.utils import extract, read_csv
+from .classes import CLASSES
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # extract only what we need
+    extract(raw / "train.csv.zip", raw)
+    extract(raw / "images.zip", raw)
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    new_test_without_labels = new_test.drop(columns=["species"])
+    # match the format of the sample submission
+    new_test = new_test[["id", "species"]]
+    new_test = df_to_one_hot(new_test, "id", "species", classes=CLASSES)
+    (public / "images").mkdir(exist_ok=True)
+    (private / "images").mkdir(exist_ok=True)
+    for file_id in new_train["id"]:
+        shutil.copyfile(
+            src=raw / "images" / f"{file_id}.jpg",
+            dst=public / "images" / f"{file_id}.jpg",
+        )
+    for file_id in new_test_without_labels["id"]:
+        shutil.copyfile(
+            src=raw / "images" / f"{file_id}.jpg",
+            dst=public / "images" / f"{file_id}.jpg",
+        )
+    # Check integrity of the files copied
+    assert len(new_test_without_labels) == len(
+        new_test
+    ), "Public and Private tests should have equal length"
+    assert len(list(public.glob("images/*.jpg"))) == len(new_train) + len(
+        new_test_without_labels
+    ), "Public images should have the same number of images as the sum of train and test"
+    # Create a sample submission file
+    submission_df = new_test.copy()
+    submission_df[CLASSES] = 1 / len(CLASSES)
+    # Copy over files
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    submission_df.to_csv(public / "sample_submission.csv", index=False)

mlebench/competitions/leaf-classification/prepare_val.py ADDED Viewed

@@ -0,0 +1,116 @@
+import shutil
+from pathlib import Path
+from pandas import DataFrame
+from sklearn.model_selection import train_test_split
+from mlebench.competitions.utils import df_to_one_hot
+from mlebench.utils import extract, read_csv
+from .classes import CLASSES
+def _create_split_and_save(
+    source_df: DataFrame,
+    image_source_dir: Path,
+    public_dir: Path,
+    private_dir: Path,
+    test_size: float,
+    random_state: int,
+) -> DataFrame:
+    """Helper function to perform a split, copy images, and save all artifacts."""
+    # Create train, test from the source dataframe
+    new_train, new_test = train_test_split(
+        source_df, test_size=test_size, random_state=random_state
+    )
+    new_test_without_labels = new_test.drop(columns=["species"])
+    # match the format of the sample submission
+    new_test = new_test[["id", "species"]]
+    new_test = df_to_one_hot(new_test, "id", "species", classes=CLASSES)
+    # Create destination directories
+    (public_dir / "images").mkdir(parents=True, exist_ok=True)
+    (private_dir / "images").mkdir(parents=True, exist_ok=True)
+    # Copy images for the new training set
+    for file_id in new_train["id"]:
+        shutil.copyfile(
+            src=image_source_dir / f"{file_id}.jpg",
+            dst=public_dir / "images" / f"{file_id}.jpg",
+        )
+    # Copy images for the new test set
+    for file_id in new_test_without_labels["id"]:
+        shutil.copyfile(
+            src=image_source_dir / f"{file_id}.jpg",
+            dst=public_dir / "images" / f"{file_id}.jpg",
+        )
+    # Check integrity of the files copied
+    assert len(new_test_without_labels) == len(
+        new_test
+    ), "Public and Private tests should have equal length"
+    assert len(list(public_dir.glob("images/*.jpg"))) == len(new_train) + len(
+        new_test_without_labels
+    ), "Public images should have the same number of images as the sum of train and test"
+    # Create a sample submission file
+    submission_df = new_test.copy()
+    submission_df[CLASSES] = 1 / len(CLASSES)
+    # Save all dataframes to their respective files
+    new_train.to_csv(public_dir / "train.csv", index=False)
+    new_test.to_csv(private_dir / "test.csv", index=False)
+    new_test_without_labels.to_csv(public_dir / "test.csv", index=False)
+    submission_df.to_csv(public_dir / "sample_submission.csv", index=False)
+    return new_train
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a secondary validation split in public_val/private_val directories.
+    """
+    # extract only what we need
+    extract(raw / "train.csv.zip", raw)
+    extract(raw / "images.zip", raw)
+    # Load the full raw training data
+    full_train_df = read_csv(raw / "train.csv")
+    image_source_dir = raw / "images"
+    # --- Stage 1: Create the original train/test split ---
+    # This block generates the primary competition data in `public/` and `private/`.
+    # The outputs of this stage must remain unchanged from the original script.
+    train_df_after_first_split = _create_split_and_save(
+        source_df=full_train_df,
+        image_source_dir=image_source_dir,
+        public_dir=public,
+        private_dir=private,
+        test_size=0.1,
+        random_state=0,
+    )
+    # --- Stage 2: Create the new train/validation split ---
+    # This block takes the training set from Stage 1 and splits it again
+    # to create a new, smaller training set and a validation set.
+    # The outputs are saved to new `public_val/` and `private_val/` directories.
+    # Define paths for the new validation split
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Calculate test_size to make the new validation set have the same number
+    # of samples as the original test set (0.1 / (1-0.1) = 0.1 / 0.9)
+    validation_test_size = 0.1 / 0.9
+    _create_split_and_save(
+        source_df=train_df_after_first_split,
+        image_source_dir=image_source_dir,
+        public_dir=public_val,
+        private_dir=private_val,
+        test_size=validation_test_size,
+        random_state=0,  # Use the same random state for consistency
+    )

mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py ADDED Viewed

@@ -0,0 +1,44 @@
+from pandas import DataFrame
+from sklearn.metrics import cohen_kappa_score
+from mlebench.grade_helpers import InvalidSubmissionError
+def prepare_for_grading(
+    submission: DataFrame, answers: DataFrame, id_column: str, target_column: str
+) -> tuple:
+    # Answers checks
+    assert (
+        target_column in answers.columns
+    ), f"Answers must contain the target column '{target_column}'"
+    assert id_column in answers.columns, f"Answers must contain the id column '{id_column}'"
+    assert not answers.empty, "Answers DataFrame should not be empty"
+    # Submission checks
+    if target_column not in submission.columns:
+        raise InvalidSubmissionError(f"Submission must contain the target column '{target_column}'")
+    if id_column not in submission.columns:
+        raise InvalidSubmissionError(f"Submission must contain the id column '{id_column}'")
+    if submission.empty:
+        raise InvalidSubmissionError("Submission DataFrame should not be empty")
+    # Sort the submissions and answers by id_column
+    submission_sorted = submission.sort_values(by=id_column)
+    answers_sorted = answers.sort_values(by=id_column)
+    if (submission_sorted[id_column].values != answers_sorted[id_column].values).any():
+        raise InvalidSubmissionError("Submission must contain the same ids as the answers")
+    # Extract the target columns
+    y_pred = submission_sorted[target_column].values
+    y_true = answers_sorted[target_column].values
+    return y_pred, y_true
+def grade(submission: DataFrame, answers: DataFrame) -> float:
+    y_pred, y_true = prepare_for_grading(
+        submission=submission, answers=answers, id_column="essay_id", target_column="score"
+    )
+    return cohen_kappa_score(y_pred, y_true, weights="quadratic")

mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py ADDED Viewed

@@ -0,0 +1,51 @@
+from pathlib import Path
+import numpy as np
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Create train, test from original train split
+    old_train = read_csv(raw / "train.csv")
+    # Original train has 17307 rows. Original hidden test has approx 8k rows. We just take 10% of the original train as the test set.
+    new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
+    new_test = answers.drop(columns=["score"])
+    sample_submission = answers[["essay_id"]].copy()
+    sample_submission["score"] = np.random.RandomState(42).randint(
+        1, 7, size=len(sample_submission)
+    )
+    # Checks
+    assert set(new_train["essay_id"]).isdisjoint(
+        set(new_test["essay_id"])
+    ), "Essay IDs in train and test sets are not disjoint"
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), f"Train and test sets do not sum to original train set"
+    assert len(new_test) == len(
+        sample_submission
+    ), f"Test and sample submission sets do not have the same length"
+    assert (
+        new_train.columns.tolist() == old_train.columns.tolist()
+    ), f"Train set columns do not match original train set, got {new_train.columns.tolist()}"
+    assert new_test.columns.tolist() == [
+        "essay_id",
+        "full_text",
+    ], f"Test set columns do not match expected columns, got {new_test.columns.tolist()}"
+    assert sample_submission.columns.tolist() == [
+        "essay_id",
+        "score",
+    ], f"Sample submission set columns do not match expected columns, got {sample_submission.columns.tolist()}"
+    # Write CSVs
+    answers.to_csv(private / "answers.csv", index=False)
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl