PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/smartphone-decimeter-2022/prepare_val.py ADDED Viewed

@@ -0,0 +1,199 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+def get_date(s: str) -> str:
+    """Gets date from string in the format YYYY-MM-DD-X where `X` is an arbitrary string."""
+    split = s.split("-")
+    assert (
+        len(split) >= 3
+    ), f"Expected the string to have at least 3 parts separated by `-`. Got {len(split)} parts."
+    year, month, day = split[:3]
+    assert (
+        isinstance(year, str) and year.isdigit()
+    ), f"Expected the year to be a string of digits. Got {year} instead."
+    assert (
+        isinstance(month, str) and month.isdigit()
+    ), f"Expected the month to be a string of digits. Got {month} instead."
+    assert (
+        isinstance(day, str) and day.isdigit()
+    ), f"Expected the day to be a string of digits. Got {day} instead."
+    date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
+    return date
+def _process_split(
+    raw_data_path: Path,
+    train_ids: list,
+    test_ids: list,
+    public_path: Path,
+    private_path: Path,
+) -> None:
+    """
+    Helper function to process a single data split.
+    It populates the public and private directories with the provided train/test IDs,
+    creating the necessary file structure and artifacts (like the sample submission).
+    """
+    # Clean and create output directories
+    shutil.rmtree(public_path, ignore_errors=True)
+    shutil.rmtree(private_path, ignore_errors=True)
+    public_path.mkdir(parents=True)
+    private_path.mkdir(parents=True)
+    (public_path / "train").mkdir()
+    (public_path / "test").mkdir()
+    for train_id in train_ids:
+        shutil.copytree(
+            src=raw_data_path / "train" / train_id,
+            dst=public_path / "train" / train_id,
+        )
+    for test_id in test_ids:
+        shutil.copytree(
+            src=raw_data_path / "train" / test_id,
+            dst=public_path / "test" / test_id,
+        )
+    # Construct test set by concatenating all ground truth csvs for the test journeys
+    dfs = []
+    for fpath in sorted((public_path / "test").rglob("ground_truth.csv")):
+        drive_id = fpath.parent.parent.name
+        phone_id = fpath.parent.name
+        assert (
+            drive_id in test_ids
+        ), f"Expected the drive {drive_id} to be one of the new test instances. Got {drive_id} instead."
+        raw_df = pd.read_csv(fpath)
+        df = raw_df.copy()
+        df.loc[:, "tripId"] = f"{drive_id}-{phone_id}"
+        df = df[["tripId", "UnixTimeMillis", "LatitudeDegrees", "LongitudeDegrees"]]
+        dfs.append(df)
+    new_test_labels = pd.concat(dfs, ignore_index=True)
+    # The output filename is 'test.csv' to match the competition structure.
+    new_test_labels.to_csv(private_path / "test.csv", index=False)
+    for fpath in (public_path / "test").rglob("ground_truth.csv"):
+        fpath.unlink()  # don't include ground truth in public test data
+    shutil.copytree(
+        src=raw_data_path / "metadata",
+        dst=public_path / "metadata",
+    )
+    actual_journey_ids = set(["-".join(s.split("-")[:-1]) for s in new_test_labels["tripId"]])
+    assert len(actual_journey_ids) == len(test_ids), (
+        f"Expected the new test instances to have {len(test_ids)} unique trip IDs. Got "
+        f"{len(new_test_labels['tripId'].unique())} unique trip IDs."
+    )
+    sample_submission = new_test_labels.copy()
+    sample_submission.loc[:, "LatitudeDegrees"] = 37.904611315634504
+    sample_submission.loc[:, "LongitudeDegrees"] = -86.48107806249548
+    assert len(sample_submission) == len(new_test_labels), (
+        f"Expected the sample submission to have the same number of instances as the new test "
+        f"instances. Got {len(sample_submission)} instances in the sample submission and "
+        f"{len(new_test_labels)} new test instances."
+    )
+    sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
+    assert sorted(list(public_path.glob("train/*"))) == sorted(
+        set([public_path / "train" / drive_id for drive_id in train_ids])
+    ), "Expected the public train directory to contain the new train instances."
+    assert sorted(list(public_path.glob("test/*"))) == sorted(
+        set([public_path / "test" / drive_id for drive_id in test_ids])
+    ), "Expected the public test directory to contain the new test instances."
+    assert (
+        len(list((public_path / "test").rglob("ground_truth.csv"))) == 0
+    ), "Expected the public test directory to not contain any ground truth files."
+    assert len(list((public_path / "train").rglob("ground_truth.csv"))) >= len(train_ids), (
+        "Expected the public train directory to contain at least one ground truth file per new "
+        "train instance."
+    )
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    # --- Stage 1: Original Split (Train / Test) ---
+    # This section creates the primary competition data in `public` and `private`.
+    # Its logic and outputs are identical to the original script.
+    old_train_ids = sorted([folder.name for folder in (raw / "train").glob("*") if folder.is_dir()])
+    dates = sorted(set([get_date(s) for s in old_train_ids]))
+    new_train_dates, new_test_dates = train_test_split(dates, test_size=0.1, random_state=0)
+    assert (
+        len(new_train_dates) >= 1
+    ), "Expected the new train set to have at least one date. Got 0 dates."
+    assert (
+        len(new_test_dates) >= 1
+    ), "Expected the new test set to have at least one date. Got 0 dates."
+    new_train_ids = sorted([i for i in old_train_ids if get_date(i) in new_train_dates])
+    new_test_ids = sorted([i for i in old_train_ids if get_date(i) in new_test_dates])
+    assert len(set(new_train_ids).intersection(set(new_test_ids))) == 0, (
+        f"Expected the new train and test instances to be disjoint. Got an intersection of "
+        f"{set(new_train_ids).intersection(set(new_test_ids))}."
+    )
+    assert len(new_train_ids) + len(new_test_ids) == len(old_train_ids), (
+        f"Expected the number of new train and test instances to sum up to the number of old train "
+        f"instances. Got {len(new_train_ids)} new train instances and {len(new_test_ids)} new test "
+        f"instances which sum to {len(new_train_ids) + len(new_test_ids)} instead of "
+        f"{len(old_train_ids)}."
+    )
+    _process_split(
+        raw_data_path=raw,
+        train_ids=new_train_ids,
+        test_ids=new_test_ids,
+        public_path=public,
+        private_path=private,
+    )
+    # --- Stage 2: Validation Split (Train_val / Test_val) ---
+    # This section creates a new validation dataset in parallel directories.
+    # It takes the `new_train_ids` from the first split and splits them again
+    # using the exact same methodology.
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # The input for this split is the training set from the *first* split.
+    val_split_input_dates = sorted(set([get_date(s) for s in new_train_ids]))
+    # We replicate the splitting logic and parameters to get a test set of a
+    # similar proportional size to the original test set.
+    train_val_dates, test_val_dates = train_test_split(
+        val_split_input_dates, test_size=0.1, random_state=0
+    )
+    train_val_ids = sorted([i for i in new_train_ids if get_date(i) in train_val_dates])
+    test_val_ids = sorted([i for i in new_train_ids if get_date(i) in test_val_dates])
+    _process_split(
+        raw_data_path=raw,
+        train_ids=train_val_ids,
+        test_ids=test_val_ids,
+        public_path=public_val,
+        private_path=private_val,
+    )

mlebench/competitions/spaceship-titanic/grade.py ADDED Viewed

@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from mlebench.competitions.utils import prepare_for_accuracy_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    accuracy_inputs = prepare_for_accuracy_metric(
+        submission=submission, answers=answers, target_column="Transported", id_column="PassengerId"
+    )
+    return accuracy_score(**accuracy_inputs)

mlebench/competitions/spaceship-titanic/prepare.py ADDED Viewed

@@ -0,0 +1,23 @@
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    # Create train and test splits from train set
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Create random example submission
+    example_submission = new_test[["PassengerId", "Transported"]].copy()
+    example_submission["Transported"] = False
+    example_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Create private files
+    new_test.to_csv(private / "test.csv", index=False)
+    # Create public files visible to agents
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.drop("Transported", axis="columns").to_csv(public / "test.csv", index=False)

mlebench/competitions/spaceship-titanic/prepare_val.py ADDED Viewed

@@ -0,0 +1,61 @@
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from mlebench.utils import read_csv
+def _create_split_files(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    public_dir: Path,
+    private_dir: Path,
+):
+    """A helper function to save train/test splits and artifacts."""
+    # Create random example submission
+    example_submission = test_df[["PassengerId", "Transported"]].copy()
+    example_submission["Transported"] = False
+    example_submission.to_csv(public_dir / "sample_submission.csv", index=False)
+    # Create private files
+    test_df.to_csv(private_dir / "test.csv", index=False)
+    # Create public files visible to agents
+    train_df.to_csv(public_dir / "train.csv", index=False)
+    test_df.drop("Transported", axis="columns").to_csv(
+        public_dir / "test.csv", index=False
+    )
+def prepare(raw: Path, public: Path, private: Path):
+    # Define and create all output directories to ensure they exist
+    public.mkdir(parents=True, exist_ok=True)
+    private.mkdir(parents=True, exist_ok=True)
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    public_val.mkdir(parents=True, exist_ok=True)
+    private_val.mkdir(parents=True, exist_ok=True)
+    # --- Original Split: Create main train and test sets ---
+    # Create train and test splits from train set
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    # Generate the files for the original public/private directories
+    # This ensures the original outputs are unchanged
+    _create_split_files(new_train, new_test, public, private)
+    # --- Validation Split: Create a new train and validation set ---
+    # The new validation set should be approx. the same size as the original test set.
+    # We calculate the required test_size for splitting the new_train set.
+    test_size_for_val = len(new_test) / len(new_train)
+    # Split the training data again to create a new, smaller training set and a validation set
+    train_val, test_val = train_test_split(
+        new_train, test_size=test_size_for_val, random_state=0
+    )
+    # Generate the files for the new validation directories, using the exact same
+    # structure and filenames as the original split.
+    _create_split_files(train_val, test_val, public_val, private_val)

mlebench/competitions/spooky-author-identification/classes.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ CLASSES = ["EAP", "HPL", "MWS"]

mlebench/competitions/spooky-author-identification/grade.py ADDED Viewed

@@ -0,0 +1,38 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import log_loss
+from mlebench.competitions.utils import one_hot_dfs_to_log_loss_inputs
+from mlebench.grade_helpers import InvalidSubmissionError
+from .classes import CLASSES
+tolerance = 1e-6
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
+    id_col = "id"
+    if submission.shape != (len(answers), len(CLASSES) + 1):
+        raise InvalidSubmissionError(
+            f"Submission shape {submission.shape} does not match answers shape {answers.shape}."
+        )
+    # assert that each row in submission sums to one, accounting for floating point errors
+    if not np.all(np.isclose(submission.iloc[:, 1:].sum(axis=1), 1, atol=tolerance)):
+        raise InvalidSubmissionError("Each row in submission should sum to one, as probabilities.")
+    if not ((submission.iloc[:, 1:] >= 0) & (submission.iloc[:, 1:] <= 1)).all().all():
+        raise InvalidSubmissionError(
+            "All probabilities in submission DataFrame must be between 0 and 1."
+        )
+    log_loss_inputs = one_hot_dfs_to_log_loss_inputs(
+        submission, answers, id_column=id_col, apply_softmax=False
+    )
+    return log_loss_inputs
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    log_loss_inputs = prepare_for_metric(submission, answers)
+    return log_loss(**log_loss_inputs)

mlebench/competitions/spooky-author-identification/prepare.py ADDED Viewed

@@ -0,0 +1,40 @@
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.competitions.utils import df_to_one_hot
+from mlebench.utils import extract, read_csv
+from .classes import CLASSES
+def prepare(raw: Path, public: Path, private: Path):
+    target_col = "author"
+    id_col = "id"
+    # extract only what we need
+    extract(raw / "train.zip", raw)
+    # Create train, test from train split
+    old_train = read_csv(raw / "train.csv")
+    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
+    new_test_without_labels = new_test.drop(columns=[target_col])
+    # private test matches the format of sample submission
+    one_hot_new_test = df_to_one_hot(
+        new_test.drop(columns=["text"]),
+        id_column=id_col,
+        target_column=target_col,
+        classes=CLASSES,
+    )
+    # fill the sample submission with arbitrary values (matching kaggle.com)
+    sample_submission = one_hot_new_test.copy()
+    sample_submission["EAP"] = 0.403493538995863
+    sample_submission["HPL"] = 0.287808366106543
+    sample_submission["MWS"] = 0.308698094897594
+    # save files
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test_without_labels.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    one_hot_new_test.to_csv(private / "test.csv", index=False)

mlebench/competitions/spooky-author-identification/prepare_val.py ADDED Viewed

@@ -0,0 +1,78 @@
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from mlebench.competitions.utils import df_to_one_hot
+from mlebench.utils import extract, read_csv
+from .classes import CLASSES
+def prepare(raw: Path, public: Path, private: Path):
+    target_col = "author"
+    id_col = "id"
+    # extract only what we need
+    extract(raw / "train.zip", raw)
+    # =================================================================
+    # == Original Data Split (for `public` and `private` directories)
+    # =================================================================
+    # Create main train/test split from the raw data
+    old_train = read_csv(raw / "train.csv")
+    train_main, test_main = train_test_split(old_train, test_size=0.1, random_state=0)
+    test_main_without_labels = test_main.drop(columns=[target_col])
+    # private test matches the format of sample submission
+    one_hot_test_main = df_to_one_hot(
+        test_main.drop(columns=["text"]),
+        id_column=id_col,
+        target_column=target_col,
+        classes=CLASSES,
+    )
+    # fill the sample submission with arbitrary values (matching kaggle.com)
+    sample_submission = one_hot_test_main.copy()
+    sample_submission["EAP"] = 0.403493538995863
+    sample_submission["HPL"] = 0.287808366106543
+    sample_submission["MWS"] = 0.308698094897594
+    # save files to original public/private directories
+    train_main.to_csv(public / "train.csv", index=False)
+    test_main_without_labels.to_csv(public / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    one_hot_test_main.to_csv(private / "test.csv", index=False)
+    # =====================================================================
+    # == New Validation Split (for `public_val` and `private_val` dirs)
+    # =====================================================================
+    # Define and create the new parallel directories for the validation set
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    public_val.mkdir(parents=True, exist_ok=True)
+    private_val.mkdir(parents=True, exist_ok=True)
+    # Split the main training set again to create a new, smaller training set and a validation set.
+    # The logic and random_state are replicated to ensure the validation set size is
+    # consistent with the main test set size.
+    train_val, test_val = train_test_split(train_main, test_size=0.1, random_state=0)
+    test_val_without_labels = test_val.drop(columns=[target_col])
+    # Process the new validation set's labels, mirroring the original logic
+    one_hot_test_val = df_to_one_hot(
+        test_val.drop(columns=["text"]),
+        id_column=id_col,
+        target_column=target_col,
+        classes=CLASSES,
+    )
+    # Create a new sample submission for the validation set
+    sample_submission_val = one_hot_test_val.copy()
+    sample_submission_val["EAP"] = 0.403493538995863
+    sample_submission_val["HPL"] = 0.287808366106543
+    sample_submission_val["MWS"] = 0.308698094897594
+    # Save files to the new validation directories using identical filenames
+    train_val.to_csv(public_val / "train.csv", index=False)
+    test_val_without_labels.to_csv(public_val / "test.csv", index=False)
+    sample_submission_val.to_csv(public_val / "sample_submission.csv", index=False)
+    one_hot_test_val.to_csv(private_val / "test.csv", index=False)

mlebench/competitions/stanford-covid-vaccine/grade.py ADDED Viewed

@@ -0,0 +1,65 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics import root_mean_squared_error
+from mlebench.grade_helpers import InvalidSubmissionError
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError(
+            f"Expected submission to be the same length as answers, but got {len(submission)} "
+            f"instead of {len(answers)}."
+        )
+    to_predict = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C"]
+    expected_answer_columns = ["id_seqpos"] + to_predict + ["keep"]
+    assert set(answers.columns).issuperset(expected_answer_columns), (
+        f"Expected answers to have columns {expected_answer_columns}, but instead it has "
+        f"columns {answers.columns}."
+    )
+    # The submission csv contains two columns which aren't used for scoring: `deg_pH10` and
+    # `deg_50C`. These are nonetheless still required to be in the submission as per the
+    # competition rules. See the "Sample Submission" section of the competition overview page for
+    # more information:
+    # https://www.kaggle.com/competitions/stanford-covid-vaccine/overview/evaluation
+    expected_submission_columns = ["id_seqpos"] + to_predict + ["deg_pH10", "deg_50C"]
+    if not set(submission.columns).issuperset(expected_submission_columns):
+        raise InvalidSubmissionError(
+            f"Expected the submission to have columns {expected_submission_columns}, but instead "
+            f"it has columns {submission.columns}."
+        )
+    filtered_submission = submission[expected_submission_columns]
+    # Sort rows by `id_seqpos` and columns alphabetically
+    sorted_submission = filtered_submission.sort_values(by="id_seqpos").sort_index(axis=1)
+    sorted_answers = answers.sort_values(by="id_seqpos").sort_index(axis=1)
+    for i, (actual_id, expected_id) in enumerate(
+        zip(sorted_submission["id_seqpos"], sorted_answers["id_seqpos"])
+    ):
+        if actual_id == expected_id:
+            continue
+        raise InvalidSubmissionError(
+            f"Expected submission to have the same `id_seqpos` as answers, but got `{actual_id}` "
+            f"instead of `{expected_id}` on row {i} of the submission."
+        )
+    mask = sorted_answers["keep"]
+    new_submission = sorted_submission[mask]
+    new_answers = sorted_answers[mask]
+    errors = []
+    for column in to_predict:
+        y_pred = new_submission[column]
+        y_true = new_answers[column]
+        error = root_mean_squared_error(y_true=y_true, y_pred=y_pred)
+        errors.append(error)
+    return np.mean(errors)

mlebench/competitions/stanford-covid-vaccine/prepare.py ADDED Viewed

@@ -0,0 +1,129 @@
+from pathlib import Path
+import pandas as pd
+def prepare(raw: Path, public: Path, private: Path) -> None:
+    old_train = pd.read_json(raw / "train.json", lines=True)
+    old_test = pd.read_json(raw / "test.json", lines=True)
+    old_sample_submission = pd.read_csv(raw / "sample_submission.csv")
+    to_predict = ["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"]
+    test_size = 0.1
+    n_test_samples = int(len(old_train) * test_size)
+    # only put samples that pass the SN filter in the test set, as per comp data desc
+    old_train["test"] = False
+    test_indices = (
+        old_train[old_train["SN_filter"] > 0].sample(n=n_test_samples, random_state=0).index
+    )
+    old_train.loc[test_indices, "test"] = True
+    new_train = old_train[~old_train["test"]].copy().drop(columns=["test"], inplace=False)
+    new_test = old_train[old_train["test"]].copy().drop(columns=["test"], inplace=False)
+    old_train = old_train.drop(columns=["test"], inplace=False)
+    # Create `test.csv` by exploding each list in the `reactivity` and `deg_*` columns, analogous
+    # to `pd.explode`. Only the first `seq_scored` items are scored out of a possible `seq_length`
+    # items. For each row, we keep track of whether it's scored or not with the `keep` column.
+    records = []
+    for _, row in new_test.iterrows():
+        n = row["seq_scored"]
+        assert len(row["reactivity"]) == n
+        assert len(row["deg_Mg_pH10"]) == n
+        assert len(row["deg_pH10"]) == n
+        assert len(row["deg_Mg_50C"]) == n
+        assert len(row["deg_50C"]) == n
+        for j in range(n):
+            records.append(
+                {
+                    "id_seqpos": f"{row['id']}_{j}",
+                    "reactivity": row["reactivity"][j],
+                    "deg_Mg_pH10": row["deg_Mg_pH10"][j],
+                    "deg_pH10": row["deg_pH10"][j],
+                    "deg_Mg_50C": row["deg_Mg_50C"][j],
+                    "deg_50C": row["deg_50C"][j],
+                    "keep": True,
+                }
+            )
+        k = row["seq_length"]
+        assert n < k
+        for j in range(n, k):
+            records.append(
+                {
+                    "id_seqpos": f"{row['id']}_{j}",
+                    "reactivity": 0.0,
+                    "deg_Mg_pH10": 0.0,
+                    "deg_pH10": 0.0,
+                    "deg_Mg_50C": 0.0,
+                    "deg_50C": 0.0,
+                    "keep": False,
+                }
+            )
+    # Write `answers.csv`
+    answers = pd.DataFrame(records)
+    answers.to_csv(private / "test.csv", index=False, float_format="%.10f")
+    # Write `train.json`
+    new_train["index"] = range(len(new_train))
+    new_train.to_json(public / "train.json", orient="records", lines=True)
+    # Write `test.json`
+    new_test_without_labels = new_test[old_test.columns].copy()
+    new_test_without_labels["index"] = range(len(new_test_without_labels))
+    new_test_without_labels.to_json(public / "test.json", orient="records", lines=True)
+    # Write `sample_submission.csv`
+    new_sample_submission = answers[["id_seqpos"] + to_predict].copy()
+    new_sample_submission.loc[:, to_predict] = 0.0
+    new_sample_submission.to_csv(
+        public / "sample_submission.csv", index=False, float_format="%.10f"
+    )
+    # Sanity checks
+    assert set(new_train.columns) == set(old_train.columns), (
+        f"Expected the columns of the new train to be the same as the old train, but got "
+        f"{set(new_train.columns)} instead of {set(old_train.columns)}."
+    )
+    assert set(new_test_without_labels.columns) == set(old_test.columns), (
+        f"Expected the columns of the new test to be the same as the old test, but got "
+        f"{set(new_test_without_labels.columns)} instead of {set(old_test.columns)}."
+    )
+    assert set(to_predict).intersection(set(new_test_without_labels.columns)) == set(), (
+        f"Expected the columns to predict aren't included in the new test, but got "
+        f"{set(to_predict) ^ set(new_test_without_labels.columns)} instead of the empty set."
+    )
+    assert set(new_sample_submission.columns) == set(old_sample_submission.columns), (
+        f"Expected the columns of the new sample submission to be the same as the old sample "
+        f"submission, but got {set(new_sample_submission.columns)} instead of "
+        f"{set(old_sample_submission.columns)}."
+    )
+    assert len(answers) == len(new_sample_submission), (
+        f"Expected the answers to have the same length as the new sample submission, but got "
+        f"{len(answers)} instead of {len(new_sample_submission)}."
+    )
+    # we can use [0] because all sequences have the same length
+    assert len(new_sample_submission) == (
+        len(new_test_without_labels) * new_test_without_labels["seq_length"].iloc[0]
+    ), (
+        "Expected new_sample_submission length to be equal to max seq_length * len(new_test)."
+        f"Got {len(new_sample_submission)} instead of {len(new_test_without_labels) * new_test_without_labels['seq_length']}."
+    )
+    assert len(new_train) + len(new_test) == len(old_train), (
+        f"Expected the length of the new train set plus the length of the new test set to be "
+        f"equal to the length of the old train set, but got {len(new_train) + len(new_test)} "
+        f"instead of {len(old_train)}."
+    )

dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.8py3-none-any.whl