PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/billion-word-imputation/prepare.py ADDED Viewed

@@ -0,0 +1,107 @@
+import os
+import shutil
+import zipfile
+from pathlib import Path
+import numpy as np
+from tqdm.auto import tqdm
+from mlebench.utils import extract, get_logger
+np_rng = np.random.RandomState(0)
+logger = get_logger(__name__)
+def count_lines_in_file(file_path):
+    line_count = 0
+    with open(file_path, "r") as file:
+        for _line in file:
+            line_count += 1
+    return line_count
+def compress_file_to_zip(src_file: Path, zip_file: Path):
+    with zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) as zipf:
+        zipf.write(src_file, arcname=src_file.name)
+def remove_random_word(sentence: str) -> str:
+    """
+    Remove a random 'word' (sequence of characters, delimited by whitespace) from a sentence.
+    Does not remove first or last words.
+    Punctuation counts as a word, and is already separated by whitespace.
+    """
+    words = sentence.split()
+    index = np_rng.randint(1, len(words) - 1)
+    return " ".join(words[:index] + words[index + 1 :])
+def prepare(raw: Path, public: Path, private: Path):
+    logger.info("Extracting raw / train_v2.txt.zip")
+    extract(raw / "train_v2.txt.zip", raw)
+    # computed this ahead of time
+    total_lines = 30301028
+    with (
+        open(raw / "train_v2.txt", "r") as old_train,
+        open(public / "train_v2.txt", "w") as public_train,
+        open(public / "test_v2.txt", "w") as public_test,
+        open(private / "test.csv", "w") as private_test,
+    ):
+        public_test.write('"id","sentence"\n')
+        private_test.write('"id","sentence"\n')
+        line_count = 0
+        test_count = 0
+        train_count = 0
+        # there is one sentence per line
+        for sentence in tqdm(old_train, desc="Processing data", total=total_lines):
+            # we will put ~0.01 of the data in test, the rest in train, matching kaggle's original split
+            # some sentences only have 2 words, so can't remove a word -- keep them in train
+            if np_rng.uniform() <= 0.01 and len(sentence.strip().split()) > 2:
+                # get rid of linebreak and escape quotes
+                sentence = sentence.strip().replace('"', '""')
+                removed_word_sentence = remove_random_word(sentence)
+                private_test.write(f'{test_count},"{sentence}"\n')
+                public_test.write(f'{test_count},"{removed_word_sentence}"\n')
+                test_count += 1
+            else:
+                public_train.write(sentence)
+                train_count += 1
+            line_count += 1
+            if line_count >= total_lines:
+                break
+    # we will be compressing the public files (to match what's on kaggle.com)
+    # so copy our sample submission to private so we have access to it
+    shutil.copy(public / "test_v2.txt", private / "sample_submission.csv")
+    # compress the public files
+    logger.info("Compressing train_v2.txt")
+    compress_file_to_zip(public / "train_v2.txt", public / "train_v2.txt.zip")
+    logger.info("Compressing test_v2.txt")
+    compress_file_to_zip(public / "test_v2.txt", public / "test_v2.txt.zip")
+    # remove the original files
+    (public / "train_v2.txt").unlink()
+    (public / "test_v2.txt").unlink()
+    # Checks
+    assert not (public / "train_v2.txt").exists(), "public / 'train_v2.txt' should not exist"
+    assert (public / "train_v2.txt.zip").exists(), "public / 'train_v2.txt.zip' should exist"
+    assert not (public / "test_v2.txt").exists(), "public / 'test_v2.txt' should not exist"
+    assert (public / "test_v2.txt.zip").exists(), "public / 'test_v2.txt.zip' should exist"
+    private_test_line_count = count_lines_in_file(private / "test.csv")
+    assert (
+        # minus 2 to exclude header
+        private_test_line_count - 1
+        == test_count
+    ), "private / 'test.csv' has incorrect number of lines"
+    assert (
+        count_lines_in_file(private / "sample_submission.csv") == private_test_line_count
+    ), "private / 'sample_submission.csv' has incorrect number of lines"
+    assert (
+        test_count + train_count == total_lines
+    ), "Expected the number of test samples and train samples to sum to the total number of samples in the original train file"

mlebench/competitions/billion-word-imputation/prepare_val.py ADDED Viewed

@@ -0,0 +1,179 @@
+import os
+import shutil
+import zipfile
+from pathlib import Path
+import numpy as np
+from tqdm.auto import tqdm
+from mlebench.utils import extract, get_logger
+np_rng = np.random.RandomState(0)
+logger = get_logger(__name__)
+def count_lines_in_file(file_path):
+    line_count = 0
+    with open(file_path, "r", encoding="utf-8") as file:
+        for _line in file:
+            line_count += 1
+    return line_count
+def compress_file_to_zip(src_file: Path, zip_file: Path):
+    with zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) as zipf:
+        zipf.write(src_file, arcname=src_file.name)
+def remove_random_word(sentence: str) -> str:
+    """
+    Remove a random 'word' (sequence of characters, delimited by whitespace) from a sentence.
+    Does not remove first or last words.
+    Punctuation counts as a word, and is already separated by whitespace.
+    """
+    words = sentence.split()
+    index = np_rng.randint(1, len(words) - 1)
+    return " ".join(words[:index] + words[index + 1 :])
+def _split_and_process_data(
+    input_file: Path,
+    output_train_file: Path,
+    output_public_test_file: Path,
+    output_private_test_file: Path,
+    total_lines: int,
+) -> tuple[int, int]:
+    """
+    Helper function to perform the core data splitting and processing logic.
+    Reads from an input file and splits it into a train and test set based on a
+    probabilistic condition, writing them to the specified output files.
+    """
+    with (
+        open(input_file, "r", encoding="utf-8") as old_train,
+        open(output_train_file, "w", encoding="utf-8") as public_train,
+        open(output_public_test_file, "w", encoding="utf-8") as public_test,
+        open(output_private_test_file, "w", encoding="utf-8") as private_test,
+    ):
+        public_test.write('"id","sentence"\n')
+        private_test.write('"id","sentence"\n')
+        test_count = 0
+        train_count = 0
+        # there is one sentence per line
+        for sentence in tqdm(old_train, desc=f"Processing {input_file.name}", total=total_lines):
+            # we will put ~0.01 of the data in test, the rest in train, matching kaggle's original split
+            # some sentences only have 2 words, so can't remove a word -- keep them in train
+            if np_rng.uniform() <= 0.01 and len(sentence.strip().split()) > 2:
+                # get rid of linebreak and escape quotes
+                sentence_clean = sentence.strip().replace('"', '""')
+                removed_word_sentence = remove_random_word(sentence_clean)
+                private_test.write(f'{test_count},"{sentence_clean}"\n')
+                public_test.write(f'{test_count},"{removed_word_sentence}"\n')
+                test_count += 1
+            else:
+                public_train.write(sentence)
+                train_count += 1
+    return train_count, test_count
+def prepare(raw: Path, public: Path, private: Path):
+    logger.info("Extracting raw / train_v2.txt.zip")
+    extract(raw / "train_v2.txt.zip", raw)
+    # Define and create the new validation directories
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    public_val.mkdir(exist_ok=True, parents=True)
+    private_val.mkdir(exist_ok=True, parents=True)
+    # --- 1. Original Split (raw -> train/test) ---
+    logger.info("--- Generating original train/test split ---")
+    # computed this ahead of time
+    total_lines = 30301028
+    original_train_count, original_test_count = _split_and_process_data(
+        input_file=raw / "train_v2.txt",
+        output_train_file=public / "train_v2.txt",
+        output_public_test_file=public / "test_v2.txt",
+        output_private_test_file=private / "test.csv",
+        total_lines=total_lines,
+    )
+    assert (
+        original_train_count + original_test_count == total_lines
+    ), "Sum of train and test samples must equal total samples for original split."
+    # --- 2. Second Split (train -> train_val/test_val) ---
+    logger.info("--- Generating validation split from the new training set ---")
+    # The input for the second split is the training set from the first split.
+    val_split_input_file = public / "train_v2.txt"
+    val_split_total_lines = original_train_count
+    val_train_count, val_test_count = _split_and_process_data(
+        input_file=val_split_input_file,
+        output_train_file=public_val / "train_v2.txt",
+        output_public_test_file=public_val / "test_v2.txt",
+        output_private_test_file=private_val / "test.csv",
+        total_lines=val_split_total_lines,
+    )
+    assert (
+        val_train_count + val_test_count == val_split_total_lines
+    ), "Sum of train_val and test_val samples must equal total samples for validation split."
+    # --- 3. Process and Compress Original public/private directories ---
+    logger.info("--- Compressing and cleaning up original public/private directories ---")
+    # we will be compressing the public files (to match what's on kaggle.com)
+    # so copy our sample submission to private so we have access to it
+    shutil.copy(public / "test_v2.txt", private / "sample_submission.csv")
+    # compress the public files
+    logger.info("Compressing train_v2.txt")
+    compress_file_to_zip(public / "train_v2.txt", public / "train_v2.txt.zip")
+    logger.info("Compressing test_v2.txt")
+    compress_file_to_zip(public / "test_v2.txt", public / "test_v2.txt.zip")
+    # remove the original files
+    (public / "train_v2.txt").unlink()
+    (public / "test_v2.txt").unlink()
+    # --- 4. Process and Compress New public_val/private_val directories ---
+    logger.info("--- Compressing and cleaning up validation public_val/private_val directories ---")
+    # Replicate the process for the validation set
+    shutil.copy(public_val / "test_v2.txt", private_val / "sample_submission.csv")
+    # compress the public_val files
+    logger.info("Compressing validation train_v2.txt")
+    compress_file_to_zip(public_val / "train_v2.txt", public_val / "train_v2.txt.zip")
+    logger.info("Compressing validation test_v2.txt")
+    compress_file_to_zip(public_val / "test_v2.txt", public_val / "test_v2.txt.zip")
+    # remove the original files
+    (public_val / "train_v2.txt").unlink()
+    (public_val / "test_v2.txt").unlink()
+    # --- 5. Final Checks ---
+    logger.info("--- Running final checks ---")
+    # Original Checks
+    assert not (public / "train_v2.txt").exists(), "public / 'train_v2.txt' should not exist"
+    assert (public / "train_v2.txt.zip").exists(), "public / 'train_v2.txt.zip' should exist"
+    assert not (public / "test_v2.txt").exists(), "public / 'test_v2.txt' should not exist"
+    assert (public / "test_v2.txt.zip").exists(), "public / 'test_v2.txt.zip' should exist"
+    private_test_line_count = count_lines_in_file(private / "test.csv")
+    assert (
+        private_test_line_count - 1 == original_test_count
+    ), "private / 'test.csv' has incorrect number of lines"
+    assert (
+        count_lines_in_file(private / "sample_submission.csv") == private_test_line_count
+    ), "private / 'sample_submission.csv' has incorrect number of lines"
+    # New Checks for Validation Set
+    assert not (public_val / "train_v2.txt").exists(), "public_val / 'train_v2.txt' should not exist"
+    assert (public_val / "train_v2.txt.zip").exists(), "public_val / 'train_v2.txt.zip' should exist"
+    assert not (public_val / "test_v2.txt").exists(), "public_val / 'test_v2.txt' should not exist"
+    assert (public_val / "test_v2.txt.zip").exists(), "public_val / 'test_v2.txt.zip' should exist"
+    private_val_test_line_count = count_lines_in_file(private_val / "test.csv")
+    assert (
+        private_val_test_line_count - 1 == val_test_count
+    ), "private_val / 'test.csv' has incorrect number of lines"
+    assert (
+        count_lines_in_file(private_val / "sample_submission.csv") == private_val_test_line_count
+    ), "private_val / 'sample_submission.csv' has incorrect number of lines"

mlebench/competitions/bms-molecular-translation/grade.py ADDED Viewed

@@ -0,0 +1,40 @@
+import numpy as np
+import pandas as pd
+from Levenshtein import distance
+from mlebench.grade_helpers import InvalidSubmissionError
+def edit_distance_array(y_true: pd.Series, y_pred: pd.Series) -> float:
+    return np.mean([distance(a, b) for a, b in zip(y_true, y_pred)])
+def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
+    if len(submission) != len(answers):
+        raise InvalidSubmissionError("Submission and answers must have the same length")
+    if "image_id" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have an 'image_id' column")
+    if "InChI" not in submission.columns:
+        raise InvalidSubmissionError("Submission must have a 'InChI' column")
+    assert "image_id" in answers.columns, "Answers must have 'image_id' column"
+    assert "InChI" in answers.columns, "Answers must have 'InChI' column"
+    submission = submission.sort_values("image_id")
+    answers = answers.sort_values("image_id")
+    if (submission["image_id"].values != answers["image_id"].values).any():
+        raise InvalidSubmissionError("Submission and answers must have the same ids")
+    # Convert to strings
+    submission["InChI"] = submission["InChI"].astype(str)
+    answers["InChI"] = answers["InChI"].astype(str)
+    return answers["InChI"], submission["InChI"]
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    y_true, y_pred = prepare_for_metric(submission, answers)
+    return edit_distance_array(y_true, y_pred)

mlebench/competitions/bms-molecular-translation/prepare.py ADDED Viewed

@@ -0,0 +1,68 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+def make_image_subpath(image_id: str) -> Path:
+    """
+    Creates a triple-nested directory structure from the first 3 characters of the image_id.
+    """
+    subpath = Path(image_id[0]) / image_id[1] / image_id[2] / f"{image_id}.png"
+    return subpath
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    """
+    # Load train data
+    old_train = pd.read_csv(raw / "train_labels.csv")
+    # Create train, test from train split
+    new_train, new_test = train_test_split(old_train, test_size=0.2, random_state=0)
+    new_train.to_csv(public / "train_labels.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    # Copy train files
+    for idx, row in tqdm(new_train.iterrows(), total=len(new_train), desc="Copying train images"):
+        image_id = row["image_id"]
+        src = raw / "train" / make_image_subpath(image_id)
+        dst = public / "train" / make_image_subpath(image_id)
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copyfile(src=src, dst=dst)
+    # Copy test files
+    for idx, row in tqdm(new_test.iterrows(), total=len(new_test), desc="Copying test images"):
+        image_id = row["image_id"]
+        src = raw / "train" / make_image_subpath(image_id)
+        dst = public / "test" / make_image_subpath(image_id)
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copyfile(src=src, dst=dst)
+    # Create sample submission
+    sample_submission = new_test.copy()
+    sample_submission["InChI"] = "InChI=1S/H2O/h1H2"
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    # Copy other files in the dataset (no modification needed)
+    shutil.copyfile(src=raw / "extra_approved_InChIs.csv", dst=public / "extra_approved_InChIs.csv")
+    # Checks
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), f"Expected {len(old_train)} total images in new_train ({len(new_train)}) and new_test ({len(new_test)})"
+    assert len(list((public / "train").glob("**/*.png"))) == len(
+        new_train
+    ), f"Expected {len(new_train)} train images in public/train, but got {len(list((public / 'train').glob('**/*.png')))}"
+    assert len(list((public / "test").glob("**/*.png"))) == len(
+        new_test
+    ), f"Expected {len(new_test)} test images in public/test, but got {len(list((public / 'test').glob('**/*.png')))}"
+    assert "image_id" in sample_submission.columns, "Sample submission must have 'image_id' column"
+    assert "InChI" in sample_submission.columns, "Sample submission must have 'InChI' column"
+    assert len(sample_submission) == len(
+        new_test
+    ), f"Expected {len(new_test)} images in sample submission, but got {len(sample_submission)}"

mlebench/competitions/bms-molecular-translation/prepare_val.py ADDED Viewed

@@ -0,0 +1,131 @@
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+def make_image_subpath(image_id: str) -> Path:
+    """
+    Creates a triple-nested directory structure from the first 3 characters of the image_id.
+    """
+    subpath = Path(image_id[0]) / image_id[1] / image_id[2] / f"{image_id}.png"
+    return subpath
+def _create_split_files(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    raw_images_path: Path,
+    public_path: Path,
+    private_path: Path,
+):
+    """
+    Helper function to generate the directory structure and files for a given split.
+    This function populates the public and private directories with train/test data,
+    images, and a sample submission file.
+    """
+    # Create output directories
+    public_path.mkdir(exist_ok=True)
+    private_path.mkdir(exist_ok=True)
+    # Save dataframes
+    train_df.to_csv(public_path / "train_labels.csv", index=False)
+    test_df.to_csv(private_path / "test.csv", index=False)
+    # Copy train files
+    desc_prefix = public_path.name
+    for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc=f"Copying {desc_prefix} train images"):
+        image_id = row["image_id"]
+        src = raw_images_path / make_image_subpath(image_id)
+        dst = public_path / "train" / make_image_subpath(image_id)
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copyfile(src=src, dst=dst)
+    # Copy test files
+    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=f"Copying {desc_prefix} test images"):
+        image_id = row["image_id"]
+        src = raw_images_path / make_image_subpath(image_id)
+        dst = public_path / "test" / make_image_subpath(image_id)
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copyfile(src=src, dst=dst)
+    # Create sample submission
+    sample_submission = test_df.copy()
+    sample_submission["InChI"] = "InChI=1S/H2O/h1H2"
+    sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
+    # Checks
+    assert len(list((public_path / "train").glob("**/*.png"))) == len(
+        train_df
+    ), f"Expected {len(train_df)} train images in {public_path}/train, but got {len(list((public_path / 'train').glob('**/*.png')))}"
+    assert len(list((public_path / "test").glob("**/*.png"))) == len(
+        test_df
+    ), f"Expected {len(test_df)} test images in {public_path}/test, but got {len(list((public_path / 'test').glob('**/*.png')))}"
+    assert "image_id" in sample_submission.columns, "Sample submission must have 'image_id' column"
+    assert "InChI" in sample_submission.columns, "Sample submission must have 'InChI' column"
+    assert len(sample_submission) == len(
+        test_df
+    ), f"Expected {len(test_df)} images in sample submission, but got {len(sample_submission)}"
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the data in raw into public and private datasets with appropriate test/train splits.
+    Also creates a secondary validation split in parallel public_val and private_val directories.
+    """
+    # Load train data
+    old_train = pd.read_csv(raw / "train_labels.csv")
+    # ---- 1. Create the original train/test split ----
+    # This split is for the main competition test set.
+    new_train, new_test = train_test_split(old_train, test_size=0.2, random_state=0)
+    # Generate the files for the original public and private directories
+    _create_split_files(
+        train_df=new_train,
+        test_df=new_test,
+        raw_images_path=raw / "train",
+        public_path=public,
+        private_path=private,
+    )
+    # ---- 2. Create the new validation split ----
+    # This second split is performed on the `new_train` set created above.
+    # We want the new `test_val` to be the same size as the original `new_test`.
+    # Original test size = 0.2 * total. Original train size = 0.8 * total.
+    # New test size relative to train set = 0.2 / 0.8 = 0.25
+    train_val, test_val = train_test_split(new_train, test_size=0.25, random_state=0)
+    # Define the new parallel directories for the validation set
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    # Generate the files for the new validation directories
+    _create_split_files(
+        train_df=train_val,
+        test_df=test_val,
+        raw_images_path=raw / "train",
+        public_path=public_val,
+        private_path=private_val,
+    )
+    # ---- 3. Copy shared files and run final checks ----
+    # Copy other files into both public and public_val directories
+    shutil.copyfile(src=raw / "extra_approved_InChIs.csv", dst=public / "extra_approved_InChIs.csv")
+    shutil.copyfile(src=raw / "extra_approved_InChIs.csv", dst=public_val / "extra_approved_InChIs.csv")
+    # Original split checks
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), f"Expected {len(old_train)} total images in new_train ({len(new_train)}) and new_test ({len(new_test)})"
+    # New validation split checks
+    assert len(train_val) + len(test_val) == len(
+        new_train
+    ), f"Expected {len(new_train)} total images in train_val ({len(train_val)}) and test_val ({len(test_val)})"
+    # Ensure the size of the validation test set is approx. the same as the original test set
+    assert abs(len(test_val) - len(new_test)) <= 1, "Validation test set size should match original test set size"

mlebench/competitions/cassava-leaf-disease-classification/grade.py ADDED Viewed

@@ -0,0 +1,12 @@
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from mlebench.competitions.utils import prepare_for_accuracy_metric
+def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
+    accuracy_inputs = prepare_for_accuracy_metric(
+        submission=submission, answers=answers, target_column="label", id_column="image_id"
+    )
+    score = accuracy_score(**accuracy_inputs)
+    return score

mlebench/competitions/cassava-leaf-disease-classification/prepare.py ADDED Viewed

@@ -0,0 +1,113 @@
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+from tqdm.auto import tqdm
+from mlebench.competitions.utils import get_ids_from_tf_records
+from mlebench.utils import read_csv
+def prepare(raw: Path, public: Path, private: Path):
+    # need to split based on the TFRecord files, since not mentioned in the CSVs
+    tfrecord_files = [
+        path
+        for path in sorted((raw / "train_tfrecords").iterdir())
+        if path.is_file() and path.suffix == ".tfrec"
+    ]
+    # In the original there are 21397 train samples and they say test has ~15000 test samples, which is ~ 0.4/0.6 test/train split
+    # We use 0.1 ratio to avoid removing too many samples from train
+    new_train_tfrecords, new_test_tfrecords = train_test_split(
+        tfrecord_files, test_size=0.1, random_state=0
+    )
+    # parse the IDs from the test tf records
+    test_ids = []
+    for path in new_test_tfrecords:
+        test_ids.extend(get_ids_from_tf_records(path))
+    old_train = read_csv(raw / "train.csv")
+    old_train["split"] = "train"
+    old_train.loc[old_train["image_id"].isin(test_ids), "split"] = "test"
+    new_train = old_train[old_train["split"] == "train"].drop(columns=["split"])
+    new_test = old_train[old_train["split"] == "test"].drop(columns=["split"])
+    sample_submission = new_test.copy()
+    sample_submission["label"] = 4
+    new_train.to_csv(public / "train.csv", index=False)
+    new_test.to_csv(private / "test.csv", index=False)
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    (public / "train_tfrecords").mkdir(parents=True, exist_ok=True)
+    for i, path in tqdm(
+        enumerate(new_train_tfrecords),
+        desc="Copying Train TFRecords",
+        total=len(new_train_tfrecords),
+    ):
+        length = path.stem.split("-")[1]
+        new_name = f"ld_train{i:02d}-{length}.tfrec"
+        shutil.copy(path, public / "train_tfrecords" / new_name)
+    (public / "test_tfrecords").mkdir(parents=True, exist_ok=True)
+    for i, path in tqdm(
+        enumerate(new_test_tfrecords), desc="Copying Test TFRecords", total=len(new_test_tfrecords)
+    ):
+        length = path.stem.split("-")[1]
+        new_name = f"ld_test{i:02d}-{length}.tfrec"
+        shutil.copy(path, public / "test_tfrecords" / new_name)
+    (public / "train_images").mkdir(parents=True, exist_ok=True)
+    for image_id in tqdm(new_train["image_id"], desc="Copying Train Images", total=len(new_train)):
+        shutil.copy(raw / "train_images" / image_id, public / "train_images")
+    (public / "test_images").mkdir(parents=True, exist_ok=True)
+    for image_id in tqdm(new_test["image_id"], desc="Copying Test Images", total=len(new_test)):
+        shutil.copy(raw / "train_images" / image_id, public / "test_images")
+    shutil.copy(raw / "label_num_to_disease_map.json", public / "label_num_to_disease_map.json")
+    # checks
+    assert len(new_train) + len(new_test) == len(
+        old_train
+    ), "Expected new train and new test lengths to sum to old train length"
+    assert len(sample_submission) == len(
+        new_test
+    ), "Expected sample submission length to be equal to new test length"
+    assert len(new_train) == sum(
+        1 for _ in (public / "train_images").iterdir()
+    ), "Mismatch in number of expected train images copied"
+    assert len(new_test) == sum(
+        1 for _ in (public / "test_images").iterdir()
+    ), "Mismatch in number of expected test images copied"
+    assert len(new_train_tfrecords) == sum(
+        1 for _ in (public / "train_tfrecords").iterdir()
+    ), "Mismatch in number of expected train TFRecords copied"
+    assert len(new_test_tfrecords) == sum(
+        1 for _ in (public / "test_tfrecords").iterdir()
+    ), "Mismatch in number of expected test TFRecords copied"
+    assert new_train.columns.tolist() == [
+        "image_id",
+        "label",
+    ], "Expected new train columns to be ['image_id', 'label']"
+    assert new_test.columns.tolist() == [
+        "image_id",
+        "label",
+    ], "Expected new test columns to be ['image_id', 'label']"
+    assert sample_submission.columns.tolist() == [
+        "image_id",
+        "label",
+    ], "Expected sample submission columns to be ['image_id', 'label']"
+    assert set(new_train["image_id"]).isdisjoint(
+        new_test["image_id"]
+    ), "Expected train and test image IDs to be disjoint"

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl