PyPI - dslighting - Versions diffs - 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl - Mend

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py ADDED Viewed

@@ -0,0 +1,719 @@
+import json
+import shutil
+from pathlib import Path
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from mlebench.utils import get_logger
+logger = get_logger(__name__)
+def prepare(raw: Path, public: Path, private: Path):
+    """
+    Splits the raw train data into new train/test splits.
+    There isn't detailed documentation on how the train/test split of the raw data was made, but according to
+    this post https://www.kaggle.com/competitions/3d-object-detection-for-autonomous-vehicles/discussion/133895:
+    - It consists of a raw camera, lidar data, and HD semantic map.
+    - 180 scenes, 25s each
+    - 638,000 2D and 3D annotations over 18,000 objects
+    - The dataset had nine classes with a large class imbalance.
+    The original train/test split:
+    - Train set 40% (train.csv has 22680 rows)
+    - Test set: public 30%, private 30% (sample submission has 27468 rows)
+    Since there are 180 scenes and >>180 samples, each sample is not independent; samples within a scene are highly correlated
+    so we can't randomly split samples at an individual level. Instead, we split by scenes.
+    In practice, scenes are identified by "log tokens" in the data, corresponding to a single log file (listed in log.json).
+    (We also verified that the log IDs in the raw train/test splits are disjoint, which supports this choice.)
+    ## What's in the dataset?
+    ```
+    $ ls raw/
+    sample_submission.csv   test_data/      test_images/    test_lidar/     test_maps/
+    train.csv               train_data/     train_images/   train_lidar/    train_maps/
+    $ ls raw/train_data/
+    attribute.json          category.json  instance.json  map.json     sample_annotation.json  scene.json   visibility.json
+    calibrated_sensor.json  ego_pose.json  log.json       sample.json  sample_data.json        sensor.json
+    $ ls raw/train_images/ | wc -l
+    158757
+    $ ls raw/train_lidar | wc -l
+    30744
+    $ ls raw/test_data/
+    attribute.json  calibrated_sensor.json  category.json  ego_pose.json  log.json  map.json  sample.json  sample_data.json  scene.json  sensor.json  visibility.json
+    # test_data/ omits the instance.json and sample_annotation.json files
+    $ ls raw/test_images/ | wc -l
+    192276
+    $ ls raw/test_lidar/ | wc -l
+    27468
+    """
+    DEV_MODE = False
+    (public / "test_data").mkdir(parents=True, exist_ok=True)
+    (public / "train_data").mkdir(parents=True, exist_ok=True)
+    private.mkdir(parents=True, exist_ok=True)
+    """
+    log.json
+    """
+    # Start the split at the log level, and the rest follows
+    with open(raw / "train_data" / "log.json") as f:
+        logs = json.load(f)
+    log_ids = [log["token"] for log in logs]
+    assert len(log_ids) == len(set(log_ids)), "Log IDs must be unique"
+    logger.info(f"Found {len(log_ids)} logs")
+    # Split the logs into train/test
+    # previous ratio had 180 train logs and 218 test logs; we'll split the 180 train samples into 80% new train and 20% new test
+    # (trying not to reduce the availibility of training data, but need a large-ish set since there are 9 object classes)
+    train_log_ids, test_logs_ids = train_test_split(log_ids, test_size=0.2, random_state=0)
+    logger.info(f"Train logs: {len(train_log_ids)}, Test logs: {len(test_logs_ids)}")
+    with open(public / "train_data" / "log.json", "w") as f:
+        json.dump([log for log in logs if log["token"] in train_log_ids], f)
+    with open(public / "test_data" / "log.json", "w") as f:
+        json.dump([log for log in logs if log["token"] in test_logs_ids], f)
+    """
+    sample.json
+    """
+    # Create train/test sample splits following the log split
+    with open(raw / "train_data" / "sample.json") as f:
+        samples = json.load(f)
+    logger.info(f"Found {len(samples)} samples")
+    train_samples = [sample for sample in samples if sample["scene_token"] in train_log_ids]
+    test_samples = [sample for sample in samples if sample["scene_token"] in test_logs_ids]
+    logger.info(f"New train samples: {len(train_samples)}, new test samples: {len(test_samples)}")
+    assert len(train_samples) + len(test_samples) == len(
+        samples
+    ), f"New train ({len(train_samples)}) and test ({len(test_samples)}) samples must cover all samples ({len(samples)})"
+    with open(public / "train_data" / "sample.json", "w") as f:
+        json.dump(train_samples, f)
+    with open(public / "test_data" / "sample.json", "w") as f:
+        json.dump(test_samples, f)
+    """
+    Make train.csv
+    """
+    # train.csv has columns `Id` and `PredictionString`, with `PredictionString` in the following format: `center_x center_y center_z width length height yaw class_name`
+    with open(raw / "train.csv") as f:
+        train_df = pd.read_csv(f)
+    logger.info(f"Found {len(train_df)} train rows")
+    new_train_df = train_df[train_df["Id"].isin([sample["token"] for sample in train_samples])]
+    new_test_df = train_df[train_df["Id"].isin([sample["token"] for sample in test_samples])]
+    logger.info(f"Train rows: {len(new_train_df)}, Test rows: {len(new_test_df)}")
+    assert len(new_train_df) + len(new_test_df) == len(
+        train_df
+    ), f"New train ({len(new_train_df)}) and test ({len(new_test_df)}) annotations must cover all annotations ({len(train_df)})"
+    assert len(new_train_df) == len(
+        train_samples
+    ), f"New train rows ({len(new_train_df)}) must match train samples ({len(train_samples)})"
+    assert len(new_test_df) == len(
+        test_samples
+    ), f"New test rows ({len(new_test_df)}) must match test samples ({len(test_samples)})"
+    new_train_df.to_csv(public / "train.csv", index=False)
+    """
+    Make private test.csv
+    """
+    # test.csv is basically new_test_df, but the "PredictionString" column needs to have a "confidence" value added
+    # so the format becomes: `confidence center_x center_y center_z width length height yaw class_name`
+    def add_confidence(pred_string):
+        pred_tokens = pred_string.split(" ")
+        assert (
+            len(pred_tokens) % 8 == 0
+        ), f"Expected 8 tokens per object, but got {len(pred_tokens)}"
+        new_pred_tokens = []
+        for i in range(0, len(pred_tokens), 8):
+            new_pred_tokens.extend(["1.0"] + pred_tokens[i : i + 8])
+        return " ".join(new_pred_tokens)
+    # Apply the function to the entire 'PredictionString' column
+    new_test_df["PredictionString"] = new_test_df["PredictionString"].apply(add_confidence)
+    new_test_df.to_csv(private / "test.csv", index=False)
+    """
+    Make sample_submission.csv
+    """
+    # sample submission is the same as test.csv but with empty prediction strings
+    sample_submission = new_test_df[["Id"]].copy()
+    sample_submission["PredictionString"] = ""
+    sample_submission.to_csv(public / "sample_submission.csv", index=False)
+    """
+    Split sample_data.json
+    """
+    # sample_data.json is a list of all images and lidar files, and each entry has a `sample_token` field that identifies which sample it belongs to
+    with open(raw / "train_data" / "sample_data.json") as f:
+        sample_data = json.load(f)
+    logger.info(f"Found {len(sample_data)} train sample data")
+    new_train_sample_data, new_test_sample_data = [], []
+    for sample_datum in sample_data:
+        sample_token = sample_datum["sample_token"]
+        if sample_token in new_train_df["Id"].values:
+            new_train_sample_data.append(sample_datum)
+        elif sample_token in new_test_df["Id"].values:
+            new_test_sample_data.append(sample_datum)
+        else:
+            raise ValueError(
+                f"Sample data token {sample_token} doesn't belong to either new train or new test set"
+            )
+    logger.info(
+        f"New train sample data: {len(new_train_sample_data)}, new test sample data: {len(new_test_sample_data)}"
+    )
+    assert len(new_train_sample_data) + len(new_test_sample_data) == len(
+        sample_data
+    ), f"New train ({len(new_train_sample_data)}) and test ({len(new_test_sample_data)}) sample data must cover all sample data ({len(sample_data)})"
+    with open(public / "train_data" / "sample_data.json", "w") as f:
+        json.dump(new_train_sample_data, f)
+    with open(public / "test_data" / "sample_data.json", "w") as f:
+        json.dump(new_test_sample_data, f)
+    """
+    Copy over maps
+    """
+    # There is only one map which is identical in both raw train/test so no need to modify, just copy over
+    # $ diff raw/test_maps/map_raster_palo_alto.png raw/train_maps/map_raster_palo_alto.png # -> no output
+    (public / "test_maps").mkdir(parents=True, exist_ok=True)
+    (public / "train_maps").mkdir(parents=True, exist_ok=True)
+    shutil.copyfile(
+        src=raw / "train_maps" / "map_raster_palo_alto.png",
+        dst=public / "test_maps" / "map_raster_palo_alto.png",
+    )
+    shutil.copyfile(
+        src=raw / "train_maps" / "map_raster_palo_alto.png",
+        dst=public / "train_maps" / "map_raster_palo_alto.png",
+    )
+    """
+    Copy attribute.json
+    """
+    # attribute.json is a list of object states, there are 18 attributes in the train set including "object_action_walking", "object_action_parked", etc.
+    # The raw test set has an attribute.json file drawn from the same set of attributes, but only has 17 attributes (whichever attributes
+    # are present in the test set.) For simplicity, we'll just copy the full list of 18 attributes in both the new train and new test sets.
+    shutil.copyfile(
+        src=raw / "train_data" / "attribute.json", dst=public / "train_data" / "attribute.json"
+    )
+    shutil.copyfile(
+        src=raw / "train_data" / "attribute.json", dst=public / "test_data" / "attribute.json"
+    )
+    """
+    Split calibrated_sensor.json
+    """
+    # calibrated_sensor.json is a list of sensor calibration parameters corresponding to the setup of the sensor at the time each sample was taken.
+    # This file will be split following the sample_data split (each sample_datum has a `calibrated_sensor_token`)
+    with open(raw / "train_data" / "calibrated_sensor.json") as f:
+        calibrated_sensors = json.load(f)
+    calibration_by_calibrated_sensor_token = {cal["token"]: cal for cal in calibrated_sensors}
+    new_train_calibrated_sensors, new_test_calibrated_sensors = [], []
+    for sample_datum in new_train_sample_data:
+        calibrated_sensor_token = sample_datum["calibrated_sensor_token"]
+        if calibrated_sensor_token in [cal["token"] for cal in new_train_calibrated_sensors]:
+            continue  # Each calibrated sensor is used by multiple samples, we don't need to add it multiple times
+        new_train_calibrated_sensors.append(
+            calibration_by_calibrated_sensor_token[calibrated_sensor_token]
+        )
+    for sample_datum in new_test_sample_data:
+        calibrated_sensor_token = sample_datum["calibrated_sensor_token"]
+        if calibrated_sensor_token in [cal["token"] for cal in new_test_calibrated_sensors]:
+            continue  # Each calibrated sensor is used by multiple samples, we don't need to add it multiple times
+        new_test_calibrated_sensors.append(
+            calibration_by_calibrated_sensor_token[calibrated_sensor_token]
+        )
+    logger.info(
+        f"New train calibrated sensors: {len(new_train_calibrated_sensors)}, new test calibrated sensors: {len(new_test_calibrated_sensors)}"
+    )
+    assert len(
+        set([cal["token"] for cal in new_train_calibrated_sensors + new_test_calibrated_sensors])
+    ) == len(
+        calibrated_sensors
+    ), f"New train and test calibrated sensors must cover all calibrated sensors ({len(calibrated_sensors)})"
+    with open(public / "train_data" / "calibrated_sensor.json", "w") as f:
+        json.dump(new_train_calibrated_sensors, f)
+    with open(public / "test_data" / "calibrated_sensor.json", "w") as f:
+        json.dump(new_test_calibrated_sensors, f)
+    """
+    Copy category.json
+    """
+    # category.json is the list of 9 object classes, and is the same for train/test
+    shutil.copyfile(
+        src=raw / "train_data" / "category.json", dst=public / "train_data" / "category.json"
+    )
+    shutil.copyfile(
+        src=raw / "train_data" / "category.json", dst=public / "test_data" / "category.json"
+    )
+    """
+    Split ego_pose.json
+    """
+    # ego_pose.json is a list of vehicle poses, and will be split following the sample_data split
+    with open(raw / "train_data" / "ego_pose.json") as f:
+        ego_poses = json.load(f)
+    ego_pose_by_ego_pose_token = {ego["token"]: ego for ego in ego_poses}
+    new_train_ego_poses, new_test_ego_poses = [], []
+    for sample_datum in new_train_sample_data:
+        ego_pose_token = sample_datum["ego_pose_token"]
+        new_train_ego_poses.append(ego_pose_by_ego_pose_token[ego_pose_token])
+    for sample_datum in new_test_sample_data:
+        ego_pose_token = sample_datum["ego_pose_token"]
+        new_test_ego_poses.append(ego_pose_by_ego_pose_token[ego_pose_token])
+    logger.info(
+        f"New train ego poses: {len(new_train_ego_poses)}, new test ego poses: {len(new_test_ego_poses)}"
+    )
+    assert len(set([ego["token"] for ego in new_train_ego_poses + new_test_ego_poses])) == len(
+        ego_poses
+    ), f"New train and test ego poses must cover all ego poses ({len(ego_poses)})"
+    with open(public / "train_data" / "ego_pose.json", "w") as f:
+        json.dump(new_train_ego_poses, f)
+    with open(public / "test_data" / "ego_pose.json", "w") as f:
+        json.dump(new_test_ego_poses, f)
+    """
+    Create map.json
+    """
+    # map.json is the list of maps, and a list of logs that used those maps. But in the raw dataset, we only have one map,
+    # so this ends up being just a list of one map, which has a sublist of all the logs in the split.
+    # [{"log_tokens": [...], "category": "semantic_prior", "filename": "maps/map_raster_palo_alto.png", "token": "53992ee3023e5494b90c316c183be829"}]
+    with open(raw / "train_data" / "map.json") as f:
+        maps = json.load(f)
+    assert len(maps) == 1, "Expected only one map in the raw dataset"
+    # Just replace the list of "log_tokens" with the new train and test log IDs
+    new_train_maps = maps.copy()
+    new_train_maps[0]["log_tokens"] = train_log_ids
+    new_test_maps = maps.copy()
+    new_test_maps[0]["log_tokens"] = test_logs_ids
+    with open(public / "train_data" / "map.json", "w") as f:
+        json.dump(new_train_maps, f)
+    with open(public / "test_data" / "map.json", "w") as f:
+        json.dump(new_test_maps, f)
+    """
+    Split scene.json
+    """
+    # scene.json is a list of scenes corresponding exactly to each log file. The scenes describe the first and last
+    # samples in each scene, as well as how many samples are in each scene.
+    # We'll split this following the log split.
+    with open(raw / "train_data" / "scene.json") as f:
+        scenes = json.load(f)
+    logger.info(f"Found {len(scenes)} scenes")
+    new_train_scenes, new_test_scenes = [], []
+    for scene in scenes:
+        log_token = scene["log_token"]
+        if log_token in train_log_ids:
+            new_train_scenes.append(scene)
+        elif log_token in test_logs_ids:
+            new_test_scenes.append(scene)
+        else:
+            raise ValueError(
+                f"Scene log token {log_token} doesn't belong to either new train or new test set"
+            )
+    logger.info(
+        f"New train scenes: {len(new_train_scenes)}, new test scenes: {len(new_test_scenes)}"
+    )
+    assert len(new_train_scenes) + len(new_test_scenes) == len(
+        scenes
+    ), f"New train ({len(new_train_scenes)}) and test ({len(new_test_scenes)}) scenes must cover all scenes ({len(scenes)})"
+    with open(public / "train_data" / "scene.json", "w") as f:
+        json.dump(new_train_scenes, f)
+    with open(public / "test_data" / "scene.json", "w") as f:
+        json.dump(new_test_scenes, f)
+    """
+    Copy sensor.json
+    """
+    # sensor.json is a list of sensors used in the dataset (10 sensors in the raw train set).
+    # For simplicity, we'll just copy the full list of sensors in both the new train and new test sets.
+    shutil.copyfile(
+        src=raw / "train_data" / "sensor.json", dst=public / "train_data" / "sensor.json"
+    )
+    shutil.copyfile(
+        src=raw / "train_data" / "sensor.json", dst=public / "test_data" / "sensor.json"
+    )
+    """
+    Copy visibility.json
+    """
+    # visibility.json is a list of 4 visibility classes describing how visible an annotated object is in a given sample.
+    # Both train and test use the same visibility classes, so we'll just copy these to the new train and new test sets.
+    shutil.copyfile(
+        src=raw / "train_data" / "visibility.json", dst=public / "train_data" / "visibility.json"
+    )
+    shutil.copyfile(
+        src=raw / "train_data" / "visibility.json", dst=public / "test_data" / "visibility.json"
+    )
+    """
+    Split sample_annotation.json
+    """
+    # sample_annotation.json is the full list of object annotations (bounding boxes) from all samples,
+    # and will be split following the sample split.
+    with open(raw / "train_data" / "sample_annotation.json") as f:
+        sample_annotations = json.load(f)
+    logger.info(f"Found {len(sample_annotations)} train sample annotations")
+    new_train_sample_annotations, new_test_sample_annotations = [], []
+    for sample_annotation in sample_annotations:
+        sample_token = sample_annotation["sample_token"]
+        if sample_token in new_train_df["Id"].values:
+            new_train_sample_annotations.append(sample_annotation)
+        elif sample_token in new_test_df["Id"].values:
+            new_test_sample_annotations.append(sample_annotation)
+        else:
+            raise ValueError(
+                f"Sample annotation token {sample_token} doesn't belong to either new train or new test set"
+            )
+    logger.info(
+        f"New train sample annotations: {len(new_train_sample_annotations)}, new test sample annotations: {len(new_test_sample_annotations)}"
+    )
+    assert len(new_train_sample_annotations) + len(new_test_sample_annotations) == len(
+        sample_annotations
+    ), f"New train ({len(new_train_sample_annotations)}) and test ({len(new_test_sample_annotations)}) sample annotations must cover all sample annotations ({len(sample_annotations)})"
+    with open(public / "train_data" / "sample_annotation.json", "w") as f:
+        json.dump(new_train_sample_annotations, f)
+    # NOTE: don't export (public / "test_data" / "sample_annotation.json") since the test set doesn't provide sample annotations
+    """
+    Split instance.json
+    """
+    # instance.json is a list of object instances (e.g. the same pedestrian appearing in contiguous frames),
+    # and will be split following the sample_annotation.json split
+    with open(raw / "train_data" / "instance.json") as f:
+        instances = json.load(f)
+    logger.info(f"Found {len(instances)} train instances")
+    new_train_instance_ids = set([sa["instance_token"] for sa in new_train_sample_annotations])
+    new_test_instance_ids = set([sa["instance_token"] for sa in new_test_sample_annotations])
+    new_train_instances, new_test_instances = [], []
+    for instance in instances:
+        if instance["token"] in new_train_instance_ids:
+            new_train_instances.append(instance)
+        elif instance["token"] in new_test_instance_ids:
+            new_test_instances.append(instance)
+        else:
+            raise ValueError(
+                f"Instance {instance['token']} doesn't belong to either new train or new test set"
+            )
+    logger.info(
+        f"New train instances: {len(new_train_instances)}, new test instances: {len(new_test_instances)}"
+    )
+    assert len(new_train_instances) + len(new_test_instances) == len(
+        instances
+    ), f"New train ({len(new_train_instances)}) and test ({len(new_test_instances)}) instances must cover all instances ({len(instances)})"
+    with open(public / "train_data" / "instance.json", "w") as f:
+        json.dump(new_train_instances, f)
+    # NOTE: don't export (public / "test_data" / "instance.json") since the test set doesn't provide instance annotations
+    """
+    Copy over the heavy image and lidar data
+    """
+    (public / "test_images").mkdir(parents=True, exist_ok=True)
+    (public / "train_images").mkdir(parents=True, exist_ok=True)
+    (public / "test_lidar").mkdir(parents=True, exist_ok=True)
+    (public / "train_lidar").mkdir(parents=True, exist_ok=True)
+    if DEV_MODE:
+        sample_data = sample_data[:100]  # Just copy a few samples for testing
+    num_train_images, num_test_images = 0, 0
+    num_train_lidar, num_test_lidar = 0, 0
+    for sample_datum in tqdm(sample_data, desc="Copying images and lidar data"):
+        filename = Path(
+            sample_datum["filename"]
+        ).name  # `filename` looks like "images/host-a011_cam2_1233689008717605006.jpeg", but we don't use that parent directory
+        is_test = sample_datum["sample_token"] in new_test_df["Id"].values
+        if sample_datum["fileformat"] == "jpeg":
+            assert filename.endswith("jpeg"), f"Expected .jpeg, but got {filename}"
+            src_file = raw / "train_images" / filename
+            if not src_file.exists():
+                raise FileNotFoundError(f"{src_file} does not exist")
+            # Image
+            if is_test:
+                dst_file = public / "test_images" / filename
+                if dst_file.exists():
+                    logger.warning(f"Copying file to {dst_file}, but file already exists!")
+                else:
+                    shutil.copyfile(src=src_file, dst=dst_file)
+                    num_test_images += 1
+            else:
+                dst_file = public / "train_images" / filename
+                if dst_file.exists():
+                    logger.warning(f"Copying file to {dst_file}, but file already exists!")
+                else:
+                    shutil.copyfile(src=src_file, dst=dst_file)
+                    num_train_images += 1
+        elif sample_datum["fileformat"] == "bin":
+            assert filename.endswith("bin"), f"Expected .bin, but got {filename}"
+            src_file = raw / "train_lidar" / filename
+            if not src_file.exists():
+                raise FileNotFoundError(f"{src_file} does not exist")
+            # Lidar
+            if is_test:
+                dst_file = public / "test_lidar" / filename
+                if dst_file.exists():
+                    logger.warning(f"Copying file to {dst_file}, but file already exists!")
+                else:
+                    shutil.copyfile(src=src_file, dst=dst_file)
+                    num_test_lidar += 1
+            else:
+                dst_file = public / "train_lidar" / filename
+                if dst_file.exists():
+                    logger.warning(f"Copying file to {dst_file}, but file already exists!")
+                else:
+                    shutil.copyfile(src=src_file, dst=dst_file)
+                    num_train_lidar += 1
+        else:
+            raise ValueError(
+                f"Unexpected `fileformat` in sample data: {sample_datum['fileformat']}"
+            )
+    assert num_train_images + num_test_images + num_train_lidar + num_test_lidar == len(
+        set(sample_datum["filename"] for sample_datum in sample_data)
+    ), f"Expected image and lidar samples for new train/test to cover all samples ({len(sample_data)})"
+    assert num_train_images == len(
+        list((public / "train_images").glob("*.jpeg"))
+    ), f"Expected {num_train_images} train images, but got {len(list((public / 'train_images').glob('*.jpeg')))}"
+    assert num_test_images == len(
+        list((public / "test_images").glob("*.jpeg"))
+    ), f"Expected {num_test_images} test images, but got {len(list((public / 'test_images').glob('*.jpeg')))}"
+    assert num_train_lidar == len(
+        list((public / "train_lidar").glob("*.bin"))
+    ), f"Expected {num_train_lidar} train lidar files, but got {len(list((public / 'train_lidar').glob('*.bin')))}"
+    assert num_test_lidar == len(
+        list((public / "test_lidar").glob("*.bin"))
+    ), f"Expected {num_test_lidar} test lidar files, but got {len(list((public / 'test_lidar').glob('*.bin')))}"
+    """
+    ========================================
+    VALIDATION SPLIT - Second split on train
+    ========================================
+    """
+    # Create paths for validation directories
+    public_val = public.parent / "public_val"
+    private_val = private.parent / "private_val"
+    (public_val / "test_data").mkdir(parents=True, exist_ok=True)
+    (public_val / "train_data").mkdir(parents=True, exist_ok=True)
+    private_val.mkdir(parents=True, exist_ok=True)
+    # Second split on train_log_ids
+    train_val_log_ids, test_val_log_ids = train_test_split(train_log_ids, test_size=0.2, random_state=0)
+    logger.info(f"Validation split - Train logs: {len(train_val_log_ids)}, Test logs: {len(test_val_log_ids)}")
+    # Write validation log.json files
+    with open(public_val / "train_data" / "log.json", "w") as f:
+        json.dump([log for log in logs if log["token"] in train_val_log_ids], f)
+    with open(public_val / "test_data" / "log.json", "w") as f:
+        json.dump([log for log in logs if log["token"] in test_val_log_ids], f)
+    # Split train_samples into train_val and test_val
+    train_val_samples = [sample for sample in train_samples if sample["scene_token"] in train_val_log_ids]
+    test_val_samples = [sample for sample in train_samples if sample["scene_token"] in test_val_log_ids]
+    logger.info(f"Validation samples - Train: {len(train_val_samples)}, Test: {len(test_val_samples)}")
+    with open(public_val / "train_data" / "sample.json", "w") as f:
+        json.dump(train_val_samples, f)
+    with open(public_val / "test_data" / "sample.json", "w") as f:
+        json.dump(test_val_samples, f)
+    # Create validation train.csv from new_train_df
+    train_val_df = new_train_df[new_train_df["Id"].isin([sample["token"] for sample in train_val_samples])]
+    test_val_df = new_train_df[new_train_df["Id"].isin([sample["token"] for sample in test_val_samples])]
+    logger.info(f"Validation CSV - Train rows: {len(train_val_df)}, Test rows: {len(test_val_df)}")
+    train_val_df.to_csv(public_val / "train.csv", index=False)
+    # Create validation test.csv with confidence added
+    test_val_df_copy = test_val_df.copy()
+    test_val_df_copy["PredictionString"] = test_val_df_copy["PredictionString"].apply(add_confidence)
+    test_val_df_copy.to_csv(private_val / "test.csv", index=False)
+    # Create validation sample_submission.csv
+    sample_submission_val = test_val_df[["Id"]].copy()
+    sample_submission_val["PredictionString"] = ""
+    sample_submission_val.to_csv(public_val / "sample_submission.csv", index=False)
+    # Split sample_data for validation
+    train_val_sample_data, test_val_sample_data = [], []
+    for sample_datum in new_train_sample_data:
+        sample_token = sample_datum["sample_token"]
+        if sample_token in train_val_df["Id"].values:
+            train_val_sample_data.append(sample_datum)
+        elif sample_token in test_val_df["Id"].values:
+            test_val_sample_data.append(sample_datum)
+    logger.info(f"Validation sample data - Train: {len(train_val_sample_data)}, Test: {len(test_val_sample_data)}")
+    with open(public_val / "train_data" / "sample_data.json", "w") as f:
+        json.dump(train_val_sample_data, f)
+    with open(public_val / "test_data" / "sample_data.json", "w") as f:
+        json.dump(test_val_sample_data, f)
+    # Copy maps for validation
+    (public_val / "test_maps").mkdir(parents=True, exist_ok=True)
+    (public_val / "train_maps").mkdir(parents=True, exist_ok=True)
+    shutil.copyfile(
+        src=raw / "train_maps" / "map_raster_palo_alto.png",
+        dst=public_val / "test_maps" / "map_raster_palo_alto.png",
+    )
+    shutil.copyfile(
+        src=raw / "train_maps" / "map_raster_palo_alto.png",
+        dst=public_val / "train_maps" / "map_raster_palo_alto.png",
+    )
+    # Copy attribute.json for validation
+    shutil.copyfile(
+        src=raw / "train_data" / "attribute.json", dst=public_val / "train_data" / "attribute.json"
+    )
+    shutil.copyfile(
+        src=raw / "train_data" / "attribute.json", dst=public_val / "test_data" / "attribute.json"
+    )
+    # Split calibrated_sensor.json for validation
+    train_val_calibrated_sensors, test_val_calibrated_sensors = [], []
+    for sample_datum in train_val_sample_data:
+        calibrated_sensor_token = sample_datum["calibrated_sensor_token"]
+        if calibrated_sensor_token in [cal["token"] for cal in train_val_calibrated_sensors]:
+            continue
+        train_val_calibrated_sensors.append(
+            calibration_by_calibrated_sensor_token[calibrated_sensor_token]
+        )
+    for sample_datum in test_val_sample_data:
+        calibrated_sensor_token = sample_datum["calibrated_sensor_token"]
+        if calibrated_sensor_token in [cal["token"] for cal in test_val_calibrated_sensors]:
+            continue
+        test_val_calibrated_sensors.append(
+            calibration_by_calibrated_sensor_token[calibrated_sensor_token]
+        )
+    with open(public_val / "train_data" / "calibrated_sensor.json", "w") as f:
+        json.dump(train_val_calibrated_sensors, f)
+    with open(public_val / "test_data" / "calibrated_sensor.json", "w") as f:
+        json.dump(test_val_calibrated_sensors, f)
+    # Copy category.json for validation
+    shutil.copyfile(
+        src=raw / "train_data" / "category.json", dst=public_val / "train_data" / "category.json"
+    )
+    shutil.copyfile(
+        src=raw / "train_data" / "category.json", dst=public_val / "test_data" / "category.json"
+    )
+    # Split ego_pose.json for validation
+    train_val_ego_poses, test_val_ego_poses = [], []
+    for sample_datum in train_val_sample_data:
+        ego_pose_token = sample_datum["ego_pose_token"]
+        train_val_ego_poses.append(ego_pose_by_ego_pose_token[ego_pose_token])
+    for sample_datum in test_val_sample_data:
+        ego_pose_token = sample_datum["ego_pose_token"]
+        test_val_ego_poses.append(ego_pose_by_ego_pose_token[ego_pose_token])
+    with open(public_val / "train_data" / "ego_pose.json", "w") as f:
+        json.dump(train_val_ego_poses, f)
+    with open(public_val / "test_data" / "ego_pose.json", "w") as f:
+        json.dump(test_val_ego_poses, f)
+    # Create map.json for validation
+    train_val_maps = maps.copy()
+    train_val_maps[0]["log_tokens"] = train_val_log_ids
+    test_val_maps = maps.copy()
+    test_val_maps[0]["log_tokens"] = test_val_log_ids
+    with open(public_val / "train_data" / "map.json", "w") as f:
+        json.dump(train_val_maps, f)
+    with open(public_val / "test_data" / "map.json", "w") as f:
+        json.dump(test_val_maps, f)
+    # Split scene.json for validation
+    train_val_scenes, test_val_scenes = [], []
+    for scene in new_train_scenes:
+        log_token = scene["log_token"]
+        if log_token in train_val_log_ids:
+            train_val_scenes.append(scene)
+        elif log_token in test_val_log_ids:
+            test_val_scenes.append(scene)
+    with open(public_val / "train_data" / "scene.json", "w") as f:
+        json.dump(train_val_scenes, f)
+    with open(public_val / "test_data" / "scene.json", "w") as f:
+        json.dump(test_val_scenes, f)
+    # Copy sensor.json for validation
+    shutil.copyfile(
+        src=raw / "train_data" / "sensor.json", dst=public_val / "train_data" / "sensor.json"
+    )
+    shutil.copyfile(
+        src=raw / "train_data" / "sensor.json", dst=public_val / "test_data" / "sensor.json"
+    )
+    # Copy visibility.json for validation
+    shutil.copyfile(
+        src=raw / "train_data" / "visibility.json", dst=public_val / "train_data" / "visibility.json"
+    )
+    shutil.copyfile(
+        src=raw / "train_data" / "visibility.json", dst=public_val / "test_data" / "visibility.json"
+    )
+    # Split sample_annotation.json for validation
+    train_val_sample_annotations, test_val_sample_annotations = [], []
+    for sample_annotation in new_train_sample_annotations:
+        sample_token = sample_annotation["sample_token"]
+        if sample_token in train_val_df["Id"].values:
+            train_val_sample_annotations.append(sample_annotation)
+        elif sample_token in test_val_df["Id"].values:
+            test_val_sample_annotations.append(sample_annotation)
+    with open(public_val / "train_data" / "sample_annotation.json", "w") as f:
+        json.dump(train_val_sample_annotations, f)
+    # Split instance.json for validation
+    train_val_instance_ids = set([sa["instance_token"] for sa in train_val_sample_annotations])
+    test_val_instance_ids = set([sa["instance_token"] for sa in test_val_sample_annotations])
+    train_val_instances, test_val_instances = [], []
+    for instance in new_train_instances:
+        if instance["token"] in train_val_instance_ids:
+            train_val_instances.append(instance)
+        elif instance["token"] in test_val_instance_ids:
+            test_val_instances.append(instance)
+    with open(public_val / "train_data" / "instance.json", "w") as f:
+        json.dump(train_val_instances, f)
+    # Copy images and lidar data for validation
+    (public_val / "test_images").mkdir(parents=True, exist_ok=True)
+    (public_val / "train_images").mkdir(parents=True, exist_ok=True)
+    (public_val / "test_lidar").mkdir(parents=True, exist_ok=True)
+    (public_val / "train_lidar").mkdir(parents=True, exist_ok=True)
+    num_train_val_images, num_test_val_images = 0, 0
+    num_train_val_lidar, num_test_val_lidar = 0, 0
+    for sample_datum in tqdm(new_train_sample_data, desc="Copying validation images and lidar data"):
+        filename = Path(sample_datum["filename"]).name
+        is_test_val = sample_datum["sample_token"] in test_val_df["Id"].values
+        if sample_datum["fileformat"] == "jpeg":
+            src_file = public / "train_images" / filename
+            if is_test_val:
+                dst_file = public_val / "test_images" / filename
+                if not dst_file.exists():
+                    shutil.copyfile(src=src_file, dst=dst_file)
+                    num_test_val_images += 1
+            else:
+                dst_file = public_val / "train_images" / filename
+                if not dst_file.exists():
+                    shutil.copyfile(src=src_file, dst=dst_file)
+                    num_train_val_images += 1
+        elif sample_datum["fileformat"] == "bin":
+            src_file = public / "train_lidar" / filename
+            if is_test_val:
+                dst_file = public_val / "test_lidar" / filename
+                if not dst_file.exists():
+                    shutil.copyfile(src=src_file, dst=dst_file)
+                    num_test_val_lidar += 1
+            else:
+                dst_file = public_val / "train_lidar" / filename
+                if not dst_file.exists():
+                    shutil.copyfile(src=src_file, dst=dst_file)
+                    num_train_val_lidar += 1
+    logger.info(f"Validation images - Train: {num_train_val_images}, Test: {num_test_val_images}")
+    logger.info(f"Validation lidar - Train: {num_train_val_lidar}, Test: {num_test_val_lidar}")

dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

dslighting 1.7.1py3-none-any.whl → 1.7.6py3-none-any.whl